diff --git a/meeting_notes/Reginald 08_09_23.md b/meeting_notes/Reginald 08_09_23.md
new file mode 100644
index 00000000..b49aecb0
--- /dev/null
+++ b/meeting_notes/Reginald 08_09_23.md	
@@ -0,0 +1,26 @@
+# Reginald 08/09/23
+
+## Notes
+- Ryan and Rosie almost finishing a PR contribution to llama-index for the following issue: https://github.com/jerryjliu/llama_index/issues/7596
+    - ReAct engine seems to be a bit broken after this due to the default system prompt
+        - Potentially can make another issue and PR to fix
+- Other potential llama-index contributions
+    - Some llama-cpp chat engine example notebooks - there's not many (if at all) that exist in the repo and they're very welcome to more examples
+    - Ryan to try fix old issue posted during Hackweek (https://github.com/jerryjliu/llama_index/issues/6465)
+- Rosie been working on getting GitHub issues and files from within our Hut23
+    - Will need to think about what is the personal token that we should use
+        - Maybe look at if there's an instituitional token that exists
+    - Will need to upload this to Azure as a secret
+- Idea: look into logging how good the answers are and save into a database
+    - For future, we can maybe use this with a RLHF fine-tuning of our LLM
+
+## Actions
+- Ryan & Rosie: look at putting this azure
+    - Figure out how to have multiple chat instances at the same time
+    - Can start with running the bot locally
+- Rosie to think about using the reader for pure markdown files so no need to process them to csv
+    - Alternatively, to figure out how the csv files were created from pure markdown files
+    - Maybe it was from Andy's scripts to pull data from wiki and handbook
+- Ryan to see if we can update the OpenAI code and compare with Llama2
+- Ryan & Rosie: Try to get a quantized Llama2-70b running (for Tomas)
+- Merge Rosie's current PRs at some point next week
diff --git a/models/llama-index-hack/llama2_ccp_chat.ipynb b/models/llama-index-hack/llama2_ccp_chat.ipynb
index e5022422..c24cc03a 100644
--- a/models/llama-index-hack/llama2_ccp_chat.ipynb
+++ b/models/llama-index-hack/llama2_ccp_chat.ipynb
@@ -2,8 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 50,
-   "id": "0471137c",
+   "execution_count": 1,
+   "id": "e78eb6a3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -16,8 +16,7 @@
     "from llama_index import (\n",
     "    SimpleDirectoryReader,\n",
     "    LangchainEmbedding,\n",
-    "    GPTListIndex,\n",
-    "    GPTVectorStoreIndex,\n",
+    "    VectorStoreIndex,\n",
     "    PromptHelper,\n",
     "    LLMPredictor,\n",
     "    ServiceContext,\n",
@@ -29,17 +28,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
-   "id": "a0683044",
+   "execution_count": 2,
+   "id": "fff875d3",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'0.8.14'"
+       "'0.8.22'"
       ]
      },
-     "execution_count": 66,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -51,29 +50,27 @@
   },
   {
    "cell_type": "markdown",
-   "id": "131d2c1e",
+   "id": "bee0bc50",
    "metadata": {},
    "source": [
-    "Note: notebook assumes that in the reginald directory, there is a `gguf_models/` folder. Here we've downloaded the quantized 4-bit version of Llama2-13b-chat from [`TheBloke/Llama-2-13B-chat-GGML`](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML). \n",
-    "\n",
-    "Note that we're currently running a version of `llama-cpp-python` which no longer supports `ggmmlv3` model formats and has changed to `gguf`. We need to convert the above to `gguf` format using the `convert-llama-ggmlv3-to-gguf.py` script in [`llama.cpp`](https://github.com/ggerganov/llama.cpp).\n",
+    "Note: notebook assumes that in the reginald directory, there is a `gguf_models/` folder. Here we've downloaded the quantized 6-bit version of Llama-2-13b-Chat from [`TheBloke/Llama-2-13b-Chat-GGUF`](https://huggingface.co/TheBloke/Llama-2-13b-Chat-GGUF). \n",
     "\n",
     "## Quick example with llama-cpp-python"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "dc00ba0c",
+   "execution_count": 3,
+   "id": "6ae386f1",
    "metadata": {},
    "outputs": [],
    "source": [
-    "llama_2_13b_chat_path = \"../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin\""
+    "llama_2_path = \"../../gguf_models/llama-2-13b-chat.Q6_K.gguf\""
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "9fbc2dfd",
+   "id": "17f77bb0",
    "metadata": {},
    "source": [
     "## Using metal acceleration"
@@ -81,8 +78,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "id": "d4aace92",
+   "execution_count": 4,
+   "id": "821f26be",
    "metadata": {
     "scrolled": true
    },
@@ -91,397 +88,397 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "llama_model_loader: loaded meta data with 18 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin (version GGUF V2 (latest))\n",
-      "llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]\n",
-      "llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,     1,     1 ]\n",
-      "llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor    8:            blk.0.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor    9:            blk.0.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   10:              blk.0.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   11:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   12:              blk.1.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   13:              blk.1.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   14:              blk.1.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   15:         blk.1.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   16:           blk.1.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   17:            blk.1.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   18:            blk.1.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   19:              blk.1.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   20:            blk.1.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   21:              blk.2.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   22:              blk.2.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   23:              blk.2.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   24:         blk.2.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   25:           blk.2.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   26:            blk.2.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   27:            blk.2.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   28:              blk.2.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   29:            blk.2.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   30:              blk.3.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   31:              blk.3.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   32:              blk.3.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   33:         blk.3.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   34:           blk.3.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   35:            blk.3.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   36:            blk.3.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   37:              blk.3.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   38:            blk.3.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   39:              blk.4.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   40:              blk.4.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   41:              blk.4.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   42:         blk.4.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   43:           blk.4.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   44:            blk.4.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   45:            blk.4.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   46:              blk.4.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   47:            blk.4.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   48:              blk.5.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   49:              blk.5.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   50:              blk.5.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   51:         blk.5.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   52:           blk.5.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   53:            blk.5.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   54:            blk.5.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   55:              blk.5.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   56:            blk.5.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   57:              blk.6.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   58:              blk.6.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   59:              blk.6.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   60:         blk.6.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   61:           blk.6.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   62:            blk.6.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   63:            blk.6.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   64:              blk.6.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   65:            blk.6.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   66:              blk.7.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   67:              blk.7.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   68:              blk.7.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   69:         blk.7.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   70:           blk.7.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   71:            blk.7.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   72:            blk.7.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   73:              blk.7.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   74:            blk.7.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   75:              blk.8.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   76:              blk.8.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   77:              blk.8.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   78:         blk.8.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   79:           blk.8.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   80:            blk.8.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   81:            blk.8.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   82:              blk.8.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   83:            blk.8.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   84:              blk.9.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   85:              blk.9.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   86:              blk.9.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   87:         blk.9.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   88:           blk.9.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   89:            blk.9.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   90:            blk.9.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   91:              blk.9.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   92:            blk.9.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   93:             blk.10.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   94:             blk.10.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   95:             blk.10.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   96:        blk.10.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   97:          blk.10.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   98:           blk.10.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   99:           blk.10.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  100:             blk.10.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  101:           blk.10.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  102:             blk.11.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  103:             blk.11.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  104:             blk.11.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  105:        blk.11.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  106:          blk.11.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  107:           blk.11.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  108:           blk.11.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  109:             blk.11.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  110:           blk.11.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  111:             blk.12.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  112:             blk.12.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  113:             blk.12.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  114:        blk.12.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  115:          blk.12.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  116:           blk.12.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  117:           blk.12.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  118:             blk.12.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  119:           blk.12.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  120:             blk.13.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  121:             blk.13.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  122:             blk.13.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  123:        blk.13.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  124:          blk.13.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  125:           blk.13.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  126:           blk.13.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  127:             blk.13.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  128:           blk.13.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  129:             blk.14.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  130:             blk.14.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  131:             blk.14.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  132:        blk.14.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  133:          blk.14.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  134:           blk.14.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  135:           blk.14.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  136:             blk.14.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  137:           blk.14.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  138:             blk.15.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  139:             blk.15.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  140:             blk.15.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  141:        blk.15.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  142:          blk.15.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  143:           blk.15.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  144:           blk.15.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  145:             blk.15.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  146:           blk.15.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  147:             blk.16.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  148:             blk.16.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  149:             blk.16.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  150:        blk.16.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  151:          blk.16.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  152:           blk.16.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  153:           blk.16.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  154:             blk.16.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  155:           blk.16.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  156:             blk.17.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  157:             blk.17.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  158:             blk.17.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  159:        blk.17.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  160:          blk.17.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  161:           blk.17.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  162:           blk.17.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  163:             blk.17.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  164:           blk.17.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  165:             blk.18.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  166:             blk.18.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  167:             blk.18.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  168:        blk.18.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  169:          blk.18.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  170:           blk.18.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  171:           blk.18.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  172:             blk.18.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  173:           blk.18.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  174:             blk.19.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  175:             blk.19.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  176:             blk.19.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  177:        blk.19.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  178:          blk.19.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  179:           blk.19.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  180:           blk.19.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  181:             blk.19.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  182:           blk.19.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  183:             blk.20.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  184:             blk.20.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  185:             blk.20.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  186:        blk.20.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  187:          blk.20.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  188:           blk.20.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  189:           blk.20.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  190:             blk.20.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  191:           blk.20.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  192:             blk.21.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  193:             blk.21.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  194:             blk.21.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  195:        blk.21.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  196:          blk.21.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  197:           blk.21.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  198:           blk.21.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  199:             blk.21.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  200:           blk.21.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  201:             blk.22.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  202:             blk.22.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  203:             blk.22.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  204:        blk.22.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  205:          blk.22.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  206:           blk.22.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  207:           blk.22.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  208:             blk.22.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  209:           blk.22.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  210:             blk.23.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  211:             blk.23.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  212:             blk.23.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  213:        blk.23.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  214:          blk.23.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  215:           blk.23.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  216:           blk.23.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  217:             blk.23.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  218:           blk.23.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  219:             blk.24.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  220:             blk.24.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  221:             blk.24.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  222:        blk.24.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  223:          blk.24.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  224:           blk.24.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  225:           blk.24.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  226:             blk.24.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  227:           blk.24.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  228:             blk.25.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  229:             blk.25.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  230:             blk.25.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  231:        blk.25.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  232:          blk.25.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  233:           blk.25.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  234:           blk.25.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  235:             blk.25.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  236:           blk.25.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  237:             blk.26.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  238:             blk.26.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  239:             blk.26.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  240:        blk.26.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  241:          blk.26.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  242:           blk.26.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  243:           blk.26.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  244:             blk.26.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  245:           blk.26.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  246:             blk.27.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  247:             blk.27.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  248:             blk.27.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  249:        blk.27.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  250:          blk.27.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  251:           blk.27.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  252:           blk.27.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  253:             blk.27.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  254:           blk.27.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  255:             blk.28.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  256:             blk.28.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  257:             blk.28.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  258:        blk.28.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  259:          blk.28.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  260:           blk.28.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  261:           blk.28.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  262:             blk.28.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  263:           blk.28.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  264:             blk.29.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  265:             blk.29.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  266:             blk.29.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  267:        blk.29.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  268:          blk.29.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  269:           blk.29.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  270:           blk.29.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  271:             blk.29.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  272:           blk.29.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  273:             blk.30.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  274:             blk.30.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  275:             blk.30.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  276:        blk.30.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  277:          blk.30.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  278:           blk.30.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  279:           blk.30.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  280:             blk.30.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  281:           blk.30.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  282:             blk.31.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  283:             blk.31.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  284:             blk.31.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  285:        blk.31.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  286:          blk.31.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  287:           blk.31.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  288:           blk.31.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  289:             blk.31.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  290:           blk.31.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  291:             blk.32.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  292:             blk.32.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  293:             blk.32.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  294:        blk.32.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  295:          blk.32.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  296:           blk.32.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  297:           blk.32.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  298:             blk.32.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  299:           blk.32.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  300:             blk.33.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  301:             blk.33.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  302:             blk.33.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  303:        blk.33.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  304:          blk.33.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  305:           blk.33.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  306:           blk.33.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  307:             blk.33.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  308:           blk.33.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  309:             blk.34.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  310:             blk.34.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  311:             blk.34.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  312:        blk.34.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  313:          blk.34.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  314:           blk.34.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  315:           blk.34.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  316:             blk.34.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  317:           blk.34.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  318:             blk.35.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  319:             blk.35.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  320:             blk.35.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  321:        blk.35.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  322:          blk.35.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  323:           blk.35.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  324:           blk.35.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  325:             blk.35.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  326:           blk.35.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  327:             blk.36.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  328:             blk.36.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  329:             blk.36.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  330:        blk.36.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  331:          blk.36.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  332:           blk.36.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  333:           blk.36.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  334:             blk.36.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  335:           blk.36.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  336:             blk.37.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  337:             blk.37.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  338:             blk.37.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  339:        blk.37.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  340:          blk.37.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  341:           blk.37.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  342:           blk.37.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  343:             blk.37.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  344:           blk.37.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  345:             blk.38.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  346:             blk.38.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  347:             blk.38.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  348:        blk.38.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  349:          blk.38.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  350:           blk.38.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  351:           blk.38.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  352:             blk.38.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  353:           blk.38.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  354:             blk.39.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  355:             blk.39.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  356:             blk.39.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  357:        blk.39.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  358:          blk.39.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  359:           blk.39.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  360:           blk.39.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  361:             blk.39.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  362:           blk.39.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.Q6_K.gguf (version GGUF V2 (latest))\n",
+      "llama_model_loader: - tensor    0:                token_embd.weight q6_K     [  5120, 32000,     1,     1 ]\n",
+      "llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor    6:              blk.0.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    7:         blk.0.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    8:              blk.0.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    9:              blk.0.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   10:           blk.1.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   11:            blk.1.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   12:            blk.1.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   13:              blk.1.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   14:            blk.1.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   15:              blk.1.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   16:         blk.1.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   17:              blk.1.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   18:              blk.1.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   19:          blk.10.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   20:           blk.10.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   21:           blk.10.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   22:             blk.10.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   23:           blk.10.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   24:             blk.10.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   25:        blk.10.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   26:             blk.10.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   27:             blk.10.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   28:          blk.11.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   29:           blk.11.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   30:           blk.11.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   31:             blk.11.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   32:           blk.11.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   33:             blk.11.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   34:        blk.11.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   35:             blk.11.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   36:             blk.11.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   37:          blk.12.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   38:           blk.12.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   39:           blk.12.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   40:             blk.12.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   41:           blk.12.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   42:             blk.12.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   43:        blk.12.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   44:             blk.12.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   45:             blk.12.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   46:          blk.13.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   47:           blk.13.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   48:           blk.13.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   49:             blk.13.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   50:           blk.13.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   51:             blk.13.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   52:        blk.13.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   53:             blk.13.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   54:             blk.13.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   55:          blk.14.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   56:           blk.14.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   57:           blk.14.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   58:             blk.14.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   59:           blk.14.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   60:             blk.14.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   61:        blk.14.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   62:             blk.14.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   63:             blk.14.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   64:             blk.15.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   65:             blk.15.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   66:           blk.2.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   67:            blk.2.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   68:            blk.2.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   69:              blk.2.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   70:            blk.2.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   71:              blk.2.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   72:         blk.2.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   73:              blk.2.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   74:              blk.2.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   75:           blk.3.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   76:            blk.3.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   77:            blk.3.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   78:              blk.3.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   79:            blk.3.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   80:              blk.3.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   81:         blk.3.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   82:              blk.3.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   83:              blk.3.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   84:           blk.4.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   85:            blk.4.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   86:            blk.4.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   87:              blk.4.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   88:            blk.4.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   89:              blk.4.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   90:         blk.4.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   91:              blk.4.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   92:              blk.4.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   93:           blk.5.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   94:            blk.5.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   95:            blk.5.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   96:              blk.5.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   97:            blk.5.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   98:              blk.5.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   99:         blk.5.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  100:              blk.5.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  101:              blk.5.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  102:           blk.6.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  103:            blk.6.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  104:            blk.6.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  105:              blk.6.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  106:            blk.6.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  107:              blk.6.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  108:         blk.6.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  109:              blk.6.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  110:              blk.6.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  111:           blk.7.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  112:            blk.7.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  113:            blk.7.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  114:              blk.7.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  115:            blk.7.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  116:              blk.7.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  117:         blk.7.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  118:              blk.7.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  119:              blk.7.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  120:           blk.8.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  121:            blk.8.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  122:            blk.8.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  123:              blk.8.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  124:            blk.8.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  125:              blk.8.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  126:         blk.8.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  127:              blk.8.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  128:              blk.8.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  129:           blk.9.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  130:            blk.9.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  131:            blk.9.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  132:              blk.9.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  133:            blk.9.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  134:              blk.9.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  135:         blk.9.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  136:              blk.9.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  137:              blk.9.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  138:          blk.15.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  139:           blk.15.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  142:           blk.15.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  143:        blk.15.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  144:             blk.15.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  145:          blk.16.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  146:           blk.16.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  147:           blk.16.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  148:             blk.16.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  149:           blk.16.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  150:             blk.16.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  151:        blk.16.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  152:             blk.16.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  153:             blk.16.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  154:          blk.17.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  155:           blk.17.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  156:           blk.17.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  157:             blk.17.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  158:           blk.17.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  159:             blk.17.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  160:        blk.17.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  161:             blk.17.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  162:             blk.17.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  163:          blk.18.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  164:           blk.18.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  165:           blk.18.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  166:             blk.18.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  167:           blk.18.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  168:             blk.18.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  169:        blk.18.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  170:             blk.18.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  171:             blk.18.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  172:          blk.19.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  173:           blk.19.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  174:           blk.19.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  175:             blk.19.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  176:           blk.19.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  177:             blk.19.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  178:        blk.19.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  179:             blk.19.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  180:             blk.19.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  181:          blk.20.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  182:           blk.20.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  183:           blk.20.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  184:             blk.20.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  185:           blk.20.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  186:             blk.20.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  187:        blk.20.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  188:             blk.20.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  189:             blk.20.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  190:          blk.21.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  191:           blk.21.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  192:           blk.21.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  193:             blk.21.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  194:           blk.21.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  195:             blk.21.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  196:        blk.21.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  197:             blk.21.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  198:             blk.21.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  199:          blk.22.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  200:           blk.22.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  201:           blk.22.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  202:             blk.22.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  203:           blk.22.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  204:             blk.22.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  205:        blk.22.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  206:             blk.22.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  207:             blk.22.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  208:          blk.23.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  209:           blk.23.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  210:           blk.23.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  211:             blk.23.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  212:           blk.23.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  213:             blk.23.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  214:        blk.23.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  215:             blk.23.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  216:             blk.23.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  217:          blk.24.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  218:           blk.24.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  219:           blk.24.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  220:             blk.24.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  221:           blk.24.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  222:             blk.24.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  223:        blk.24.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  224:             blk.24.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  225:             blk.24.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  226:          blk.25.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  227:           blk.25.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  228:           blk.25.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  229:             blk.25.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  230:           blk.25.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  231:             blk.25.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  232:        blk.25.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  233:             blk.25.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  234:             blk.25.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  235:          blk.26.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  236:           blk.26.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  237:           blk.26.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  238:             blk.26.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  239:           blk.26.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  240:             blk.26.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  241:        blk.26.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  242:             blk.26.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  243:             blk.26.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  244:          blk.27.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  245:           blk.27.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  246:           blk.27.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  247:             blk.27.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  248:           blk.27.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  249:             blk.27.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  250:        blk.27.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  251:             blk.27.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  252:             blk.27.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  253:          blk.28.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  254:           blk.28.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  255:           blk.28.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  256:             blk.28.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  257:           blk.28.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  258:             blk.28.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  259:        blk.28.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  260:             blk.28.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  261:             blk.28.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  262:          blk.29.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  263:           blk.29.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  264:           blk.29.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  265:             blk.29.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  266:           blk.29.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  267:             blk.29.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  268:        blk.29.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  269:             blk.29.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  270:             blk.29.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  271:           blk.30.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  272:             blk.30.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  273:             blk.30.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  274:        blk.30.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  275:             blk.30.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  276:             blk.30.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  277:                    output.weight q6_K     [  5120, 32000,     1,     1 ]\n",
+      "llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  279:           blk.30.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  280:           blk.30.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  281:          blk.31.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  282:           blk.31.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  283:           blk.31.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  284:             blk.31.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  285:           blk.31.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  286:             blk.31.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  287:        blk.31.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  288:             blk.31.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  289:             blk.31.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  290:          blk.32.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  291:           blk.32.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  292:           blk.32.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  293:             blk.32.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  294:           blk.32.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  295:             blk.32.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  296:        blk.32.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  297:             blk.32.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  298:             blk.32.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  299:          blk.33.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  300:           blk.33.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  301:           blk.33.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  302:             blk.33.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  303:           blk.33.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  304:             blk.33.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  305:        blk.33.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  306:             blk.33.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  307:             blk.33.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  308:          blk.34.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  309:           blk.34.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  310:           blk.34.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  311:             blk.34.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  312:           blk.34.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  313:             blk.34.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  314:        blk.34.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  315:             blk.34.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  316:             blk.34.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  317:          blk.35.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  318:           blk.35.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  319:           blk.35.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  320:             blk.35.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  321:           blk.35.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  322:             blk.35.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  323:        blk.35.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  324:             blk.35.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  325:             blk.35.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  326:          blk.36.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  327:           blk.36.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  328:           blk.36.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  329:             blk.36.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  330:           blk.36.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  331:             blk.36.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  332:        blk.36.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  333:             blk.36.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  334:             blk.36.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  335:          blk.37.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  336:           blk.37.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  337:           blk.37.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  338:             blk.37.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  339:           blk.37.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  340:             blk.37.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  341:        blk.37.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  342:             blk.37.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  343:             blk.37.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  344:          blk.38.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  345:           blk.38.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  346:           blk.38.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  347:             blk.38.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  348:           blk.38.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  349:             blk.38.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  350:        blk.38.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  351:             blk.38.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  352:             blk.38.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  353:          blk.39.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  354:           blk.39.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  355:           blk.39.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  356:             blk.39.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  357:           blk.39.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  358:             blk.39.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  359:        blk.39.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  360:             blk.39.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  361:             blk.39.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  362:               output_norm.weight f32      [  5120,     1,     1,     1 ]\n",
       "llama_model_loader: - kv   0:                       general.architecture str     \n",
       "llama_model_loader: - kv   1:                               general.name str     \n",
-      "llama_model_loader: - kv   2:                        general.description str     \n",
-      "llama_model_loader: - kv   3:                       llama.context_length u32     \n",
-      "llama_model_loader: - kv   4:                     llama.embedding_length u32     \n",
-      "llama_model_loader: - kv   5:                          llama.block_count u32     \n",
-      "llama_model_loader: - kv   6:                  llama.feed_forward_length u32     \n",
-      "llama_model_loader: - kv   7:                 llama.rope.dimension_count u32     \n",
-      "llama_model_loader: - kv   8:                 llama.attention.head_count u32     \n",
-      "llama_model_loader: - kv   9:              llama.attention.head_count_kv u32     \n",
-      "llama_model_loader: - kv  10:     llama.attention.layer_norm_rms_epsilon f32     \n",
+      "llama_model_loader: - kv   2:                       llama.context_length u32     \n",
+      "llama_model_loader: - kv   3:                     llama.embedding_length u32     \n",
+      "llama_model_loader: - kv   4:                          llama.block_count u32     \n",
+      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32     \n",
+      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32     \n",
+      "llama_model_loader: - kv   7:                 llama.attention.head_count u32     \n",
+      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32     \n",
+      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32     \n",
+      "llama_model_loader: - kv  10:                          general.file_type u32     \n",
       "llama_model_loader: - kv  11:                       tokenizer.ggml.model str     \n",
       "llama_model_loader: - kv  12:                      tokenizer.ggml.tokens arr     \n",
       "llama_model_loader: - kv  13:                      tokenizer.ggml.scores arr     \n",
       "llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr     \n",
-      "llama_model_loader: - kv  15:            tokenizer.ggml.unknown_token_id u32     \n",
-      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32     \n",
-      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32     \n",
+      "llama_model_loader: - kv  15:                tokenizer.ggml.bos_token_id u32     \n",
+      "llama_model_loader: - kv  16:                tokenizer.ggml.eos_token_id u32     \n",
+      "llama_model_loader: - kv  17:            tokenizer.ggml.unknown_token_id u32     \n",
+      "llama_model_loader: - kv  18:               general.quantization_version u32     \n",
       "llama_model_loader: - type  f32:   81 tensors\n",
-      "llama_model_loader: - type q4_K:  281 tensors\n",
-      "llama_model_loader: - type q6_K:    1 tensors\n",
+      "llama_model_loader: - type q6_K:  282 tensors\n",
       "llm_load_print_meta: format         = GGUF V2 (latest)\n",
       "llm_load_print_meta: arch           = llama\n",
       "llm_load_print_meta: vocab type     = SPM\n",
       "llm_load_print_meta: n_vocab        = 32000\n",
       "llm_load_print_meta: n_merges       = 0\n",
-      "llm_load_print_meta: n_ctx_train    = 2048\n",
+      "llm_load_print_meta: n_ctx_train    = 4096\n",
       "llm_load_print_meta: n_ctx          = 512\n",
       "llm_load_print_meta: n_embd         = 5120\n",
       "llm_load_print_meta: n_head         = 40\n",
@@ -490,95 +487,89 @@
       "llm_load_print_meta: n_rot          = 128\n",
       "llm_load_print_meta: n_gqa          = 1\n",
       "llm_load_print_meta: f_norm_eps     = 1.0e-05\n",
-      "llm_load_print_meta: f_norm_rms_eps = 5.0e-06\n",
+      "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
       "llm_load_print_meta: n_ff           = 13824\n",
       "llm_load_print_meta: freq_base      = 10000.0\n",
       "llm_load_print_meta: freq_scale     = 1\n",
       "llm_load_print_meta: model type     = 13B\n",
-      "llm_load_print_meta: model ftype    = mostly Q4_K - Medium (guessed)\n",
+      "llm_load_print_meta: model ftype    = mostly Q6_K\n",
       "llm_load_print_meta: model size     = 13.02 B\n",
-      "llm_load_print_meta: general.name   = llama-2-13b-chat.ggmlv3.q4_K_S.bin\n",
+      "llm_load_print_meta: general.name   = LLaMA v2\n",
       "llm_load_print_meta: BOS token = 1 '<s>'\n",
       "llm_load_print_meta: EOS token = 2 '</s>'\n",
       "llm_load_print_meta: UNK token = 0 '<unk>'\n",
       "llm_load_print_meta: LF token  = 13 '<0x0A>'\n",
       "llm_load_tensors: ggml ctx size =    0.12 MB\n",
-      "llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)\n",
-      "...................................................................................................\n",
+      "llm_load_tensors: mem required  = 10183.83 MB (+  400.00 MB per state)\n",
+      "....................................................................................................\n",
       "llama_new_context_with_model: kv self size  =  400.00 MB\n",
       "ggml_metal_init: allocating\n",
       "ggml_metal_init: loading '/Users/rchan/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_cpp/ggml-metal.metal'\n",
-      "ggml_metal_init: loaded kernel_add                            0x1779615f0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_add_row                        0x177961850 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul                            0x177957ac0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_row                        0x177957d20 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_scale                          0x177957f80 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_silu                           0x1779581e0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_relu                           0x177955c40 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_gelu                           0x177955ea0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_soft_max                       0x177961e70 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x1779620d0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_f16                   0x177962330 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x177962590 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x177965370 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q8_0                  0x1779655d0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q2_K                  0x177965830 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q3_K                  0x177965a90 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_K                  0x177965cf0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q5_K                  0x177965f50 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q6_K                  0x1779661b0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_rms_norm                       0x177966410 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_norm                           0x177966670 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x1779668d0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x177966b30 | th_max =  896 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x177966d90 | th_max =  896 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q8_0_f32               0x177966ff0 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q2_K_f32               0x177967250 | th_max =  640 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q3_K_f32               0x177967530 | th_max =  704 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_K_f32               0x177967790 | th_max =  576 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q5_K_f32               0x1779679f0 | th_max =  576 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q6_K_f32               0x177967c50 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_f16_f32                 0x177967eb0 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q4_0_f32                0x177968110 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q8_0_f32                0x177968370 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q4_1_f32                0x1779685d0 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q2_K_f32                0x177968830 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q3_K_f32                0x177968a90 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q4_K_f32                0x177968cf0 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q5_K_f32                0x177968f50 | th_max =  704 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q6_K_f32                0x1779691b0 | th_max =  704 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_rope                           0x177969410 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_alibi_f32                      0x177969670 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x1779698d0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x177969b30 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x112775cc0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_add                            0x162fd5370 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_add_row                        0x162fd55d0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul                            0x162fd5830 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_row                        0x162fd5a90 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_scale                          0x162fd5cf0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_silu                           0x162fd5f50 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_relu                           0x162fd61b0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_gelu                           0x162fd6410 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_soft_max                       0x162fd6670 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x162fd68d0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_f16                   0x162fd6b30 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x162fd6d90 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x162fd6ff0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q8_0                  0x162fd7250 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q2_K                  0x162fd74b0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q3_K                  0x162fd7710 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_K                  0x162fd7970 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q5_K                  0x162fd7cf0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q6_K                  0x162fd8200 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_rms_norm                       0x162fd8720 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_norm                           0x162fd8c30 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x162fd9340 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x162fd9900 | th_max =  896 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x162fda040 | th_max =  896 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q8_0_f32               0x162fda600 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q2_K_f32               0x162fdabc0 | th_max =  640 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q3_K_f32               0x162fdb180 | th_max =  704 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_K_f32               0x162fdb940 | th_max =  576 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q5_K_f32               0x162fdc160 | th_max =  576 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q6_K_f32               0x162fdc720 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_f16_f32                 0x162fdcd20 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q4_0_f32                0x162fdd320 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q8_0_f32                0x162fdd920 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q4_1_f32                0x162fddf20 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q2_K_f32                0x162fde520 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q3_K_f32                0x162fdeb20 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q4_K_f32                0x162fdf120 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q5_K_f32                0x162fdf720 | th_max =  704 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q6_K_f32                0x162fdfd20 | th_max =  704 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_rope                           0x162fe00a0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_alibi_f32                      0x162fe07c0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x162fe0eb0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x162fe15a0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x162fe1c90 | th_max = 1024 | th_width =   32\n",
       "ggml_metal_init: recommendedMaxWorkingSetSize  = 21845.34 MB\n",
       "ggml_metal_init: hasUnifiedMemory              = true\n",
-      "ggml_metal_init: maxTransferRate               = built-in GPU\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
+      "ggml_metal_init: maxTransferRate               = built-in GPU\n",
       "llama_new_context_with_model: compute buffer total size =   91.47 MB\n",
       "llama_new_context_with_model: max tensor size =   128.17 MB\n",
-      "ggml_metal_add_buffer: allocated 'data            ' buffer, size =  7024.61 MB, (14549.28 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =     1.48 MB, (14550.77 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =   402.00 MB, (14952.77 / 21845.34)\n",
+      "ggml_metal_add_buffer: allocated 'data            ' buffer, size = 10184.42 MB, (10184.86 / 21845.34)\n",
+      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =     1.48 MB, (10186.34 / 21845.34)\n",
+      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =   402.00 MB, (10588.34 / 21845.34)\n",
       "AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | \n",
-      "ggml_metal_add_buffer: allocated 'alloc           ' buffer, size =    90.02 MB, (15042.78 / 21845.34)\n"
+      "ggml_metal_add_buffer: allocated 'alloc           ' buffer, size =    90.02 MB, (10678.36 / 21845.34)\n"
      ]
     }
    ],
    "source": [
-    "llm = Llama(model_path=llama_2_13b_chat_path, n_gpu_layers=1)"
+    "llm = Llama(model_path=llama_2_path, n_gpu_layers=1)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
-   "id": "5aade599",
+   "execution_count": 5,
+   "id": "5c87e243",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -587,40 +578,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "id": "4c1cf394",
-   "metadata": {},
+   "execution_count": 6,
+   "id": "1d0c196c",
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "\n",
-      "llama_print_timings:        load time =   552.96 ms\n",
-      "llama_print_timings:      sample time =   179.55 ms /   256 runs   (    0.70 ms per token,  1425.80 tokens per second)\n",
-      "llama_print_timings: prompt eval time =   552.93 ms /    17 tokens (   32.53 ms per token,    30.75 tokens per second)\n",
-      "llama_print_timings:        eval time = 14287.03 ms /   255 runs   (   56.03 ms per token,    17.85 tokens per second)\n",
-      "llama_print_timings:       total time = 15342.45 ms\n"
+      "llama_print_timings:        load time =   567.44 ms\n",
+      "llama_print_timings:      sample time =   229.39 ms /   327 runs   (    0.70 ms per token,  1425.54 tokens per second)\n",
+      "llama_print_timings: prompt eval time =   567.41 ms /    17 tokens (   33.38 ms per token,    29.96 tokens per second)\n",
+      "llama_print_timings:        eval time = 30608.18 ms /   326 runs   (   93.89 ms per token,    10.65 tokens per second)\n",
+      "llama_print_timings:       total time = 31823.15 ms\n"
      ]
     }
    ],
    "source": [
-    "output = llm(prompt_example,\n",
-    "             max_tokens=512,\n",
-    "             echo=True)"
+    "output = llm(prompt_example, max_tokens=512, echo=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
-   "id": "fc2c20fb",
+   "execution_count": 7,
+   "id": "d67c8401",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'id': 'cmpl-618337c4-bc4d-4818-99d4-e87893cf21fb', 'object': 'text_completion', 'created': 1693518842, 'model': '../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin', 'choices': [{'text': \"Name all the planets in the solar system and state their distances to the sun.\\n\\nThere are eight planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Here is a list of each planet along with its distance from the Sun (in astronomical units or AU):\\n\\n1. Mercury - 0.4 AU (very close to the Sun)\\n2. Venus - 1.0 AU (just inside Earth's orbit)\\n3. Earth - 1.0 AU (the distance from the Earth to the Sun is called an astronomical unit, or AU)\\n4. Mars - 1.6 AU (about 1.5 times the distance from the Earth to the Sun)\\n5. Jupiter - 5.2 AU (about 5 times the distance from the Earth to the Sun)\\n6. Saturn - 9.5 AU (almost twice the distance from the Earth to the Sun)\\n7. Uranus - 19.0 AU (about 4 times the distance from the Earth to the Sun)\\n8. Neptune - 30.1 AU (more than \", 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 17, 'completion_tokens': 256, 'total_tokens': 273}}\n"
+      "{'id': 'cmpl-35d1cb16-69fa-4ff5-9bf3-aaf53d3e866d', 'object': 'text_completion', 'created': 1694191828, 'model': '../../gguf_models/llama-2-13b-chat.Q6_K.gguf', 'choices': [{'text': 'Name all the planets in the solar system and state their distances to the sun.\\n\\nThere are eight planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Here is a list of the planets in order from closest to farthest from the Sun:\\n\\n1. Mercury - 57,909,227 km (0.387 AU)\\n2. Venus - 108,208,930 km (0.723 AU)\\n3. Earth - 149,597,870 km (1 AU)\\n4. Mars - 225,000,000 km (1.381 AU)\\n5. Jupiter - 778,299,000 km (5.203 AU)\\n6. Saturn - 1,426,666,400 km (8.388 AU)\\n7. Uranus - 2,870,972,200 km (19.218 AU)\\n8. Neptune - 4,497,072,000 km (30.05 AU)\\n\\nNote: One astronomical unit (AU) is the average distance between the Earth and the Sun, which is approximately 93 million miles or 149.6 million kilometers.', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 17, 'completion_tokens': 326, 'total_tokens': 343}}\n"
      ]
     }
    ],
@@ -630,8 +621,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
-   "id": "7cc677a8",
+   "execution_count": 8,
+   "id": "336029d1",
    "metadata": {},
    "outputs": [
     {
@@ -640,16 +631,18 @@
      "text": [
       "Name all the planets in the solar system and state their distances to the sun.\n",
       "\n",
-      "There are eight planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Here is a list of each planet along with its distance from the Sun (in astronomical units or AU):\n",
+      "There are eight planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Here is a list of the planets in order from closest to farthest from the Sun:\n",
+      "\n",
+      "1. Mercury - 57,909,227 km (0.387 AU)\n",
+      "2. Venus - 108,208,930 km (0.723 AU)\n",
+      "3. Earth - 149,597,870 km (1 AU)\n",
+      "4. Mars - 225,000,000 km (1.381 AU)\n",
+      "5. Jupiter - 778,299,000 km (5.203 AU)\n",
+      "6. Saturn - 1,426,666,400 km (8.388 AU)\n",
+      "7. Uranus - 2,870,972,200 km (19.218 AU)\n",
+      "8. Neptune - 4,497,072,000 km (30.05 AU)\n",
       "\n",
-      "1. Mercury - 0.4 AU (very close to the Sun)\n",
-      "2. Venus - 1.0 AU (just inside Earth's orbit)\n",
-      "3. Earth - 1.0 AU (the distance from the Earth to the Sun is called an astronomical unit, or AU)\n",
-      "4. Mars - 1.6 AU (about 1.5 times the distance from the Earth to the Sun)\n",
-      "5. Jupiter - 5.2 AU (about 5 times the distance from the Earth to the Sun)\n",
-      "6. Saturn - 9.5 AU (almost twice the distance from the Earth to the Sun)\n",
-      "7. Uranus - 19.0 AU (about 4 times the distance from the Earth to the Sun)\n",
-      "8. Neptune - 30.1 AU (more than \n"
+      "Note: One astronomical unit (AU) is the average distance between the Earth and the Sun, which is approximately 93 million miles or 149.6 million kilometers.\n"
      ]
     }
    ],
@@ -659,7 +652,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "036f5ead",
+   "id": "365beee4",
    "metadata": {},
    "source": [
     "## Using CPU"
@@ -667,365 +660,465 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
-   "id": "67d96462",
+   "execution_count": 9,
+   "id": "ad96cb8c",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "llama_model_loader: loaded meta data with 18 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin (version GGUF V2 (latest))\n",
-      "llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]\n",
-      "llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,     1,     1 ]\n",
-      "llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor    8:            blk.0.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor    9:            blk.0.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   10:              blk.0.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   11:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   12:              blk.1.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   13:              blk.1.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   14:              blk.1.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   15:         blk.1.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   16:           blk.1.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   17:            blk.1.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   18:            blk.1.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   19:              blk.1.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   20:            blk.1.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   21:              blk.2.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   22:              blk.2.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   23:              blk.2.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   24:         blk.2.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   25:           blk.2.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   26:            blk.2.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   27:            blk.2.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   28:              blk.2.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   29:            blk.2.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   30:              blk.3.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   31:              blk.3.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   32:              blk.3.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   33:         blk.3.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   34:           blk.3.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   35:            blk.3.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   36:            blk.3.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   37:              blk.3.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   38:            blk.3.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   39:              blk.4.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   40:              blk.4.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   41:              blk.4.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   42:         blk.4.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   43:           blk.4.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   44:            blk.4.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   45:            blk.4.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   46:              blk.4.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   47:            blk.4.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   48:              blk.5.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   49:              blk.5.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   50:              blk.5.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   51:         blk.5.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   52:           blk.5.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   53:            blk.5.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   54:            blk.5.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   55:              blk.5.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   56:            blk.5.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   57:              blk.6.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   58:              blk.6.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   59:              blk.6.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   60:         blk.6.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   61:           blk.6.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   62:            blk.6.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   63:            blk.6.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   64:              blk.6.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   65:            blk.6.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   66:              blk.7.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   67:              blk.7.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   68:              blk.7.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   69:         blk.7.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   70:           blk.7.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   71:            blk.7.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   72:            blk.7.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   73:              blk.7.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   74:            blk.7.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   75:              blk.8.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   76:              blk.8.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   77:              blk.8.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   78:         blk.8.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   79:           blk.8.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   80:            blk.8.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   81:            blk.8.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   82:              blk.8.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   83:            blk.8.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   84:              blk.9.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   85:              blk.9.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   86:              blk.9.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   87:         blk.9.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   88:           blk.9.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   89:            blk.9.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   90:            blk.9.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   91:              blk.9.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   92:            blk.9.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   93:             blk.10.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   94:             blk.10.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   95:             blk.10.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   96:        blk.10.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   97:          blk.10.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   98:           blk.10.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   99:           blk.10.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  100:             blk.10.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  101:           blk.10.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  102:             blk.11.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  103:             blk.11.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  104:             blk.11.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  105:        blk.11.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  106:          blk.11.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  107:           blk.11.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  108:           blk.11.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  109:             blk.11.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  110:           blk.11.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  111:             blk.12.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  112:             blk.12.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  113:             blk.12.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  114:        blk.12.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  115:          blk.12.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  116:           blk.12.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  117:           blk.12.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  118:             blk.12.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  119:           blk.12.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  120:             blk.13.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  121:             blk.13.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  122:             blk.13.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  123:        blk.13.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  124:          blk.13.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  125:           blk.13.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  126:           blk.13.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  127:             blk.13.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  128:           blk.13.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  129:             blk.14.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  130:             blk.14.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  131:             blk.14.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  132:        blk.14.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  133:          blk.14.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  134:           blk.14.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  135:           blk.14.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  136:             blk.14.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  137:           blk.14.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  138:             blk.15.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  139:             blk.15.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  140:             blk.15.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  141:        blk.15.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  142:          blk.15.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  143:           blk.15.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  144:           blk.15.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  145:             blk.15.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  146:           blk.15.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  147:             blk.16.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  148:             blk.16.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  149:             blk.16.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  150:        blk.16.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  151:          blk.16.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  152:           blk.16.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  153:           blk.16.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  154:             blk.16.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  155:           blk.16.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  156:             blk.17.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  157:             blk.17.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  158:             blk.17.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  159:        blk.17.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  160:          blk.17.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  161:           blk.17.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  162:           blk.17.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  163:             blk.17.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  164:           blk.17.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  165:             blk.18.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  166:             blk.18.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  167:             blk.18.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  168:        blk.18.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  169:          blk.18.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  170:           blk.18.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  171:           blk.18.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  172:             blk.18.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  173:           blk.18.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  174:             blk.19.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  175:             blk.19.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  176:             blk.19.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  177:        blk.19.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  178:          blk.19.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  179:           blk.19.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  180:           blk.19.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  181:             blk.19.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  182:           blk.19.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  183:             blk.20.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  184:             blk.20.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  185:             blk.20.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  186:        blk.20.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  187:          blk.20.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  188:           blk.20.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  189:           blk.20.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  190:             blk.20.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  191:           blk.20.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  192:             blk.21.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  193:             blk.21.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  194:             blk.21.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  195:        blk.21.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  196:          blk.21.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  197:           blk.21.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  198:           blk.21.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  199:             blk.21.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  200:           blk.21.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  201:             blk.22.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  202:             blk.22.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  203:             blk.22.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  204:        blk.22.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  205:          blk.22.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  206:           blk.22.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  207:           blk.22.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  208:             blk.22.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  209:           blk.22.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  210:             blk.23.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  211:             blk.23.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  212:             blk.23.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  213:        blk.23.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  214:          blk.23.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  215:           blk.23.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  216:           blk.23.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  217:             blk.23.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  218:           blk.23.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  219:             blk.24.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  220:             blk.24.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  221:             blk.24.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  222:        blk.24.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  223:          blk.24.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  224:           blk.24.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  225:           blk.24.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  226:             blk.24.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  227:           blk.24.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  228:             blk.25.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  229:             blk.25.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  230:             blk.25.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  231:        blk.25.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  232:          blk.25.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  233:           blk.25.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  234:           blk.25.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  235:             blk.25.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  236:           blk.25.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  237:             blk.26.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  238:             blk.26.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  239:             blk.26.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  240:        blk.26.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  241:          blk.26.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  242:           blk.26.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  243:           blk.26.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  244:             blk.26.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  245:           blk.26.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  246:             blk.27.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  247:             blk.27.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  248:             blk.27.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  249:        blk.27.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  250:          blk.27.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  251:           blk.27.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  252:           blk.27.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  253:             blk.27.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  254:           blk.27.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  255:             blk.28.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  256:             blk.28.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  257:             blk.28.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  258:        blk.28.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  259:          blk.28.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  260:           blk.28.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  261:           blk.28.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  262:             blk.28.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  263:           blk.28.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  264:             blk.29.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  265:             blk.29.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  266:             blk.29.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  267:        blk.29.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  268:          blk.29.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  269:           blk.29.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  270:           blk.29.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  271:             blk.29.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  272:           blk.29.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  273:             blk.30.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  274:             blk.30.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  275:             blk.30.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  276:        blk.30.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  277:          blk.30.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  278:           blk.30.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  279:           blk.30.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  280:             blk.30.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  281:           blk.30.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  282:             blk.31.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  283:             blk.31.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  284:             blk.31.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  285:        blk.31.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  286:          blk.31.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  287:           blk.31.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llamAVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | \n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "a_model_loader: - tensor  288:           blk.31.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  289:             blk.31.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  290:           blk.31.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  291:             blk.32.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  292:             blk.32.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  293:             blk.32.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  294:        blk.32.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  295:          blk.32.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  296:           blk.32.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  297:           blk.32.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  298:             blk.32.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  299:           blk.32.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  300:             blk.33.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  301:             blk.33.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  302:             blk.33.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  303:        blk.33.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  304:          blk.33.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  305:           blk.33.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  306:           blk.33.ffn_down"
+      "llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.Q6_K.gguf (version GGUF V2 (latest))\n",
+      "llama_model_loader: - tensor    0:                token_embd.weight q6_K     [  5120, 32000,     1,     1 ]\n",
+      "llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor    6:              blk.0.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    7:         blk.0.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    8:              blk.0.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    9:              blk.0.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   10:           blk.1.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   11:            blk.1.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   12:            blk.1.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   13:              blk.1.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   14:            blk.1.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   15:              blk.1.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   16:         blk.1.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   17:              blk.1.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   18:              blk.1.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   19:          blk.10.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   20:           blk.10.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   21:           blk.10.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   22:             blk.10.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   23:           blk.10.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   24:             blk.10.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   25:        blk.10.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   26:             blk.10.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   27:             blk.10.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   28:          blk.11.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   29:           blk.11.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   30:           blk.11.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   31:             blk.11.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   32:           blk.11.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   33:             blk.11.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   34:        blk.11.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   35:             blk.11.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   36:             blk.11.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   37:          blk.12.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   38:           blk.12.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   39:           blk.12.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   40:             blk.12.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   41:           blk.12.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   42:             blk.12.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   43:        blk.12.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   44:             blk.12.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   45:             blk.12.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   46:          blk.13.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   47:           blk.13.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   48:           blk.13.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   49:             blk.13.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   50:           blk.13.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   51:             blk.13.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   52:        blk.13.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   53:             blk.13.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   54:             blk.13.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   55:          blk.14.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   56:           blk.14.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   57:           blk.14.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   58:             blk.14.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   59:           blk.14.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   60:             blk.14.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   61:        blk.14.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   62:             blk.14.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   63:             blk.14.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   64:             blk.15.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   65:             blk.15.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   66:           blk.2.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   67:            blk.2.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   68:            blk.2.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   69:              blk.2.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   70:            blk.2.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   71:              blk.2.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   72:         blk.2.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   73:              blk.2.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   74:              blk.2.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   75:           blk.3.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   76:            blk.3.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   77:            blk.3.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   78:              blk.3.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   79:            blk.3.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   80:              blk.3.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   81:         blk.3.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   82:              blk.3.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   83:              blk.3.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   84:           blk.4.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   85:            blk.4.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   86:            blk.4.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   87:              blk.4.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   88:            blk.4.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   89:              blk.4.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   90:         blk.4.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   91:              blk.4.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   92:              blk.4.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   93:           blk.5.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   94:            blk.5.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   95:            blk.5.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   96:              blk.5.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   97:            blk.5.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   98:              blk.5.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   99:         blk.5.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  100:              blk.5.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  101:              blk.5.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  102:           blk.6.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  103:            blk.6.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  104:            blk.6.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  105:              blk.6.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  106:            blk.6.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  107:              blk.6.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  108:         blk.6.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  109:              blk.6.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  110:              blk.6.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  111:           blk.7.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  112:            blk.7.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  113:            blk.7.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  114:              blk.7.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  115:            blk.7.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  116:              blk.7.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  117:         blk.7.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  118:              blk.7.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  119:              blk.7.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  120:           blk.8.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  121:            blk.8.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  122:            blk.8.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  123:              blk.8.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  124:            blk.8.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  125:              blk.8.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  126:         blk.8.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  127:              blk.8.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  128:              blk.8.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  129:           blk.9.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  130:            blk.9.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  131:            blk.9.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  132:              blk.9.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  133:            blk.9.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  134:              blk.9.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  135:         blk.9.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  136:              blk.9.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  137:              blk.9.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  138:          blk.15.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  139:           blk.15.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  142:           blk.15.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  143:        blk.15.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  144:             blk.15.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  145:          blk.16.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  146:           blk.16.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  147:           blk.16.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  148:             blk.16.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  149:           blk.16.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  150:             blk.16.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  151:        blk.16.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  152:             blk.16.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  153:             blk.16.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  154:          blk.17.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  155:           blk.17.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  156:           blk.17.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  157:             blk.17.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  158:           blk.17.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  159:             blk.17.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  160:        blk.17.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  161:             blk.17.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  162:             blk.17.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  163:          blk.18.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  164:           blk.18.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  165:           blk.18.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  166:             blk.18.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  167:           blk.18.ffn_norm.weight f32 AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | \n",
+      "     [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  168:             blk.18.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  169:        blk.18.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  170:             blk.18.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  171:             blk.18.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  172:          blk.19.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  173:           blk.19.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  174:           blk.19.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  175:             blk.19.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  176:           blk.19.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  177:             blk.19.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  178:        blk.19.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  179:             blk.19.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  180:             blk.19.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  181:          blk.20.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  182:           blk.20.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  183:           blk.20.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  184:             blk.20.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  185:           blk.20.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  186:             blk.20.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  187:        blk.20.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  188:             blk.20.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  189:             blk.20.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  190:          blk.21.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  191:           blk.21.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  192:           blk.21.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  193:             blk.21.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  194:           blk.21.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  195:             blk.21.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  196:        blk.21.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  197:             blk.21.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  198:             blk.21.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  199:          blk.22.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  200:           blk.22.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  201:           blk.22.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  202:             blk.22.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  203:           blk.22.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  204:             blk.22.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  205:        blk.22.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  206:             blk.22.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  207:             blk.22.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  208:          blk.23.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  209:           blk.23.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  210:           blk.23.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  211:             blk.23.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  212:           blk.23.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  213:             blk.23.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  214:        blk.23.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  215:             blk.23.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  216:             blk.23.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  217:          blk.24.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  218:           blk.24.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  219:           blk.24.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  220:             blk.24.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  221:           blk.24.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  222:             blk.24.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  223:        blk.24.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  224:             blk.24.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  225:             blk.24.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  226:          blk.25.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  227:           blk.25.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  228:           blk.25.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  229:             blk.25.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  230:           blk.25.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  231:             blk.25.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  232:        blk.25.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  233:             blk.25.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  234:             blk.25.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  235:          blk.26.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  236:           blk.26.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  237:           blk.26.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  238:             blk.26.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  239:           blk.26.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  240:             blk.26.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  241:        blk.26.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  242:             blk.26.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  243:             blk.26.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  244:          blk.27.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  245:           blk.27.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  246:           blk.27.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  247:             blk.27.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  248:           blk.27.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  249:             blk.27.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  250:        blk.27.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  251:             blk.27.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  252:             blk.27.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  253:          blk.28.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  254:           blk.28.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  255:           blk.28.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  256:             blk.28.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  257:           blk.28.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  258:             blk.28.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  259:        blk.28.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  260:             blk.28.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  261:             blk.28.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  262:          blk.29.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  263:           blk.29.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  264:           blk.29.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  265:             blk.29.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  266:           blk.29.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  267:             blk.29.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  268:        blk.29.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  269:             blk.29.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  270:             blk.29.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  271:           blk.30.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  272:             blk.30.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  273:             blk.30.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  274:        blk.30.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  275:             blk.30.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  276:             blk.30.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  277:                    output.weight q6_K     [  5120, 32000,     1,     1 ]\n",
+      "llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  279:           blk.30.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  280:           blk.30.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  281:          blk.31.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  282:           blk.31.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  283:           blk.31.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  284:             blk.31.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  285:           blk.31.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  286:             blk.31.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  287:        blk.31.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  288:             blk.31.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  289:             blk.31.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  290:          blk.32.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  291:           blk.32.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  292:           blk.32.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  293:             blk.32.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  294:           blk.32.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  295:             blk.32.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  296:        blk.32.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  297:             blk.32.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  298:             blk.32.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  299:          blk.33.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  300:           blk.33.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  301:           blk.33.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  302:             blk.33.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  303:           blk.33.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  304:             blk.33.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  305:        blk.33.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  306:             blk.33.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  307:             blk.33.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  308:          blk.34.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  309:           blk.34.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  310:           blk.34.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  311:             blk.34.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  312:           blk.34.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  313:             blk.34.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  314:        blk.34.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  315:             blk.34.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  316:             blk.34.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  317:          blk.35.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  318:           blk.35.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  319:           blk.35.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  320:             blk.35.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  321:           blk.35.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  322:             blk.35.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  323:        blk.35.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  324:             blk.35.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  325:             blk.35.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  326:          blk.36.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  327:           blk.36.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  328:           blk.36.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  329:             blk.36.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  330:           blk.36.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  331:             blk.36.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  332:        blk.36.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  333:             blk.36.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  334:             blk.36.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  335:          blk.37.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  336:           blk.37.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  337:           blk.37.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  338:             blk.37.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  339:           blk.37.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  340:             blk.37.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  341:        blk.37.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  342:             blk.37.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  343:             blk.37.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  344:          blk.38.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  345:           blk.38.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  346:           blk.38.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  347:             blk.38.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  348:           blk.38.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  349:             blk.38.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  350:        blk.38.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  351:             blk.38.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  352:             blk.38.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  353:          blk.39.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  354:           blk.39.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  355:           blk.39.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  356:             blk.39.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  357:           blk.39.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  358:             blk.39.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  359:        blk.39.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  360:             blk.39.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  361:             blk.39.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  362:               output_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - kv   0:                       general.architecture str     \n",
+      "llama_model_loader: - kv   1:                               general.name str     \n",
+      "llama_model_loader: - kv   2:                       llama.context_length u32     \n",
+      "llama_model_loader: - kv   3:                     llama.embedding_length u32     \n",
+      "llama_model_loader: - kv   4:                          llama.block_count u32     \n",
+      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32     \n",
+      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32     \n",
+      "llama_model_loader: - kv   7:                 llama.attention.head_count u32     \n",
+      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32     \n",
+      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32     \n",
+      "llama_model_loader: - kv  10:                          general.file_type u32     \n",
+      "llama_model_loader: - kv  11:                       tokenizer.ggml.model str     \n",
+      "llama_model_loader: - kv  12:                      tokenizer.ggml.tokens arr     \n",
+      "llama_model_loader: - kv  13:                      tokenizer.ggml.scores arr     \n",
+      "llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr     \n",
+      "llama_model_loader: - kv  15:                tokenizer.ggml.bos_token_id u32     \n",
+      "llama_model_loader: - kv  16:                tokenizer.ggml.eos_token_id u32     \n",
+      "llama_model_loader: - kv  17:            tokenizer.ggml.unknown_token_id u32     \n",
+      "llama_model_loader: - kv  18:               general.quantization_version u32     \n",
+      "llama_model_loader: - type  f32:   81 tensors\n",
+      "llama_model_loader: - type q6_K:  282 tensors\n",
+      "llm_load_print_meta: format         = GGUF V2 (latest)\n",
+      "llm_load_print_meta: arch           = llama\n",
+      "llm_load_print_meta: vocab type     = SPM\n",
+      "llm_load_print_meta: n_vocab        = 32000\n",
+      "llm_load_print_meta: n_merges       = 0\n",
+      "llm_load_print_meta: n_ctx_train    = 4096\n",
+      "llm_load_print_meta: n_ctx          = 512\n",
+      "llm_load_print_meta: n_embd         = 5120\n",
+      "llm_load_print_meta: n_head         = 40\n",
+      "llm_load_print_meta: n_head_kv      = 40\n",
+      "llm_load_print_meta: n_layer        = 40\n",
+      "llm_load_print_meta: n_rot          = 128\n",
+      "llm_load_print_meta: n_gqa          = 1\n",
+      "llm_load_print_meta: f_norm_eps     = 1.0e-05\n",
+      "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
+      "llm_load_print_meta: n_ff           = 13824\n",
+      "llm_load_print_meta: freq_base      = 10000.0\n",
+      "llm_load_print_meta: freq_scale     = 1\n",
+      "llm_load_print_meta: model type     = 13B\n",
+      "llm_load_print_meta: model ftype    = mostly Q6_K\n",
+      "llm_load_print_meta: model size     = 13.02 B\n",
+      "llm_load_print_meta: general.name   = LLaMA v2\n",
+      "llm_load_print_meta: BOS token = 1 '<s>'\n",
+      "llm_load_print_meta: EOS token = 2 '</s>'\n",
+      "llm_load_print_meta: UNK token = 0 '<unk>'\n",
+      "llm_load_print_meta: LF token  = 13 '<0x0A>'\n",
+      "llm_load_tensors: ggml ctx size =    0.12 MB\n",
+      "llm_load_tensors: mem required  = 10183.83 MB (+  400.00 MB per state)\n",
+      "....................................................................................................\n",
+      "llama_new_context_with_model: kv self size  =  400.00 MB\n",
+      "llama_new_context_with_model: compute buffer total size =   75.47 MB\n",
+      "ggml_metal_free: deallocating\n"
      ]
     }
    ],
    "source": [
-    "llm = Llama(model_path=llama_2_13b_chat_path)"
+    "llm = Llama(model_path=llama_2_path)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "id": "b9a0d5b2",
+   "execution_count": 10,
+   "id": "bcce886c",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Llama.generate: prefix-match hit\n",
       "\n",
-      "llama_print_timings:        load time =  1496.01 ms\n",
-      "llama_print_timings:      sample time =   182.77 ms /   256 runs   (    0.71 ms per token,  1400.66 tokens per second)\n",
-      "llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)\n",
-      "llama_print_timings:        eval time = 21947.42 ms /   256 runs   (   85.73 ms per token,    11.66 tokens per second)\n",
-      "llama_print_timings:       total time = 22482.00 ms\n"
+      "llama_print_timings:        load time =  2349.61 ms\n",
+      "llama_print_timings:      sample time =   258.02 ms /   329 runs   (    0.78 ms per token,  1275.08 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  2349.57 ms /    17 tokens (  138.21 ms per token,     7.24 tokens per second)\n",
+      "llama_print_timings:        eval time = 44262.75 ms /   328 runs   (  134.95 ms per token,     7.41 tokens per second)\n",
+      "llama_print_timings:       total time = 47359.38 ms\n"
      ]
     }
    ],
    "source": [
-    "output = llm(prompt_example,\n",
-    "             max_tokens=512,\n",
-    "             echo=True)"
+    "output = llm(prompt_example, max_tokens=512, echo=True)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "55489fed",
+   "id": "80968f48",
    "metadata": {},
    "source": [
     "By inspection, we can see that the metal acceleration is faster as expected."
@@ -1033,8 +1126,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
-   "id": "243ff1a4",
+   "execution_count": 11,
+   "id": "fd921ba0",
    "metadata": {},
    "outputs": [
     {
@@ -1042,17 +1135,18 @@
      "output_type": "stream",
      "text": [
       "Name all the planets in the solar system and state their distances to the sun.\n",
+      "The eight planets of our solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. The average distance from each planet to the sun is as follows:\n",
       "\n",
-      "There are eight planets in our solar system, which are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus and Neptune. Here's a list of the planets in order from closest to farthest from the Sun:\n",
-      "\n",
-      "1. Mercury - 57,909,227 km (0.38 AU)\n",
-      "2. Venus - 108,208,930 km (0.72 AU)\n",
+      "1. Mercury - 57,909,227 km (0.387 AU)\n",
+      "2. Venus - 108,208,930 km (0.723 AU)\n",
       "3. Earth - 149,597,890 km (1 AU)\n",
-      "4. Mars - 226,650,000 km (1.38 AU)\n",
-      "5. Jupiter - 778,299,000 km (5.2 AU)\n",
-      "6. Saturn - 1,426,666,400 km (9.5 AU)\n",
-      "7. Uranus - 2,870,972,200 km (19.2 AU)\n",
-      "8. Neptune - \n"
+      "4. Mars - 225,000,000 km (1.381 AU)\n",
+      "5. Jupiter - 778,299,000 km (5.203 AU)\n",
+      "6. Saturn - 1,426,666,400 km (8.388 AU)\n",
+      "7. Uranus - 2,870,972,200 km (19.18 AU)\n",
+      "8. Neptune - 4,497,072,000 km (30.05 AU)\n",
+      "\n",
+      "Note: One astronomical unit (AU) is the average distance between the Earth and the sun, which is about 149,600,000 km or 92,955,800 miles.\n"
      ]
     }
    ],
@@ -1062,7 +1156,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e0014652",
+   "id": "c6830776",
    "metadata": {},
    "source": [
     "## Using Llama2 in `llama-index`"
@@ -1070,405 +1164,405 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "id": "b45709e0",
+   "execution_count": 12,
+   "id": "bfb1fd3b",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "llama_model_loader: loaded meta data with 18 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin (version GGUF V2 (latest))\n",
-      "llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]\n",
-      "llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,     1,     1 ]\n",
-      "llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor    8:            blk.0.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor    9:            blk.0.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   10:              blk.0.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   11:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   12:              blk.1.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   13:              blk.1.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   14:              blk.1.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   15:         blk.1.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   16:           blk.1.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   17:            blk.1.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   18:            blk.1.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   19:              blk.1.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   20:            blk.1.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   21:              blk.2.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   22:              blk.2.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   23:              blk.2.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   24:         blk.2.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   25:           blk.2.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   26:            blk.2.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   27:            blk.2.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   28:              blk.2.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   29:            blk.2.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   30:              blk.3.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   31:              blk.3.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   32:              blk.3.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   33:         blk.3.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   34:           blk.3.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   35:            blk.3.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   36:            blk.3.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   37:              blk.3.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   38:            blk.3.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   39:              blk.4.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   40:              blk.4.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   41:              blk.4.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   42:         blk.4.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   43:           blk.4.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   44:            blk.4.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   45:            blk.4.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   46:              blk.4.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   47:            blk.4.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   48:              blk.5.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   49:              blk.5.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   50:              blk.5.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   51:         blk.5.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   52:           blk.5.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   53:            blk.5.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   54:            blk.5.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   55:              blk.5.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   56:            blk.5.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   57:              blk.6.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   58:              blk.6.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   59:              blk.6.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   60:         blk.6.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   61:           blk.6.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   62:            blk.6.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   63:            blk.6.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   64:              blk.6.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   65:            blk.6.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   66:              blk.7.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   67:              blk.7.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   68:              blk.7.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   69:         blk.7.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   70:           blk.7.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   71:            blk.7.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   72:            blk.7.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   73:              blk.7.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   74:            blk.7.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   75:              blk.8.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   76:              blk.8.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   77:              blk.8.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   78:         blk.8.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   79:           blk.8.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   80:            blk.8.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   81:            blk.8.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   82:              blk.8.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   83:            blk.8.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   84:              blk.9.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   85:              blk.9.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   86:              blk.9.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   87:         blk.9.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   88:           blk.9.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   89:            blk.9.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   90:            blk.9.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   91:              blk.9.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   92:            blk.9.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   93:             blk.10.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   94:             blk.10.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   95:             blk.10.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   96:        blk.10.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor   97:          blk.10.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor   98:           blk.10.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor   99:           blk.10.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  100:             blk.10.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  101:           blk.10.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  102:             blk.11.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  103:             blk.11.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  104:             blk.11.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  105:        blk.11.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  106:          blk.11.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  107:           blk.11.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  108:           blk.11.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  109:             blk.11.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  110:           blk.11.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  111:             blk.12.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  112:             blk.12.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  113:             blk.12.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  114:        blk.12.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  115:          blk.12.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  116:           blk.12.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  117:           blk.12.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  118:             blk.12.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  119:           blk.12.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  120:             blk.13.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  121:             blk.13.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  122:             blk.13.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  123:        blk.13.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  124:          blk.13.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  125:           blk.13.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  126:           blk.13.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  127:             blk.13.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  128:           blk.13.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  129:             blk.14.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  130:             blk.14.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  131:             blk.14.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  132:        blk.14.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  133:          blk.14.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  134:           blk.14.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  135:           blk.14.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  136:             blk.14.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  137:           blk.14.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  138:             blk.15.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  139:             blk.15.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  140:             blk.15.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  141:        blk.15.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  142:          blk.15.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  143:           blk.15.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  144:           blk.15.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  145:             blk.15.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  146:           blk.15.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  147:             blk.16.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  148:             blk.16.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  149:             blk.16.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  150:        blk.16.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  151:          blk.16.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  152:           blk.16.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  153:           blk.16.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  154:             blk.16.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  155:           blk.16.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  156:             blk.17.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  157:             blk.17.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  158:             blk.17.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  159:        blk.17.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  160:          blk.17.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  161:           blk.17.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  162:           blk.17.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  163:             blk.17.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  164:           blk.17.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  165:             blk.18.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  166:             blk.18.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  167:             blk.18.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  168:        blk.18.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  169:          blk.18.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  170:           blk.18.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  171:           blk.18.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  172:             blk.18.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  173:           blk.18.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  174:             blk.19.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  175:             blk.19.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  176:             blk.19.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  177:        blk.19.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  178:          blk.19.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  179:           blk.19.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  180:           blk.19.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  181:             blk.19.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  182:           blk.19.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  183:             blk.20.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  184:             blk.20.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  185:             blk.20.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  186:        blk.20.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  187:          blk.20.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  188:           blk.20.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  189:           blk.20.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  190:             blk.20.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  191:           blk.20.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  192:             blk.21.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  193:             blk.21.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  194:             blk.21.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  195:        blk.21.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  196:          blk.21.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  197:           blk.21.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  198:           blk.21.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  199:             blk.21.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  200:           blk.21.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  201:             blk.22.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  202:             blk.22.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  203:             blk.22.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  204:        blk.22.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  205:          blk.22.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  206:           blk.22.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  207:           blk.22.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  208:             blk.22.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  209:           blk.22.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  210:             blk.23.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  211:             blk.23.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  212:             blk.23.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  213:        blk.23.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  214:          blk.23.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  215:           blk.23.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  216:           blk.23.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  217:             blk.23.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  218:           blk.23.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  219:             blk.24.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  220:             blk.24.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  221:             blk.24.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  222:        blk.24.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  223:          blk.24.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  224:           blk.24.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  225:           blk.24.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  226:             blk.24.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  227:           blk.24.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  228:             blk.25.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  229:             blk.25.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  230:             blk.25.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  231:        blk.25.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  232:          blk.25.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  233:           blk.25.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  234:           blk.25.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  235:             blk.25.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  236:           blk.25.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  237:             blk.26.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  238:             blk.26.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  239:             blk.26.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  240:        blk.26.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  241:          blk.26.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  242:           blk.26.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  243:           blk.26.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  244:             blk.26.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  245:           blk.26.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  246:             blk.27.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  247:             blk.27.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  248:             blk.27.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  249:        blk.27.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  250:          blk.27.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  251:           blk.27.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  252:           blk.27.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  253:             blk.27.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  254:           blk.27.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  255:             blk.28.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  256:             blk.28.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  257:             blk.28.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  258:        blk.28.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  259:          blk.28.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  260:           blk.28.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  261:           blk.28.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  262:             blk.28.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  263:           blk.28.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  264:             blk.29.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  265:             blk.29.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  266:             blk.29.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  267:        blk.29.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  268:          blk.29.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  269:           blk.29.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  270:           blk.29.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  271:             blk.29.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  272:           blk.29.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  273:             blk.30.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  274:             blk.30.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  275:             blk.30.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  276:        blk.30.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  277:          blk.30.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  278:           blk.30.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  279:           blk.30.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  280:             blk.30.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  281:           blk.30.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  282:             blk.31.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  283:             blk.31.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  284:             blk.31.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  285:        blk.31.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  286:          blk.31.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  287:           blk.31.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  288:           blk.31.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  289:             blk.31.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  290:           blk.31.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  291:             blk.32.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  292:             blk.32.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  293:             blk.32.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  294:        blk.32.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  295:          blk.32.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  296:           blk.32.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  297:           blk.32.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  298:             blk.32.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  299:           blk.32.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  300:             blk.33.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  301:             blk.33.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  302:             blk.33.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  303:        blk.33.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  304:          blk.33.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  305:           blk.33.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  306:           blk.33.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  307:             blk.33.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  308:           blk.33.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  309:             blk.34.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  310:             blk.34.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  311:             blk.34.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  312:        blk.34.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  313:          blk.34.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  314:           blk.34.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  315:           blk.34.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  316:             blk.34.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  317:           blk.34.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  318:             blk.35.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  319:             blk.35.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  320:             blk.35.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  321:        blk.35.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  322:          blk.35.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  323:           blk.35.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  324:           blk.35.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  325:             blk.35.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  326:           blk.35.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  327:             blk.36.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  328:             blk.36.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  329:             blk.36.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  330:        blk.36.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  331:          blk.36.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  332:           blk.36.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  333:           blk.36.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  334:             blk.36.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  335:           blk.36.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  336:             blk.37.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  337:             blk.37.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  338:             blk.37.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  339:        blk.37.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  340:          blk.37.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  341:           blk.37.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  342:           blk.37.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  343:             blk.37.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  344:           blk.37.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  345:             blk.38.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  346:             blk.38.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  347:             blk.38.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  348:        blk.38.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  349:          blk.38.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  350:           blk.38.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  351:           blk.38.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  352:             blk.38.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  353:           blk.38.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  354:             blk.39.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  355:             blk.39.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  356:             blk.39.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  357:        blk.39.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  358:          blk.39.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
-      "llama_model_loader: - tensor  359:           blk.39.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  360:           blk.39.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]\n",
-      "llama_model_loader: - tensor  361:             blk.39.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]\n",
-      "llama_model_loader: - tensor  362:           blk.39.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.Q6_K.gguf (version GGUF V2 (latest))\n",
+      "llama_model_loader: - tensor    0:                token_embd.weight q6_K     [  5120, 32000,     1,     1 ]\n",
+      "llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor    6:              blk.0.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    7:         blk.0.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    8:              blk.0.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor    9:              blk.0.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   10:           blk.1.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   11:            blk.1.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   12:            blk.1.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   13:              blk.1.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   14:            blk.1.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   15:              blk.1.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   16:         blk.1.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   17:              blk.1.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   18:              blk.1.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   19:          blk.10.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   20:           blk.10.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   21:           blk.10.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   22:             blk.10.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   23:           blk.10.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   24:             blk.10.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   25:        blk.10.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   26:             blk.10.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   27:             blk.10.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   28:          blk.11.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   29:           blk.11.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   30:           blk.11.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   31:             blk.11.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   32:           blk.11.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   33:             blk.11.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   34:        blk.11.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   35:             blk.11.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   36:             blk.11.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   37:          blk.12.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   38:           blk.12.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   39:           blk.12.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   40:             blk.12.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   41:           blk.12.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   42:             blk.12.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   43:        blk.12.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   44:             blk.12.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   45:             blk.12.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   46:          blk.13.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   47:           blk.13.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   48:           blk.13.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   49:             blk.13.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   50:           blk.13.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   51:             blk.13.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   52:        blk.13.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   53:             blk.13.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   54:             blk.13.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   55:          blk.14.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   56:           blk.14.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   57:           blk.14.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   58:             blk.14.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   59:           blk.14.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   60:             blk.14.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   61:        blk.14.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   62:             blk.14.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   63:             blk.14.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   64:             blk.15.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   65:             blk.15.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   66:           blk.2.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   67:            blk.2.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   68:            blk.2.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   69:              blk.2.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   70:            blk.2.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   71:              blk.2.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   72:         blk.2.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   73:              blk.2.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   74:              blk.2.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   75:           blk.3.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   76:            blk.3.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   77:            blk.3.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   78:              blk.3.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   79:            blk.3.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   80:              blk.3.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   81:         blk.3.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   82:              blk.3.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   83:              blk.3.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   84:           blk.4.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   85:            blk.4.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   86:            blk.4.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   87:              blk.4.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   88:            blk.4.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   89:              blk.4.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   90:         blk.4.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   91:              blk.4.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   92:              blk.4.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   93:           blk.5.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   94:            blk.5.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   95:            blk.5.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   96:              blk.5.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor   97:            blk.5.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   98:              blk.5.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor   99:         blk.5.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  100:              blk.5.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  101:              blk.5.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  102:           blk.6.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  103:            blk.6.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  104:            blk.6.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  105:              blk.6.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  106:            blk.6.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  107:              blk.6.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  108:         blk.6.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  109:              blk.6.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  110:              blk.6.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  111:           blk.7.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  112:            blk.7.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  113:            blk.7.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  114:              blk.7.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  115:            blk.7.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  116:              blk.7.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  117:         blk.7.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  118:              blk.7.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  119:              blk.7.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  120:           blk.8.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  121:            blk.8.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  122:            blk.8.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  123:              blk.8.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  124:            blk.8.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  125:              blk.8.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  126:         blk.8.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  127:              blk.8.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  128:              blk.8.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  129:           blk.9.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  130:            blk.9.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  131:            blk.9.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  132:              blk.9.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  133:            blk.9.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  134:              blk.9.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  135:         blk.9.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  136:              blk.9.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  137:              blk.9.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  138:          blk.15.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  139:           blk.15.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  142:           blk.15.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  143:        blk.15.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  144:             blk.15.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  145:          blk.16.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  146:           blk.16.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  147:           blk.16.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  148:             blk.16.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  149:           blk.16.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  150:             blk.16.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  151:        blk.16.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  152:             blk.16.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  153:             blk.16.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  154:          blk.17.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  155:           blk.17.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  156:           blk.17.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  157:             blk.17.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  158:           blk.17.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  159:             blk.17.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  160:        blk.17.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  161:             blk.17.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  162:             blk.17.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  163:          blk.18.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  164:           blk.18.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  165:           blk.18.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  166:             blk.18.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  167:           blk.18.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  168:             blk.18.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  169:        blk.18.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  170:             blk.18.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  171:             blk.18.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  172:          blk.19.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  173:           blk.19.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  174:           blk.19.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  175:             blk.19.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  176:           blk.19.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  177:             blk.19.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  178:        blk.19.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  179:             blk.19.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  180:             blk.19.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  181:          blk.20.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  182:           blk.20.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  183:           blk.20.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  184:             blk.20.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  185:           blk.20.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  186:             blk.20.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  187:        blk.20.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  188:             blk.20.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  189:             blk.20.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  190:          blk.21.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  191:           blk.21.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  192:           blk.21.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  193:             blk.21.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  194:           blk.21.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  195:             blk.21.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  196:        blk.21.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  197:             blk.21.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  198:             blk.21.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  199:          blk.22.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  200:           blk.22.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  201:           blk.22.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  202:             blk.22.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  203:           blk.22.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  204:             blk.22.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  205:        blk.22.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  206:             blk.22.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  207:             blk.22.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  208:          blk.23.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  209:           blk.23.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  210:           blk.23.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  211:             blk.23.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  212:           blk.23.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  213:             blk.23.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  214:        blk.23.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  215:             blk.23.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  216:             blk.23.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  217:          blk.24.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  218:           blk.24.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  219:           blk.24.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  220:             blk.24.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  221:           blk.24.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  222:             blk.24.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  223:        blk.24.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  224:             blk.24.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  225:             blk.24.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  226:          blk.25.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  227:           blk.25.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  228:           blk.25.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  229:             blk.25.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  230:           blk.25.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  231:             blk.25.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  232:        blk.25.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  233:             blk.25.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  234:             blk.25.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  235:          blk.26.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  236:           blk.26.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  237:           blk.26.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  238:             blk.26.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  239:           blk.26.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  240:             blk.26.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  241:        blk.26.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  242:             blk.26.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  243:             blk.26.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  244:          blk.27.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  245:           blk.27.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  246:           blk.27.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  247:             blk.27.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  248:           blk.27.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  249:             blk.27.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  250:        blk.27.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  251:             blk.27.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  252:             blk.27.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  253:          blk.28.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  254:           blk.28.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  255:           blk.28.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  256:             blk.28.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  257:           blk.28.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  258:             blk.28.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  259:        blk.28.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  260:             blk.28.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  261:             blk.28.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  262:          blk.29.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  263:           blk.29.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  264:           blk.29.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  265:             blk.29.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  266:           blk.29.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  267:             blk.29.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  268:        blk.29.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  269:             blk.29.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  270:             blk.29.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  271:           blk.30.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  272:             blk.30.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  273:             blk.30.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  274:        blk.30.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  275:             blk.30.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  276:             blk.30.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  277:                    output.weight q6_K     [  5120, 32000,     1,     1 ]\n",
+      "llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  279:           blk.30.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  280:           blk.30.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  281:          blk.31.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  282:           blk.31.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  283:           blk.31.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  284:             blk.31.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  285:           blk.31.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  286:             blk.31.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  287:        blk.31.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  288:             blk.31.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  289:             blk.31.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  290:          blk.32.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  291:           blk.32.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  292:           blk.32.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  293:             blk.32.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  294:           blk.32.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  295:             blk.32.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  296:        blk.32.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  297:             blk.32.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  298:             blk.32.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  299:          blk.33.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  300:           blk.33.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  301:           blk.33.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  302:             blk.33.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  303:           blk.33.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  304:             blk.33.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  305:        blk.33.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  306:             blk.33.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  307:             blk.33.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  308:          blk.34.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  309:           blk.34.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  310:           blk.34.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  311:             blk.34.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  312:           blk.34.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  313:             blk.34.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  314:        blk.34.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  315:             blk.34.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  316:             blk.34.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  317:          blk.35.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  318:           blk.35.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  319:           blk.35.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  320:             blk.35.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  321:           blk.35.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  322:             blk.35.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  323:        blk.35.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  324:             blk.35.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  325:             blk.35.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  326:          blk.36.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  327:           blk.36.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  328:           blk.36.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  329:             blk.36.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  330:           blk.36.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  331:             blk.36.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  332:        blk.36.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  333:             blk.36.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  334:             blk.36.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  335:          blk.37.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  336:           blk.37.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  337:           blk.37.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  338:             blk.37.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  339:           blk.37.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  340:             blk.37.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  341:        blk.37.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  342:             blk.37.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  343:             blk.37.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  344:          blk.38.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  345:           blk.38.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  346:           blk.38.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  347:             blk.38.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  348:           blk.38.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  349:             blk.38.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  350:        blk.38.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  351:             blk.38.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  352:             blk.38.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  353:          blk.39.attn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  354:           blk.39.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  355:           blk.39.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  356:             blk.39.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]\n",
+      "llama_model_loader: - tensor  357:           blk.39.ffn_norm.weight f32      [  5120,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  358:             blk.39.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  359:        blk.39.attn_output.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  360:             blk.39.attn_q.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  361:             blk.39.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]\n",
+      "llama_model_loader: - tensor  362:               output_norm.weight f32      [  5120,     1,     1,     1 ]\n",
       "llama_model_loader: - kv   0:                       general.architecture str     \n",
       "llama_model_loader: - kv   1:                               general.name str     \n",
-      "llama_model_loader: - kv   2:                        general.description str     \n",
-      "llama_model_loader: - kv   3:                       llama.context_length u32     \n",
-      "llama_model_loader: - kv   4:                     llama.embedding_length u32     \n",
-      "llama_model_loader: - kv   5:                          llama.block_count u32     \n",
-      "llama_model_loader: - kv   6:                  llama.feed_forward_length u32     \n",
-      "llama_model_loader: - kv   7:                 llama.rope.dimension_count u32     \n",
-      "llama_model_loader: - kv   8:                 llama.attention.head_count u32     \n",
-      "llama_model_loader: - kv   9:              llama.attention.head_count_kv u32     \n",
-      "llama_model_loader: - kv  10:     llama.attention.layer_norm_rms_epsilon f32     \n",
+      "llama_model_loader: - kv   2:                       llama.context_length u32     \n",
+      "llama_model_loader: - kv   3:                     llama.embedding_length u32     \n",
+      "llama_model_loader: - kv   4:                          llama.block_count u32     \n",
+      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32     \n",
+      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32     \n",
+      "llama_model_loader: - kv   7:                 llama.attention.head_count u32     \n",
+      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32     \n",
+      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32     \n",
+      "llama_model_loader: - kv  10:                          general.file_type u32     \n",
       "llama_model_loader: - kv  11:                       tokenizer.ggml.model str     \n",
       "llama_model_loader: - kv  12:                      tokenizer.ggml.tokens arr     \n",
       "llama_model_loader: - kv  13:                      tokenizer.ggml.scores arr     \n",
       "llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr     \n",
-      "llama_model_loader: - kv  15:            tokenizer.ggml.unknown_token_id u32     \n",
-      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32     \n",
-      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32     \n",
+      "llama_model_loader: - kv  15:                tokenizer.ggml.bos_token_id u32     \n",
+      "llama_model_loader: - kv  16:                tokenizer.ggml.eos_token_id u32     \n",
+      "llama_model_loader: - kv  17:            tokenizer.ggml.unknown_token_id u32     \n",
+      "llama_model_loader: - kv  18:               general.quantization_version u32     \n",
       "llama_model_loader: - type  f32:   81 tensors\n",
-      "llama_model_loader: - type q4_K:  281 tensors\n",
-      "llama_model_loader: - type q6_K:    1 tensors\n",
+      "llama_model_loader: - type q6_K:  282 tensors\n",
       "llm_load_print_meta: format         = GGUF V2 (latest)\n",
       "llm_load_print_meta: arch           = llama\n",
       "llm_load_print_meta: vocab type     = SPM\n",
       "llm_load_print_meta: n_vocab        = 32000\n",
       "llm_load_print_meta: n_merges       = 0\n",
-      "llm_load_print_meta: n_ctx_train    = 2048\n",
+      "llm_load_print_meta: n_ctx_train    = 4096\n",
       "llm_load_print_meta: n_ctx          = 3900\n",
       "llm_load_print_meta: n_embd         = 5120\n",
       "llm_load_print_meta: n_head         = 40\n",
@@ -1477,95 +1571,86 @@
       "llm_load_print_meta: n_rot          = 128\n",
       "llm_load_print_meta: n_gqa          = 1\n",
       "llm_load_print_meta: f_norm_eps     = 1.0e-05\n",
-      "llm_load_print_meta: f_norm_rms_eps = 5.0e-06\n",
+      "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
       "llm_load_print_meta: n_ff           = 13824\n",
       "llm_load_print_meta: freq_base      = 10000.0\n",
       "llm_load_print_meta: freq_scale     = 1\n",
       "llm_load_print_meta: model type     = 13B\n",
-      "llm_load_print_meta: model ftype    = mostly Q4_K - Medium (guessed)\n",
+      "llm_load_print_meta: model ftype    = mostly Q6_K\n",
       "llm_load_print_meta: model size     = 13.02 B\n",
-      "llm_load_print_meta: general.name   = llama-2-13b-chat.ggmlv3.q4_K_S.bin\n",
+      "llm_load_print_meta: general.name   = LLaMA v2\n",
       "llm_load_print_meta: BOS token = 1 '<s>'\n",
       "llm_load_print_meta: EOS token = 2 '</s>'\n",
       "llm_load_print_meta: UNK token = 0 '<unk>'\n",
       "llm_load_print_meta: LF token  = 13 '<0x0A>'\n",
       "llm_load_tensors: ggml ctx size =    0.12 MB\n",
-      "llm_load_tensors: mem required  = 7024.01 MB (+ 3046.88 MB per state)\n",
-      "...................................................................................................\n",
+      "llm_load_tensors: mem required  = 10183.83 MB (+ 3046.88 MB per state)\n",
+      "....................................................................................................\n",
       "llama_new_context_with_model: kv self size  = 3046.88 MB\n",
       "ggml_metal_init: allocating\n",
       "ggml_metal_init: loading '/Users/rchan/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_cpp/ggml-metal.metal'\n",
-      "ggml_metal_init: loaded kernel_add                            0x17796b1b0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_add_row                        0x17796ab00 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul                            0x17796b860 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_row                        0x177962df0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_scale                          0x177964200 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_silu                           0x177963b50 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_relu                           0x177952de0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_gelu                           0x17796c190 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_soft_max                       0x17796c3f0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x17796c650 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_f16                   0x17796c8b0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x17796cb10 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x17796cd70 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q8_0                  0x17796cfd0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q2_K                  0x17796d230 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q3_K                  0x17796d490 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_K                  0x17796d6f0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q5_K                  0x17796d950 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_get_rows_q6_K                  0x17796dbb0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_rms_norm                       0x17796de10 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_norm                           0x17796e070 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x17796e2d0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x17796e530 | th_max =  896 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x17796e790 | th_max =  896 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q8_0_f32               0x17796e9f0 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q2_K_f32               0x17796ec50 | th_max =  640 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q3_K_f32               0x17796eeb0 | th_max =  704 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_K_f32               0x17796f110 | th_max =  576 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q5_K_f32               0x17796f370 | th_max =  576 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q6_K_f32               0x17796f5d0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_f16_f32                 0x17796f830 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q4_0_f32                0x17796fa90 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q8_0_f32                0x17796fcf0 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q4_1_f32                0x17796ff50 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q2_K_f32                0x1779701b0 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q3_K_f32                0x177970410 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q4_K_f32                0x177970670 | th_max =  768 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q5_K_f32                0x1779708d0 | th_max =  704 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_mul_mm_q6_K_f32                0x177970b30 | th_max =  704 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_rope                           0x177970d90 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_alibi_f32                      0x177970ff0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x177971250 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x1779714b0 | th_max = 1024 | th_width =   32\n",
-      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x177971710 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_add                            0x1010135f0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_add_row                        0x101013a80 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul                            0x101013e00 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_row                        0x101014290 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_scale                          0x101014610 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_silu                           0x101014990 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_relu                           0x101014d10 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_gelu                           0x101015090 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_soft_max                       0x1010155a0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x101015a60 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_f16                   0x101015f70 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x101016480 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x101016990 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q8_0                  0x101016ea0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q2_K                  0x1010173b0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q3_K                  0x1010178c0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_K                  0x101017dd0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q5_K                  0x1010182e0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_get_rows_q6_K                  0x1010187f0 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_rms_norm                       0x101018d10 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_norm                           0x101019220 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x101019930 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x101019ef0 | th_max =  896 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x10101a4b0 | th_max =  896 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q8_0_f32               0x10101aa70 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q2_K_f32               0x10101b030 | th_max =  640 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q3_K_f32               0x10101b5f0 | th_max =  704 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_K_f32               0x10101bbb0 | th_max =  576 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q5_K_f32               0x10101c170 | th_max =  576 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q6_K_f32               0x10101c730 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_f16_f32                 0x10101cd30 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q4_0_f32                0x10101d330 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q8_0_f32                0x10101d930 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q4_1_f32                0x10101df30 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q2_K_f32                0x10101e530 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q3_K_f32                0x10101eb30 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q4_K_f32                0x10101f390 | th_max =  768 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q5_K_f32                0x10101f990 | th_max =  704 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_mul_mm_q6_K_f32                0x10101ff90 | th_max =  704 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_rope                           0x101020310 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_alibi_f32                      0x101020a30 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x101021120 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x101021810 | th_max = 1024 | th_width =   32\n",
+      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x101021f00 | th_max = 1024 | th_width =   32\n",
       "ggml_metal_init: recommendedMaxWorkingSetSize  = 21845.34 MB\n",
       "ggml_metal_init: hasUnifiedMemory              = true\n",
-      "ggml_metal_init: maxTransferRate               = built-in GPU\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
+      "ggml_metal_init: maxTransferRate               = built-in GPU\n",
       "llama_new_context_with_model: compute buffer total size =  356.16 MB\n",
       "llama_new_context_with_model: max tensor size =   128.17 MB\n",
-      "ggml_metal_add_buffer: allocated 'data            ' buffer, size =  7024.61 MB, (22067.39 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n",
-      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =     1.48 MB, (22068.88 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n",
-      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =  3048.88 MB, (25117.75 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n",
+      "ggml_metal_add_buffer: allocated 'data            ' buffer, size = 10184.42 MB, (10184.92 / 21845.34)\n",
+      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =     1.48 MB, (10186.41 / 21845.34)\n",
+      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =  3048.88 MB, (13235.28 / 21845.34)\n",
       "AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | \n",
-      "ggml_metal_add_buffer: allocated 'alloc           ' buffer, size =   354.70 MB, (25472.45 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n",
-      "ggml_metal_free: deallocating\n"
+      "ggml_metal_add_buffer: allocated 'alloc           ' buffer, size =   354.70 MB, (13589.98 / 21845.34)\n"
      ]
     }
    ],
    "source": [
     "llm = LlamaCPP(\n",
-    "    model_path=\"../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin\",\n",
+    "    model_path=llama_2_path,\n",
     "    temperature=0.1,\n",
-    "    max_new_tokens=256,\n",
-    "    # llama2 has a context window of 4096 tokens,\n",
-    "    # but we set it lower to allow for some wiggle room\n",
+    "    max_new_tokens=1024,\n",
     "    context_window=3900,\n",
     "    # kwargs to pass to __call__()\n",
     "    generate_kwargs={},\n",
@@ -1581,22 +1666,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
-   "id": "d694eda6",
+   "execution_count": 13,
+   "id": "8760d520",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LLMMetadata(context_window=3900, num_output=1024, is_chat_model=False, is_function_calling_model=False, model_name='../../gguf_models/llama-2-13b-chat.Q6_K.gguf')"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "843abeb0",
    "metadata": {},
    "outputs": [],
    "source": [
     "handbook = pd.read_csv(\"../../data/public/handbook-scraped.csv\")\n",
-    "turing = pd.read_csv(\"../../data/public/turingacuk-no-boilerplate.csv\")\n",
+    "wiki = pd.read_csv(\"../../data/turing_internal/wiki-scraped.csv\")\n",
+    "# turing = pd.read_csv(\"../../data/public/turingacuk-no-boilerplate.csv\")\n",
     "\n",
-    "text_list = list(handbook[\"body\"].astype(\"str\")) + list(turing[\"body\"].astype(\"str\"))\n",
+    "text_list = list(handbook[\"body\"].astype(\"str\")) + list(wiki[\"body\"].astype(\"str\"))\n",
     "documents = [Document(text=t) for t in text_list]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
-   "id": "99089231",
+   "execution_count": 15,
+   "id": "518887f1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1606,21 +1713,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
-   "id": "42d1da70",
+   "execution_count": 16,
+   "id": "68b0c63e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "HuggingFaceEmbeddings(client=SentenceTransformer(\n",
+       "  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel \n",
+       "  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})\n",
+       "  (2): Normalize()\n",
+       "), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hfemb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "88198eb3",
    "metadata": {},
    "outputs": [],
    "source": [
     "# set number of output tokens\n",
-    "num_output = 256\n",
+    "num_output = 1024\n",
     "# set maximum input size\n",
-    "max_input_size = 2048\n",
+    "context_window = 4096\n",
     "# set maximum chunk overlap\n",
-    "chunk_size_limit = 1024\n",
+    "chunk_size_limit = 512\n",
     "chunk_overlap_ratio = 0.1\n",
     "\n",
     "prompt_helper = PromptHelper(\n",
-    "    context_window=max_input_size,\n",
+    "    context_window=context_window,\n",
     "    num_output=num_output,\n",
     "    chunk_size_limit=chunk_size_limit,\n",
     "    chunk_overlap_ratio=chunk_overlap_ratio,\n",
@@ -1629,67 +1761,89 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
-   "id": "da87ad9c",
+   "execution_count": 18,
+   "id": "7a2ee69a",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package punkt to\n",
-      "[nltk_data]     /Users/rchan/Library/Caches/llama_index...\n",
-      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     " service_context = ServiceContext.from_defaults(\n",
-    "    llm_predictor=llm,\n",
+    "    llm_predictor=LLMPredictor(llm=llm),\n",
     "    embed_model=embed_model,\n",
     "    prompt_helper=prompt_helper,\n",
-    "    chunk_size_limit=chunk_size_limit,\n",
+    "    chunk_size=chunk_size_limit,\n",
     ")\n",
     "\n",
-    "index = GPTVectorStoreIndex.from_documents(\n",
+    "index = VectorStoreIndex.from_documents(\n",
     "    documents, service_context=service_context\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
-   "id": "fac67c98",
+   "execution_count": 19,
+   "id": "a614a37a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_mode = \"simple_summarize\"\n",
+    "similarity_top_k = 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "6e6a29f3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "query_engine = index.as_query_engine()"
+    "query_engine = index.as_query_engine(response_mode=response_mode,\n",
+    "                                     similarity_top_k=similarity_top_k)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
-   "id": "4e603ec1",
+   "execution_count": 21,
+   "id": "eecc1b2f",
    "metadata": {},
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "'LlamaCPP' object has no attribute 'predict'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[65], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mquery_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mwhat should a new starter in REG do?\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(response\u001b[38;5;241m.\u001b[39mresponse)\n",
-      "File \u001b[0;32m~/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_index/indices/query/base.py:23\u001b[0m, in \u001b[0;36mBaseQueryEngine.query\u001b[0;34m(self, str_or_query_bundle)\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(str_or_query_bundle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m     22\u001b[0m     str_or_query_bundle \u001b[38;5;241m=\u001b[39m QueryBundle(str_or_query_bundle)\n\u001b[0;32m---> 23\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstr_or_query_bundle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\n",
-      "File \u001b[0;32m~/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_index/query_engine/retriever_query_engine.py:171\u001b[0m, in \u001b[0;36mRetrieverQueryEngine._query\u001b[0;34m(self, query_bundle)\u001b[0m\n\u001b[1;32m    165\u001b[0m         nodes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mretrieve(query_bundle)\n\u001b[1;32m    167\u001b[0m         retrieve_event\u001b[38;5;241m.\u001b[39mon_end(\n\u001b[1;32m    168\u001b[0m             payload\u001b[38;5;241m=\u001b[39m{EventPayload\u001b[38;5;241m.\u001b[39mNODES: nodes},\n\u001b[1;32m    169\u001b[0m         )\n\u001b[0;32m--> 171\u001b[0m     response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_response_synthesizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msynthesize\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    172\u001b[0m \u001b[43m        \u001b[49m\u001b[43mquery\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery_bundle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    173\u001b[0m \u001b[43m        \u001b[49m\u001b[43mnodes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnodes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    174\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    176\u001b[0m     query_event\u001b[38;5;241m.\u001b[39mon_end(payload\u001b[38;5;241m=\u001b[39m{EventPayload\u001b[38;5;241m.\u001b[39mRESPONSE: response})\n\u001b[1;32m    178\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\n",
-      "File \u001b[0;32m~/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_index/response_synthesizers/base.py:125\u001b[0m, in \u001b[0;36mBaseSynthesizer.synthesize\u001b[0;34m(self, query, nodes, additional_source_nodes)\u001b[0m\n\u001b[1;32m    120\u001b[0m     query \u001b[38;5;241m=\u001b[39m QueryBundle(query_str\u001b[38;5;241m=\u001b[39mquery)\n\u001b[1;32m    122\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_callback_manager\u001b[38;5;241m.\u001b[39mevent(\n\u001b[1;32m    123\u001b[0m     CBEventType\u001b[38;5;241m.\u001b[39mSYNTHESIZE, payload\u001b[38;5;241m=\u001b[39m{EventPayload\u001b[38;5;241m.\u001b[39mQUERY_STR: query\u001b[38;5;241m.\u001b[39mquery_str}\n\u001b[1;32m    124\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m event:\n\u001b[0;32m--> 125\u001b[0m     response_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_response\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    126\u001b[0m \u001b[43m        \u001b[49m\u001b[43mquery_str\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery_str\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    127\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext_chunks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\n\u001b[1;32m    128\u001b[0m \u001b[43m            \u001b[49m\u001b[43mn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_content\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmetadata_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mMetadataMode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mLLM\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mn\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mnodes\u001b[49m\n\u001b[1;32m    129\u001b[0m \u001b[43m        \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    130\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    132\u001b[0m     additional_source_nodes \u001b[38;5;241m=\u001b[39m additional_source_nodes \u001b[38;5;129;01mor\u001b[39;00m []\n\u001b[1;32m    133\u001b[0m     source_nodes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(nodes) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlist\u001b[39m(additional_source_nodes)\n",
-      "File \u001b[0;32m~/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_index/response_synthesizers/compact_and_refine.py:34\u001b[0m, in \u001b[0;36mCompactAndRefine.get_response\u001b[0;34m(self, query_str, text_chunks, **response_kwargs)\u001b[0m\n\u001b[1;32m     30\u001b[0m \u001b[38;5;66;03m# use prompt helper to fix compact text_chunks under the prompt limitation\u001b[39;00m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;66;03m# TODO: This is a temporary fix - reason it's temporary is that\u001b[39;00m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;66;03m# the refine template does not account for size of previous answer.\u001b[39;00m\n\u001b[1;32m     33\u001b[0m new_texts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_compact_text_chunks(query_str, text_chunks)\n\u001b[0;32m---> 34\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_response\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     35\u001b[0m \u001b[43m    \u001b[49m\u001b[43mquery_str\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery_str\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_chunks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_texts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kwargs\u001b[49m\n\u001b[1;32m     36\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\n",
-      "File \u001b[0;32m~/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_index/response_synthesizers/refine.py:120\u001b[0m, in \u001b[0;36mRefine.get_response\u001b[0;34m(self, query_str, text_chunks, **response_kwargs)\u001b[0m\n\u001b[1;32m    116\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m text_chunk \u001b[38;5;129;01min\u001b[39;00m text_chunks:\n\u001b[1;32m    117\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m prev_response_obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    118\u001b[0m         \u001b[38;5;66;03m# if this is the first chunk, and text chunk already\u001b[39;00m\n\u001b[1;32m    119\u001b[0m         \u001b[38;5;66;03m# is an answer, then return it\u001b[39;00m\n\u001b[0;32m--> 120\u001b[0m         response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_give_response_single\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    121\u001b[0m \u001b[43m            \u001b[49m\u001b[43mquery_str\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    122\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtext_chunk\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    123\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    124\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    125\u001b[0m         response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_refine_response_single(\n\u001b[1;32m    126\u001b[0m             prev_response_obj, query_str, text_chunk\n\u001b[1;32m    127\u001b[0m         )\n",
-      "File \u001b[0;32m~/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_index/response_synthesizers/refine.py:177\u001b[0m, in \u001b[0;36mRefine._give_response_single\u001b[0;34m(self, query_str, text_chunk, **response_kwargs)\u001b[0m\n\u001b[1;32m    174\u001b[0m query_satisfied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    175\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_streaming:\n\u001b[1;32m    176\u001b[0m     structured_response \u001b[38;5;241m=\u001b[39m cast(\n\u001b[0;32m--> 177\u001b[0m         StructuredRefineResponse, \u001b[43mprogram\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext_str\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcur_text_chunk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    178\u001b[0m     )\n\u001b[1;32m    179\u001b[0m     query_satisfied \u001b[38;5;241m=\u001b[39m structured_response\u001b[38;5;241m.\u001b[39mquery_satisfied\n\u001b[1;32m    180\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m query_satisfied:\n",
-      "File \u001b[0;32m~/opt/miniconda3/envs/reginald/lib/python3.11/site-packages/llama_index/response_synthesizers/refine.py:60\u001b[0m, in \u001b[0;36mDefaultRefineProgram.__call__\u001b[0;34m(self, *args, **kwds)\u001b[0m\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m StructuredRefineResponse:\n\u001b[0;32m---> 60\u001b[0m     answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_llm_predictor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m(\n\u001b[1;32m     61\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prompt,\n\u001b[1;32m     62\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds,\n\u001b[1;32m     63\u001b[0m     )\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m StructuredRefineResponse(answer\u001b[38;5;241m=\u001b[39manswer, query_satisfied\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'LlamaCPP' object has no attribute 'predict'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  As a new starter in REG, you should:\n",
+      "\n",
+      "1. Attend buddy meetings with your assigned buddies to get familiarized with the team and ask any questions you may have.\n",
+      "2. Attend HR induction and IT induction meetings to discuss general information such as pay, health, leaves, benefits, and accounts.\n",
+      "3. Meet with your line manager to discuss your role and responsibilities.\n",
+      "4. Shadow meetings across the group to get a feel for how REG works and meet people.\n",
+      "5. Complete all necessary forms on Cezanne, including personal details, bank details, NI Health form, Additional characteristics form, and signed and dated scanned contract.\n",
+      "6. Upload your photo to the Documents section on Cezanne.\n",
+      "7. Request a British Library pass to access the office.\n",
+      "8. Complete Agenda screening (if you haven't already done so).\n",
+      "9. Read about health and dental insurance and decide whether to sign up.\n",
+      "10. Check the Turing Benefits site for useful discounts.\n",
+      "11. Provide a description for your public profile on the Turing website.\n",
+      "12. Verify your MoorePay account for payslips.\n",
+      "\n",
+      "It is also recommended that you:\n",
+      "\n",
+      "* Join in for welcome coffee(s) to introduce yourself to the whole REG team.\n",
+      "* Attend 1-on-1 meetings with REG's Director within the first few weeks.\n",
+      "* Use the time before being assigned to a project to set up your laptop and tools, get to know people, read the handbook and internal wiki, and shadow meetings.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   254.43 ms /   363 runs   (    0.70 ms per token,  1426.70 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 29778.92 ms /  1296 tokens (   22.98 ms per token,    43.52 tokens per second)\n",
+      "llama_print_timings:        eval time = 41385.82 ms /   362 runs   (  114.33 ms per token,     8.75 tokens per second)\n",
+      "llama_print_timings:       total time = 71899.77 ms\n"
      ]
     }
    ],
@@ -1699,100 +1853,1215 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "9e301002",
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "5a338a7b",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "## Chat engine"
+    "len(response.source_nodes)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "f4a6b68d",
+   "execution_count": 23,
+   "id": "b4b62bea",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Based on the provided context information, the starting salary for a standard RSE in REG would be £40,000. This is the bottom third baseline for the Standard role in the 2023-24 Institute-wide HERA Bands.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    42.72 ms /    61 runs   (    0.70 ms per token,  1427.84 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 49457.50 ms /  1809 tokens (   27.34 ms per token,    36.58 tokens per second)\n",
+      "llama_print_timings:        eval time =  7267.23 ms /    60 runs   (  121.12 ms per token,     8.26 tokens per second)\n",
+      "llama_print_timings:       total time = 56845.16 ms\n"
+     ]
+    }
+   ],
    "source": [
-    "chat_engine = index.as_chat_engine(chat_mode=\"react\", verbose=True)"
+    "response = query_engine.query(\"What is the starting salary for a standard RSE in REG?\")\n",
+    "print(response.response)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5f8ce782",
+   "cell_type": "markdown",
+   "id": "d4212724",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "response = chat_engine.chat(\n",
-    "    \"what should a new starter in REG do?\"\n",
-    ")"
+    "## Context chat engine"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "351013fc",
+   "execution_count": 24,
+   "id": "dedcf17d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(response)"
+    "system_prompt=(\n",
+    "    \"You are a helpful assistant, able to have normal interactions, \"\n",
+    "    # \"as well as talk about the Research Engineering Group (REG) \"\n",
+    "    # \"and The Alan Turing Institute based on the context provided. \"\n",
+    "    \"Please answer questions with the context provided if it is relevant. \"\n",
+    "    \"If the context is not related to the question or message, answer normally. \"\n",
+    "    \"Do not speculate or make up information. \"\n",
+    "    \"Do not reference any given instructions or context. \"\n",
+    "    \"Do not thank me for any additional context. \"\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "20449087",
+   "execution_count": 25,
+   "id": "647ab6db",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'You are a helpful assistant, able to have normal interactions, Please answer questions with the context provided if it is relevant. If the context is not related to the question or message, answer normally. Do not speculate or make up information. Do not reference any given instructions or context. Do not thank me for any additional context. '"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "response = chat_engine.chat(\"What did I ask you before?\")"
+    "system_prompt"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "f1a2bab4",
+   "execution_count": 26,
+   "id": "5124242d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(response)"
+    "chat_engine = index.as_chat_engine(\n",
+    "    chat_mode=\"context\",\n",
+    "    response_mode=response_mode,\n",
+    "    similarity_top_k=similarity_top_k,\n",
+    "    system_prompt=system_prompt,\n",
+    ")"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "0327d628",
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "5732405b",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, content='You are a helpful assistant, able to have normal interactions, Please answer questions with the context provided if it is relevant. If the context is not related to the question or message, answer normally. Do not speculate or make up information. Do not reference any given instructions or context. Do not thank me for any additional context. ', additional_kwargs={})]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "Reset chat engine..."
+    "chat_engine._prefix_messages"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "b055a7ef",
+   "execution_count": 28,
+   "id": "81b31c72",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<bound method llm_chat_callback.<locals>.wrap.<locals>.wrapped_llm_chat of LlamaCPP(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x5391e0750>, model_url='https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin', model_path='../../gguf_models/llama-2-13b-chat.Q6_K.gguf', temperature=0.1, max_new_tokens=1024, context_window=3900, messages_to_prompt=<function messages_to_prompt at 0x16346e0c0>, completion_to_prompt=<function completion_to_prompt at 0x16346e160>, generate_kwargs={'temperature': 0.1, 'max_tokens': 1024, 'stream': False}, model_kwargs={'n_gpu_layers': 1, 'n_ctx': 3900, 'verbose': True}, verbose=True)>"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "chat_engine.reset()"
+    "chat_engine._llm.chat"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "a86a24cd",
+   "execution_count": 29,
+   "id": "10f1a940-38f9-476e-9db9-4a48afd02792",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Hello! I'm here to help with any questions you have. What would you like to know or discuss? Please keep in mind that I can only provide information based on the context provided, so if your question is not related to the context, I may not be able to assist. Go ahead and ask away!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    46.23 ms /    66 runs   (    0.70 ms per token,  1427.61 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  9458.20 ms /   536 tokens (   17.65 ms per token,    56.67 tokens per second)\n",
+      "llama_print_timings:        eval time =  6521.28 ms /    65 runs   (  100.33 ms per token,     9.97 tokens per second)\n",
+      "llama_print_timings:       total time = 16106.92 ms\n"
+     ]
+    }
+   ],
    "source": [
-    "response = chat_engine.chat(\"What did I ask you before?\")"
+    "response = chat_engine.chat(\n",
+    "    \"hello\"\n",
+    ")\n",
+    "print(response)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "d00949a5",
+   "execution_count": 30,
+   "id": "f81ee28c",
    "metadata": {},
-   "outputs": [],
-   "source": [
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Based on the provided context, here are some tasks that a new starter in REG might consider doing:\n",
+      "\n",
+      "1. Meet with their buddies: REG provides two buddies for each new starter to be friendly points of contact and help with any questions or issues.\n",
+      "2. Attend HR induction: HR will set up a meeting with the new starter to discuss general information such as pay, health, leaves, and benefits.\n",
+      "3. Attend IT induction: IT will meet the new starter to discuss accounts and Turing wide systems.\n",
+      "4. Shadow meetings: REG offers new starters the opportunity to shadow meetings across the group to meet people and get a feel for how they work.\n",
+      "5. Complete Agenda screening: HR requires all new starters to complete an Agenda screening report before starting.\n",
+      "6. Upload personal details and documents to Cezanne: New starters should enter specific personal details, such as DOB, home address, emergency contact details, and bank details, and upload a photo to the Documents section on Cezanne.\n",
+      "7. Complete and reupload in Cezanne documents area: New starters should complete and reupload in Cezanne documents area \"BL partners - Health, Safety and Security\" form and HMRC new starters form.\n",
+      "8. Signed and dated scanned contract: New starters should sign and date their scanned contract and upload it to Cezanne.\n",
+      "9. British Library pass: New starters should complete and reupload in Cezanne documents area \"BL partners - Health, Safety and Security\" form to get a British Library pass.\n",
+      "10. Read about health and dental insurance: REG provides health and dental insurance options, and new starters should read about them and decide whether to sign up.\n",
+      "11. Check the Turing Benefits site: The Turing Benefits site offers useful discounts, and new starters should check it to see if there are any discounts they can take advantage of.\n",
+      "12. Send P45 from previous job to HR contact directly by email: New starters should send their P45 from their previous job to the HR contact directly by email.\n",
+      "13. Provide a description for the ATI webpage: New starters should provide a description for their public profile on the Turing website.\n",
+      "14. Verify your MoorePay account: New starters should verify their MoorePay account to receive payslips.\n",
+      "\n",
+      "Please note that this list is not exhaustive and may not include all tasks that a new starter in REG might need to do. It's always a good idea to check with the person in charge of onboarding or your line manager for specific instructions tailored to your needs.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   423.75 ms /   603 runs   (    0.70 ms per token,  1423.00 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 31744.09 ms /  1359 tokens (   23.36 ms per token,    42.81 tokens per second)\n",
+      "llama_print_timings:        eval time = 70752.81 ms /   602 runs   (  117.53 ms per token,     8.51 tokens per second)\n",
+      "llama_print_timings:       total time = 103788.19 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"what should a new starter in REG do?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "1fcc61d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Great, it sounds like you have already taken care of some of the essential tasks for new starters in REG! Here are a few more things you might consider doing:\n",
+      "\n",
+      "1. Familiarize yourself with the REG handbook: The REG handbook provides an overview of how REG operates, including information on projects, service areas, and 22 days time.\n",
+      "2. Explore the project tracker: The project tracker is a tool used to express preferences on upcoming projects and track progress. You can browse the tracker to get an idea of what REG is working on and express your interests in specific projects.\n",
+      "3. Join the #new-starters Slack channel: This channel is a great place to connect with other new starters and ask questions or share information.\n",
+      "4. Attend tech talks: REG runs tech talks every Tuesday lunchtime, which cover a range of topics related to research and technology. You can find the upcoming topics on the REG Tech Talks Slack channel.\n",
+      "5. Check out the Turing Data Stories (TDS) repository: TDS is a separate workspace for small projects that may be of interest to new starters. You can find more information about TDS and how to get involved in the TDS Slack channel.\n",
+      "6. Consider contributing to service areas: Service areas are REG-internal work, such as looking after the handbook, organizing recruitment, or managing computational resources. You may want to consider contributing to one service area, which should take approximately half a day a week.\n",
+      "7. Learn about the Turing Way: The Turing Way is a set of principles and practices that guide REG's work. You can find more information about the Turing Way on the Turing Complete website.\n",
+      "8. Network with other REG members: Connecting with other REG members can be a great way to learn more about the group and find opportunities for collaboration. You can find contact information for other REG members on the REG website or by reaching out to your line manager or buddies.\n",
+      "\n",
+      "Remember, these are just suggestions, and you should prioritize tasks based on your own needs and interests. Your line manager or buddies may also have specific tasks or recommendations for you to consider.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   353.78 ms /   505 runs   (    0.70 ms per token,  1427.43 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 71155.50 ms /  2339 tokens (   30.42 ms per token,    32.87 tokens per second)\n",
+      "llama_print_timings:        eval time = 66587.39 ms /   504 runs   (  132.12 ms per token,     7.57 tokens per second)\n",
+      "llama_print_timings:       total time = 138794.64 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"I've already completed my inductions and uploaded my \"\n",
+    "    \"documents for Cezanne and the ATI website, what else is there to do?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "23f92e2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  You're welcome! The link to the REG project tracker is:\n",
+      "\n",
+      "<https://alan-turing-institute.github.io/REG-handbook/docs/our_projects/project_tracker/>\n",
+      "\n",
+      "This page displays all upcoming, current, and completed projects in REG, along with information about each project's status and the issue number in the Hut23 repository. You can use this tracker to express preferences on upcoming projects and track progress.\n",
+      "\n",
+      "Please note that you may need to log in to access some of the links or features on the project tracker page. If you have any questions or need help finding something, feel free to reach out to your line manager or buddies for assistance.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   113.80 ms /   162 runs   (    0.70 ms per token,  1423.60 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 79735.58 ms /  2512 tokens (   31.74 ms per token,    31.50 tokens per second)\n",
+      "llama_print_timings:        eval time = 21263.09 ms /   161 runs   (  132.07 ms per token,     7.57 tokens per second)\n",
+      "llama_print_timings:       total time = 101326.02 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"thanks! what is the link to the project tracker?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "b26a7b4f-4bc2-4c09-816d-07cf9bf89c8b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Oh dear! It looks like I made a mistake. Thank you for letting me know.\n",
+      "\n",
+      "The REG project tracker is indeed located in the Hut23 GitHub repository, and you can access it by following these steps:\n",
+      "\n",
+      "1. Go to the Hut23 GitHub repository: <https://alan-turing-institute.github.io/Hut23/>\n",
+      "2. Click on the \"Projects\" tab in the top navigation menu.\n",
+      "3. You will see a list of all upcoming, current, and completed projects in REG, along with information about each project's status and the issue number in the Hut23 repository.\n",
+      "\n",
+      "You can use this tracker to express preferences on upcoming projects and track progress. If you have any questions or need help finding something, feel free to reach out to your line manager or buddies for assistance.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   133.99 ms /   191 runs   (    0.70 ms per token,  1425.46 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 92744.62 ms /  2632 tokens (   35.24 ms per token,    28.38 tokens per second)\n",
+      "llama_print_timings:        eval time = 26045.85 ms /   190 runs   (  137.08 ms per token,     7.29 tokens per second)\n",
+      "llama_print_timings:       total time = 119174.93 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"that link doesn't seem to be right. the project tracker is in the Hut23 GitHub repo\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "052d380f-6219-4756-82ec-8ca9d63fadd1",
+   "metadata": {},
+   "source": [
+    "Ran out of context length after this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "daff22ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[ChatMessage(role=<MessageRole.USER: 'user'>, content='hello', additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content=\"  Hello! I'm here to help with any questions you have. What would you like to know or discuss? Please keep in mind that I can only provide information based on the context provided, so if your question is not related to the context, I may not be able to assist. Go ahead and ask away!\", additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.USER: 'user'>, content='what should a new starter in REG do?', additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='  Based on the provided context, here are some tasks that a new starter in REG might consider doing:\\n\\n1. Meet with their buddies: REG provides two buddies for each new starter to be friendly points of contact and help with any questions or issues.\\n2. Attend HR induction: HR will set up a meeting with the new starter to discuss general information such as pay, health, leaves, and benefits.\\n3. Attend IT induction: IT will meet the new starter to discuss accounts and Turing wide systems.\\n4. Shadow meetings: REG offers new starters the opportunity to shadow meetings across the group to meet people and get a feel for how they work.\\n5. Complete Agenda screening: HR requires all new starters to complete an Agenda screening report before starting.\\n6. Upload personal details and documents to Cezanne: New starters should enter specific personal details, such as DOB, home address, emergency contact details, and bank details, and upload a photo to the Documents section on Cezanne.\\n7. Complete and reupload in Cezanne documents area: New starters should complete and reupload in Cezanne documents area \"BL partners - Health, Safety and Security\" form and HMRC new starters form.\\n8. Signed and dated scanned contract: New starters should sign and date their scanned contract and upload it to Cezanne.\\n9. British Library pass: New starters should complete and reupload in Cezanne documents area \"BL partners - Health, Safety and Security\" form to get a British Library pass.\\n10. Read about health and dental insurance: REG provides health and dental insurance options, and new starters should read about them and decide whether to sign up.\\n11. Check the Turing Benefits site: The Turing Benefits site offers useful discounts, and new starters should check it to see if there are any discounts they can take advantage of.\\n12. Send P45 from previous job to HR contact directly by email: New starters should send their P45 from their previous job to the HR contact directly by email.\\n13. Provide a description for the ATI webpage: New starters should provide a description for their public profile on the Turing website.\\n14. Verify your MoorePay account: New starters should verify their MoorePay account to receive payslips.\\n\\nPlease note that this list is not exhaustive and may not include all tasks that a new starter in REG might need to do. It\\'s always a good idea to check with the person in charge of onboarding or your line manager for specific instructions tailored to your needs.', additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.USER: 'user'>, content=\"I've already completed my inductions and uploaded my documents for Cezanne and the ATI website, what else is there to do?\", additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content=\"  Great, it sounds like you have already taken care of some of the essential tasks for new starters in REG! Here are a few more things you might consider doing:\\n\\n1. Familiarize yourself with the REG handbook: The REG handbook provides an overview of how REG operates, including information on projects, service areas, and 22 days time.\\n2. Explore the project tracker: The project tracker is a tool used to express preferences on upcoming projects and track progress. You can browse the tracker to get an idea of what REG is working on and express your interests in specific projects.\\n3. Join the #new-starters Slack channel: This channel is a great place to connect with other new starters and ask questions or share information.\\n4. Attend tech talks: REG runs tech talks every Tuesday lunchtime, which cover a range of topics related to research and technology. You can find the upcoming topics on the REG Tech Talks Slack channel.\\n5. Check out the Turing Data Stories (TDS) repository: TDS is a separate workspace for small projects that may be of interest to new starters. You can find more information about TDS and how to get involved in the TDS Slack channel.\\n6. Consider contributing to service areas: Service areas are REG-internal work, such as looking after the handbook, organizing recruitment, or managing computational resources. You may want to consider contributing to one service area, which should take approximately half a day a week.\\n7. Learn about the Turing Way: The Turing Way is a set of principles and practices that guide REG's work. You can find more information about the Turing Way on the Turing Complete website.\\n8. Network with other REG members: Connecting with other REG members can be a great way to learn more about the group and find opportunities for collaboration. You can find contact information for other REG members on the REG website or by reaching out to your line manager or buddies.\\n\\nRemember, these are just suggestions, and you should prioritize tasks based on your own needs and interests. Your line manager or buddies may also have specific tasks or recommendations for you to consider.\", additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.USER: 'user'>, content='thanks! what is the link to the project tracker?', additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content=\"  You're welcome! The link to the REG project tracker is:\\n\\n<https://alan-turing-institute.github.io/REG-handbook/docs/our_projects/project_tracker/>\\n\\nThis page displays all upcoming, current, and completed projects in REG, along with information about each project's status and the issue number in the Hut23 repository. You can use this tracker to express preferences on upcoming projects and track progress.\\n\\nPlease note that you may need to log in to access some of the links or features on the project tracker page. If you have any questions or need help finding something, feel free to reach out to your line manager or buddies for assistance.\", additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.USER: 'user'>, content=\"that link doesn't seem to be right. the project tracker is in the Hut23 GitHub repo\", additional_kwargs={}),\n",
+       " ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='  Oh dear! It looks like I made a mistake. Thank you for letting me know.\\n\\nThe REG project tracker is indeed located in the Hut23 GitHub repository, and you can access it by following these steps:\\n\\n1. Go to the Hut23 GitHub repository: <https://alan-turing-institute.github.io/Hut23/>\\n2. Click on the \"Projects\" tab in the top navigation menu.\\n3. You will see a list of all upcoming, current, and completed projects in REG, along with information about each project\\'s status and the issue number in the Hut23 repository.\\n\\nYou can use this tracker to express preferences on upcoming projects and track progress. If you have any questions or need help finding something, feel free to reach out to your line manager or buddies for assistance.', additional_kwargs={})]"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chat_engine.chat_history"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9fabedd-5925-420b-8f08-0b795c37de2a",
+   "metadata": {},
+   "source": [
+    "## Context chat example for obtaining starting salary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "68bb3d04-5e75-494f-889e-b58429ef5d0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_engine = index.as_chat_engine(\n",
+    "    chat_mode=\"context\",\n",
+    "    response_mode=response_mode,\n",
+    "    similarity_top_k=similarity_top_k,\n",
+    "    system_prompt=system_prompt,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "162878b8-c074-4fba-814e-1ef4ee35d37b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Based on the provided context, the starting salary for a Standard RSE in REG is £40,000. This is the bottom third baseline for the Standard band, which is £40,000 - £48,491.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    41.39 ms /    59 runs   (    0.70 ms per token,  1425.47 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 50427.23 ms /  1866 tokens (   27.02 ms per token,    37.00 tokens per second)\n",
+      "llama_print_timings:        eval time =  7028.91 ms /    58 runs   (  121.19 ms per token,     8.25 tokens per second)\n",
+      "llama_print_timings:       total time = 57572.15 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"what is the starting salary for a standard RSE in REG?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "dcf979f0-e91e-4b40-b8d9-82e5836e6d70",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Yes, the starting salary of £40,000 is for the 2023/24 financial year, as mentioned in the context. The cost of living increase for 2023 is 5%, and the cumulative cost of living increase from 2020 to 2023 is 20.74%.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    55.32 ms /    79 runs   (    0.70 ms per token,  1427.95 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 34095.64 ms /  1427 tokens (   23.89 ms per token,    41.85 tokens per second)\n",
+      "llama_print_timings:        eval time =  8928.90 ms /    78 runs   (  114.47 ms per token,     8.74 tokens per second)\n",
+      "llama_print_timings:       total time = 43178.32 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"is that for 2023/24?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "8e723487-eacc-48f6-8643-a67bc5f8fa4c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Certainly! Here's the table for the 2023/24 salaries based on the provided context:\n",
+      "\n",
+      "| Role | Band | Role Salary Min | Role Salary Max | Bottom Third Baseline | Middle Third Baseline | Top Third Baseline |\n",
+      "| --- | --- | --- | --- | --- | --- | --- |\n",
+      "| Principal | 6 | £73,526 | £84,488 | £73,526 | £77,180 | £80,834 |\n",
+      "| Lead | 5 | £62,666 | £73,297 | £62,666 | £66,210 | £69,754 |\n",
+      "| Senior | 4 | £51,476 | £62,108 | £51,476 | £55,020 | £58,564 |\n",
+      "| Standard | 3b* | £42,000 | £50,916 | £42,000 | £44,972 | £47,944 |\n",
+      "| Junior | 3a* | £38,048 | £39,900 | £38,048 | £38,665 | £39,283 |\n",
+      "\n",
+      "Note that the table only shows the salary ranges for the REG roles, as the other roles are not relevant to the context. Also, the bottom third baseline is the starting salary for a new hire in the role, while the middle and top third baselines represent the salary progression for existing employees based on their performance and experience.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   282.04 ms /   396 runs   (    0.71 ms per token,  1404.07 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 59088.33 ms /  2057 tokens (   28.73 ms per token,    34.81 tokens per second)\n",
+      "llama_print_timings:        eval time = 50284.80 ms /   395 runs   (  127.30 ms per token,     7.86 tokens per second)\n",
+      "llama_print_timings:       total time = 110246.08 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"can you show me the table for the 2023/24 salaries?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "605acfbd",
+   "metadata": {},
+   "source": [
+    "## \"React\" chat engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "8be62e6f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_engine = index.as_chat_engine(chat_mode=\"react\",\n",
+    "                                   response_mode=response_mode,\n",
+    "                                   similarity_top_k=similarity_top_k,\n",
+    "                                   verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "6f55aa2f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mThought: I need to use a tool to help me answer the question.\n",
+      "Action: query_engine_tool\n",
+      "Action Input: {'input': 'What should a new starter in REG do?'}\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    39.60 ms /    56 runs   (    0.71 ms per token,  1414.21 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  7363.70 ms /   441 tokens (   16.70 ms per token,    59.89 tokens per second)\n",
+      "llama_print_timings:        eval time =  5465.05 ms /    55 runs   (   99.36 ms per token,    10.06 tokens per second)\n",
+      "llama_print_timings:       total time = 12942.65 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36;1m\u001b[1;3mObservation:   As a new starter in REG, you should:\n",
+      "\n",
+      "1. Attend buddy meetings with your assigned buddies to get familiarized with the team and ask any questions you may have.\n",
+      "2. Attend HR induction and IT induction meetings to complete necessary paperwork and set up accounts.\n",
+      "3. Meet with your line manager to discuss your role, responsibilities, and project assignments.\n",
+      "4. Shadow meetings across the group to get a feel for how REG works and meet people.\n",
+      "5. Complete all necessary forms and tasks on Cezanne, including updating personal details, completing health and safety forms, and signing the \"Right to Work\" document.\n",
+      "6. Request a British Library pass to access the office.\n",
+      "7. Read about health and dental insurance options and decide whether to sign up.\n",
+      "8. Check the Turing Benefits site for discounts and benefits.\n",
+      "9. Provide a description for your profile on the ATI website.\n",
+      "10. Verify your MoorePay account for payslips.\n",
+      "\n",
+      "It is also recommended that you:\n",
+      "\n",
+      "* Join in for welcome coffee(s) to introduce yourself to the whole REG team.\n",
+      "* Attend 1-on-1 meetings with REG's Director within the first few weeks of starting.\n",
+      "* Use the first few days to set up your laptop and tools, get familiarized with the internal wiki and handbook, and shadow meetings.\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   228.08 ms /   320 runs   (    0.71 ms per token,  1403.03 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 29339.14 ms /  1282 tokens (   22.89 ms per token,    43.70 tokens per second)\n",
+      "llama_print_timings:        eval time = 36476.99 ms /   319 runs   (  114.35 ms per token,     8.75 tokens per second)\n",
+      "llama_print_timings:       total time = 66512.02 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mResponse: As a new starter in REG, you should attend buddy meetings with your assigned buddies to get familiarized with the team and ask any questions you may have. You should also attend HR induction and IT induction meetings, meet with your line manager to discuss your role and project assignments, shadow meetings across the group, complete all necessary forms and tasks on Cezanne, request a British Library pass, read about health and dental insurance options, check the Turing Benefits site for discounts and benefits, provide a description for your profile on the ATI website, and verify your MoorePay account for payslips. Additionally, you should join in for welcome coffee(s) to introduce yourself to the whole REG team, attend 1-on-1 meetings with REG's Director within the first few weeks of starting, and use the first few days to set up your laptop and tools, get familiarized with the internal wiki and handbook, and shadow meetings.\n",
+      "\u001b[0mAs a new starter in REG, you should attend buddy meetings with your assigned buddies to get familiarized with the team and ask any questions you may have. You should also attend HR induction and IT induction meetings, meet with your line manager to discuss your role and project assignments, shadow meetings across the group, complete all necessary forms and tasks on Cezanne, request a British Library pass, read about health and dental insurance options, check the Turing Benefits site for discounts and benefits, provide a description for your profile on the ATI website, and verify your MoorePay account for payslips. Additionally, you should join in for welcome coffee(s) to introduce yourself to the whole REG team, attend 1-on-1 meetings with REG's Director within the first few weeks of starting, and use the first few days to set up your laptop and tools, get familiarized with the internal wiki and handbook, and shadow meetings.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   161.03 ms /   230 runs   (    0.70 ms per token,  1428.29 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 15762.62 ms /   817 tokens (   19.29 ms per token,    51.83 tokens per second)\n",
+      "llama_print_timings:        eval time = 24342.82 ms /   229 runs   (  106.30 ms per token,     9.41 tokens per second)\n",
+      "llama_print_timings:       total time = 40567.23 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"what should a new starter in REG do?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "a82115a1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mResponse:   You asked me: what should a new starter in REG do?\n",
+      "\u001b[0m  You asked me: what should a new starter in REG do?\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    11.91 ms /    17 runs   (    0.70 ms per token,  1426.89 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  4737.11 ms /   231 tokens (   20.51 ms per token,    48.76 tokens per second)\n",
+      "llama_print_timings:        eval time =  1635.97 ms /    16 runs   (  102.25 ms per token,     9.78 tokens per second)\n",
+      "llama_print_timings:       total time =  6406.09 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\"What did I ask you before?\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "f7dd6b2e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mResponse:   No, I have not used the query engine yet.\n",
+      "\u001b[0m  No, I have not used the query engine yet.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =     9.11 ms /    13 runs   (    0.70 ms per token,  1427.00 tokens per second)\n",
+      "llama_print_timings: prompt eval time =   980.09 ms /    36 tokens (   27.22 ms per token,    36.73 tokens per second)\n",
+      "llama_print_timings:        eval time =  1232.60 ms /    12 runs   (  102.72 ms per token,     9.74 tokens per second)\n",
+      "llama_print_timings:       total time =  2237.77 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\"Have you used the query engine yet?\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "7a424919",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mResponse:   You have asked me:\n",
+      "\n",
+      "1. What should a new starter in REG do?\n",
+      "2. Have you used the query engine yet?\n",
+      "\u001b[0m  You have asked me:\n",
+      "\n",
+      "1. What should a new starter in REG do?\n",
+      "2. Have you used the query engine yet?\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    23.11 ms /    33 runs   (    0.70 ms per token,  1427.83 tokens per second)\n",
+      "llama_print_timings: prompt eval time =   694.77 ms /    32 tokens (   21.71 ms per token,    46.06 tokens per second)\n",
+      "llama_print_timings:        eval time =  3312.30 ms /    32 runs   (  103.51 ms per token,     9.66 tokens per second)\n",
+      "llama_print_timings:       total time =  4071.13 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\"What have I asked you so far?\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ebd4646",
+   "metadata": {},
+   "source": [
+    "Reset chat engine..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "d27686bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_engine.reset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "f67d46e6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mThought: I need to use a tool to help me answer the question.\n",
+      "Action: query_engine_tool\n",
+      "Action Input: {'input': 'What did I ask you before?'}\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    42.02 ms /    60 runs   (    0.70 ms per token,  1427.76 tokens per second)\n",
+      "llama_print_timings: prompt eval time =   382.90 ms /    11 tokens (   34.81 ms per token,    28.73 tokens per second)\n",
+      "llama_print_timings:        eval time =  5846.63 ms /    59 runs   (   99.10 ms per token,    10.09 tokens per second)\n",
+      "llama_print_timings:       total time =  6345.72 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36;1m\u001b[1;3mObservation:   Based on the current context information provided, you have not asked me any questions before.\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    13.35 ms /    19 runs   (    0.70 ms per token,  1423.54 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  1406.31 ms /   102 tokens (   13.79 ms per token,    72.53 tokens per second)\n",
+      "llama_print_timings:        eval time =  1687.97 ms /    18 runs   (   93.78 ms per token,    10.66 tokens per second)\n",
+      "llama_print_timings:       total time =  3130.96 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mThought: Hmm, that's correct. Let me try again.\n",
+      "Action: query_engine_tool\n",
+      "Action Input: {'input': 'What is the purpose of this conversation?'}\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    30.17 ms /    43 runs   (    0.70 ms per token,  1425.35 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  8934.27 ms /   508 tokens (   17.59 ms per token,    56.86 tokens per second)\n",
+      "llama_print_timings:        eval time =  4208.91 ms /    42 runs   (  100.21 ms per token,     9.98 tokens per second)\n",
+      "llama_print_timings:       total time = 13226.36 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36;1m\u001b[1;3mObservation:   Based on the context information provided, the purpose of this conversation is to discuss and share information related to technology, specifically about projects, data science, computer science, and software engineering. The conversation may also be used as an opportunity to seek help and input from others.\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    38.51 ms /    55 runs   (    0.70 ms per token,  1428.16 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  3965.45 ms /   274 tokens (   14.47 ms per token,    69.10 tokens per second)\n",
+      "llama_print_timings:        eval time =  5213.82 ms /    54 runs   (   96.55 ms per token,    10.36 tokens per second)\n",
+      "llama_print_timings:       total time =  9286.15 ms\n",
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    16.80 ms /    24 runs   (    0.70 ms per token,  1428.74 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 11186.44 ms /   617 tokens (   18.13 ms per token,    55.16 tokens per second)\n",
+      "llama_print_timings:        eval time =  2336.65 ms /    23 runs   (  101.59 ms per token,     9.84 tokens per second)\n",
+      "llama_print_timings:       total time = 13570.41 ms\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "Could not parse output:  Thought: Ah, I see. That's helpful to know.\nAction: None (for now)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "File \u001b[0;32m~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/llama_index/llama_index/agent/react/base.py:124\u001b[0m, in \u001b[0;36mReActAgent._extract_reasoning_step\u001b[0;34m(self, output)\u001b[0m\n\u001b[1;32m    123\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 124\u001b[0m     reasoning_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_output_parser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessage_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    125\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n",
+      "File \u001b[0;32m~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/llama_index/llama_index/agent/react/output_parser.py:77\u001b[0m, in \u001b[0;36mReActOutputParser.parse\u001b[0;34m(self, output)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAction:\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m output:\n\u001b[0;32m---> 77\u001b[0m     thought, action, action_input \u001b[38;5;241m=\u001b[39m \u001b[43mextract_tool_use\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     78\u001b[0m     json_str \u001b[38;5;241m=\u001b[39m extract_json_str(action_input)\n",
+      "File \u001b[0;32m~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/llama_index/llama_index/agent/react/output_parser.py:22\u001b[0m, in \u001b[0;36mextract_tool_use\u001b[0;34m(input_text)\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m match:\n\u001b[0;32m---> 22\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m     23\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not extract tool use from input text: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(input_text)\n\u001b[1;32m     24\u001b[0m     )\n\u001b[1;32m     26\u001b[0m thought \u001b[38;5;241m=\u001b[39m match\u001b[38;5;241m.\u001b[39mgroup(\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39mstrip()\n",
+      "\u001b[0;31mValueError\u001b[0m: Could not extract tool use from input text:  Thought: Ah, I see. That's helpful to know.\nAction: None (for now)",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[45], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mchat_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat did I ask you before?\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(response)\n",
+      "File \u001b[0;32m~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/llama_index/llama_index/callbacks/utils.py:38\u001b[0m, in \u001b[0;36mtrace_method.<locals>.decorator.<locals>.wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     36\u001b[0m callback_manager \u001b[38;5;241m=\u001b[39m cast(CallbackManager, callback_manager)\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m callback_manager\u001b[38;5;241m.\u001b[39mas_trace(trace_id):\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/llama_index/llama_index/agent/react/base.py:228\u001b[0m, in \u001b[0;36mReActAgent.chat\u001b[0;34m(self, message, chat_history)\u001b[0m\n\u001b[1;32m    226\u001b[0m chat_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_llm\u001b[38;5;241m.\u001b[39mchat(input_chat)\n\u001b[1;32m    227\u001b[0m \u001b[38;5;66;03m# given react prompt outputs, call tools or return response\u001b[39;00m\n\u001b[0;32m--> 228\u001b[0m reasoning_steps, is_done \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_actions\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchat_response\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    229\u001b[0m current_reasoning\u001b[38;5;241m.\u001b[39mextend(reasoning_steps)\n\u001b[1;32m    230\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_done:\n",
+      "File \u001b[0;32m~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/llama_index/llama_index/agent/react/base.py:143\u001b[0m, in \u001b[0;36mReActAgent._process_actions\u001b[0;34m(self, output)\u001b[0m\n\u001b[1;32m    140\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_process_actions\u001b[39m(\n\u001b[1;32m    141\u001b[0m     \u001b[38;5;28mself\u001b[39m, output: ChatResponse\n\u001b[1;32m    142\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[BaseReasoningStep], \u001b[38;5;28mbool\u001b[39m]:\n\u001b[0;32m--> 143\u001b[0m     _, current_reasoning, is_done \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_extract_reasoning_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    145\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_done:\n\u001b[1;32m    146\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m current_reasoning, \u001b[38;5;28;01mTrue\u001b[39;00m\n",
+      "File \u001b[0;32m~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/llama_index/llama_index/agent/react/base.py:126\u001b[0m, in \u001b[0;36mReActAgent._extract_reasoning_step\u001b[0;34m(self, output)\u001b[0m\n\u001b[1;32m    124\u001b[0m     reasoning_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_output_parser\u001b[38;5;241m.\u001b[39mparse(message_content)\n\u001b[1;32m    125\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m--> 126\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not parse output: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmessage_content\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mexc\u001b[39;00m\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_verbose:\n\u001b[1;32m    128\u001b[0m     print_text(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mreasoning_step\u001b[38;5;241m.\u001b[39mget_content()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, color\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpink\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mValueError\u001b[0m: Could not parse output:  Thought: Ah, I see. That's helpful to know.\nAction: None (for now)"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\"What did I ask you before?\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f44ac4c6",
+   "metadata": {},
+   "source": [
+    "## React engine and asking it to use query\n",
+    "\n",
+    "We saw that it didn't use the query engine in the above, but maybe we could force it to use it..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "f6374408",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_engine = index.as_chat_engine(chat_mode=\"react\",\n",
+    "                                   response_mode=response_mode,\n",
+    "                                   similarity_top_k=similarity_top_k,\n",
+    "                                   verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "006f178e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mThought: I need to use a tool to help me answer the question.\n",
+      "Action: query_engine_tool\n",
+      "Action Input: {'input': 'What should a new starter in the research engineering group do?'}\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    43.88 ms /    60 runs   (    0.73 ms per token,  1367.43 tokens per second)\n",
+      "llama_print_timings: prompt eval time =   866.61 ms /    23 tokens (   37.68 ms per token,    26.54 tokens per second)\n",
+      "llama_print_timings:        eval time =  5842.61 ms /    59 runs   (   99.03 ms per token,    10.10 tokens per second)\n",
+      "llama_print_timings:       total time =  6827.89 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36;1m\u001b[1;3mObservation:   Based on the provided context information, here are some suggestions for what a new starter in the Research Engineering Group (REG) should do:\n",
+      "\n",
+      "1. Familiarize yourself with the New Starter page to get an overview of the team's structure, roles, and key contacts.\n",
+      "2. Meet your buddies, who will provide informal friendly faces for advice, guidance, and encouragement on any aspect of working within REG and ARC. Your buddies should not be assigned to the projects you will be working on, and ideally, they should be at a similarly senior level to you.\n",
+      "3. Shadow projects for a short while to get an idea of how the team works.\n",
+      "4. Participate in \"Hacktoberfest\"-style issues to quickly get up to speed with the team's projects and get involved if there are any gaps in allocations.\n",
+      "5. Attend welcome coffee sessions to meet the team and get familiar with the group's culture and processes.\n",
+      "6. Check in with your buddies at least once in the first couple of weeks, and again a few weeks after, to discuss any pain points or concerns you may have.\n",
+      "7. Be open, honest, and respect confidentiality, and feel free to reach out to your buddies or other team members for technical pointers or questions.\n",
+      "\n",
+      "Remember that the buddy system is in place to help you get settled into your new role and provide support as needed. Don't hesitate to reach out to your buddies or other team members if you have any questions or need assistance.\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   239.88 ms /   342 runs   (    0.70 ms per token,  1425.74 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 31629.28 ms /  1354 tokens (   23.36 ms per token,    42.81 tokens per second)\n",
+      "llama_print_timings:        eval time = 39397.15 ms /   341 runs   (  115.53 ms per token,     8.66 tokens per second)\n",
+      "llama_print_timings:       total time = 71716.37 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mResponse: Based on the provided context information, here are some suggestions for what a new starter in the Research Engineering Group (REG) should do:\n",
+      "\u001b[0mBased on the provided context information, here are some suggestions for what a new starter in the Research Engineering Group (REG) should do:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   251.04 ms /   358 runs   (    0.70 ms per token,  1426.08 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 16593.54 ms /   849 tokens (   19.54 ms per token,    51.16 tokens per second)\n",
+      "llama_print_timings:        eval time = 38505.53 ms /   357 runs   (  107.86 ms per token,     9.27 tokens per second)\n",
+      "llama_print_timings:       total time = 55835.46 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"Please use the query engine. What should a new starter in the research engineering group do?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "ff81fbc8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mThought: I need to use a tool to help me answer the question.\n",
+      "Action: query_engine_tool\n",
+      "Action Input: {'input': 'What should a new starter in the REG team at the Turing Institute do?'}\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =    52.01 ms /    74 runs   (    0.70 ms per token,  1422.80 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  1112.21 ms /    61 tokens (   18.23 ms per token,    54.85 tokens per second)\n",
+      "llama_print_timings:        eval time =  7314.27 ms /    73 runs   (  100.20 ms per token,     9.98 tokens per second)\n",
+      "llama_print_timings:       total time =  8572.35 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36;1m\u001b[1;3mObservation:   As a new starter in the REG team at the Turing Institute, you should:\n",
+      "\n",
+      "1. Expect to be assigned two buddies who will be friendly points of contact for you. Your buddies will welcome you on your first day and introduce you to the rest of the team.\n",
+      "2. Attend a welcome coffee on your first day to meet the whole REG team.\n",
+      "3. Have a 1-on-1 meeting with the REG Director within the first few weeks of starting.\n",
+      "4. Use the time before being assigned to a project to do admin tasks, set up your laptop and tools, get to know people, read the handbook and internal wiki, and shadow meetings.\n",
+      "5. Sign up for the buddy system to be matched with two REG buddies who can offer informal technical help and social support.\n",
+      "6. Review the getting started checklist and first few days pages for more information on what to expect and how to prepare.\n",
+      "7. Familiarize yourself with the REG wiki, which contains a repository of knowledge helpful to the Hut 23 team, including howtos and instructions for new joiners.\n",
+      "8. Review the salary bands for all REG roles, annual pay increases, and probation review information.\n",
+      "9. Understand the project process, service areas, and remote working policies.\n",
+      "10. Familiarize yourself with the equipment and regular events at the Turing Institute.\n",
+      "11. Take care of your wellbeing and EDI, and participate in team outputs and knowledge sharing.\n",
+      "\n",
+      "Remember to update the buddy sign-up sheet if you have any preferences for being a buddy or if you need to change your buddy assignment.\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   260.69 ms /   372 runs   (    0.70 ms per token,  1426.95 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 39191.95 ms /  1566 tokens (   25.03 ms per token,    39.96 tokens per second)\n",
+      "llama_print_timings:        eval time = 44213.05 ms /   371 runs   (  119.17 ms per token,     8.39 tokens per second)\n",
+      "llama_print_timings:       total time = 84170.13 ms\n",
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[38;5;200m\u001b[1;3mResponse: Based on the provided information, a new starter in the REG team at the Turing Institute should take the following actions to prepare for their role and integrate into the team: attend a welcome coffee, schedule a 1-on-1 meeting with the REG Director, complete admin tasks, sign up for the buddy system, review the getting started checklist and first few days pages, familiarize themselves with the REG wiki, and take care of their wellbeing and EDI.\n",
+      "\u001b[0mBased on the provided information, a new starter in the REG team at the Turing Institute should take the following actions to prepare for their role and integrate into the team: attend a welcome coffee, schedule a 1-on-1 meeting with the REG Director, complete admin tasks, sign up for the buddy system, review the getting started checklist and first few days pages, familiarize themselves with the REG wiki, and take care of their wellbeing and EDI.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  9387.84 ms\n",
+      "llama_print_timings:      sample time =   305.88 ms /   435 runs   (    0.70 ms per token,  1422.12 tokens per second)\n",
+      "llama_print_timings: prompt eval time = 19322.01 ms /   945 tokens (   20.45 ms per token,    48.91 tokens per second)\n",
+      "llama_print_timings:        eval time = 47720.26 ms /   434 runs   (  109.95 ms per token,     9.09 tokens per second)\n",
+      "llama_print_timings:       total time = 67951.26 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"I want to specifically know about a new starter in the REG team at the Turing institute\"\n",
+    ")\n",
     "print(response)"
    ]
   }
diff --git a/poetry.lock b/poetry.lock
index eaf1e98d..cf0db93c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1283,6 +1283,7 @@ files = [
     {file = "greenlet-2.0.2-cp27-cp27m-win32.whl", hash = "sha256:6c3acb79b0bfd4fe733dff8bc62695283b57949ebcca05ae5c129eb606ff2d74"},
     {file = "greenlet-2.0.2-cp27-cp27m-win_amd64.whl", hash = "sha256:283737e0da3f08bd637b5ad058507e578dd462db259f7f6e4c5c365ba4ee9343"},
     {file = "greenlet-2.0.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:d27ec7509b9c18b6d73f2f5ede2622441de812e7b1a80bbd446cb0633bd3d5ae"},
+    {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d967650d3f56af314b72df7089d96cda1083a7fc2da05b375d2bc48c82ab3f3c"},
     {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:30bcf80dda7f15ac77ba5af2b961bdd9dbc77fd4ac6105cee85b0d0a5fcf74df"},
     {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26fbfce90728d82bc9e6c38ea4d038cba20b7faf8a0ca53a9c07b67318d46088"},
     {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9190f09060ea4debddd24665d6804b995a9c122ef5917ab26e1566dcc712ceeb"},
@@ -1291,6 +1292,7 @@ files = [
     {file = "greenlet-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:76ae285c8104046b3a7f06b42f29c7b73f77683df18c49ab5af7983994c2dd91"},
     {file = "greenlet-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:2d4686f195e32d36b4d7cf2d166857dbd0ee9f3d20ae349b6bf8afc8485b3645"},
     {file = "greenlet-2.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c4302695ad8027363e96311df24ee28978162cdcdd2006476c43970b384a244c"},
+    {file = "greenlet-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d4606a527e30548153be1a9f155f4e283d109ffba663a15856089fb55f933e47"},
     {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c48f54ef8e05f04d6eff74b8233f6063cb1ed960243eacc474ee73a2ea8573ca"},
     {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1846f1b999e78e13837c93c778dcfc3365902cfb8d1bdb7dd73ead37059f0d0"},
     {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a06ad5312349fec0ab944664b01d26f8d1f05009566339ac6f63f56589bc1a2"},
@@ -1320,6 +1322,7 @@ files = [
     {file = "greenlet-2.0.2-cp37-cp37m-win32.whl", hash = "sha256:3f6ea9bd35eb450837a3d80e77b517ea5bc56b4647f5502cd28de13675ee12f7"},
     {file = "greenlet-2.0.2-cp37-cp37m-win_amd64.whl", hash = "sha256:7492e2b7bd7c9b9916388d9df23fa49d9b88ac0640db0a5b4ecc2b653bf451e3"},
     {file = "greenlet-2.0.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b864ba53912b6c3ab6bcb2beb19f19edd01a6bfcbdfe1f37ddd1778abfe75a30"},
+    {file = "greenlet-2.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1087300cf9700bbf455b1b97e24db18f2f77b55302a68272c56209d5587c12d1"},
     {file = "greenlet-2.0.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ba2956617f1c42598a308a84c6cf021a90ff3862eddafd20c3333d50f0edb45b"},
     {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc3a569657468b6f3fb60587e48356fe512c1754ca05a564f11366ac9e306526"},
     {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8eab883b3b2a38cc1e050819ef06a7e6344d4a990d24d45bc6f2cf959045a45b"},
@@ -1328,6 +1331,7 @@ files = [
     {file = "greenlet-2.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0ef99cdbe2b682b9ccbb964743a6aca37905fda5e0452e5ee239b1654d37f2a"},
     {file = "greenlet-2.0.2-cp38-cp38-win32.whl", hash = "sha256:b80f600eddddce72320dbbc8e3784d16bd3fb7b517e82476d8da921f27d4b249"},
     {file = "greenlet-2.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:4d2e11331fc0c02b6e84b0d28ece3a36e0548ee1a1ce9ddde03752d9b79bba40"},
+    {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8512a0c38cfd4e66a858ddd1b17705587900dd760c6003998e9472b77b56d417"},
     {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:88d9ab96491d38a5ab7c56dd7a3cc37d83336ecc564e4e8816dbed12e5aaefc8"},
     {file = "greenlet-2.0.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:561091a7be172ab497a3527602d467e2b3fbe75f9e783d8b8ce403fa414f71a6"},
     {file = "greenlet-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:971ce5e14dc5e73715755d0ca2975ac88cfdaefcaab078a284fea6cfabf866df"},
@@ -1688,7 +1692,7 @@ i18n = ["Babel (>=2.7)"]
 name = "joblib"
 version = "1.3.2"
 description = "Lightweight pipelining with Python functions"
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"},
@@ -1997,13 +2001,13 @@ retrying = "*"
 
 [[package]]
 name = "llama-index"
-version = "0.8.21"
+version = "0.8.24.post1"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "*"
 files = [
-    {file = "llama_index-0.8.21-py3-none-any.whl", hash = "sha256:41cf1e5f7ff856c08f3ed70435a3e29b96226af43d45115276ae3fff57c5f973"},
-    {file = "llama_index-0.8.21.tar.gz", hash = "sha256:1058c0ec574d964f7209cf923687994845aadce5947612cc05121ed16a5f5730"},
+    {file = "llama_index-0.8.24.post1-py3-none-any.whl", hash = "sha256:4b7645a445d394640bad8c66a67483df29f7f0af25c53360cb382075be0c6c34"},
+    {file = "llama_index-0.8.24.post1.tar.gz", hash = "sha256:7cd47cf6ba64d24dbc6db712bcd4834767e0d35890559feee139bd4fa90ad916"},
 ]
 
 [package.dependencies]
@@ -2012,6 +2016,7 @@ dataclasses-json = "*"
 fsspec = ">=2023.5.0"
 langchain = ">=0.0.262"
 nest-asyncio = "*"
+nltk = "*"
 numpy = "*"
 openai = ">=0.26.4"
 pandas = "*"
@@ -2453,7 +2458,7 @@ test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
 name = "nltk"
 version = "3.8.1"
 description = "Natural Language Toolkit"
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"},
@@ -3362,6 +3367,7 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -3369,8 +3375,15 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -3387,6 +3400,7 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -3394,6 +3408,7 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -3788,35 +3803,57 @@ files = [
     {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:17f41344d9a075f2f21b289a49a62e98baff54b5754240ba896063bce31626bf"},
     {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:f1045f798e1a16a6ced98d6a42ec72936d367a2eec81dc5fade6ed54638cd7d2"},
     {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:eaf0e4bc91da13f21ac846a39429eb3f3b7ed06295a32321fa3eb1a59b5c70f3"},
+    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25149180d4dc8ca48bac2ac3852a9424b466e36336a39659b35b21b2116f96fc"},
+    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9e943bf78c39de8865398a71818315e7d5d1af93c7b30d4da3fc852e62ad9bc"},
+    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cccfcac04a010354e87c7a2fe16a1ff004fc4f6e7ef8efc966ed30122ce00bc7"},
     {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a07121f427e646a50d18c1be0fa1a2cbf6398624c31149cd7e6b35486d72189e"},
     {file = "safetensors-0.3.3-cp310-cp310-win32.whl", hash = "sha256:a85e29cbfddfea86453cc0f4889b4bcc6b9c155be9a60e27be479a34e199e7ef"},
+    {file = "safetensors-0.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:e13adad4a3e591378f71068d14e92343e626cf698ff805f61cdb946e684a218e"},
     {file = "safetensors-0.3.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:cbc3312f134baf07334dd517341a4b470b2931f090bd9284888acb7dfaf4606f"},
     {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d15030af39d5d30c22bcbc6d180c65405b7ea4c05b7bab14a570eac7d7d43722"},
     {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:f84a74cbe9859b28e3d6d7715ac1dd3097bebf8d772694098f6d42435245860c"},
     {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:10d637423d98ab2e6a4ad96abf4534eb26fcaf8ca3115623e64c00759374e90d"},
     {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:3b46f5de8b44084aff2e480874c550c399c730c84b2e8ad1bddb062c94aa14e9"},
+    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e76da691a82dfaf752854fa6d17c8eba0c8466370c5ad8cf1bfdf832d3c7ee17"},
+    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4e342fd54e66aa9512dd13e410f791e47aa4feeb5f4c9a20882c72f3d272f29"},
+    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:178fd30b5dc73bce14a39187d948cedd0e5698e2f055b7ea16b5a96c9b17438e"},
     {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e8fdf7407dba44587ed5e79d5de3533d242648e1f2041760b21474bd5ea5c8c"},
     {file = "safetensors-0.3.3-cp311-cp311-win32.whl", hash = "sha256:7d3b744cee8d7a46ffa68db1a2ff1a1a432488e3f7a5a97856fe69e22139d50c"},
+    {file = "safetensors-0.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f579877d30feec9b6ba409d05fa174633a4fc095675a4a82971d831a8bb60b97"},
     {file = "safetensors-0.3.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:2fff5b19a1b462c17322998b2f4b8bce43c16fe208968174d2f3a1446284ceed"},
     {file = "safetensors-0.3.3-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:41adb1d39e8aad04b16879e3e0cbcb849315999fad73bc992091a01e379cb058"},
     {file = "safetensors-0.3.3-cp37-cp37m-macosx_12_0_x86_64.whl", hash = "sha256:0f2b404250b3b877b11d34afcc30d80e7035714a1116a3df56acaca6b6c00096"},
     {file = "safetensors-0.3.3-cp37-cp37m-macosx_13_0_x86_64.whl", hash = "sha256:b43956ef20e9f4f2e648818a9e7b3499edd6b753a0f5526d4f6a6826fbee8446"},
+    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d61a99b34169981f088ccfbb2c91170843efc869a0a0532f422db7211bf4f474"},
+    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0008aab36cd20e9a051a68563c6f80d40f238c2611811d7faa5a18bf3fd3984"},
+    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:93d54166072b143084fdcd214a080a088050c1bb1651016b55942701b31334e4"},
     {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c32ee08f61cea56a5d62bbf94af95df6040c8ab574afffaeb7b44ae5da1e9e3"},
     {file = "safetensors-0.3.3-cp37-cp37m-win32.whl", hash = "sha256:351600f367badd59f7bfe86d317bb768dd8c59c1561c6fac43cafbd9c1af7827"},
+    {file = "safetensors-0.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:034717e297849dae1af0a7027a14b8647bd2e272c24106dced64d83e10d468d1"},
     {file = "safetensors-0.3.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8530399666748634bc0b301a6a5523756931b0c2680d188e743d16304afe917a"},
     {file = "safetensors-0.3.3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:9d741c1f1621e489ba10aa3d135b54202684f6e205df52e219d5eecd673a80c9"},
+    {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:0c345fd85b4d2093a5109596ff4cd9dfc2e84992e881b4857fbc4a93a3b89ddb"},
     {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:69ccee8d05f55cdf76f7e6c87d2bdfb648c16778ef8acfd2ecc495e273e9233e"},
+    {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:c08a9a4b7a4ca389232fa8d097aebc20bbd4f61e477abc7065b5c18b8202dede"},
     {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:a002868d2e3f49bbe81bee2655a411c24fa1f8e68b703dec6629cb989d6ae42e"},
+    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3bd2704cb41faa44d3ec23e8b97330346da0395aec87f8eaf9c9e2c086cdbf13"},
+    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b2951bf3f0ad63df5e6a95263652bd6c194a6eb36fd4f2d29421cd63424c883"},
+    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07114cec116253ca2e7230fdea30acf76828f21614afd596d7b5438a2f719bd8"},
     {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ab43aeeb9eadbb6b460df3568a662e6f1911ecc39387f8752afcb6a7d96c087"},
     {file = "safetensors-0.3.3-cp38-cp38-win32.whl", hash = "sha256:f2f59fce31dd3429daca7269a6b06f65e6547a0c248f5116976c3f1e9b73f251"},
+    {file = "safetensors-0.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:c31ca0d8610f57799925bf08616856b39518ab772c65093ef1516762e796fde4"},
     {file = "safetensors-0.3.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:59a596b3225c96d59af412385981f17dd95314e3fffdf359c7e3f5bb97730a19"},
     {file = "safetensors-0.3.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:82a16e92210a6221edd75ab17acdd468dd958ef5023d9c6c1289606cc30d1479"},
     {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:98a929e763a581f516373ef31983ed1257d2d0da912a8e05d5cd12e9e441c93a"},
     {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:12b83f1986cd16ea0454c636c37b11e819d60dd952c26978310a0835133480b7"},
     {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:f439175c827c2f1bbd54df42789c5204a10983a30bc4242bc7deaf854a24f3f0"},
     {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:0085be33b8cbcb13079b3a8e131656e05b0bc5e6970530d4c24150f7afd76d70"},
+    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3ec70c87b1e910769034206ad5efc051069b105aac1687f6edcd02526767f4"},
+    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f490132383e5e490e710608f4acffcb98ed37f91b885c7217d3f9f10aaff9048"},
+    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79d1b6c7ed5596baf79c80fbce5198c3cdcc521ae6a157699f427aba1a90082d"},
     {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad3cc8006e7a86ee7c88bd2813ec59cd7cc75b03e6fa4af89b9c7b235b438d68"},
     {file = "safetensors-0.3.3-cp39-cp39-win32.whl", hash = "sha256:ab29f54c6b8c301ca05fa014728996bd83aac6e21528f893aaf8945c71f42b6d"},
+    {file = "safetensors-0.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:0fa82004eae1a71e2aa29843ef99de9350e459a0fc2f65fc6ee0da9690933d2d"},
     {file = "safetensors-0.3.3.tar.gz", hash = "sha256:edb7072d788c4f929d0f5735d3a2fb51e5a27f833587828583b7f5747af1a2b8"},
 ]
 
@@ -5101,4 +5138,4 @@ bot = ["adapter-transformers", "datasets", "einops", "faiss-cpu", "gradio", "lan
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "e26a2e65fc12563fce8c7dd0a7cd7da8ad557fb9824eff7e1d3b1f31403310aa"
+content-hash = "56c1ceefcb939dd090c641de32bf25cf5ba2718cceb2148bcf2a24a255b2e945"
diff --git a/pyproject.toml b/pyproject.toml
index 7043c7f9..2e84d427 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,9 @@ einops = { version="^0.6.1", optional=true }
 faiss-cpu = { version="^1.7.4", optional=true }
 gradio = { version="^3.34.0", optional=true }
 langchain = "^0.0.278"
-llama-index = "^0.8.14"
+llama-index = "^0.8.24"
+llama-cpp-python = "^0.1.83"
+llama-hub = "^0.0.26"
 nbconvert = { version="^7.5.0", optional=true }
 openai = { version="^0.27.8", optional=true }
 pandas = "^2.0.2"
@@ -38,8 +40,6 @@ torch = [
 transformers = "=4.30.2"
 ipykernel = "^6.23.2"
 xformers = { version="^0.0.20", optional=true }
-llama-cpp-python = "^0.1.83"
-llama-hub = "^0.0.26"
 ipywidgets = "^8.1.0"
 gitpython = "^3.1.36"
 
@@ -81,9 +81,6 @@ url = "https://download.pytorch.org/whl/cpu/"
 priority = "explicit"
 
 
-[tool.poetry.group.bot.dependencies]
-llama-cpp-python = {version = "^0.1.83", optional = true}
-
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"