cohere-ai · sanderland · May 30, 2024 · May 14, 2024
diff --git a/README.md b/README.md
@@ -49,7 +49,8 @@ If you want to contribute results for additional models, please include:
   * A line in `run_verification.sh`
   * All files in `results` that are not `.gitignore`'d
 
+## Model requests
 
-
+If you know of a model that may be interesting to analyze, but do not have the resources to run it yourself, feel free to open an issue. Please add the Hugging Face id, some information on how it is interesting in terms of tokenization, and keep in mind that the larger the model is, the less likely it is to be prioritized.
 
 
diff --git a/find_tokens_in_training_data.ipynb b/find_tokens_in_training_data.ipynb
@@ -54,7 +54,7 @@
     "# group phrases derived from the same docs\n",
     "for p in phrases:\n",
     "    count = sum(s['content'].count(p) for s in finds[p])\n",
-    "    print(f\"Phrase {repr(p):<40} found in {len(finds[p])}/{len(ds['train'])} samples\\t  hash {hashes[p]}\\t  {count} occurences\")\n"
+    "    print(f\"Phrase {repr(p):<40} found in {len(finds[p])}/{len(ds['train'])} samples\\t  hash {hashes[p]}\\t  {count} occurrences\")\n"
    ]
   },
   {

diff --git a/magikarp/fishing.py b/magikarp/fishing.py
@@ -139,6 +139,7 @@ def load_analyzers(
                 token_infos[token_id][k] = res[k]  # overwritten if not avoid_loading_model
             if "verification" in token_infos[token_id]:
                 classify_verification(token_infos[token_id])
+    write_verification_results(token_infos, model_id)  # mainly for first inspection
 
     if not avoid_loading_model:  # by default we don't trust saved results for easy experimentation
         moda = ModelAnalyzer(

diff --git a/magikarp/unused_tokens.py b/magikarp/unused_tokens.py
@@ -18,6 +18,8 @@
 
 DEEPSEEK_LANG_UNUSED_TOKENS = np.arange(100002, 100015)  # unused utf8
 DEEPSEEK_CODE_UNUSED_TOKENS = np.arange(171, 173)  # f1/f2
+MAP_NEO_UNUSED_TOKENS = np.arange(248, 259)  # unused 245-255 fallback
+FUGAKU_UNUSED_TOKENS = np.arange(278,289) # unused 245-255 fallback
 
 # Defines reference unused tokens for models
 # optional for most models, but also functions as a kind of registry of models to process
@@ -36,6 +38,7 @@
     "openai-community/gpt2-xl": GPT2_UNUSED_TOKENS,
     "EleutherAI/gpt-j-6b": GPT2_UNUSED_TOKENS,
     "microsoft/phi-2": GPT2_UNUSED_TOKENS,
+    "benjamin/Mistral-7B-v0.1-zett-gpt2": [x+3 for x in GPT2_UNUSED_TOKENS],
     # llama2 and variants
     "meta-llama/Llama-2-13b-hf": LLAMA2_UNUSED_TOKENS,
     "meta-llama/Llama-2-7b-hf": LLAMA2_UNUSED_TOKENS,
@@ -47,7 +50,6 @@
     "EleutherAI/gpt-neox-20b": NEOX_UNUSED_TOKENS,
     "allenai/OLMo-7B-hf": OLMO_UNUSED_TOKENS,  # required since we use secondary metric
     "allenai/OLMo-1.7-7B-hf": OLMO_UNUSED_TOKENS,  # required since we use secondary metric
-    #    "allenai/OLMo-1.7-7B": OLMO_UNUSED_TOKENS,# required since we use secondary metric
     # mistral variants
     "mistralai/Mistral-7B-v0.1": MISTRAL_UNUSED_TOKENS,
     "mistralai/Mistral-7B-Instruct-v0.2": MISTRAL_UNUSED_TOKENS,
@@ -68,7 +70,10 @@
     # others
     "bigcode/starcoder2-15b": STARCODER2_UNUSED_TOKENS,
     "01-ai/Yi-9B": YI_UNUSED_TOKENS,
+    "01-ai/Yi-1.5-9B": YI_UNUSED_TOKENS,
     "ai21labs/Jamba-v0.1": JAMBA_UNUSED_TOKENS,
     "deepseek-ai/deepseek-llm-7b-base": DEEPSEEK_LANG_UNUSED_TOKENS,
     "deepseek-ai/deepseek-coder-33b-base": DEEPSEEK_CODE_UNUSED_TOKENS,
+    "m-a-p/neo_7b": MAP_NEO_UNUSED_TOKENS,
+    "Fugaku-LLM/Fugaku-LLM-13B": FUGAKU_UNUSED_TOKENS,
 }
diff --git a/poetry.lock b/poetry.lock
diff --git a/results/metrics_pairplot_byid/01_ai_Yi_1_5_9B.png b/results/metrics_pairplot_byid/01_ai_Yi_1_5_9B.png
diff --git a/results/metrics_pairplot_byid/Fugaku_LLM_Fugaku_LLM_13B.png b/results/metrics_pairplot_byid/Fugaku_LLM_Fugaku_LLM_13B.png
diff --git a/results/metrics_pairplot_byid/benjamin_Mistral_7B_v0_1_zett_gpt2.png b/results/metrics_pairplot_byid/benjamin_Mistral_7B_v0_1_zett_gpt2.png
diff --git a/results/metrics_pairplot_byid/m_a_p_neo_7b.png b/results/metrics_pairplot_byid/m_a_p_neo_7b.png
diff --git a/results/reports/01_ai_Yi_1_5_9B.md b/results/reports/01_ai_Yi_1_5_9B.md
diff --git a/results/reports/Fugaku_LLM_Fugaku_LLM_13B.md b/results/reports/Fugaku_LLM_Fugaku_LLM_13B.md
diff --git a/results/reports/benjamin_Mistral_7B_v0_1_zett_gpt2.md b/results/reports/benjamin_Mistral_7B_v0_1_zett_gpt2.md
diff --git a/results/reports/m_a_p_neo_7b.md b/results/reports/m_a_p_neo_7b.md
diff --git a/results/verifications/01_ai_Yi_1_5_9B.jsonl.gz b/results/verifications/01_ai_Yi_1_5_9B.jsonl.gz
diff --git a/results/verifications/Fugaku_LLM_Fugaku_LLM_13B.jsonl.gz b/results/verifications/Fugaku_LLM_Fugaku_LLM_13B.jsonl.gz
diff --git a/results/verifications/benjamin_Mistral_7B_v0_1_zett_gpt2.jsonl.gz b/results/verifications/benjamin_Mistral_7B_v0_1_zett_gpt2.jsonl.gz
diff --git a/results/verifications/m_a_p_neo_7b.jsonl.gz b/results/verifications/m_a_p_neo_7b.jsonl.gz
diff --git a/results/verifications/openai_community_gpt2_medium.jsonl.gz b/results/verifications/openai_community_gpt2_medium.jsonl.gz
diff --git a/results/verifications_scatterplot/01_ai_Yi_1_5_9B.png b/results/verifications_scatterplot/01_ai_Yi_1_5_9B.png
diff --git a/results/verifications_scatterplot/Fugaku_LLM_Fugaku_LLM_13B.png b/results/verifications_scatterplot/Fugaku_LLM_Fugaku_LLM_13B.png
diff --git a/results/verifications_scatterplot/benjamin_Mistral_7B_v0_1_zett_gpt2.png b/results/verifications_scatterplot/benjamin_Mistral_7B_v0_1_zett_gpt2.png
diff --git a/results/verifications_scatterplot/m_a_p_neo_7b.png b/results/verifications_scatterplot/m_a_p_neo_7b.png
diff --git a/run_verification.sh b/run_verification.sh
@@ -61,12 +61,17 @@ for arg in "$@"; do
             python magikarp/fishing.py --model_id "Qwen/Qwen1.5-72B-Chat"
             python magikarp/fishing.py --model_id "stabilityai/stablelm-2-12b" --trust-remote-code # missing weights if not trust remote
             python magikarp/fishing.py --model_id "meta-llama/Meta-Llama-3-8B"
+            python generate_results.py "Qwen|Llama-3" --load
+            python generate_results.py "stablelm" --load --trust-remote-code
             ;;
         "misc")
             python magikarp/fishing.py --model_id "01-ai/Yi-9B"
             python magikarp/fishing.py --model_id "bigcode/starcoder2-15b"
             python magikarp/fishing.py --model_id "ai21labs/Jamba-v0.1" --trust_remote_code
-            python generate_results.py "Yi-9B|starcoder2|Jamba" --load
+            python magikarp/fishing.py --model_id "Fugaku-LLM/Fugaku-LLM-13B"
+            python generate_results.py "Yi-9B|starcoder2|Jamba|Fugaku" --load
+            python magikarp/fishing.py --model_id "m-a-p/neo_7b" --trust-remote-code
+            python generate_results.py "m-a-p" --load --trust-remote-code
             ;;
         *)
             echo "Error: Invalid argument '$arg'. Supported arguments are: 'gpt2', 'neox', 'llama2', 'mistral', 'gemma', 'cohere', 'tiktoken', 'misc'"