monarch-initiative · leokim-l · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/caches/cache_log.txt b/caches/cache_log.txt
@@ -164,3 +164,17 @@ CacheInfo: hits=176206, misses=6703, maxsize=524288, currsize=29413
 omim_mappings cache info:
 CacheInfo: hits=80073, misses=2993, maxsize=524288, currsize=23613
 
+Timestamp: 20241120-174511
+
+gpt-4o/results.tsv
+score_grounded_result cache info:
+CacheInfo: hits=45, misses=30, maxsize=524288        , currsize=29443
+omim_mappings cache info:
+CacheInfo: hits=615, misses=5, maxsize=524288        , currsize=23618
+
+gpt-4o-mini/results.tsv
+score_grounded_result cache info:
+CacheInfo: hits=98, misses=70, maxsize=524288        , currsize=29483
+omim_mappings cache info:
+CacheInfo: hits=1153, misses=14, maxsize=524288        , currsize=23627
+
diff --git a/caches/omim_mappings_cache.db b/caches/omim_mappings_cache.db
diff --git a/caches/score_grounded_result_cache.db b/caches/score_grounded_result_cache.db
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,7 @@ setuptools = "^69.5.1"
 shelved-cache = "^0.3.1"
 curategpt = "^0.2.2"
 psutil = "^6.1.0"
+autopep8 = "^2.3.1"
 
 [tool.poetry.plugins."pheval.plugins"]
 template = "malco.runner:MalcoRunner"

diff --git a/src/malco/analysis/check_lens.py b/src/malco/analysis/check_lens.py
@@ -1,11 +1,12 @@
-import pandas as pd 
+import sys
+
+# from malco.post_process.post_process_results_format import read_raw_result_yaml
+from pathlib import Path
 from typing import List
 
 import pandas as pd
 import yaml
-#from malco.post_process.post_process_results_format import read_raw_result_yaml
-from pathlib import Path
-import sys
+
 
 def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     """
@@ -17,17 +18,20 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     Returns:
         dict: Contents of the raw result file.
     """
-    with open(raw_result_path, 'r') as raw_result:
-        return list(yaml.safe_load_all(raw_result.read().replace(u'\x04','')))  # Load and convert to list
+    with open(raw_result_path, "r") as raw_result:
+        return list(
+            yaml.safe_load_all(raw_result.read().replace("\x04", ""))
+        )  # Load and convert to list
+
 
 unique_ppkts = {}
-#model=str(sys.argv[1])
+# model=str(sys.argv[1])
 models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
 for model in models:
-    print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)
-   
+    print("===" * 10, "\nEvaluating now: ", model, "\n" + "===" * 10)
+
     yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
-    all_results=read_raw_result_yaml(yamlfile)
+    all_results = read_raw_result_yaml(yamlfile)
 
     counter = 0
     labelvec = []
@@ -36,20 +40,27 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     for this_result in all_results:
         extracted_object = this_result.get("extracted_object")
         if extracted_object:
-            label = extracted_object.get('label')
+            label = extracted_object.get("label")
             labelvec.append(label)
-            terms = extracted_object.get('terms')
+            terms = extracted_object.get("terms")
             if terms:
                 counter += 1
 
     full_df_file = f"out_openAI_models/multimodel/{model}/results.tsv"
-    df = pd.read_csv(full_df_file, sep='\t')
-    num_ppkts = df['label'].nunique()
-    unique_ppkts[model] = df['label'].unique()
+    df = pd.read_csv(full_df_file, sep="\t")
+    num_ppkts = df["label"].nunique()
+    unique_ppkts[model] = df["label"].unique()
     # The first should be equivalent to grepping "raw_" in some results.yaml
     print("The number of prompts that have something in results.yaml are: ", len(labelvec))
-    print("The number of prompts that have a non-empty differential (i.e. term is not None) is:", counter)
-    print("The number of unique prompts/ppkts with a non-empty differential in results.tsv are:", num_ppkts, "\n")
+    print(
+        "The number of prompts that have a non-empty differential (i.e. term is not None) is:",
+        counter,
+    )
+    print(
+        "The number of unique prompts/ppkts with a non-empty differential in results.tsv are:",
+        num_ppkts,
+        "\n",
+    )
 
 # This we know a posteriori, gpt-4o and gpt-4-turbo both have 5213 phenopackets
 # Thus, let's print out what is missing in the others
@@ -64,4 +75,4 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     if i in unique_ppkts["gpt-3.5-turbo"]:
         continue
     else:
-        print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
+        print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
diff --git a/src/malco/analysis/count_grounding_failures.py b/src/malco/analysis/count_grounding_failures.py
@@ -1,11 +1,10 @@
 # Quick check how often the grounding failed
 # Need to be in short_letter branch
 import pandas as pd
+
 mfile = "../outputdir_all_2024_07_04/en/results.tsv"
 
-df = pd.read_csv(
-        mfile, sep="\t" #, header=None, names=["description", "term", "label"]
-    )
+df = pd.read_csv(mfile, sep="\t")  # , header=None, names=["description", "term", "label"]
 
 terms = df["term"]
 counter = 0
@@ -17,4 +16,4 @@
         counter += 1
 
 print(counter)
-print(grounded)
+print(grounded)
diff --git a/src/malco/analysis/count_translated_prompts.py b/src/malco/analysis/count_translated_prompts.py
@@ -1,37 +1,39 @@
 import os
 import re
+
 fp = "/Users/leonardo/IdeaProjects/phenopacket2prompt/prompts/"
 
-langs = ["en",
-         "es",
-         "de",
-         "it",
-         "nl",
-         "tr",
-         "zh",
-         ]
+langs = [
+    "en",
+    "es",
+    "de",
+    "it",
+    "nl",
+    "tr",
+    "zh",
+]
 
 promptfiles = {}
 for lang in langs:
     promptfiles[lang] = []
-    for (dirpath, dirnames, filenames) in os.walk(fp+lang):
+    for dirpath, dirnames, filenames in os.walk(fp + lang):
         for fn in filenames:
-            fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
+            fn = fn[0:-14]  # TODO may be problematic if there are 2 "_" before "{langcode}-"
             # Maybe something along the lines of other script disease_avail_knowledge.py
             # ppkt_label = ppkt[0].replace('_en-prompt.txt','')
             promptfiles[lang].append(fn)
         break
 
 intersection = set()
 
-enset = set(promptfiles['en'])
-esset = set(promptfiles['es'])
-deset = set(promptfiles['de'])
-itset = set(promptfiles['it'])
-nlset = set(promptfiles['nl'])
-zhset = set(promptfiles['zh'])
-trset = set(promptfiles['tr'])
+enset = set(promptfiles["en"])
+esset = set(promptfiles["es"])
+deset = set(promptfiles["de"])
+itset = set(promptfiles["it"])
+nlset = set(promptfiles["nl"])
+zhset = set(promptfiles["zh"])
+trset = set(promptfiles["tr"])
 
 intersection = enset & esset & deset & itset & nlset & zhset & trset
 
-print("Common ppkts are: ", len(intersection))
+print("Common ppkts are: ", len(intersection))