trying to edit formatting with autopep8 to conform to pep8 (#58)

* trying to edit formatting with autopep8 to conform to pep8 * fixed several things, among which flake8 version, tox.ini formatting, lots in the source code format, ran it once to make sure it still works Not perfect yet, but merged since it works. Further embellishments or edits can be done in the future
monarch-initiative · Nov 20, 2024 · 11cbc1e · 11cbc1e
1 parent bb6d78d
commit 11cbc1e
Show file tree

Hide file tree

Showing 27 changed files with 746 additions and 629 deletions.
diff --git a/caches/cache_log.txt b/caches/cache_log.txt
@@ -164,3 +164,17 @@ CacheInfo: hits=176206, misses=6703, maxsize=524288, currsize=29413
 omim_mappings cache info:
 CacheInfo: hits=80073, misses=2993, maxsize=524288, currsize=23613
 
+Timestamp: 20241120-174511
+
+gpt-4o/results.tsv
+score_grounded_result cache info:
+CacheInfo: hits=45, misses=30, maxsize=524288        , currsize=29443
+omim_mappings cache info:
+CacheInfo: hits=615, misses=5, maxsize=524288        , currsize=23618
+
+gpt-4o-mini/results.tsv
+score_grounded_result cache info:
+CacheInfo: hits=98, misses=70, maxsize=524288        , currsize=29483
+omim_mappings cache info:
+CacheInfo: hits=1153, misses=14, maxsize=524288        , currsize=23627
+
diff --git a/caches/omim_mappings_cache.db b/caches/omim_mappings_cache.db
diff --git a/caches/score_grounded_result_cache.db b/caches/score_grounded_result_cache.db
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,7 @@ setuptools = "^69.5.1"
 shelved-cache = "^0.3.1"
 curategpt = "^0.2.2"
 psutil = "^6.1.0"
+autopep8 = "^2.3.1"
 
 [tool.poetry.plugins."pheval.plugins"]
 template = "malco.runner:MalcoRunner"

diff --git a/src/malco/analysis/check_lens.py b/src/malco/analysis/check_lens.py
@@ -1,11 +1,12 @@
-import pandas as pd 
+import sys
+
+# from malco.post_process.post_process_results_format import read_raw_result_yaml
+from pathlib import Path
 from typing import List
 
 import pandas as pd
 import yaml
-#from malco.post_process.post_process_results_format import read_raw_result_yaml
-from pathlib import Path
-import sys
+
 
 def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     """
@@ -17,17 +18,20 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     Returns:
         dict: Contents of the raw result file.
     """
-    with open(raw_result_path, 'r') as raw_result:
-        return list(yaml.safe_load_all(raw_result.read().replace(u'\x04','')))  # Load and convert to list
+    with open(raw_result_path, "r") as raw_result:
+        return list(
+            yaml.safe_load_all(raw_result.read().replace("\x04", ""))
+        )  # Load and convert to list
+
 
 unique_ppkts = {}
-#model=str(sys.argv[1])
+# model=str(sys.argv[1])
 models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
 for model in models:
-    print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)
-   
+    print("===" * 10, "\nEvaluating now: ", model, "\n" + "===" * 10)
+
     yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
-    all_results=read_raw_result_yaml(yamlfile)
+    all_results = read_raw_result_yaml(yamlfile)
 
     counter = 0
     labelvec = []
@@ -36,20 +40,27 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     for this_result in all_results:
         extracted_object = this_result.get("extracted_object")
         if extracted_object:
-            label = extracted_object.get('label')
+            label = extracted_object.get("label")
             labelvec.append(label)
-            terms = extracted_object.get('terms')
+            terms = extracted_object.get("terms")
             if terms:
                 counter += 1
 
     full_df_file = f"out_openAI_models/multimodel/{model}/results.tsv"
-    df = pd.read_csv(full_df_file, sep='\t')
-    num_ppkts = df['label'].nunique()
-    unique_ppkts[model] = df['label'].unique()
+    df = pd.read_csv(full_df_file, sep="\t")
+    num_ppkts = df["label"].nunique()
+    unique_ppkts[model] = df["label"].unique()
     # The first should be equivalent to grepping "raw_" in some results.yaml
     print("The number of prompts that have something in results.yaml are: ", len(labelvec))
-    print("The number of prompts that have a non-empty differential (i.e. term is not None) is:", counter)
-    print("The number of unique prompts/ppkts with a non-empty differential in results.tsv are:", num_ppkts, "\n")
+    print(
+        "The number of prompts that have a non-empty differential (i.e. term is not None) is:",
+        counter,
+    )
+    print(
+        "The number of unique prompts/ppkts with a non-empty differential in results.tsv are:",
+        num_ppkts,
+        "\n",
+    )
 
 # This we know a posteriori, gpt-4o and gpt-4-turbo both have 5213 phenopackets
 # Thus, let's print out what is missing in the others
@@ -64,4 +75,4 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     if i in unique_ppkts["gpt-3.5-turbo"]:
         continue
     else:
-        print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
+        print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
diff --git a/src/malco/analysis/count_grounding_failures.py b/src/malco/analysis/count_grounding_failures.py
@@ -1,11 +1,10 @@
 # Quick check how often the grounding failed
 # Need to be in short_letter branch
 import pandas as pd
+
 mfile = "../outputdir_all_2024_07_04/en/results.tsv"
 
-df = pd.read_csv(
-        mfile, sep="\t" #, header=None, names=["description", "term", "label"]
-    )
+df = pd.read_csv(mfile, sep="\t")  # , header=None, names=["description", "term", "label"]
 
 terms = df["term"]
 counter = 0
@@ -17,4 +16,4 @@
         counter += 1
 
 print(counter)
-print(grounded)
+print(grounded)
diff --git a/src/malco/analysis/count_translated_prompts.py b/src/malco/analysis/count_translated_prompts.py
@@ -1,37 +1,39 @@
 import os
 import re
+
 fp = "/Users/leonardo/IdeaProjects/phenopacket2prompt/prompts/"
 
-langs = ["en",
-         "es",
-         "de",
-         "it",
-         "nl",
-         "tr",
-         "zh",
-         ]
+langs = [
+    "en",
+    "es",
+    "de",
+    "it",
+    "nl",
+    "tr",
+    "zh",
+]
 
 promptfiles = {}
 for lang in langs:
     promptfiles[lang] = []
-    for (dirpath, dirnames, filenames) in os.walk(fp+lang):
+    for dirpath, dirnames, filenames in os.walk(fp + lang):
         for fn in filenames:
-            fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
+            fn = fn[0:-14]  # TODO may be problematic if there are 2 "_" before "{langcode}-"
             # Maybe something along the lines of other script disease_avail_knowledge.py
             # ppkt_label = ppkt[0].replace('_en-prompt.txt','')
             promptfiles[lang].append(fn)
         break
 
 intersection = set()
 
-enset = set(promptfiles['en'])
-esset = set(promptfiles['es'])
-deset = set(promptfiles['de'])
-itset = set(promptfiles['it'])
-nlset = set(promptfiles['nl'])
-zhset = set(promptfiles['zh'])
-trset = set(promptfiles['tr'])
+enset = set(promptfiles["en"])
+esset = set(promptfiles["es"])
+deset = set(promptfiles["de"])
+itset = set(promptfiles["it"])
+nlset = set(promptfiles["nl"])
+zhset = set(promptfiles["zh"])
+trset = set(promptfiles["tr"])
 
 intersection = enset & esset & deset & itset & nlset & zhset & trset
 
-print("Common ppkts are: ", len(intersection))
+print("Common ppkts are: ", len(intersection))