trying to edit formatting with autopep8 to conform to pep8

monarch-initiative · Nov 20, 2024 · 988404c · 988404c
1 parent bb6d78d
commit 988404c
Show file tree

Hide file tree

Showing 23 changed files with 415 additions and 371 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,7 @@ setuptools = "^69.5.1"
 shelved-cache = "^0.3.1"
 curategpt = "^0.2.2"
 psutil = "^6.1.0"
+autopep8 = "^2.3.1"
 
 [tool.poetry.plugins."pheval.plugins"]
 template = "malco.runner:MalcoRunner"

diff --git a/src/malco/analysis/check_lens.py b/src/malco/analysis/check_lens.py
@@ -1,12 +1,13 @@
-import pandas as pd 
+import pandas as pd
 from typing import List
 
 import pandas as pd
 import yaml
-#from malco.post_process.post_process_results_format import read_raw_result_yaml
+# from malco.post_process.post_process_results_format import read_raw_result_yaml
 from pathlib import Path
 import sys
 
+
 def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     """
     Read the raw result file.
@@ -18,16 +19,17 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
         dict: Contents of the raw result file.
     """
     with open(raw_result_path, 'r') as raw_result:
-        return list(yaml.safe_load_all(raw_result.read().replace(u'\x04','')))  # Load and convert to list
+        return list(yaml.safe_load_all(raw_result.read().replace(u'\x04', '')))  # Load and convert to list
+
 
 unique_ppkts = {}
-#model=str(sys.argv[1])
+# model=str(sys.argv[1])
 models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
 for model in models:
-    print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)
-   
+    print("===" * 10, "\nEvaluating now: ", model, "\n" + "===" * 10)
+
     yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
-    all_results=read_raw_result_yaml(yamlfile)
+    all_results = read_raw_result_yaml(yamlfile)
 
     counter = 0
     labelvec = []
@@ -64,4 +66,4 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     if i in unique_ppkts["gpt-3.5-turbo"]:
         continue
     else:
-        print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
+        print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
diff --git a/src/malco/analysis/count_grounding_failures.py b/src/malco/analysis/count_grounding_failures.py
@@ -4,8 +4,8 @@
 mfile = "../outputdir_all_2024_07_04/en/results.tsv"
 
 df = pd.read_csv(
-        mfile, sep="\t" #, header=None, names=["description", "term", "label"]
-    )
+    mfile, sep="\t"  # , header=None, names=["description", "term", "label"]
+)
 
 terms = df["term"]
 counter = 0
@@ -17,4 +17,4 @@
         counter += 1
 
 print(counter)
-print(grounded)
+print(grounded)
diff --git a/src/malco/analysis/count_translated_prompts.py b/src/malco/analysis/count_translated_prompts.py
@@ -14,9 +14,9 @@
 promptfiles = {}
 for lang in langs:
     promptfiles[lang] = []
-    for (dirpath, dirnames, filenames) in os.walk(fp+lang):
+    for (dirpath, dirnames, filenames) in os.walk(fp + lang):
         for fn in filenames:
-            fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
+            fn = fn[0:-14]  # TODO may be problematic if there are 2 "_" before "{langcode}-"
             # Maybe something along the lines of other script disease_avail_knowledge.py
             # ppkt_label = ppkt[0].replace('_en-prompt.txt','')
             promptfiles[lang].append(fn)
@@ -34,4 +34,4 @@
 
 intersection = enset & esset & deset & itset & nlset & zhset & trset
 
-print("Common ppkts are: ", len(intersection))
+print("Common ppkts are: ", len(intersection))
diff --git a/src/malco/analysis/eval_diagnose_category.py b/src/malco/analysis/eval_diagnose_category.py
@@ -17,27 +17,33 @@
 outpath = "disease_groups/"
 
 pc_cache_file = outpath + "diagnoses_hereditary_cond"
-pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)        
-    
+pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
 def mondo_adapter() -> OboGraphInterface:
     """
     Get the adapter for the MONDO ontology.
 
     Returns:
         Adapter: The adapter.
     """
-    return get_adapter("sqlite:obo:mondo") 
+    return get_adapter("sqlite:obo:mondo")
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-def mondo_mapping(term, adapter): 
+
+
+def mondo_mapping(term, adapter):
     mondos = []
     for m in adapter.sssom_mappings([term], source="OMIM"):
         if m.predicate_id == "skos:exactMatch":
             mondos.append(m.subject_id)
     return mondos
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
 @cached(pc, key=lambda omim_term, disease_categories, mondo: hashkey(omim_term))
 def find_category(omim_term, disease_categories, mondo):
     if not isinstance(mondo, MappingProviderInterface):
@@ -47,61 +53,63 @@ def find_category(omim_term, disease_categories, mondo):
     if not mondo_term:
         print(omim_term)
         return None
-
-    ancestor_list = mondo.ancestors(mondo_term, # only IS_A->same result
-                                    predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT
-
+
+    ancestor_list = mondo.ancestors(mondo_term,  # only IS_A->same result
+                                    # , reflexive=True) # method=GraphTraversalMethod.ENTAILMENT
+                                    predicates=[IS_A, PART_OF])
+
     for mondo_ancestor in ancestor_list:
         if mondo_ancestor in disease_categories:
-            #TODO IMPORTANT! Like this, at the first match the function exits!!
-            return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)
-    
+            # TODO IMPORTANT! Like this, at the first match the function exits!!
+            return mondo_ancestor  # This should be smt like MONDO:0045024 (cancer or benign tumor)
+
     print("Special issue following:  ")
     print(omim_term)
 
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#=====================================================
+# =====================================================
 # Script starts here. Name model:
-model=str(sys.argv[1])
-#=====================================================
+model = str(sys.argv[1])
+# =====================================================
 # Find 42 diseases categories
 
 mondo = mondo_adapter()
 
-disease_categories = mondo.relationships(objects = ["MONDO:0003847"],  # hereditary diseases
+disease_categories = mondo.relationships(objects=["MONDO:0003847"],  # hereditary diseases
                                          predicates=[IS_A, PART_OF])   # only IS_A->same result
-#disease_categories = mondo.relationships(objects = ["MONDO:0700096"], # only IS_A->same result
+# disease_categories = mondo.relationships(objects = ["MONDO:0700096"], # only IS_A->same result
 #                                         predicates=[IS_A, PART_OF])
 
 # make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
-header = ["label","correct", "incorrect"]
+header = ["label", "correct", "incorrect"]
 dc_list = [i[0] for i in list(disease_categories)]
 contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
 for j in dc_list:
-    contingency_table.loc[j,"label"] = mondo.label(j)
+    contingency_table.loc[j, "label"] = mondo.label(j)
 breakpoint()
 filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv"
 # label   term    score   rank    correct_term    is_correct      reciprocal_rank
 # PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt        MONDO:0008675   1.0     1.0     OMIM:620545     False        0.0
 
 df = pd.read_csv(
-        filename, sep="\t" 
-    )
+    filename, sep="\t"
+)
 
-ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]] 
-count_fails=0
+ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]]
+count_fails = 0
 
 omim_wo_match = {}
 for ppkt in ppkts:
     # find this phenopackets category <cat> from OMIM
     category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
     if not category_index:
         count_fails += 1
-        #print(f"Category index for {ppkt[1].iloc[0]["correct_term"]} ")
+        # print(f"Category index for {ppkt[1].iloc[0]["correct_term"]} ")
         omim_wo_match[ppkt[0]] = ppkt[1].iloc[0]["correct_term"]
         continue
-    #cat_ind = find_cat_index(category)
-    # is there a true? ppkt is tuple ("filename"/"label"/what has been used for grouping, dataframe) --> ppkt[1] is a dataframe 
+    # cat_ind = find_cat_index(category)
+    # is there a true? ppkt is tuple ("filename"/"label"/what has been used for grouping, dataframe) --> ppkt[1] is a dataframe
     if not any(ppkt[1]["is_correct"]):
         # no  --> increase <cat> incorrect
         try:
@@ -117,12 +125,12 @@ def find_category(omim_term, disease_categories, mondo):
             print("issue here")
             continue
 
-print("\n\n", "==="*15,"\n")
-print(f"For whatever reason find_category() returned None in {count_fails} cases, wich follow:\n") # print to file!
-#print(contingency_table)
+print("\n\n", "===" * 15, "\n")
+print(f"For whatever reason find_category() returned None in {count_fails} cases, wich follow:\n")  # print to file!
+# print(contingency_table)
 print("\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))
-#print(omim_wo_match, "\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))
+# print(omim_wo_match, "\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))
 
 cont_table_file = f"{outpath}{model}.tsv"
 # Will overwrite
-#contingency_table.to_csv(cont_table_file, sep='\t')
+# contingency_table.to_csv(cont_table_file, sep='\t')
diff --git a/src/malco/analysis/monarchKG_classifier.py b/src/malco/analysis/monarchKG_classifier.py
@@ -1,8 +1,8 @@
-# Monarch KG 
-# Idea: for each ppkt, make contingency table NF/F and in box write 
+# Monarch KG
+# Idea: for each ppkt, make contingency table NF/F and in box write
 # average number of connections. Thus 7 K of entries with num_edges, y=0,1
 # Think about mouse weight and obesity as an example.
-import numpy 
+import numpy
 from neo4j import GraphDatabase
 
 # Connect to the Neo4j database
@@ -11,7 +11,7 @@
 
 # From results take ppkts ground truth correct result and 0,1
 # Map OMIM to MONDO
-# 
+#
 # Need to decide what to project out. Maybe simply all edges connected to the MONDO terms I have.
 # At this point for each MONDO term I have count the edges
 # Define the Cypher query
@@ -26,4 +26,4 @@
 with driver.session() as session:
     results = session.run(query)
     for record in results:
-        data.append(record)
+        data.append(record)
diff --git a/src/malco/analysis/test_curate_script.py b/src/malco/analysis/test_curate_script.py
@@ -1,9 +1,9 @@
-import yaml 
+import yaml
 from pathlib import Path
 from typing import List
 from malco.post_process.extended_scoring import clean_service_answer, ground_diagnosis_text_to_mondo
 from oaklib import get_adapter
- 
+
 
 def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     """
@@ -16,7 +16,7 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
         dict: Contents of the raw result file.
     """
     with open(raw_result_path, 'r') as raw_result:
-        return list(yaml.safe_load_all(raw_result.read().replace(u'\x04','')))  # Load and convert to list
+        return list(yaml.safe_load_all(raw_result.read().replace(u'\x04', '')))  # Load and convert to list
 
 
 annotator = get_adapter("sqlite:obo:mondo")
@@ -29,25 +29,25 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
     j = 0
     for this_result in all_results:
         extracted_object = this_result.get("extracted_object")
-        if extracted_object: # Necessary because this is how I keep track of multiple runs
+        if extracted_object:  # Necessary because this is how I keep track of multiple runs
             ontogpt_text = this_result.get("input_text")
             # its a single string, should be parseable through curategpt
             cleaned_text = clean_service_answer(ontogpt_text)
             assert cleaned_text != "", "Cleaning failed: the cleaned text is empty."
             result = ground_diagnosis_text_to_mondo(annotator, cleaned_text, verbose=False)
 
-            label = extracted_object.get('label') # pubmed id 
+            label = extracted_object.get('label')  # pubmed id
             # terms will now ONLY contain MONDO IDs OR 'N/A'. The latter should be dealt with downstream
-            terms = [i[1][0][0] for i in result] 
-            #terms = extracted_object.get('terms') # list of strings, the mondo id or description
+            terms = [i[1][0][0] for i in result]
+            # terms = extracted_object.get('terms') # list of strings, the mondo id or description
             if terms:
-            # Note, the if allows for rerunning ppkts that failed due to connection issues
-            # We can have multiple identical ppkts/prompts in results.yaml as long as only one has a terms field
+                # Note, the if allows for rerunning ppkts that failed due to connection issues
+                # We can have multiple identical ppkts/prompts in results.yaml as long as only one has a terms field
                 num_terms = len(terms)
                 score = [1 / (i + 1) for i in range(num_terms)]  # score is reciprocal rank
-                rank_list = [ i+1 for i in range(num_terms)]
+                rank_list = [i + 1 for i in range(num_terms)]
                 for term, scr, rank in zip(terms, score, rank_list):
                     data.append({'label': label, 'term': term, 'score': scr, 'rank': rank})
-        if j>20:
+        if j > 20:
             break
         j += 1