Skip to content

Commit

Permalink
trying to edit formatting with autopep8 to conform to pep8
Browse files Browse the repository at this point in the history
  • Loading branch information
leokim-l committed Nov 20, 2024
1 parent bb6d78d commit 988404c
Show file tree
Hide file tree
Showing 23 changed files with 415 additions and 371 deletions.
27 changes: 21 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ setuptools = "^69.5.1"
shelved-cache = "^0.3.1"
curategpt = "^0.2.2"
psutil = "^6.1.0"
autopep8 = "^2.3.1"

[tool.poetry.plugins."pheval.plugins"]
template = "malco.runner:MalcoRunner"
Expand Down
18 changes: 10 additions & 8 deletions src/malco/analysis/check_lens.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import pandas as pd
import pandas as pd
from typing import List

import pandas as pd
import yaml
#from malco.post_process.post_process_results_format import read_raw_result_yaml
# from malco.post_process.post_process_results_format import read_raw_result_yaml
from pathlib import Path
import sys


def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
"""
Read the raw result file.
Expand All @@ -18,16 +19,17 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04', ''))) # Load and convert to list


unique_ppkts = {}
#model=str(sys.argv[1])
# model=str(sys.argv[1])
models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
for model in models:
print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)
print("===" * 10, "\nEvaluating now: ", model, "\n" + "===" * 10)

yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
all_results=read_raw_result_yaml(yamlfile)
all_results = read_raw_result_yaml(yamlfile)

counter = 0
labelvec = []
Expand Down Expand Up @@ -64,4 +66,4 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
if i in unique_ppkts["gpt-3.5-turbo"]:
continue
else:
print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
6 changes: 3 additions & 3 deletions src/malco/analysis/count_grounding_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
mfile = "../outputdir_all_2024_07_04/en/results.tsv"

df = pd.read_csv(
mfile, sep="\t" #, header=None, names=["description", "term", "label"]
)
mfile, sep="\t" # , header=None, names=["description", "term", "label"]
)

terms = df["term"]
counter = 0
Expand All @@ -17,4 +17,4 @@
counter += 1

print(counter)
print(grounded)
print(grounded)
6 changes: 3 additions & 3 deletions src/malco/analysis/count_translated_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
promptfiles = {}
for lang in langs:
promptfiles[lang] = []
for (dirpath, dirnames, filenames) in os.walk(fp+lang):
for (dirpath, dirnames, filenames) in os.walk(fp + lang):
for fn in filenames:
fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
# Maybe something along the lines of other script disease_avail_knowledge.py
# ppkt_label = ppkt[0].replace('_en-prompt.txt','')
promptfiles[lang].append(fn)
Expand All @@ -34,4 +34,4 @@

intersection = enset & esset & deset & itset & nlset & zhset & trset

print("Common ppkts are: ", len(intersection))
print("Common ppkts are: ", len(intersection))
68 changes: 38 additions & 30 deletions src/malco/analysis/eval_diagnose_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,33 @@
outpath = "disease_groups/"

pc_cache_file = outpath + "diagnoses_hereditary_cond"
pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)
pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


def mondo_adapter() -> OboGraphInterface:
"""
Get the adapter for the MONDO ontology.
Returns:
Adapter: The adapter.
"""
return get_adapter("sqlite:obo:mondo")
return get_adapter("sqlite:obo:mondo")

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def mondo_mapping(term, adapter):


def mondo_mapping(term, adapter):
mondos = []
for m in adapter.sssom_mappings([term], source="OMIM"):
if m.predicate_id == "skos:exactMatch":
mondos.append(m.subject_id)
return mondos

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


@cached(pc, key=lambda omim_term, disease_categories, mondo: hashkey(omim_term))
def find_category(omim_term, disease_categories, mondo):
if not isinstance(mondo, MappingProviderInterface):
Expand All @@ -47,61 +53,63 @@ def find_category(omim_term, disease_categories, mondo):
if not mondo_term:
print(omim_term)
return None

ancestor_list = mondo.ancestors(mondo_term, # only IS_A->same result
predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT


ancestor_list = mondo.ancestors(mondo_term, # only IS_A->same result
# , reflexive=True) # method=GraphTraversalMethod.ENTAILMENT
predicates=[IS_A, PART_OF])

for mondo_ancestor in ancestor_list:
if mondo_ancestor in disease_categories:
#TODO IMPORTANT! Like this, at the first match the function exits!!
return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)
# TODO IMPORTANT! Like this, at the first match the function exits!!
return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)

print("Special issue following: ")
print(omim_term)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#=====================================================
# =====================================================
# Script starts here. Name model:
model=str(sys.argv[1])
#=====================================================
model = str(sys.argv[1])
# =====================================================
# Find 42 diseases categories

mondo = mondo_adapter()

disease_categories = mondo.relationships(objects = ["MONDO:0003847"], # hereditary diseases
disease_categories = mondo.relationships(objects=["MONDO:0003847"], # hereditary diseases
predicates=[IS_A, PART_OF]) # only IS_A->same result
#disease_categories = mondo.relationships(objects = ["MONDO:0700096"], # only IS_A->same result
# disease_categories = mondo.relationships(objects = ["MONDO:0700096"], # only IS_A->same result
# predicates=[IS_A, PART_OF])

# make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
header = ["label","correct", "incorrect"]
header = ["label", "correct", "incorrect"]
dc_list = [i[0] for i in list(disease_categories)]
contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
for j in dc_list:
contingency_table.loc[j,"label"] = mondo.label(j)
contingency_table.loc[j, "label"] = mondo.label(j)
breakpoint()
filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv"
# label term score rank correct_term is_correct reciprocal_rank
# PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt MONDO:0008675 1.0 1.0 OMIM:620545 False 0.0

df = pd.read_csv(
filename, sep="\t"
)
filename, sep="\t"
)

ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]]
count_fails=0
ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]]
count_fails = 0

omim_wo_match = {}
for ppkt in ppkts:
# find this phenopackets category <cat> from OMIM
category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
if not category_index:
count_fails += 1
#print(f"Category index for {ppkt[1].iloc[0]["correct_term"]} ")
# print(f"Category index for {ppkt[1].iloc[0]["correct_term"]} ")
omim_wo_match[ppkt[0]] = ppkt[1].iloc[0]["correct_term"]
continue
#cat_ind = find_cat_index(category)
# is there a true? ppkt is tuple ("filename"/"label"/what has been used for grouping, dataframe) --> ppkt[1] is a dataframe
# cat_ind = find_cat_index(category)
# is there a true? ppkt is tuple ("filename"/"label"/what has been used for grouping, dataframe) --> ppkt[1] is a dataframe
if not any(ppkt[1]["is_correct"]):
# no --> increase <cat> incorrect
try:
Expand All @@ -117,12 +125,12 @@ def find_category(omim_term, disease_categories, mondo):
print("issue here")
continue

print("\n\n", "==="*15,"\n")
print(f"For whatever reason find_category() returned None in {count_fails} cases, wich follow:\n") # print to file!
#print(contingency_table)
print("\n\n", "===" * 15, "\n")
print(f"For whatever reason find_category() returned None in {count_fails} cases, wich follow:\n") # print to file!
# print(contingency_table)
print("\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))
#print(omim_wo_match, "\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))
# print(omim_wo_match, "\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))

cont_table_file = f"{outpath}{model}.tsv"
# Will overwrite
#contingency_table.to_csv(cont_table_file, sep='\t')
# contingency_table.to_csv(cont_table_file, sep='\t')
10 changes: 5 additions & 5 deletions src/malco/analysis/monarchKG_classifier.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Monarch KG
# Idea: for each ppkt, make contingency table NF/F and in box write
# Monarch KG
# Idea: for each ppkt, make contingency table NF/F and in box write
# average number of connections. Thus 7 K of entries with num_edges, y=0,1
# Think about mouse weight and obesity as an example.
import numpy
import numpy
from neo4j import GraphDatabase

# Connect to the Neo4j database
Expand All @@ -11,7 +11,7 @@

# From results take ppkts ground truth correct result and 0,1
# Map OMIM to MONDO
#
#
# Need to decide what to project out. Maybe simply all edges connected to the MONDO terms I have.
# At this point for each MONDO term I have count the edges
# Define the Cypher query
Expand All @@ -26,4 +26,4 @@
with driver.session() as session:
results = session.run(query)
for record in results:
data.append(record)
data.append(record)
22 changes: 11 additions & 11 deletions src/malco/analysis/test_curate_script.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import yaml
import yaml
from pathlib import Path
from typing import List
from malco.post_process.extended_scoring import clean_service_answer, ground_diagnosis_text_to_mondo
from oaklib import get_adapter


def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
"""
Expand All @@ -16,7 +16,7 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04', ''))) # Load and convert to list


annotator = get_adapter("sqlite:obo:mondo")
Expand All @@ -29,25 +29,25 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
j = 0
for this_result in all_results:
extracted_object = this_result.get("extracted_object")
if extracted_object: # Necessary because this is how I keep track of multiple runs
if extracted_object: # Necessary because this is how I keep track of multiple runs
ontogpt_text = this_result.get("input_text")
# its a single string, should be parseable through curategpt
cleaned_text = clean_service_answer(ontogpt_text)
assert cleaned_text != "", "Cleaning failed: the cleaned text is empty."
result = ground_diagnosis_text_to_mondo(annotator, cleaned_text, verbose=False)

label = extracted_object.get('label') # pubmed id
label = extracted_object.get('label') # pubmed id
# terms will now ONLY contain MONDO IDs OR 'N/A'. The latter should be dealt with downstream
terms = [i[1][0][0] for i in result]
#terms = extracted_object.get('terms') # list of strings, the mondo id or description
terms = [i[1][0][0] for i in result]
# terms = extracted_object.get('terms') # list of strings, the mondo id or description
if terms:
# Note, the if allows for rerunning ppkts that failed due to connection issues
# We can have multiple identical ppkts/prompts in results.yaml as long as only one has a terms field
# Note, the if allows for rerunning ppkts that failed due to connection issues
# We can have multiple identical ppkts/prompts in results.yaml as long as only one has a terms field
num_terms = len(terms)
score = [1 / (i + 1) for i in range(num_terms)] # score is reciprocal rank
rank_list = [ i+1 for i in range(num_terms)]
rank_list = [i + 1 for i in range(num_terms)]
for term, scr, rank in zip(terms, score, rank_list):
data.append({'label': label, 'term': term, 'score': scr, 'rank': rank})
if j>20:
if j > 20:
break
j += 1
Loading

0 comments on commit 988404c

Please sign in to comment.