Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trying to edit formatting with autopep8 to conform to pep8 #58

Merged
merged 2 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions caches/cache_log.txt
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,17 @@ CacheInfo: hits=176206, misses=6703, maxsize=524288, currsize=29413
omim_mappings cache info:
CacheInfo: hits=80073, misses=2993, maxsize=524288, currsize=23613

Timestamp: 20241120-174511

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=45, misses=30, maxsize=524288 , currsize=29443
omim_mappings cache info:
CacheInfo: hits=615, misses=5, maxsize=524288 , currsize=23618

gpt-4o-mini/results.tsv
score_grounded_result cache info:
CacheInfo: hits=98, misses=70, maxsize=524288 , currsize=29483
omim_mappings cache info:
CacheInfo: hits=1153, misses=14, maxsize=524288 , currsize=23627

Binary file modified caches/omim_mappings_cache.db
Binary file not shown.
Binary file modified caches/score_grounded_result_cache.db
Binary file not shown.
27 changes: 21 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ setuptools = "^69.5.1"
shelved-cache = "^0.3.1"
curategpt = "^0.2.2"
psutil = "^6.1.0"
autopep8 = "^2.3.1"

[tool.poetry.plugins."pheval.plugins"]
template = "malco.runner:MalcoRunner"
Expand Down
47 changes: 29 additions & 18 deletions src/malco/analysis/check_lens.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import pandas as pd
import sys

# from malco.post_process.post_process_results_format import read_raw_result_yaml
from pathlib import Path
from typing import List

import pandas as pd
import yaml
#from malco.post_process.post_process_results_format import read_raw_result_yaml
from pathlib import Path
import sys


def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
"""
Expand All @@ -17,17 +18,20 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
Returns:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list
with open(raw_result_path, "r") as raw_result:
return list(
yaml.safe_load_all(raw_result.read().replace("\x04", ""))
) # Load and convert to list


unique_ppkts = {}
#model=str(sys.argv[1])
# model=str(sys.argv[1])
models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
for model in models:
print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)
print("===" * 10, "\nEvaluating now: ", model, "\n" + "===" * 10)

yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
all_results=read_raw_result_yaml(yamlfile)
all_results = read_raw_result_yaml(yamlfile)

counter = 0
labelvec = []
Expand All @@ -36,20 +40,27 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
for this_result in all_results:
extracted_object = this_result.get("extracted_object")
if extracted_object:
label = extracted_object.get('label')
label = extracted_object.get("label")
labelvec.append(label)
terms = extracted_object.get('terms')
terms = extracted_object.get("terms")
if terms:
counter += 1

full_df_file = f"out_openAI_models/multimodel/{model}/results.tsv"
df = pd.read_csv(full_df_file, sep='\t')
num_ppkts = df['label'].nunique()
unique_ppkts[model] = df['label'].unique()
df = pd.read_csv(full_df_file, sep="\t")
num_ppkts = df["label"].nunique()
unique_ppkts[model] = df["label"].unique()
# The first should be equivalent to grepping "raw_" in some results.yaml
print("The number of prompts that have something in results.yaml are: ", len(labelvec))
print("The number of prompts that have a non-empty differential (i.e. term is not None) is:", counter)
print("The number of unique prompts/ppkts with a non-empty differential in results.tsv are:", num_ppkts, "\n")
print(
"The number of prompts that have a non-empty differential (i.e. term is not None) is:",
counter,
)
print(
"The number of unique prompts/ppkts with a non-empty differential in results.tsv are:",
num_ppkts,
"\n",
)

# This we know a posteriori, gpt-4o and gpt-4-turbo both have 5213 phenopackets
# Thus, let's print out what is missing in the others
Expand All @@ -64,4 +75,4 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
if i in unique_ppkts["gpt-3.5-turbo"]:
continue
else:
print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
7 changes: 3 additions & 4 deletions src/malco/analysis/count_grounding_failures.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
# Quick check how often the grounding failed
# Need to be in short_letter branch
import pandas as pd

mfile = "../outputdir_all_2024_07_04/en/results.tsv"

df = pd.read_csv(
mfile, sep="\t" #, header=None, names=["description", "term", "label"]
)
df = pd.read_csv(mfile, sep="\t") # , header=None, names=["description", "term", "label"]

terms = df["term"]
counter = 0
Expand All @@ -17,4 +16,4 @@
counter += 1

print(counter)
print(grounded)
print(grounded)
38 changes: 20 additions & 18 deletions src/malco/analysis/count_translated_prompts.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,39 @@
import os
import re

fp = "/Users/leonardo/IdeaProjects/phenopacket2prompt/prompts/"

langs = ["en",
"es",
"de",
"it",
"nl",
"tr",
"zh",
]
langs = [
"en",
"es",
"de",
"it",
"nl",
"tr",
"zh",
]

promptfiles = {}
for lang in langs:
promptfiles[lang] = []
for (dirpath, dirnames, filenames) in os.walk(fp+lang):
for dirpath, dirnames, filenames in os.walk(fp + lang):
for fn in filenames:
fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
# Maybe something along the lines of other script disease_avail_knowledge.py
# ppkt_label = ppkt[0].replace('_en-prompt.txt','')
promptfiles[lang].append(fn)
break

intersection = set()

enset = set(promptfiles['en'])
esset = set(promptfiles['es'])
deset = set(promptfiles['de'])
itset = set(promptfiles['it'])
nlset = set(promptfiles['nl'])
zhset = set(promptfiles['zh'])
trset = set(promptfiles['tr'])
enset = set(promptfiles["en"])
esset = set(promptfiles["es"])
deset = set(promptfiles["de"])
itset = set(promptfiles["it"])
nlset = set(promptfiles["nl"])
zhset = set(promptfiles["zh"])
trset = set(promptfiles["tr"])

intersection = enset & esset & deset & itset & nlset & zhset & trset

print("Common ppkts are: ", len(intersection))
print("Common ppkts are: ", len(intersection))
Loading
Loading