Skip to content

Commit

Permalink
trying to edit formatting with autopep8 to conform to pep8 (#58)
Browse files Browse the repository at this point in the history
* trying to edit formatting with autopep8 to conform to pep8

* fixed several things, among which flake8 version, tox.ini formatting, lots in the source code format, ran it once to make sure it still works

Not perfect yet, but merged since it works. Further embellishments or edits can be done in the future
  • Loading branch information
leokim-l authored Nov 20, 2024
1 parent bb6d78d commit 11cbc1e
Show file tree
Hide file tree
Showing 27 changed files with 746 additions and 629 deletions.
14 changes: 14 additions & 0 deletions caches/cache_log.txt
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,17 @@ CacheInfo: hits=176206, misses=6703, maxsize=524288, currsize=29413
omim_mappings cache info:
CacheInfo: hits=80073, misses=2993, maxsize=524288, currsize=23613

Timestamp: 20241120-174511

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=45, misses=30, maxsize=524288 , currsize=29443
omim_mappings cache info:
CacheInfo: hits=615, misses=5, maxsize=524288 , currsize=23618

gpt-4o-mini/results.tsv
score_grounded_result cache info:
CacheInfo: hits=98, misses=70, maxsize=524288 , currsize=29483
omim_mappings cache info:
CacheInfo: hits=1153, misses=14, maxsize=524288 , currsize=23627

Binary file modified caches/omim_mappings_cache.db
Binary file not shown.
Binary file modified caches/score_grounded_result_cache.db
Binary file not shown.
27 changes: 21 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ setuptools = "^69.5.1"
shelved-cache = "^0.3.1"
curategpt = "^0.2.2"
psutil = "^6.1.0"
autopep8 = "^2.3.1"

[tool.poetry.plugins."pheval.plugins"]
template = "malco.runner:MalcoRunner"
Expand Down
47 changes: 29 additions & 18 deletions src/malco/analysis/check_lens.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import pandas as pd
import sys

# from malco.post_process.post_process_results_format import read_raw_result_yaml
from pathlib import Path
from typing import List

import pandas as pd
import yaml
#from malco.post_process.post_process_results_format import read_raw_result_yaml
from pathlib import Path
import sys


def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
"""
Expand All @@ -17,17 +18,20 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
Returns:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list
with open(raw_result_path, "r") as raw_result:
return list(
yaml.safe_load_all(raw_result.read().replace("\x04", ""))
) # Load and convert to list


unique_ppkts = {}
#model=str(sys.argv[1])
# model=str(sys.argv[1])
models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
for model in models:
print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)
print("===" * 10, "\nEvaluating now: ", model, "\n" + "===" * 10)

yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
all_results=read_raw_result_yaml(yamlfile)
all_results = read_raw_result_yaml(yamlfile)

counter = 0
labelvec = []
Expand All @@ -36,20 +40,27 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
for this_result in all_results:
extracted_object = this_result.get("extracted_object")
if extracted_object:
label = extracted_object.get('label')
label = extracted_object.get("label")
labelvec.append(label)
terms = extracted_object.get('terms')
terms = extracted_object.get("terms")
if terms:
counter += 1

full_df_file = f"out_openAI_models/multimodel/{model}/results.tsv"
df = pd.read_csv(full_df_file, sep='\t')
num_ppkts = df['label'].nunique()
unique_ppkts[model] = df['label'].unique()
df = pd.read_csv(full_df_file, sep="\t")
num_ppkts = df["label"].nunique()
unique_ppkts[model] = df["label"].unique()
# The first should be equivalent to grepping "raw_" in some results.yaml
print("The number of prompts that have something in results.yaml are: ", len(labelvec))
print("The number of prompts that have a non-empty differential (i.e. term is not None) is:", counter)
print("The number of unique prompts/ppkts with a non-empty differential in results.tsv are:", num_ppkts, "\n")
print(
"The number of prompts that have a non-empty differential (i.e. term is not None) is:",
counter,
)
print(
"The number of unique prompts/ppkts with a non-empty differential in results.tsv are:",
num_ppkts,
"\n",
)

# This we know a posteriori, gpt-4o and gpt-4-turbo both have 5213 phenopackets
# Thus, let's print out what is missing in the others
Expand All @@ -64,4 +75,4 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
if i in unique_ppkts["gpt-3.5-turbo"]:
continue
else:
print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
7 changes: 3 additions & 4 deletions src/malco/analysis/count_grounding_failures.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
# Quick check how often the grounding failed
# Need to be in short_letter branch
import pandas as pd

mfile = "../outputdir_all_2024_07_04/en/results.tsv"

df = pd.read_csv(
mfile, sep="\t" #, header=None, names=["description", "term", "label"]
)
df = pd.read_csv(mfile, sep="\t") # , header=None, names=["description", "term", "label"]

terms = df["term"]
counter = 0
Expand All @@ -17,4 +16,4 @@
counter += 1

print(counter)
print(grounded)
print(grounded)
38 changes: 20 additions & 18 deletions src/malco/analysis/count_translated_prompts.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,39 @@
import os
import re

fp = "/Users/leonardo/IdeaProjects/phenopacket2prompt/prompts/"

langs = ["en",
"es",
"de",
"it",
"nl",
"tr",
"zh",
]
langs = [
"en",
"es",
"de",
"it",
"nl",
"tr",
"zh",
]

promptfiles = {}
for lang in langs:
promptfiles[lang] = []
for (dirpath, dirnames, filenames) in os.walk(fp+lang):
for dirpath, dirnames, filenames in os.walk(fp + lang):
for fn in filenames:
fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
# Maybe something along the lines of other script disease_avail_knowledge.py
# ppkt_label = ppkt[0].replace('_en-prompt.txt','')
promptfiles[lang].append(fn)
break

intersection = set()

enset = set(promptfiles['en'])
esset = set(promptfiles['es'])
deset = set(promptfiles['de'])
itset = set(promptfiles['it'])
nlset = set(promptfiles['nl'])
zhset = set(promptfiles['zh'])
trset = set(promptfiles['tr'])
enset = set(promptfiles["en"])
esset = set(promptfiles["es"])
deset = set(promptfiles["de"])
itset = set(promptfiles["it"])
nlset = set(promptfiles["nl"])
zhset = set(promptfiles["zh"])
trset = set(promptfiles["tr"])

intersection = enset & esset & deset & itset & nlset & zhset & trset

print("Common ppkts are: ", len(intersection))
print("Common ppkts are: ", len(intersection))
Loading

0 comments on commit 11cbc1e

Please sign in to comment.