added multiple models, along the lines of multiple languages. (#40)

* added multiple models, along the lines of multiple languages. Several functions are called twice, output goes into multilingual and multimodel within output and raw_results dirs. Two hardcoded model names in ontogpt have to be edited in order for this to work * del comments * multimodel plots etc polished and finished
monarch-initiative · Aug 2, 2024 · d90eec6 · d90eec6
1 parent b23dd69
commit d90eec6
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 45 deletions.
diff --git a/src/malco/post_process/compute_mrr.py b/src/malco/post_process/compute_mrr.py
@@ -23,14 +23,14 @@ def mondo_adapter() -> OboGraphInterface:
     """
     return get_adapter("sqlite:obo:mondo") 
 
-def compute_mrr(output_dir, prompt_dir, correct_answer_file,
+def compute_mrr(comparing, output_dir, prompt_dir, correct_answer_file,
                 raw_results_dir) -> Path:
     # Read in results TSVs from self.output_dir that match glob results*tsv 
     results_data = []
     results_files = []
     num_ppkt = 0
 
-    for subdir, dirs, files in os.walk(output_dir):
+    for subdir, dirs, files in os.walk(output_dir): # maybe change this so it only looks into multilingual/multimodel? I.e. use that as outputdir...?
         for filename in files:
             if filename.startswith("result") and filename.endswith(".tsv"):
                 file_path = os.path.join(subdir, filename)
@@ -49,7 +49,7 @@ def compute_mrr(output_dir, prompt_dir, correct_answer_file,
     label_to_correct_term = answers.set_index("label")["term"].to_dict()
     # Calculate the Mean Reciprocal Rank (MRR) for each file
     mrr_scores = []
-    header = ["lang", "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10", "n10p", "nf"]
+    header = [comparing, "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10", "n10p", "nf"]
     rank_df = pd.DataFrame(0, index=np.arange(len(results_files)), columns=header)
 
     cache_file = output_dir / "cache_log.txt"
@@ -85,15 +85,15 @@ def compute_mrr(output_dir, prompt_dir, correct_answer_file,
             )
 
             # Save full data frame
-            full_df_file = raw_results_dir / results_files[i][0:2] / "full_df_results.tsv"
+            full_df_file = raw_results_dir / results_files[i].split("/")[0] / "full_df_results.tsv"
             df.to_csv(full_df_file, sep='\t', index=False)
 
             # Calculate MRR for this file
             mrr = df.groupby("label")["reciprocal_rank"].max().mean()
             mrr_scores.append(mrr)
 
             # Calculate top<n> of each rank
-            rank_df.loc[i,"lang"] = results_files[i][0:2]
+            rank_df.loc[i, comparing] = results_files[i].split("/")[0]
 
             ppkts = df.groupby("label")[["rank","is_correct"]] 
             index_matches = df.index[df['is_correct']]
@@ -133,12 +133,12 @@ def compute_mrr(output_dir, prompt_dir, correct_answer_file,
 
     print("MRR scores are:\n")
     print(mrr_scores)
-    plot_data_file = plot_dir / "plotting_data.tsv"
+    mrr_file = plot_dir / "mrr_result.tsv"
 
     # write out results for plotting 
-    with plot_data_file.open('w', newline = '') as dat:
+    with mrr_file.open('w', newline = '') as dat:
         writer = csv.writer(dat, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
         writer.writerow(results_files)
         writer.writerow(mrr_scores)
 
-    return plot_data_file, plot_dir, num_ppkt, topn_file
+    return mrr_file, plot_dir, num_ppkt, topn_file
diff --git a/src/malco/post_process/generate_plots.py b/src/malco/post_process/generate_plots.py
@@ -6,12 +6,16 @@
 
 # Make a nice plot, use it as function or as script
 
-def make_plots(plot_data_file, plot_dir, languages, num_ppkt, topn_file):
-    with plot_data_file.open('r', newline = '') as f:
+def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, comparing):
+    if comparing=="model":
+        name_string = str(len(models))
+    else:
+        name_string = str(len(languages))
+
+    with mrr_file.open('r', newline = '') as f:
         lines = csv.reader(f, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
         results_files = next(lines)
         mrr_scores = next(lines)
-        #lines = f.read().splitlines()
 
     print(results_files)
     print(mrr_scores)
@@ -21,8 +25,9 @@ def make_plots(plot_data_file, plot_dir, languages, num_ppkt, topn_file):
     plt.xlabel("Results File")
     plt.ylabel("Mean Reciprocal Rank (MRR)")
     plt.title("MRR of Correct Answers Across Different Results Files")
-    plot_path = plot_dir /  (str(len(languages)) + "_langs_" + str(num_ppkt) + "ppkt.png")
+    plot_path = plot_dir /  (name_string + "_" + comparing + "_" + str(num_ppkt) + "ppkt.png")
     plt.savefig(plot_path)
+    plt.close()
 
     # Plotting bar-plots with top<n> ranks
     df = pd.read_csv(topn_file, delimiter='\t')
@@ -33,17 +38,19 @@ def make_plots(plot_data_file, plot_dir, languages, num_ppkt, topn_file):
     df["not_found"] = df["nf"]
 
     df_aggr = pd.DataFrame()
-    df_aggr = pd.melt(df, id_vars="lang", value_vars=["top1", "top3", "top5", "top10", "not_found"], var_name="Rank_in", value_name="counts")
+    df_aggr = pd.melt(df, id_vars=comparing, value_vars=["top1", "top3", "top5", "top10", "not_found"], var_name="Rank_in", value_name="counts")
+    df_aggr["percentage"] = df_aggr["counts"]/num_ppkt
     bar_data_file = plot_dir / "topn_aggr.tsv"
     df_aggr.to_csv(bar_data_file, sep='\t', index=False)
 
-    sns.barplot(x="Rank_in", y="counts", data = df_aggr, hue = "lang")
+    sns.barplot(x="Rank_in", y="percentage", data = df_aggr, hue = comparing)
 
-    plt.xlabel("Number of Ranks")
-    plt.ylabel("Number of Correct Diagnoses")
-    plt.title("Rank Comparison for Different Languages")
-    plot_path = plot_dir /  ("barplot_" + str(len(languages)) + "_langs_" + str(num_ppkt) + "ppkt.png")
+    plt.xlabel("Number of Ranks in")
+    plt.ylabel("Percentage of Cases")
+    plt.title("Rank Comparison for Differential Diagnosis")
+    breakpoint()
+    plt.legend(title=comparing)
+    plot_path = plot_dir /  ("barplot_" + name_string + "_" + comparing + "_" + str(num_ppkt) + "ppkt.png")
     plt.savefig(plot_path)
-    plt.show()
-
+    plt.close()
 
diff --git a/src/malco/post_process/post_process.py b/src/malco/post_process/post_process.py
@@ -4,7 +4,7 @@
 import os
 
 
-def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple) -> None:
+def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models: tuple) -> None:
     """
     Post-process the raw results output to standardised PhEval TSV format.
 
@@ -14,10 +14,19 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple) -> None:
     """
 
     for lang in langs:
-        raw_results_lang = raw_results_dir / lang
-        output_lang = output_dir / lang
-        raw_results_lang.mkdir(exist_ok=True)
-        output_lang.mkdir(exist_ok=True)
+        raw_results_lang = raw_results_dir / "multilingual" / lang
+        output_lang = output_dir / "multilingual" / lang
+        raw_results_lang.mkdir(exist_ok=True, parents=True)
+        output_lang.mkdir(exist_ok=True, parents=True)
 
         create_standardised_results(raw_results_dir=raw_results_lang,
                                     output_dir=output_lang, output_file_name="results.tsv")
+
+    for model in models:
+        raw_results_model = raw_results_dir / "multimodel" / model
+        output_model = output_dir / "multimodel" / model
+        raw_results_model.mkdir(exist_ok=True, parents=True)
+        output_model.mkdir(exist_ok=True, parents=True)
+
+        create_standardised_results(raw_results_dir=raw_results_model,
+                                    output_dir=output_model, output_file_name="results.tsv")
diff --git a/src/malco/run/run.py b/src/malco/run/run.py
@@ -3,23 +3,36 @@
 import subprocess
 
 
-def call_ontogpt(lang, raw_results_dir, input_dir):
-    command = (
-        f"ontogpt -v run-multilingual-analysis "
-        f"--output={raw_results_dir}/{lang}/results.yaml "  # save raw OntoGPT output
-        f"{input_dir}/prompts/{lang}/ "
-        f"{raw_results_dir}/{lang}/differentials_by_file/"
-    )
+def call_ontogpt(lang, raw_results_dir, input_dir, model, modality): 
+    if modality=="several_languages":
+        command = (
+            f"ontogpt -v run-multilingual-analysis "
+            f"--output={raw_results_dir}/{lang}/results.yaml "  # save raw OntoGPT output
+            f"{input_dir}/prompts/{lang}/ "
+            f"{raw_results_dir}/{lang}/differentials_by_file/ "
+            f"--model={model}"
+        )
+    elif modality=="several_models":
+        command = (
+            f"ontogpt -v run-multilingual-analysis "
+            f"--output={raw_results_dir}/{model}/results.yaml "  # save raw OntoGPT output
+            f"{input_dir}/prompts/{lang}/ "
+            f"{raw_results_dir}/{model}/differentials_by_file/ "
+            f"--model={model}"
+        )
+    else:
+        command(f"echo Something is not working...")
     print(f"Running command: {command}")
     process = subprocess.Popen(command, shell=True)
     process.communicate()
-    print(f"Finished command for {lang}")
+    print(f"Finished command for language {lang} and model {model}") 
 
 
 def run(testdata_dir: Path,
         raw_results_dir: Path,
         input_dir: Path,
         langs: tuple,
+        models: tuple,
         max_workers: int = None) -> None:
     """
     Run the tool to obtain the raw results.
@@ -35,5 +48,11 @@ def run(testdata_dir: Path,
     if max_workers is None:
         max_workers = multiprocessing.cpu_count()
 
+    modality = "several_languages"
     with multiprocessing.Pool(processes=max_workers) as pool:
-        pool.starmap(call_ontogpt, [(lang, raw_results_dir, input_dir) for lang in langs])
+        pool.starmap(call_ontogpt, [(lang, raw_results_dir / "multilingual", input_dir, "gpt-4-turbo", modality) for lang in langs])
+
+    # English only many models
+    modality = "several_models"
+    with multiprocessing.Pool(processes=max_workers) as pool:
+        pool.starmap(call_ontogpt, [("en", raw_results_dir / "multimodel", input_dir, model, modality) for model in models])
diff --git a/src/malco/runner.py b/src/malco/runner.py
@@ -20,17 +20,13 @@ class MalcoRunner(PhEvalRunner):
     version: str
     # Declare a tuple (immutable!) of languages
     languages = ("en", "es", "nl", "it", "de")
+    models = ('gpt-4o', 'gpt-4') # Decide on list of models: Claude-Sonnet (Anthropic key), 
 
     def prepare(self):
         """
         Pre-process any data and inputs necessary to run the tool.
         """
         print("Preparing...\n")
-        # Before this prepare step:
-        # We start with cohort with 1 phenopacket per disease, run
-        # phenopacket2prompt.jar to get prompts
-        # We then commit this to the repo, and the phenopackets and prompts here
-        # are the source of truth
         pass
 
     def run(self):
@@ -41,7 +37,8 @@ def run(self):
         run(testdata_dir=self.testdata_dir,
             raw_results_dir=self.raw_results_dir,
             input_dir=self.input_dir,
-            langs=self.languages)
+            langs=self.languages,
+            models=self.models)
 
 
     def post_process(self,
@@ -56,13 +53,25 @@ def post_process(self,
 
         post_process(raw_results_dir=self.raw_results_dir,
                      output_dir=self.output_dir,
-                     langs=self.languages)
+                     langs=self.languages,
+                     models=self.models)
 
-        plot_data_file, plot_dir, num_ppkt, topn_file = compute_mrr(
-            output_dir=self.output_dir,
+        comparing = "language"
+        mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr(comparing,
+            output_dir=self.output_dir / "multilingual" ,
             prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
             correct_answer_file=correct_answer_file,
-            raw_results_dir=self.raw_results_dir)
+            raw_results_dir=self.raw_results_dir / "multilingual")
 
         if print_plot:
-            make_plots(plot_data_file, plot_dir, self.languages, num_ppkt, topn_file)
+            make_plots(mrr_file, plot_dir, self.languages, num_ppkt, self.models, topn_file, comparing)
+
+        comparing = "model"
+        mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr( comparing,
+            output_dir=self.output_dir / "multimodel" ,
+            prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
+            correct_answer_file=correct_answer_file,
+            raw_results_dir=self.raw_results_dir / "multimodel" )
+
+        if print_plot:
+            make_plots(mrr_file, plot_dir, self.languages, num_ppkt, self.models, topn_file, comparing)