Merge branch 'main' into 6th-mode

JetBrains-Research · Dec 19, 2024 · 2edd6d2 · 2edd6d2
2 parents 2a11fee + 42ff9f0
commit 2edd6d2
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 1 deletion.
diff --git a/prompts/dafny_eval_comment_without_impls_textd/ask_for_fixed.txt b/prompts/dafny_eval_comment_without_impls_textd/ask_for_fixed.txt
@@ -0,0 +1,5 @@
+This answer got Dafny verification error:
+{error}
+Please try again by taking the Dafny feedback.
+
+If an error appears in one of method ending with `_valid`, that means annotations of an existing function were altered.
diff --git a/prompts/dafny_eval_comment_without_impls_textd/rewrite.txt b/prompts/dafny_eval_comment_without_impls_textd/rewrite.txt
@@ -0,0 +1,6 @@
+Given a Dafny program with function signature, preconditions and postconditions, but with some implementations and annotations missing.
+Please return a complete Dafny program with all functions implemented, and annotated using the strongest possible annotations (loop invariants, assert statements, etc.).
+Use extensive comments in the code, but don't explain anything outside of it.
+Please use exactly the same function signature, preconditions, and postconditions. Do not ever modify the given lines.
+Below is the program:
+{program}
diff --git a/prompts/dafny_eval_comment_without_impls_textd/sys.txt b/prompts/dafny_eval_comment_without_impls_textd/sys.txt
@@ -0,0 +1,4 @@
+You are an expert in Dafny.
+You will be given tasks dealing with Dafny programs including precise annotations.
+You respond only with code blocks.
+You will write a comment explaining each significant point of the program on a line before.
diff --git a/scripts/generate_heatmap.py b/scripts/generate_heatmap.py
@@ -0,0 +1,62 @@
+# %%
+import matplotlib.pyplot as plt
+import json
+from itertools import accumulate
+import pathlib
+import seaborn as sns
+import pandas as pd
+
+# %%
+
+
+def read_result(file):
+    with open(file) as f:
+        return {name[:3]: t if t >= 0 else 99 for name, t in json.load(f).items()}
+
+
+# %%
+
+
+def plot_results_heatmap(results_dict, width=35):
+    df = pd.DataFrame.from_dict({
+        source: entry_dict
+        for source, entry_dict in results_dict.items()
+    }, orient='index')
+
+    # Create the heatmap
+    plt.figure(figsize=(width, 4))
+    sns.heatmap(
+        df,
+        cmap='YlOrRd',  # Color scheme - you can change this
+        annot=True,  # Show numbers in cells
+        fmt='d',  # Format as integers
+        cbar_kws={'label': 'Result'}
+    )
+
+    plt.title('Results Heatmap')
+    plt.xlabel('Entries')
+    plt.ylabel('Sources')
+    plt.tight_layout()
+
+
+# %%
+data = pathlib.Path('heatmap_data')
+results = {str(x.name).removesuffix(".json"): read_result(x) for x in sorted(data.glob('*.json'))}
+# %%
+plot_results_heatmap(results, width=33)
+plt.show()
+
+# %%
+all_tasks = list(next(iter(results.values())).keys())
+solve_count = {t: sum(r[t] <= 10 for r in results.values()) for t in all_tasks}
+# %%
+nontrivial_tasks = [t for t in all_tasks if 0 < solve_count[t] < len(results)]
+nontrivial_results = {s: {t: r[t] for t in nontrivial_tasks} for s, r in results.items()}
+variant_tasks = [t for t in all_tasks if 2 <= solve_count[t] <= len(results) - 2]
+variant_results = {s: {t: r[t] for t in variant_tasks} for s, r in results.items()}
+# %%
+plot_results_heatmap(nontrivial_results, width=19)
+plt.show()
+# %%
+plot_results_heatmap(variant_results, width=10)
+plt.show()
diff --git a/scripts/generate_result_graph.py b/scripts/generate_result_graph.py
@@ -5,7 +5,7 @@
 import pathlib
 
 # %%
-path = "../results/tries_HumanEval-Dafny.json"
+path = "../results/tries_HumanEval-Dafny_comments.json"
 bench = pathlib.Path("../benches/HumanEval-Dafny")
 file_cnt = len(list(bench.glob("*.dfy")))
 with open(path) as f:

diff --git a/scripts/generate_result_graph_dafny_nagini.py b/scripts/generate_result_graph_dafny_nagini.py
@@ -0,0 +1,49 @@
+# %%
+import matplotlib.pyplot as plt
+import json
+from itertools import accumulate
+import pathlib
+
+# %%
+
+
+def read_bench(d, ext):
+    bench = pathlib.Path(d)
+    return [x.name.removesuffix(f'.{ext}') for x in bench.glob(f'*.{ext}')]
+
+
+# %%
+
+intersect_bench = True
+
+path = "results/dafny_4_comments.json"
+dafny_bench = pathlib.Path("benches/HumanEval-Dafny")
+nagini_bench = pathlib.Path("benches/HumanEval-Nagini/Bench")
+dafny_files = read_bench("benches/HumanEval-Dafny", "dfy")
+nagini_files = read_bench("benches/HumanEval-Nagini/Bench", "py")
+shared_files = [x for x in dafny_files if x in nagini_files]
+files = shared_files if intersect_bench else dafny_files
+# %%
+file_cnt = len(files)
+with open(path) as f:
+    data = json.load(f)
+
+tries = [value for file, value in data.items() if file.removesuffix('.dfy') in files and value != -1]
+max_tries = max(tries)
+cnt = [0] * (max_tries + 1)
+for t in tries:
+    cnt[t] += 1
+cnt = list(accumulate(cnt))
+perc = [100 * (c / file_cnt) for c in cnt]
+
+fig, ax = plt.subplots()  # type: ignore
+plt.title("Percentage of files verified")  # type: ignore
+plt.xlabel("Number of tries")  # type: ignore
+plt.ylabel("Percentage of files")  # type: ignore
+ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f"{x}%"))  # type: ignore
+ax.plot(perc)  # type: ignore
+# %%
+plt.show()
+# %%
+print(f"{cnt[-1]} / {file_cnt}")
+print(f"{round(perc[-1],2)}% of the files were successfully verified")