diff --git a/prompts/dafny_eval_comment_without_impls_textd/ask_for_fixed.txt b/prompts/dafny_eval_comment_without_impls_textd/ask_for_fixed.txt new file mode 100644 index 0000000..cbb6770 --- /dev/null +++ b/prompts/dafny_eval_comment_without_impls_textd/ask_for_fixed.txt @@ -0,0 +1,5 @@ +This answer got Dafny verification error: +{error} +Please try again by taking the Dafny feedback. + +If an error appears in one of method ending with `_valid`, that means annotations of an existing function were altered. diff --git a/prompts/dafny_eval_comment_without_impls_textd/rewrite.txt b/prompts/dafny_eval_comment_without_impls_textd/rewrite.txt new file mode 100644 index 0000000..8d5718d --- /dev/null +++ b/prompts/dafny_eval_comment_without_impls_textd/rewrite.txt @@ -0,0 +1,6 @@ +Given a Dafny program with function signature, preconditions and postconditions, but with some implementations and annotations missing. +Please return a complete Dafny program with all functions implemented, and annotated using the strongest possible annotations (loop invariants, assert statements, etc.). +Use extensive comments in the code, but don't explain anything outside of it. +Please use exactly the same function signature, preconditions, and postconditions. Do not ever modify the given lines. +Below is the program: +{program} diff --git a/prompts/dafny_eval_comment_without_impls_textd/sys.txt b/prompts/dafny_eval_comment_without_impls_textd/sys.txt new file mode 100644 index 0000000..6522a1c --- /dev/null +++ b/prompts/dafny_eval_comment_without_impls_textd/sys.txt @@ -0,0 +1,4 @@ +You are an expert in Dafny. +You will be given tasks dealing with Dafny programs including precise annotations. +You respond only with code blocks. +You will write a comment explaining each significant point of the program on a line before. \ No newline at end of file diff --git a/scripts/generate_heatmap.py b/scripts/generate_heatmap.py new file mode 100644 index 0000000..f444a8d --- /dev/null +++ b/scripts/generate_heatmap.py @@ -0,0 +1,62 @@ +# %% +import matplotlib.pyplot as plt +import json +from itertools import accumulate +import pathlib +import seaborn as sns +import pandas as pd + +# %% + + +def read_result(file): + with open(file) as f: + return {name[:3]: t if t >= 0 else 99 for name, t in json.load(f).items()} + + +# %% + + +def plot_results_heatmap(results_dict, width=35): + df = pd.DataFrame.from_dict({ + source: entry_dict + for source, entry_dict in results_dict.items() + }, orient='index') + + # Create the heatmap + plt.figure(figsize=(width, 4)) + sns.heatmap( + df, + cmap='YlOrRd', # Color scheme - you can change this + annot=True, # Show numbers in cells + fmt='d', # Format as integers + cbar_kws={'label': 'Result'} + ) + + plt.title('Results Heatmap') + plt.xlabel('Entries') + plt.ylabel('Sources') + plt.tight_layout() + + +# %% +data = pathlib.Path('heatmap_data') +results = {str(x.name).removesuffix(".json"): read_result(x) for x in sorted(data.glob('*.json'))} +# %% +plot_results_heatmap(results, width=33) +plt.show() + +# %% +all_tasks = list(next(iter(results.values())).keys()) +solve_count = {t: sum(r[t] <= 10 for r in results.values()) for t in all_tasks} +# %% +nontrivial_tasks = [t for t in all_tasks if 0 < solve_count[t] < len(results)] +nontrivial_results = {s: {t: r[t] for t in nontrivial_tasks} for s, r in results.items()} +variant_tasks = [t for t in all_tasks if 2 <= solve_count[t] <= len(results) - 2] +variant_results = {s: {t: r[t] for t in variant_tasks} for s, r in results.items()} +# %% +plot_results_heatmap(nontrivial_results, width=19) +plt.show() +# %% +plot_results_heatmap(variant_results, width=10) +plt.show() \ No newline at end of file diff --git a/scripts/generate_result_graph.py b/scripts/generate_result_graph.py index 099d7ff..34d51ce 100644 --- a/scripts/generate_result_graph.py +++ b/scripts/generate_result_graph.py @@ -5,7 +5,7 @@ import pathlib # %% -path = "../results/tries_HumanEval-Dafny.json" +path = "../results/tries_HumanEval-Dafny_comments.json" bench = pathlib.Path("../benches/HumanEval-Dafny") file_cnt = len(list(bench.glob("*.dfy"))) with open(path) as f: diff --git a/scripts/generate_result_graph_dafny_nagini.py b/scripts/generate_result_graph_dafny_nagini.py new file mode 100644 index 0000000..ab570fe --- /dev/null +++ b/scripts/generate_result_graph_dafny_nagini.py @@ -0,0 +1,49 @@ +# %% +import matplotlib.pyplot as plt +import json +from itertools import accumulate +import pathlib + +# %% + + +def read_bench(d, ext): + bench = pathlib.Path(d) + return [x.name.removesuffix(f'.{ext}') for x in bench.glob(f'*.{ext}')] + + +# %% + +intersect_bench = True + +path = "results/dafny_4_comments.json" +dafny_bench = pathlib.Path("benches/HumanEval-Dafny") +nagini_bench = pathlib.Path("benches/HumanEval-Nagini/Bench") +dafny_files = read_bench("benches/HumanEval-Dafny", "dfy") +nagini_files = read_bench("benches/HumanEval-Nagini/Bench", "py") +shared_files = [x for x in dafny_files if x in nagini_files] +files = shared_files if intersect_bench else dafny_files +# %% +file_cnt = len(files) +with open(path) as f: + data = json.load(f) + +tries = [value for file, value in data.items() if file.removesuffix('.dfy') in files and value != -1] +max_tries = max(tries) +cnt = [0] * (max_tries + 1) +for t in tries: + cnt[t] += 1 +cnt = list(accumulate(cnt)) +perc = [100 * (c / file_cnt) for c in cnt] + +fig, ax = plt.subplots() # type: ignore +plt.title("Percentage of files verified") # type: ignore +plt.xlabel("Number of tries") # type: ignore +plt.ylabel("Percentage of files") # type: ignore +ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f"{x}%")) # type: ignore +ax.plot(perc) # type: ignore +# %% +plt.show() +# %% +print(f"{cnt[-1]} / {file_cnt}") +print(f"{round(perc[-1],2)}% of the files were successfully verified")