diff --git a/.gitignore b/.gitignore
index 6edb45a..0bc51c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,8 +18,9 @@ run.sh
 **/.pytest_cache
 run_nagini.py
 /tmp
-results
 results/*
+!results/tries_HumanEval-Dafny_1.json
+!results/tries_HumanEval-RustBench_1.json
 /log_tries/
 /log_tries/*
 .direnv
diff --git a/benches/HumanEval-Dafny b/benches/HumanEval-Dafny
index 197aca2..9223614 160000
--- a/benches/HumanEval-Dafny
+++ b/benches/HumanEval-Dafny
@@ -1 +1 @@
-Subproject commit 197aca2480d30357e97dd087df64cd8808c97a5c
+Subproject commit 922361457f4c7cfb7bfd4dc71311b225649b7751
diff --git a/gui/src/helpers.rs b/gui/src/helpers.rs
index f44a529..06218fa 100644
--- a/gui/src/helpers.rs
+++ b/gui/src/helpers.rs
@@ -59,11 +59,6 @@ fn add_common_arguments<'a>(
     token: &str,
     settings: &Settings,
 ) -> &'a mut Command {
-    let bench_type = if settings.incremental_run {
-        String::from("validating")
-    } else {
-        settings.bench_type.to_string()
-    };
     if settings.do_filter {
         cmd.args(["--filter-by-ext", &settings.filter_by_ext]);
     }
@@ -72,7 +67,7 @@ fn add_common_arguments<'a>(
         .args(["--insert-conditions-mode", "llm-single-step"])
         .args(["--llm-profile", settings.llm_profile.as_grazie()])
         .args(["--grazie-token", token])
-        .args(["--bench-type", &bench_type])
+        .args(["--bench-type", &settings.bench_type.to_string()])
         .args(["--tries", &make_tries(&settings.tries)])
         .args(["--retries", &make_retries(&settings.retries)])
         .args(["--verifier-timeout", &make_timeout(&settings.timeout)])
diff --git a/gui/src/main.rs b/gui/src/main.rs
index 234831c..89e0e7b 100644
--- a/gui/src/main.rs
+++ b/gui/src/main.rs
@@ -72,21 +72,23 @@ enum FileMode {
     Directory,
 }
 
-#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Default, Serialize, Deserialize)]
 enum BenchMode {
     #[default]
     Invariants,
     Generic,
     Generate,
     Validating,
+    StepByStep,
 }
 
 impl BenchMode {
     fn llm_generated_path(&self, path: &str) -> PathBuf {
         let name = match self {
-            BenchMode::Invariants | BenchMode::Generic | BenchMode::Validating => {
-                basename(path).to_string()
-            }
+            BenchMode::Invariants
+            | BenchMode::Generic
+            | BenchMode::Validating
+            | BenchMode::StepByStep => basename(path).to_string(),
             BenchMode::Generate => {
                 let base = basename(path);
                 base.chars()
@@ -97,6 +99,27 @@ impl BenchMode {
         };
         APP_DIRS.cache_dir().join("llm-generated").join(name)
     }
+
+    fn name(&self) -> &str {
+        match self {
+            BenchMode::Invariants => "Invariants",
+            BenchMode::Generic => "Generic",
+            BenchMode::Generate => "Generate",
+            BenchMode::Validating => "Validating",
+            BenchMode::StepByStep => "Step by step",
+        }
+    }
+
+    fn all() -> &'static [BenchMode] {
+        const MODES: &[BenchMode] = &[
+            BenchMode::Invariants,
+            BenchMode::Generic,
+            BenchMode::Generate,
+            BenchMode::Validating,
+            BenchMode::StepByStep,
+        ];
+        MODES
+    }
 }
 
 impl Display for BenchMode {
@@ -106,6 +129,7 @@ impl Display for BenchMode {
             BenchMode::Generic => write!(f, "generic"),
             BenchMode::Generate => write!(f, "generate"),
             BenchMode::Validating => write!(f, "validating"),
+            BenchMode::StepByStep => write!(f, "step-by-step"),
         }
     }
 }
diff --git a/gui/src/ui/settings.rs b/gui/src/ui/settings.rs
index ae3a451..4a3c720 100644
--- a/gui/src/ui/settings.rs
+++ b/gui/src/ui/settings.rs
@@ -37,27 +37,16 @@ impl AppState {
                 self.settings.file_mode = FileMode::Directory;
             }
             ui.add_space(2.0);
-            if !self.settings.incremental_run {
-                ui.horizontal(|ui| {
-                    ui.label("Bench mode: ");
-                    ui.radio_value(
-                        &mut self.settings.bench_type,
-                        BenchMode::Invariants,
-                        "Invariants",
-                    );
-                    ui.radio_value(&mut self.settings.bench_type, BenchMode::Generic, "Generic");
-                    ui.radio_value(
-                        &mut self.settings.bench_type,
-                        BenchMode::Generate,
-                        "Generate",
-                    );
-                    ui.radio_value(
-                        &mut self.settings.bench_type,
-                        BenchMode::Validating,
-                        "Validating",
-                    );
-                });
-            }
+            ui.horizontal(|ui| {
+                ui.label("Bench mode:");
+                egui::ComboBox::from_id_source("bench-mode-select")
+                    .selected_text(format!("{}", self.settings.bench_type.name()))
+                    .show_ui(ui, |ui| {
+                        for mode in BenchMode::all() {
+                            ui.selectable_value(&mut self.settings.bench_type, *mode, mode.name());
+                        }
+                    });
+            });
 
             ui.separator();
             ui.horizontal(|ui| {
diff --git a/prompts/humaneval-dafny-cot/ask_for_fixed.txt b/prompts/humaneval-dafny-cot/ask_for_fixed.txt
new file mode 100644
index 0000000..d5c6c30
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/ask_for_fixed.txt
@@ -0,0 +1,4 @@
+The following errors occurred during verification:
+{error}
+
+Please fix the error by adding, removing or modifying the invariants and/or assertions and return the fixed program.
diff --git a/prompts/humaneval-dafny-cot/ask_for_fixed_had_errors.txt b/prompts/humaneval-dafny-cot/ask_for_fixed_had_errors.txt
new file mode 100644
index 0000000..e2cb9ad
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/ask_for_fixed_had_errors.txt
@@ -0,0 +1,4 @@
+There are still some errors:
+{error}
+
+Could you please fix them?
diff --git a/prompts/humaneval-dafny-cot/rewrite/examples/001/answer.txt b/prompts/humaneval-dafny-cot/rewrite/examples/001/answer.txt
new file mode 100644
index 0000000..30b39e4
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/rewrite/examples/001/answer.txt
@@ -0,0 +1,83 @@
+During the previous steps, we've found that the `sum_loop` function has a loop that goes over the elements of the array `numbers` and adds them to the `s` variable, which is equal to the `sum(numbers[..i])` for all `i`, such that `0 <= i <= |numbers|`. This means that during the execution of the loop the following invariant holds:
+1. `0 <= i <= |numbers|`
+2. `s == sum(numbers[..i])`
+
+Let's put them in:
+```dafny
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+        invariant 0 <= i <= |numbers|
+        invariant s == sum(numbers[..i])
+    {
+        s := s + numbers[i];
+
+        i := i + 1;
+    }
+}
+```
+
+We've also worked out how to prove the second invariant. For that we need to prove to the verifier that `sum(numbers[..i + 1]) == sum(numbers[..i]) + numbers[i]`. For that we need to use the `sum_prop` function, as follows:
+```dafny
+assert numbers[..i + 1][..i] == numbers[..i]; // assert that taking a slice of a slice is the same as taking a slice of the original array
+sum_prop(numbers[..i + 1]); // use a lemma to prove the invariant
+assert sum(numbers[..i + 1]) == sum(numbers[..i + 1][..i]) + numbers[i];
+```
+Let's add these lines to the code:
+```dafny
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+        invariant 0 <= i <= |numbers|
+        invariant s == sum(numbers[..i])
+    {
+        s := s + numbers[i];
+
+        assert numbers[..i + 1][..i] == numbers[..i]; // assert that taking a slice of a slice is the same as taking a slice of the original array
+        sum_prop(numbers[..i + 1]); // use a lemma to prove the invariant
+        assert sum(numbers[..i + 1]) == sum(numbers[..i + 1][..i]) + numbers[i];
+
+        i := i + 1;
+    }
+}
+```
+
+Other than that, we didn't find any other invariants or patterns that hold during the execution of the program. Therefore, the resulting program is (notice the lack of backticks):
+<answer>
+function sum(s: seq<int>) : int {
+    if |s| == 0 then 0 else s[0] + sum(s[1..])
+}
+
+lemma sum_prop(s: seq<int>)
+    requires |s| > 0
+    ensures sum(s) == sum(s[..|s| - 1]) + s[ |s| - 1 ]
+{
+    if (|s| > 1) {
+        assert (s[1..][..|s[1..]| - 1]) == s[1..|s| - 1];
+    }
+}
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+        invariant 0 <= i <= |numbers|
+        invariant s == sum(numbers[..i])
+    {
+        s := s + numbers[i];
+
+        assert numbers[..i + 1][..i] == numbers[..i]; // assert that taking a slice of a slice is the same as taking a slice of the original array
+        sum_prop(numbers[..i + 1]); // use a lemma to prove the invariant
+        assert sum(numbers[..i + 1]) == sum(numbers[..i + 1][..i]) + numbers[i];
+
+        i := i + 1;
+    }
+}
+</answer>
diff --git a/prompts/humaneval-dafny-cot/rewrite/examples/001/question.txt b/prompts/humaneval-dafny-cot/rewrite/examples/001/question.txt
new file mode 100644
index 0000000..2295ed0
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/rewrite/examples/001/question.txt
@@ -0,0 +1,28 @@
+It's time to rewrite the program to contain the invariants and assertions that we've found. Work on the following program and go step by step through the reasoning process. After that, output the rewritten program in the `<answer>` tag without backticks.
+
+```dafny
+function sum(s: seq<int>) : int {
+    if |s| == 0 then 0 else s[0] + sum(s[1..])
+}
+
+lemma sum_prop(s: seq<int>)
+    requires |s| > 0
+    ensures sum(s) == sum(s[..|s| - 1]) + s[ |s| - 1 ]
+{
+    if (|s| > 1) {
+        assert (s[1..][..|s[1..]| - 1]) == s[1..|s| - 1];
+    }
+}
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+    {
+        s := s + numbers[i];
+
+        i := i + 1;
+    }
+}
+```
diff --git a/prompts/humaneval-dafny-cot/rewrite/question.txt b/prompts/humaneval-dafny-cot/rewrite/question.txt
new file mode 100644
index 0000000..17e9fcc
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/rewrite/question.txt
@@ -0,0 +1,3 @@
+It's time to rewrite the program to contain the invariants and assertions that we've found. Work on the following program and go step by step through the reasoning process. After that, output the rewritten program in the `<answer>` tag without backticks.
+
+{program}
diff --git a/prompts/humaneval-dafny-cot/steps/001/examples/001/answer.txt b/prompts/humaneval-dafny-cot/steps/001/examples/001/answer.txt
new file mode 100644
index 0000000..0a6dd47
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/001/examples/001/answer.txt
@@ -0,0 +1,10 @@
+Let's go function by function. The first function, `sum` does not have a specification, as can be seen by the absence of either a `requires` or `ensures` clause. This means that this function is probably a specification itself. Let's consider, what it can be step by step. We can see that the aruguments to this function are a sequence of integers. We can see that the function returns an integer. We can conclude that the function does some accumulation of integers from the sequence. Looking at the body, we can see that the function takes the first element of the sequence and adds it to the accumulated result of evaluating the `sum` function on the rest of the sequence. This means the function is a specification for the sum of the elements of the sequence.
+
+Next, we have the `sum_prop` lemma. Considering it's specification, we can see that it requires a non-empty sequence of integers, as seen by the `requires |s| > 0` clause. Looking at the ensures clause, we can see that the lemma ensures that the sum of the sequence (as computed by the `sum` function) is equal to the sum of the elements of the sequence without the last element, plus that last element. This is important to prove, because the sum function drops the first element and thus it is not obvious for the prover that the last element can be dropped in the same way.
+
+Finally, we have the `sum_loop` method. This method has a specification, as indicated by the `ensures` clause. The specification states that the returned value `s` should be equal to the sum of the input sequence `numbers`, as computed by the `sum` function, as seen by the `ensures s == sum(numbers)` clause.
+
+In summary:
+1. The `sum` function is itself a specification for computing the sum of a sequence of integers.
+2. The `sum_prop` lemma has a specification that proves a property about the `sum` function.
+3. The `sum_loop` method has a specification that ensures it correctly computes the sum of a sequence of integers, matching the result of the `sum` function.
diff --git a/prompts/humaneval-dafny-cot/steps/001/examples/001/question.txt b/prompts/humaneval-dafny-cot/steps/001/examples/001/question.txt
new file mode 100644
index 0000000..0aead6f
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/001/examples/001/question.txt
@@ -0,0 +1,27 @@
+Given the following program, for each function, determine if it has a specification. If it does not, is it itself a specification? If it is a specification, what does it specify?
+```dafny
+function sum(s: seq<int>) : int {
+    if |s| == 0 then 0 else s[0] + sum(s[1..])
+}
+
+lemma sum_prop(s: seq<int>)
+    requires |s| > 0
+    ensures sum(s) == sum(s[..|s| - 1]) + s[ |s| - 1 ]
+{
+    if (|s| > 1) {
+        assert (s[1..][..|s[1..]| - 1]) == s[1..|s| - 1];
+    }
+}
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+    {
+        s := s + numbers[i];
+
+        i := i + 1;
+    }
+}
+```
diff --git a/prompts/humaneval-dafny-cot/steps/001/question.txt b/prompts/humaneval-dafny-cot/steps/001/question.txt
new file mode 100644
index 0000000..4bc2633
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/001/question.txt
@@ -0,0 +1,5 @@
+Given the following program, for each function, determine if it has a specification. If it does not, is it itself a specification? If it is a specification, what does it specify?
+
+```dafny
+{program}
+```
diff --git a/prompts/humaneval-dafny-cot/steps/002/examples/001/answer.txt b/prompts/humaneval-dafny-cot/steps/002/examples/001/answer.txt
new file mode 100644
index 0000000..c4037db
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/002/examples/001/answer.txt
@@ -0,0 +1,6 @@
+Looking at the functions, we can see that the 'sum' function uses recursion and thus contains no loops. The 'sum_prop' lemma, in the same way, does not contain any loops, as evidenced by the absence of 'while' and 'for' keywords.
+Now, let's examine the 'sum_loop' method. This method contains a while loop that iterates through the elements of the 'numbers' sequence. The loop starts with 's' and 'i' initialized to 0, and continues as long as 'i' is less than the length of 'numbers'. Let's try to understand what 's' and 'i' represent in this context.
+
+Looking at the body of the loop, we can see that it adds the current element of 'numbers' to 's'. This means that 's' is the running sum of the elements in 'numbers', namely 's = sum(numbers[..i])'. 'i' is easier to analyze. It is the index of the current element being processed. In each iteration of the loop, 'i' is incremented by 1, so 'i' represents the index of the next element to be processed.
+
+Each iteration of the loop adds the current element to 's', and then increments 'i' to move to the next element. This process continues until 'i' reaches the length of 'numbers', at which point the loop terminates and 's' holds the final sum of the elements in 'numbers'.
diff --git a/prompts/humaneval-dafny-cot/steps/002/examples/001/question.txt b/prompts/humaneval-dafny-cot/steps/002/examples/001/question.txt
new file mode 100644
index 0000000..e004327
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/002/examples/001/question.txt
@@ -0,0 +1,28 @@
+Now we are finished with the specifications. Let's move onto the loops.
+Given the following program, for each function, determine if it has loops. If it does, try to determine what the loop does in detail. Go through all your thinking and reasoning process step by step.
+```dafny
+function sum(s: seq<int>) : int {
+    if |s| == 0 then 0 else s[0] + sum(s[1..])
+}
+
+lemma sum_prop(s: seq<int>)
+    requires |s| > 0
+    ensures sum(s) == sum(s[..|s| - 1]) + s[ |s| - 1 ]
+{
+    if (|s| > 1) {
+        assert (s[1..][..|s[1..]| - 1]) == s[1..|s| - 1];
+    }
+}
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+    {
+        s := s + numbers[i];
+
+        i := i + 1;
+    }
+}
+```
diff --git a/prompts/humaneval-dafny-cot/steps/002/question.txt b/prompts/humaneval-dafny-cot/steps/002/question.txt
new file mode 100644
index 0000000..6abd123
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/002/question.txt
@@ -0,0 +1,4 @@
+Given the following program, for each function, determine if it has loops. If it does, try to determine what the loop does in detail. Go through all your thinking and reasoning process step by step.
+```dafny
+{program}
+```
diff --git a/prompts/humaneval-dafny-cot/steps/003/examples/001/answer.txt b/prompts/humaneval-dafny-cot/steps/003/examples/001/answer.txt
new file mode 100644
index 0000000..334a899
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/003/examples/001/answer.txt
@@ -0,0 +1,15 @@
+As we've seen, the loop in the `sum_loop` function goes over the elements of the array `numbers` and adds them to the `s` variable, which is equal to the `sum(numbers[..i])` for all `i`, such that `0 <= i <= |numbers|`. This means that during the execution of the loop the following invariant holds:
+1. `0 <= i <= |numbers|`
+2. `s == sum(numbers[..i])`
+
+The second invariant might be a bit difficult to prove, as it adds the last element of the array to the sum, while, as I've said before, the `sum` function is defined as `sum(a) == a[0] + sum(a[1..])`. However, as we've seen, it's possible to use the `sum_prop` function to prove that this invariant holds.
+Let's work out how to use it. If applied to the `numbers[..i + 1]`, we get:
+`sum(numbers[..i + 1]) == sum(numbers[..i + 1][..i]) + numbers[i]`. It might be difficult for dafny to prove that `sum(numbers[..i + 1][..i]) == sum(numbers[..i])`, so I'd suggest adding an assertion before calling `sum_prop` to make sure that this is the case, namely `assert numbers[..i + 1][..i] == numbers[..i]`.
+So, to use the `sum_prop`, I'd suggest the following code:
+```
+assert numbers[..i + 1][..i] == numbers[..i]; // assert that taking a slice of a slice is the same as taking a slice of the original array
+sum_prop(numbers[..i + 1]); // use a lemma to prove the invariant
+assert sum(numbers[..i + 1]) == sum(numbers[..i + 1][..i]) + numbers[i];
+```
+
+In conclusion, we can say that two invariants hold and the second one could be proved using the `sum_prop` function.
diff --git a/prompts/humaneval-dafny-cot/steps/003/examples/001/question.txt b/prompts/humaneval-dafny-cot/steps/003/examples/001/question.txt
new file mode 100644
index 0000000..d7a9fb2
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/003/examples/001/question.txt
@@ -0,0 +1,27 @@
+As we've already analyzed the specifications and the loops, it's time to try to analyze what invariants and patterns hold during the execution of the program. Go over the found loops in the following program in the code and try to find out what invariants and patterns they hold. Go through all your thinking and reasoning process step by step.
+```dafny
+function sum(s: seq<int>) : int {
+    if |s| == 0 then 0 else s[0] + sum(s[1..])
+}
+
+lemma sum_prop(s: seq<int>)
+    requires |s| > 0
+    ensures sum(s) == sum(s[..|s| - 1]) + s[ |s| - 1 ]
+{
+    if (|s| > 1) {
+        assert (s[1..][..|s[1..]| - 1]) == s[1..|s| - 1];
+    }
+}
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+    {
+        s := s + numbers[i];
+
+        i := i + 1;
+    }
+}
+```
diff --git a/prompts/humaneval-dafny-cot/steps/003/question.txt b/prompts/humaneval-dafny-cot/steps/003/question.txt
new file mode 100644
index 0000000..437bd31
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/steps/003/question.txt
@@ -0,0 +1,4 @@
+As we've already analyzed the specifications and the loops, it's time to try to analyze what invariants and patterns hold during the execution of the program. Go over the found loops in the following program in the code and try to find out what invariants and patterns they hold. Go through all your thinking and reasoning process step by step.
+```dafny
+{program}
+```
diff --git a/prompts/humaneval-dafny-cot/sys.txt b/prompts/humaneval-dafny-cot/sys.txt
new file mode 100644
index 0000000..0af33ba
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/sys.txt
@@ -0,0 +1,3 @@
+You are an expert in Dafny.
+You will be given tasks dealing with Dafny programs including precise annotations.
+You respond only with code blocks.
diff --git a/prompts/humaneval-dafny-cot/timeout.txt b/prompts/humaneval-dafny-cot/timeout.txt
new file mode 100644
index 0000000..f69fd69
--- /dev/null
+++ b/prompts/humaneval-dafny-cot/timeout.txt
@@ -0,0 +1,3 @@
+The verifier timed out during the verification.
+This usually means that the provided invariants were too broad or were difficult to check.
+Could you please try to improve the invariants and try again?
diff --git a/prompts/humaneval-nagini-cot/ask_for_fixed.txt b/prompts/humaneval-nagini-cot/ask_for_fixed.txt
new file mode 100644
index 0000000..38f5c7b
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/ask_for_fixed.txt
@@ -0,0 +1,6 @@
+The following errors occurred during verification:
+{error}
+
+Please fix the error by adding, removing or modifying the invariants and return the fixed program.
+Don't add any additional text comments, your response must contain only program with invariants.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/ask_for_fixed_had_errors.txt b/prompts/humaneval-nagini-cot/ask_for_fixed_had_errors.txt
new file mode 100644
index 0000000..9e6f1ab
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/ask_for_fixed_had_errors.txt
@@ -0,0 +1,6 @@
+There are still some errors:
+{error}
+
+Could you please fix them?
+Don't add any additional text comments, your response must contain only program with invariants.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/rewrite/examples/001/answer.txt b/prompts/humaneval-nagini-cot/rewrite/examples/001/answer.txt
new file mode 100644
index 0000000..5db533d
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/rewrite/examples/001/answer.txt
@@ -0,0 +1,103 @@
+During the previous steps, we've found that the `sum_loop` function has a loop that goes over the elements of the array `numbers` and adds them to the `s` variable, which is equal to the `Sum(numbers, 0, i)` for all `i`, such that `0 <= i and i <= len(numbers)`. Also we need list access to numbers. This means that during the execution of the loop the following invariant holds:
+1. `Acc(list_pred(numbers))`
+2. `0 <= i and i <= len(numbers)`
+3. `s == Sum(numbers, 0, i)`
+
+Let's put them in:
+```
+from typing import cast, List, Dict, Set, Optional, Union
+from nagini_contracts.contracts import *
+
+@Pure
+def Sum(a : List[int], s : int, t : int) -> int :
+    Requires(Acc(list_pred(a)))
+    Requires(((0) <= (s)) and ((s) <= (t)) and ((t) <= (len(a))))
+
+    if s == t:
+        return 0
+    else:
+        return (a)[t - 1] + (Sum(a, s, t - 1))
+
+def sum_loop(numbers: List[int]) -> int:
+    Requires(Acc(list_pred(numbers)))
+    Ensures(Acc(list_pred(numbers)))
+    Ensures(Result() == Sum(numbers, 0, len(numbers)))
+    s = int(0)
+    i = int(0)
+    while (i) < (len(numbers)):
+        s = s + (numbers)[i]
+        i = i + 1
+    return s
+```
+
+We've also worked out how to prove the third invariant. For that we need to prove to the verifier that `Sum(numbers, 0, i + 1)== Sum(numbers, 0, i) + numbers[i]`. For that we need to add the following invariants and asserts:
+```
+Invariant(Forall(int, lambda d_1_p_:
+    (Implies(0 <= d_1_p_ and d_1_p_ < len(numbers), Sum(numbers, 0, d_1_p_ + 1) == Sum(numbers, 0, d_1_p_) + numbers[d_1_p_]), [[Sum(numbers, 0, d_1_p_ + 1)]])))
+Assert(Sum(numbers, 0, i + 1) == Sum(numbers, 0, i) + numbers[i])
+```
+Let's add these lines to the code:
+```
+from typing import cast, List, Dict, Set, Optional, Union
+from nagini_contracts.contracts import *
+
+@Pure
+def Sum(a : List[int], s : int, t : int) -> int :
+    Requires(Acc(list_pred(a)))
+    Requires(((0) <= (s)) and ((s) <= (t)) and ((t) <= (len(a))))
+
+    if s == t:
+        return 0
+    else:
+        return (a)[t - 1] + (Sum(a, s, t - 1))
+
+def sum_loop(numbers: List[int]) -> int:
+    Requires(Acc(list_pred(numbers)))
+    Ensures(Acc(list_pred(numbers)))
+    Ensures(Result() == Sum(numbers, 0, len(numbers)))
+    s = int(0)
+    i = int(0)
+    while (i) < (len(numbers)):
+        Invariant(Acc(list_pred(numbers)))
+        Invariant(0 <= i and i <= len(numbers))
+        Invariant(Forall(int, lambda d_1_p_:
+            (Implies(0 <= d_1_p_ and d_1_p_ < len(numbers), Sum(numbers, 0, d_1_p_ + 1) == Sum(numbers, 0, d_1_p_) + numbers[d_1_p_]), [[Sum(numbers, 0, d_1_p_ + 1)]])))
+        Invariant(s == Sum(numbers, 0, i))
+        Assert(Sum(numbers, 0, i + 1) == Sum(numbers, 0, i) + numbers[i])
+        s = s + (numbers)[i]
+        i = i + 1
+    return s
+```
+
+Other than that, we didn't find any other invariants or patterns that hold during the execution of the program. Therefore, the resulting program is (notice the lack of backticks):
+<answer>
+from typing import cast, List, Dict, Set, Optional, Union
+from nagini_contracts.contracts import *
+
+@Pure
+def Sum(a : List[int], s : int, t : int) -> int :
+    Requires(Acc(list_pred(a)))
+    Requires(((0) <= (s)) and ((s) <= (t)) and ((t) <= (len(a))))
+
+    if s == t:
+        return 0
+    else:
+        return (a)[t - 1] + (Sum(a, s, t - 1))
+
+def sum_loop(numbers: List[int]) -> int:
+    Requires(Acc(list_pred(numbers)))
+    Ensures(Acc(list_pred(numbers)))
+    Ensures(Result() == Sum(numbers, 0, len(numbers)))
+    s = int(0)
+    i = int(0)
+    while (i) < (len(numbers)):
+        Invariant(Acc(list_pred(numbers)))
+        Invariant(0 <= i and i <= len(numbers))
+        Invariant(Forall(int, lambda d_1_p_:
+            (Implies(0 <= d_1_p_ and d_1_p_ < len(numbers), Sum(numbers, 0, d_1_p_ + 1) == Sum(numbers, 0, d_1_p_) + numbers[d_1_p_]), [[Sum(numbers, 0, d_1_p_ + 1)]])))
+        Invariant(s == Sum(numbers, 0, i))
+        Assert(Sum(numbers, 0, i + 1) == Sum(numbers, 0, i) + numbers[i])
+        s = s + (numbers)[i]
+        i = i + 1
+    return s
+</answer>
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/rewrite/examples/001/question.txt b/prompts/humaneval-nagini-cot/rewrite/examples/001/question.txt
new file mode 100644
index 0000000..6b36103
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/rewrite/examples/001/question.txt
@@ -0,0 +1,27 @@
+It's time to rewrite the program to contain the invariants and assertions that we've found. Work on the following program and go step by step through the reasoning process. After that, output the rewritten program in the `<answer>` tag without backticks.
+
+```
+from typing import cast, List, Dict, Set, Optional, Union
+from nagini_contracts.contracts import *
+
+@Pure
+def Sum(a : List[int], s : int, t : int) -> int :
+    Requires(Acc(list_pred(a)))
+    Requires(((0) <= (s)) and ((s) <= (t)) and ((t) <= (len(a))))
+
+    if s == t:
+        return 0
+    else:
+        return (a)[t - 1] + (Sum(a, s, t - 1))
+
+def sum_loop(numbers: List[int]) -> int:
+    Requires(Acc(list_pred(numbers)))
+    Ensures(Acc(list_pred(numbers)))
+    Ensures(Result() == Sum(numbers, 0, len(numbers)))
+    s = int(0)
+    i = int(0)
+    while (i) < (len(numbers)):
+        s = s + (numbers)[i]
+        i = i + 1
+    return s
+```
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/rewrite/question.txt b/prompts/humaneval-nagini-cot/rewrite/question.txt
new file mode 100644
index 0000000..17e9fcc
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/rewrite/question.txt
@@ -0,0 +1,3 @@
+It's time to rewrite the program to contain the invariants and assertions that we've found. Work on the following program and go step by step through the reasoning process. After that, output the rewritten program in the `<answer>` tag without backticks.
+
+{program}
diff --git a/prompts/humaneval-nagini-cot/steps/001/examples/001/answer.txt b/prompts/humaneval-nagini-cot/steps/001/examples/001/answer.txt
new file mode 100644
index 0000000..4d31107
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/001/examples/001/answer.txt
@@ -0,0 +1,7 @@
+Let's go function by function. The first function, `Sum` has @Pure annotation. This means that this function can be probably used in specifications. We can see that the aruguments to this function are a sequence of integers and two indices. We can see that the function returns an integer. We can conclude that the function does some accumulation of integers from the subsequence [l;r) of the given sequence. Looking at the body, we can see that the function takes the first element of the subsequence and adds it to the accumulated result of evaluating the `sum` function on the rest of the subsequence. This means the function is a specification for the sum of the elements of the subsequence.
+
+Finally, we have the `sum_loop` function. This function has a specification, as indicated by the `Requires` and `Ensures` clauses. The specification states that the returned value should be equal to the sum of the input sequence `numbers`, as computed by the `Sum` function, as seen by the `Result() == Sum(numbers, 0, len(numbers))` clause.
+
+In summary:
+1. The `sum` function is used in a specification for computing the sum of a sequence of integers.
+2. The `sum_loop` function has a specification that ensures it correctly computes the sum of a sequence of integers, matching the result of the `Sum` function.
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/steps/001/examples/001/question.txt b/prompts/humaneval-nagini-cot/steps/001/examples/001/question.txt
new file mode 100644
index 0000000..9266bfc
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/001/examples/001/question.txt
@@ -0,0 +1,26 @@
+Given the following program, for each function, determine if it has a specification. If it does not, is it itself a specification? If it is a specification, what does it specify?
+```
+from typing import cast, List, Dict, Set, Optional, Union
+from nagini_contracts.contracts import *
+
+@Pure
+def Sum(a : List[int], s : int, t : int) -> int :
+    Requires(Acc(list_pred(a)))
+    Requires(((0) <= (s)) and ((s) <= (t)) and ((t) <= (len(a))))
+
+    if s == t:
+        return 0
+    else:
+        return (a)[t - 1] + (Sum(a, s, t - 1))
+
+def sum_loop(numbers: List[int]) -> int:
+    Requires(Acc(list_pred(numbers)))
+    Ensures(Acc(list_pred(numbers)))
+    Ensures(Result() == Sum(numbers, 0, len(numbers)))
+    s = int(0)
+    i = int(0)
+    while (i) < (len(numbers)):
+        s = s + (numbers)[i]
+        i = i + 1
+    return s
+```
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/steps/001/question.txt b/prompts/humaneval-nagini-cot/steps/001/question.txt
new file mode 100644
index 0000000..bbf83ca
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/001/question.txt
@@ -0,0 +1,5 @@
+Given the following program, for each function, determine if it has a specification. Determine also functions, that can be used in specifications.
+
+```
+{program}
+```
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/steps/002/examples/001/answer.txt b/prompts/humaneval-nagini-cot/steps/002/examples/001/answer.txt
new file mode 100644
index 0000000..d336822
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/002/examples/001/answer.txt
@@ -0,0 +1,6 @@
+Looking at the functions, we can see that the 'sum' function uses recursion and thus contains no loops.
+Now, let's examine the 'sum_loop' function. This method contains a while loop that iterates through the elements of the 'numbers' list. The loop starts with 's' and 'i' initialized to 0, and continues as long as 'i' is less than the length of 'numbers'. Let's try to understand what 's' and 'i' represent in this context.
+
+Looking at the body of the loop, we can see that it adds the current element of 'numbers' to 's'. This means that 's' is the running sum of the elements in 'numbers', namely 's = Sum(numbers, 0, i)'. 'i' is easier to analyze. It is the index of the current element being processed. In each iteration of the loop, 'i' is incremented by 1, so 'i' represents the index of the next element to be processed.
+
+Each iteration of the loop adds the current element to 's', and then increments 'i' to move to the next element. This process continues until 'i' reaches the length of 'numbers', at which point the loop terminates and 's' holds the final sum of the elements in 'numbers'.
diff --git a/prompts/humaneval-nagini-cot/steps/002/examples/001/question.txt b/prompts/humaneval-nagini-cot/steps/002/examples/001/question.txt
new file mode 100644
index 0000000..a8d91f0
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/002/examples/001/question.txt
@@ -0,0 +1,27 @@
+Now we are finished with the specifications. Let's move onto the loops.
+Given the following program, for each function, determine if it has loops. If it does, try to determine what the loop does in detail. Go through all your thinking and reasoning process step by step.
+```
+from typing import cast, List, Dict, Set, Optional, Union
+from nagini_contracts.contracts import *
+
+@Pure
+def Sum(a : List[int], s : int, t : int) -> int :
+    Requires(Acc(list_pred(a)))
+    Requires(((0) <= (s)) and ((s) <= (t)) and ((t) <= (len(a))))
+
+    if s == t:
+        return 0
+    else:
+        return (a)[t - 1] + (Sum(a, s, t - 1))
+
+def sum_loop(numbers: List[int]) -> int:
+    Requires(Acc(list_pred(numbers)))
+    Ensures(Acc(list_pred(numbers)))
+    Ensures(Result() == Sum(numbers, 0, len(numbers)))
+    s = int(0)
+    i = int(0)
+    while (i) < (len(numbers)):
+        s = s + (numbers)[i]
+        i = i + 1
+    return s
+```
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/steps/002/question.txt b/prompts/humaneval-nagini-cot/steps/002/question.txt
new file mode 100644
index 0000000..f9997f3
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/002/question.txt
@@ -0,0 +1,4 @@
+Given the following program, for each function, determine if it has loops. If it does, try to determine what the loop does in detail. Go through all your thinking and reasoning process step by step.
+```
+{program}
+```
diff --git a/prompts/humaneval-nagini-cot/steps/003/examples/001/answer.txt b/prompts/humaneval-nagini-cot/steps/003/examples/001/answer.txt
new file mode 100644
index 0000000..2f54b07
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/003/examples/001/answer.txt
@@ -0,0 +1,14 @@
+During the previous steps, we've found that the `sum_loop` function has a loop that goes over the elements of the array `numbers` and adds them to the `s` variable, which is equal to the `Sum(numbers, 0, i)` for all `i`, such that `0 <= i and i <= len(numbers)`. Also we need list access to numbers. This means that during the execution of the loop the following invariant holds:
+1. `Acc(list_pred(numbers))`
+2. `0 <= i and i <= len(numbers)`
+3. `s == Sum(numbers, 0, i)`
+
+The third invariant might be a bit difficult to prove, as it adds the last element of the array to the sum, while, as I've said before, the `Sum` function is defined as `Sum(a, 0, i + 1) == Sum(a, 0, i) + a[i]`. However, as we've seen, it's possible to use additional invariants and assertions to prove this invariants holds.
+Let's work out how to use it. One of the ways to prove it is to add the following invariants and assertions:
+```
+Invariant(Forall(int, lambda d_1_p_:
+    (Implies(0 <= d_1_p_ and d_1_p_ < len(numbers), Sum(numbers, 0, d_1_p_ + 1) == Sum(numbers, 0, d_1_p_) + numbers[d_1_p_]), [[Sum(numbers, 0, d_1_p_ + 1)]])))
+Assert(Sum(numbers, 0, i + 1) == Sum(numbers, 0, i) + numbers[i])
+```
+
+In conclusion, we can say that two invariants hold and the third one could be proved using additional invariants and assertions.
diff --git a/prompts/humaneval-nagini-cot/steps/003/examples/001/question.txt b/prompts/humaneval-nagini-cot/steps/003/examples/001/question.txt
new file mode 100644
index 0000000..2e91037
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/003/examples/001/question.txt
@@ -0,0 +1,26 @@
+As we've already analyzed the specifications and the loops, it's time to try to analyze what invariants and patterns hold during the execution of the program. Go over the found loops in the following program in the code and try to find out what invariants and patterns they hold. Go through all your thinking and reasoning process step by step.
+```
+from typing import cast, List, Dict, Set, Optional, Union
+from nagini_contracts.contracts import *
+
+@Pure
+def Sum(a : List[int], s : int, t : int) -> int :
+    Requires(Acc(list_pred(a)))
+    Requires(((0) <= (s)) and ((s) <= (t)) and ((t) <= (len(a))))
+
+    if s == t:
+        return 0
+    else:
+        return (a)[t - 1] + (Sum(a, s, t - 1))
+
+def sum_loop(numbers: List[int]) -> int:
+    Requires(Acc(list_pred(numbers)))
+    Ensures(Acc(list_pred(numbers)))
+    Ensures(Result() == Sum(numbers, 0, len(numbers)))
+    s = int(0)
+    i = int(0)
+    while (i) < (len(numbers)):
+        s = s + (numbers)[i]
+        i = i + 1
+    return s
+```
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/steps/003/question.txt b/prompts/humaneval-nagini-cot/steps/003/question.txt
new file mode 100644
index 0000000..e1d7365
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/steps/003/question.txt
@@ -0,0 +1,4 @@
+As we've already analyzed the specifications and the loops, it's time to try to analyze what invariants and patterns hold during the execution of the program. Go over the found loops in the following program in the code and try to find out what invariants and patterns they hold. Go through all your thinking and reasoning process step by step.
+```
+{program}
+```
diff --git a/prompts/humaneval-nagini-cot/sys.txt b/prompts/humaneval-nagini-cot/sys.txt
new file mode 100644
index 0000000..e9979b1
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/sys.txt
@@ -0,0 +1,4 @@
+You are an expert in a Python verification framework Nagini.
+You will be given tasks dealing with Python programs including precise docstrings and specifications.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
+Take into account that arrays inside the invariants are indexed by type `int`.
\ No newline at end of file
diff --git a/prompts/humaneval-nagini-cot/timeout.txt b/prompts/humaneval-nagini-cot/timeout.txt
new file mode 100644
index 0000000..1c50276
--- /dev/null
+++ b/prompts/humaneval-nagini-cot/timeout.txt
@@ -0,0 +1,3 @@
+The verifier timed out during the verification.
+This usually means that the provided invariants were too broad or were difficult to check.
+Could you please try to improve the invariants and try again?
\ No newline at end of file
diff --git a/prompts/humaneval-verus-cot/steps/001/examples/001/answer.txt b/prompts/humaneval-verus-cot/steps/001/examples/001/answer.txt
new file mode 100644
index 0000000..f4ba9eb
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/001/examples/001/answer.txt
@@ -0,0 +1,10 @@
+Let's go function by function. The first function, `sum` is a specification as it is declared with the `spec` keyword. It does not have a specification, as can be seen by the absence of either a `requires` or `ensures` clause, which are infact disallowed for `spec` functions. Note also the `decreases` clause indicating this is a recursive function. Let's consider, what it can be step by step. We can see that the aruguments to this function are a sequence of `u32` integers. We can see that the function returns an `int`. We can conclude that the function does some accumulation of integers from the sequence. Looking at the body, we can see that the function takes the first element of the sequence and adds it to the accumulated result of evaluating the `sum` function on the rest of the sequence. This means the function is a specification for the sum of the elements of the sequence.
+
+Next, we have the `sum_prop` lemma. Considering it's specification, we can see that it requires a non-empty sequence of integers, as seen by the `requires s.len() > 0` clause. Looking at the ensures clause, we can see that the lemma ensures that the sum of the sequence (as computed by the `sum` function) is equal to the sum of the elements of the sequence without the last element, plus that last element. This is important to prove, because the sum function drops the first element and thus it is not obvious for the prover that the last element can be dropped in the same way.
+
+Finally, we have the `sum_loop` method. This method has a specification, as indicated by the `ensures` clause. The return value is an `Option` type. The specification states that if `Some` is returned `s.unwrap()` should be equal to the sum of the input sequence `numbers`, as computed by the `sum` function, as seen by `s.unwrap() == sum(numbers@)` in the `ensures` clause.
+
+In summary:
+1. The `sum` function is itself a specification for computing the sum of a sequence of integers.
+2. The `sum_prop` lemma has a specification that proves a property about the `sum` function.
+3. The `sum_loop` method has a specification that ensures it correctly computes the sum of a sequence of integers, matching the result of the `sum` function.
diff --git a/prompts/humaneval-verus-cot/steps/001/examples/001/question.txt b/prompts/humaneval-verus-cot/steps/001/examples/001/question.txt
new file mode 100644
index 0000000..c6cbb88
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/001/examples/001/question.txt
@@ -0,0 +1,38 @@
+Given the following program, for each function, determine if it has a specification. If it does not, is it itself a specification? If it is a specification, what does it specify?
+```verus
+spec fn sum(s: Seq<u32>) -> int 
+    decreases s.len()
+{
+    if s.len() == 0 { 0 } else { s[0] + sum(s.subrange(1, s.len() as int)) }
+}
+
+proof fn sum_prop(s: Seq<u32>)
+    requires s.len() > 0,
+    ensures sum(s) == sum(s.subrange(0, s.len() - 1)) + s[s.len() - 1],
+    decreases s.len(),
+{
+    if s.len() > 1 {
+        sum_prop(s.subrange(1, s.len() as int));
+        assert (s.subrange(0, s.len() - 1).drop_first() == s.subrange(1, s.len() as int).drop_last());
+        assert(s[0] + sum(s.subrange(0, s.len() - 1).drop_first()) + s[s.len() - 1] == sum(s));
+    } else {
+        assert (sum(s) == sum(s.subrange(0, s.len() - 1)) + s[s.len() - 1]) by {
+            assert (sum(s) == s[0] + sum(s.subrange(1, s.len() as int)));
+        }
+    }
+}
+
+fn sum_loop(numbers: &Vec<u32>) -> (s: Option<u32>)
+    ensures s.is_some() ==> s.unwrap() == sum(numbers@),
+{
+    let mut s: u32 = 0;
+    let mut i = 0;
+    while i < numbers.len()
+    {
+        s = s.checked_add(numbers[i])?;
+
+        i += 1;
+    }
+    Some(s)
+}
+```
diff --git a/prompts/humaneval-verus-cot/steps/001/question.txt b/prompts/humaneval-verus-cot/steps/001/question.txt
new file mode 100644
index 0000000..77cdba9
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/001/question.txt
@@ -0,0 +1,5 @@
+Given the following program, for each function, determine if it has a specification. If it does not, is it itself a specification? If it is a specification, what does it specify?
+
+```verus
+{program}
+```
diff --git a/prompts/humaneval-verus-cot/steps/002/examples/001/answer.txt b/prompts/humaneval-verus-cot/steps/002/examples/001/answer.txt
new file mode 100644
index 0000000..47d09e8
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/002/examples/001/answer.txt
@@ -0,0 +1,6 @@
+We know that `spec` and `proof` functions cannot have loops. Since `sum` is a `spec` function and `sum_prop` is a `proof` function, they have no loops.
+Now, let's examine the 'sum_loop' method. This method contains a while loop that iterates through the elements of the 'numbers' sequence. The loop starts with 's' and 'i' initialized to 0, and continues as long as 'i' is less than the length of 'numbers'. Let's try to understand what 's' and 'i' represent in this context.
+
+Looking at the body of the loop, we can see that it adds the current element of 'numbers' to 's'. This means that 's' is the running sum of the elements in 'numbers', namely 's = sum(numbers@.subrange(0, i as int))'. 'i' is easier to analyze. It is the index of the current element being processed. In each iteration of the loop, 'i' is incremented by 1, so 'i' represents the index of the next element to be processed.
+
+Each iteration of the loop adds the current element to 's' after checking for overflow. If there is overflow `None` is returned using the shorthand `?` operator. Then 'i' is incremented to move to the next element. This process continues until 'i' reaches the length of 'numbers', at which point the loop terminates and 's' holds the final sum of the elements in 'numbers'.
diff --git a/prompts/humaneval-verus-cot/steps/002/examples/001/question.txt b/prompts/humaneval-verus-cot/steps/002/examples/001/question.txt
new file mode 100644
index 0000000..0c2e155
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/002/examples/001/question.txt
@@ -0,0 +1,39 @@
+Now we are finished with the specifications. Let's move onto the loops.
+Given the following program, for each function, determine if it has loops. If it does, try to determine what the loop does in detail. Go through all your thinking and reasoning process step by step.
+```verus
+spec fn sum(s: Seq<u32>) -> int 
+    decreases s.len()
+{
+    if s.len() == 0 { 0 } else { s[0] + sum(s.subrange(1, s.len() as int)) }
+}
+
+proof fn sum_prop(s: Seq<u32>)
+    requires s.len() > 0,
+    ensures sum(s) == sum(s.subrange(0, s.len() - 1)) + s[s.len() - 1],
+    decreases s.len(),
+{
+    if s.len() > 1 {
+        sum_prop(s.subrange(1, s.len() as int));
+        assert (s.subrange(0, s.len() - 1).drop_first() == s.subrange(1, s.len() as int).drop_last());
+        assert(s[0] + sum(s.subrange(0, s.len() - 1).drop_first()) + s[s.len() - 1] == sum(s));
+    } else {
+        assert (sum(s) == sum(s.subrange(0, s.len() - 1)) + s[s.len() - 1]) by {
+            assert (sum(s) == s[0] + sum(s.subrange(1, s.len() as int)));
+        }
+    }
+}
+
+fn sum_loop(numbers: &Vec<u32>) -> (s: Option<u32>)
+    ensures s.is_some() ==> s.unwrap() == sum(numbers@),
+{
+    let mut s: u32 = 0;
+    let mut i = 0;
+    while i < numbers.len()
+    {
+        s = s.checked_add(numbers[i])?;
+
+        i += 1;
+    }
+    Some(s)
+}
+```
diff --git a/prompts/humaneval-verus-cot/steps/002/question.txt b/prompts/humaneval-verus-cot/steps/002/question.txt
new file mode 100644
index 0000000..83ec0a7
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/002/question.txt
@@ -0,0 +1,4 @@
+Given the following program, for each function, determine if it has loops. If it does, try to determine what the loop does in detail. Go through all your thinking and reasoning process step by step.
+```verus
+{program}
+```
diff --git a/prompts/humaneval-verus-cot/steps/003/examples/001/answer.txt b/prompts/humaneval-verus-cot/steps/003/examples/001/answer.txt
new file mode 100644
index 0000000..3520792
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/003/examples/001/answer.txt
@@ -0,0 +1,22 @@
+As we've seen, the loop in the `sum_loop` function goes over the elements of the array `numbers` and adds them to the `s` variable, which is equal to the `sum(numbers@.subrange(0, i as int))` for all `i`, such that `0 <= i <= numbers@.len()`. This means that during the execution of the loop the following invariant holds:
+1. `0 <= i <= numbers@.len()`
+2. `s == sum(numbers@.take(i as int))`
+            
+The second invariant might be a bit difficult to prove, as it adds the last element of the array to the sum, while, as I've said before, the `sum` function is defined as `sum(a) == a[0] + sum(a.subrange(1, a.len() - 1))`. However, as we've seen, it's possible to use the `sum_prop` function to prove that this invariant holds.
+Let's work out how to use it. If applied to the `numbers@.subrange(0, i + 1)`, we get:
+`sum(numbers@.subrange(0, i + 1)) == sum(numbers@.subrange(0, i + 1).subrange(0, i as int)) + numbers@[i as int]`. It might be difficult for verus to prove that `sum(numbers@.subrange(0, i + 1).subrange(0, i as int)) == sum(numbers@.subrange(0, i as int))`, so I'd suggest adding an assertion before calling `sum_prop` to make sure that this is the case, namely `assert (numbers@.subrange(0, i + 1).subrange(0, i as int) == numbers@.subrange(0, i as int))`.
+A lemma can only be used inside a proof block and can also add the assertions in this block. So, to use the `sum_prop`, I'd suggest the following code:
+```
+proof {
+    assert (numbers@.subrange(0, i + 1).subrange(0, i as int) == numbers@.subrange(0, i as int));
+    sum_prop(numbers@.subrange(0, i + 1));
+    assert(sum(numbers@.subrange(0, i + 1)) == sum(numbers@.subrange(0, i + 1).subrange(0, i as int)) + numbers@[i as int]);
+}
+```
+
+After the end of the loop we know that `i == numbers@.len()` from the first invariant and loop condition `i < numbers.len()` being false. Applying this to the second invaraint we deduce `s == sum(numbers@.subrange(0, numbers@.len() as int)). But in order to prove the postcondition, we need to let verus know that this `numbers@.subrange(0, numbers@.len() as int)` is the entire sequence `numbers@`. So i suggest adding this after the end of the loop
+```
+assert(numbers@.subrange(0, numbers.len() as int) == numbers@);
+```
+
+In conclusion, we can say that two invariants hold and the second one could be proved using the `sum_prop` function.
diff --git a/prompts/humaneval-verus-cot/steps/003/examples/001/question.txt b/prompts/humaneval-verus-cot/steps/003/examples/001/question.txt
new file mode 100644
index 0000000..ce7bb1f
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/003/examples/001/question.txt
@@ -0,0 +1,38 @@
+As we've already analyzed the specifications and the loops, it's time to try to analyze what invariants and patterns hold during the execution of the program. Go over the found loops in the following program in the code and try to find out what invariants and patterns they hold. Go through all your thinking and reasoning process step by step.
+```verus
+spec fn sum(s: Seq<u32>) -> int 
+    decreases s.len()
+{
+    if s.len() == 0 { 0 } else { s[0] + sum(s.subrange(1, s.len() as int)) }
+}
+
+proof fn sum_prop(s: Seq<u32>)
+    requires s.len() > 0,
+    ensures sum(s) == sum(s.subrange(0, s.len() - 1)) + s[s.len() - 1],
+    decreases s.len(),
+{
+    if s.len() > 1 {
+        sum_prop(s.subrange(1, s.len() as int));
+        assert (s.subrange(0, s.len() - 1).drop_first() == s.subrange(1, s.len() as int).drop_last());
+        assert(s[0] + sum(s.subrange(0, s.len() - 1).drop_first()) + s[s.len() - 1] == sum(s));
+    } else {
+        assert (sum(s) == sum(s.subrange(0, s.len() - 1)) + s[s.len() - 1]) by {
+            assert (sum(s) == s[0] + sum(s.subrange(1, s.len() as int)));
+        }
+    }
+}
+
+fn sum_loop(numbers: &Vec<u32>) -> (s: Option<u32>)
+    ensures s.is_some() ==> s.unwrap() == sum(numbers@),
+{
+    let mut s: u32 = 0;
+    let mut i = 0;
+    while i < numbers.len()
+    {
+        s = s.checked_add(numbers[i])?;
+
+        i += 1;
+    }
+    Some(s)
+}
+```
diff --git a/prompts/humaneval-verus-cot/steps/003/question.txt b/prompts/humaneval-verus-cot/steps/003/question.txt
new file mode 100644
index 0000000..fa0167e
--- /dev/null
+++ b/prompts/humaneval-verus-cot/steps/003/question.txt
@@ -0,0 +1,4 @@
+As we've already analyzed the specifications and the loops, it's time to try to analyze what invariants and patterns hold during the execution of the program. Go over the found loops in the following program in the code and try to find out what invariants and patterns they hold. Go through all your thinking and reasoning process step by step.
+```verus
+{program}
+```
diff --git a/pyproject.toml b/pyproject.toml
index c8c4ad0..17c475f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ src_paths = ["verified_cogen"]
 
 [tool.pyright]
 typeCheckingMode = "strict"
+pythonVersion = "3.9"
 
 [[tool.poetry.source]]
 name = "PyPI"
diff --git a/results/tries_HumanEval-Dafny_1.json b/results/tries_HumanEval-Dafny_1.json
new file mode 100644
index 0000000..21c5734
--- /dev/null
+++ b/results/tries_HumanEval-Dafny_1.json
@@ -0,0 +1,113 @@
+{
+  "002-truncate.dfy": 0,
+  "000-has_close_elements.dfy": 1,
+  "003-below_zero.dfy": 1,
+  "005-intersperse.dfy": 4,
+  "004-mean_absolute_derivation.dfy": 4,
+  "006-parse_nested_parens.dfy": 4,
+  "009-rolling_max.dfy": 6,
+  "011-string_xor.dfy": 1,
+  "012-longest.dfy": 5,
+  "013-greatest_common_divisor.dfy": 3,
+  "014-all_prefixes.dfy": 1,
+  "016-count_distinct_characters.dfy": 6,
+  "021-rescale_to_unit.dfy": 3,
+  "023-strlen.dfy": 0,
+  "024-largest-divisor.dfy": 1,
+  "025-factorize.dfy": 3,
+  "026-remove_duplicates.dfy": 7,
+  "027-flip_case.dfy": 0,
+  "029-filter_by_prefix.dfy": 1,
+  "031-is-prime.dfy": 1,
+  "034-unique.dfy": 6,
+  "035-max-element.dfy": 2,
+  "036-fizz_buzz.dfy": 5,
+  "040-triples-sum-to-zero.dfy": 3,
+  "041-car_race_collision.dfy": 0,
+  "042-incr-list.dfy": 1,
+  "045-triangle_area.dfy": 0,
+  "046-fib4.dfy": 1,
+  "048-is-palindrome.dfy": 3,
+  "049-modp.dfy": 1,
+  "050-encode_shift.dfy": 1,
+  "051-remove-vowels.dfy": 1,
+  "052-below-threshold.dfy": 1,
+  "053-add.dfy": 0,
+  "054-same-chars.dfy": 0,
+  "055-fib.dfy": 1,
+  "057-monotonic.dfy": 1,
+  "059-largest-prime-factor.dfy": 1,
+  "060-sum_to_n.dfy": 0,
+  "062-derivative.dfy": 1,
+  "063-fibfib.dfy": 1,
+  "064-vowel_count.dfy": 2,
+  "065-circular_shift.dfy": 1,
+  "066-digitSum.dfy": 4,
+  "068-pluck.dfy": 3,
+  "069-search.dfy": 1,
+  "072-will_it_fly.dfy": 6,
+  "074-total_match.dfy": 2,
+  "075-is_multiply_prime.dfy": 5,
+  "077-is_cube.dfy": 1,
+  "078-hex_key.dfy": 2,
+  "079-decimal_to_binary.dfy": 0,
+  "080-is_happy.dfy": 1,
+  "081-numerical_letter_grade.dfy": 1,
+  "082-prime_length.dfy": 0,
+  "083-starts_one_ends.dfy": 0,
+  "084-solve.dfy": 2,
+  "085-add.dfy": 2,
+  "088-sort_array.dfy": 2,
+  "089-encrypt.dfy": 1,
+  "092-any_int.dfy": 0,
+  "093-encode.dfy": 2,
+  "094-skjkasdkd.dfy": 5,
+  "096-count_up_to.dfy": 2,
+  "097-multiply.dfy": 0,
+  "098-count_upper.dfy": 3,
+  "100-make_a_pile.dfy": 1,
+  "102-choose_num.dfy": 1,
+  "104-unique_digits.dfy": 5,
+  "106-f.dfy": 1,
+  "108-count_nums.dfy": 2,
+  "110-exchange.dfy": 0,
+  "115-max_fill.dfy": 6,
+  "116-sort_array.dfy": 6,
+  "121-solution.dfy": 10,
+  "122-add_elements.dfy": 2,
+  "123-get_odd_collatz.dfy": 10,
+  "127-intersection.dfy": 0,
+  "128-prod_signs.dfy": 2,
+  "130-tri.dfy": 1,
+  "133-sum_squares.dfy": 2,
+  "134-check_if_last_char_is_a_letter.dfy": 0,
+  "135_can_arrange.dfy": 1,
+  "138_is_equal_to_sum_even.dfy": 0,
+  "139-special_factorial.dfy": 2,
+  "142-sum_squares.dfy": 6,
+  "145-order_by_points.dfy": 2,
+  "146_specialFilter.dfy": 2,
+  "148-bf.dfy": 0,
+  "150-x_or_y.dfy": 0,
+  "151-double_the_difference.dfy": 3,
+  "152-compare.dfy": 1,
+  "153-Strongest_Extension.dfy": 1,
+  "154-cycpattern_check.dfy": 1,
+  "157-right_angle_triangle.dfy": 0,
+  "158-find_max.dfy": 1,
+  "159-eat.dfy": 0,
+  "161-solve.dfy": 1,
+  "010-is_palindrome.dfy": 3,
+  "033-sort_third.dfy": 6,
+  "043-pairs-sum-to-zero.dfy": 1,
+  "058-common.dfy": 1,
+  "007-filter_by_substring.dfy": 8,
+  "076-is_simple_power.dfy": 8,
+  "163-generate_integers.dfy": 9,
+  "008-sum_product.dfy": 1,
+  "070-strange_sort_list.dfy": 2,
+  "107-even_odd_palindrome.dfy": 3,
+  "112-reverse_delete.dfy": 2,
+  "136-largest_smallest_integers.dfy": 2,
+  "155_even_odd_count.dfy": 4
+}
\ No newline at end of file
diff --git a/results/tries_HumanEval-RustBench_1.json b/results/tries_HumanEval-RustBench_1.json
new file mode 100644
index 0000000..0584721
--- /dev/null
+++ b/results/tries_HumanEval-RustBench_1.json
@@ -0,0 +1,34 @@
+{
+  "014-all_prefixes.rs": 4,
+  "023-strlen.rs": 0,
+  "035-max-element.rs": 1,
+  "052-below-threshold.rs": 2,
+  "053-add.rs": 0,
+  "057-monotonic.rs": 1,
+  "062-derivative.rs": 2,
+  "abs.rs": 0,
+  "arithmetic_weird.rs": 1,
+  "array_concat.rs": 1,
+  "array_copy.rs": 2,
+  "array_product.rs": 1,
+  "barrier.rs": 6,
+  "binary_search_recursive.rs": 0,
+  "has_close_elements.rs": 2,
+  "has_only_one_distinct_element.rs": 6,
+  "integer_square_root.rs": 8,
+  "intersperse.rs": 1,
+  "is_non_prime.rs": 2,
+  "is_sorted.rs": 6,
+  "largest_prime_factor.rs": 6,
+  "last_position.rs": 5,
+  "max_array.rs": 1,
+  "max_dafny_lsp.rs": 4,
+  "remove_element.rs": 1,
+  "remove_elements.rs": 9,
+  "replace.rs": 1,
+  "reverse.rs": 2,
+  "smallest_list_length.rs": 2,
+  "two_way_sort.rs": 1,
+  "unique.rs": 3,
+  "unique_better.rs": 3
+}
\ No newline at end of file
diff --git a/scripts/generate_result_graph.py b/scripts/generate_result_graph.py
index a3bab94..4e76f95 100644
--- a/scripts/generate_result_graph.py
+++ b/scripts/generate_result_graph.py
@@ -17,9 +17,14 @@
 for t in tries:
     cnt[t] += 1
 cnt = list(accumulate(cnt))
-cnt = [c / file_cnt for c in cnt]
-plt.plot(cnt)
-plt.show()
+cnt = [100 * (c / file_cnt) for c in cnt]
+
+fig, ax = plt.subplots()  # type: ignore
+plt.title("Percentage of files verified")  # type: ignore
+plt.xlabel("Number of tries")  # type: ignore
+plt.ylabel("Percentage of files")  # type: ignore
+ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f"{x}%"))  # type: ignore
+ax.plot(cnt)  # type: ignore
 
 # %%
 f"{cnt[-1] * 100}% of the files were successfully verified"
diff --git a/tests/test_dafny.py b/tests/test_dafny.py
index a8809df..cc914c9 100644
--- a/tests/test_dafny.py
+++ b/tests/test_dafny.py
@@ -21,7 +21,32 @@ def test_dafny_generate():
         method main_valid(value: int) returns (result: int)
             requires value >= 10
             ensures result >= 20
-        { var ret := main(value); return ret; }
+        { var ret0 := main(value); return ret0; }
+        """
+    )
+
+
+def test_dafny_generate_multiple_returns():
+    dafny_lang = LanguageDatabase().get("dafny")
+    code = dedent(
+        """\
+        method main(value: int) returns (result: int, result2: int)
+            requires value >= 10
+            ensures result >= 20
+            ensures result2 >= 30
+        {
+            assert value * 2 >= 20; // assert-line
+            result := value * 2;
+            result2 := value * 3;
+        }"""
+    )
+    assert dafny_lang.generate_validators(code) == dedent(
+        """\
+        method main_valid(value: int) returns (result: int, result2: int)
+            requires value >= 10
+            ensures result >= 20
+            ensures result2 >= 30
+        { var ret0, ret1 := main(value); return ret0, ret1; }
         """
     )
 
diff --git a/tests/test_step.py b/tests/test_step.py
new file mode 100644
index 0000000..bfa9942
--- /dev/null
+++ b/tests/test_step.py
@@ -0,0 +1,8 @@
+from verified_cogen.runners.chain_of_thought import Step
+import pathlib
+
+
+def test_step_file():
+    step = Step(pathlib.Path("prompts/humaneval-dafny-cot/steps/001"))
+    assert "{program}" in step.question
+    assert len(step.examples) == 1
diff --git a/verified_cogen/args.py b/verified_cogen/args.py
index 22a604c..d4fbb0b 100644
--- a/verified_cogen/args.py
+++ b/verified_cogen/args.py
@@ -61,7 +61,7 @@ def get_default_parser():
     )
     parser.add_argument(
         "--bench-type",
-        help="benchmark type, available: {invariants, generic, generate, validating}",
+        help="benchmark type, available: {invariants, generic, generate, validating, step-by-step}",
         default="invariants",
     )
     parser.add_argument("--temperature", help="model temperature", default=0, type=int)
diff --git a/verified_cogen/experiments/incremental_run.py b/verified_cogen/experiments/incremental_run.py
index dcdfb73..97dbeb0 100644
--- a/verified_cogen/experiments/incremental_run.py
+++ b/verified_cogen/experiments/incremental_run.py
@@ -1,28 +1,24 @@
 import logging
 import pathlib
-import sys
 import json
 
+from pathlib import Path
 from verified_cogen.llm.llm import LLM
 from verified_cogen.args import get_args
-from verified_cogen.tools import rename_file, ext_glob, extension_from_file_list
-from verified_cogen.runners.invariants import InvariantRunner
+from verified_cogen.tools import (
+    rename_file,
+    ext_glob,
+    extension_from_file_list,
+    register_output_handler,
+)
 from verified_cogen.runners.languages import register_basic_languages
-from verified_cogen.runners.languages.language import LanguageDatabase
-from verified_cogen.runners.validating import ValidatingRunner
 from verified_cogen.tools.modes import Mode
 from verified_cogen.tools.verifier import Verifier
+from verified_cogen.main import make_runner_cls
 
 logger = logging.getLogger(__name__)
 
 
-def register_output_handler():
-    handler = logging.StreamHandler(sys.stdout)
-    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-
-
 def main():
     register_basic_languages()
 
@@ -30,12 +26,11 @@ def main():
     mode = Mode(args.insert_conditions_mode)
     assert mode != Mode.REGEX
     assert args.dir is not None
-    assert args.bench_type == "validating", args.bench_type
     assert args.runs == 1
     assert args.retries == 0
 
     if args.output_logging:
-        register_output_handler()
+        register_output_handler(logger)
 
     directory = pathlib.Path(args.dir)
     log_tries = pathlib.Path(args.log_tries) if args.log_tries is not None else None
@@ -52,7 +47,6 @@ def main():
     assert len(files) > 0, "No files found in the directory"
     files.sort()
 
-    language = LanguageDatabase().get(extension_from_file_list(files))
     verifier = Verifier(args.shell, args.verifier_command)
 
     for file in files:
@@ -62,11 +56,9 @@ def main():
             args.prompts_directory,
             args.temperature,
         )
-        runner = ValidatingRunner(
-            wrapping=InvariantRunner(llm, logger, verifier),
-            language=language,
-            log_tries=log_tries,
-        )
+        runner = make_runner_cls(
+            args.bench_type, extension_from_file_list([file]), log_tries
+        )(llm, logger, verifier)
         display_name = rename_file(file)
         marker_name = str(file.relative_to(directory))
         if marker_name in results and isinstance(results[marker_name], int):
@@ -88,6 +80,8 @@ def main():
         with open(json_results, "w") as f:
             json.dump(results, f, indent=2)
 
+        llm.dump_history(Path("history") / f"{file.stem}.txt")
+
 
 if __name__ == "__main__":
     main()
diff --git a/verified_cogen/experiments/use_houdini.py b/verified_cogen/experiments/use_houdini.py
index e955d8c..4aec06b 100644
--- a/verified_cogen/experiments/use_houdini.py
+++ b/verified_cogen/experiments/use_houdini.py
@@ -125,10 +125,10 @@ def collect_invariants(args: ProgramArgs, prg: str) -> list[str]:
             temperature=temperature,
         )
 
-        llm.user_prompts.append(
+        llm.add_user_prompt(
             INVARIANTS_JSON_PROMPT.replace("{program}", prg).replace("{function}", func)
         )
-        response = llm._make_request()  # type: ignore
+        response = llm.make_request()
         try:
             invariants = json.loads(response)
             result_invariants.extend(invariants)
@@ -143,8 +143,8 @@ def collect_invariants(args: ProgramArgs, prg: str) -> list[str]:
 def remove_failed_invariants(
     llm: LLM, invariants: list[str], err: str
 ) -> Optional[list[str]]:
-    llm.user_prompts.append(REMOVE_FAILED_INVARIANTS_PROMPT.format(error=err))
-    response = llm._make_request()  # type: ignore
+    llm.add_user_prompt(REMOVE_FAILED_INVARIANTS_PROMPT.format(error=err))
+    response = llm.make_request()
     try:
         new_invariants = json.loads(response)
         log.debug("REMOVED: {}".format(set(invariants).difference(set(new_invariants))))
diff --git a/verified_cogen/llm/llm.py b/verified_cogen/llm/llm.py
index c5cc99b..4341b30 100644
--- a/verified_cogen/llm/llm.py
+++ b/verified_cogen/llm/llm.py
@@ -1,11 +1,16 @@
 import logging
 from http.client import RemoteDisconnected
+from pathlib import Path
 from typing import Optional
 
 from grazie.api.client.chat.prompt import ChatPrompt
 from grazie.api.client.chat.response import ChatResponse
 from grazie.api.client.endpoints import GrazieApiGatewayUrls
-from grazie.api.client.gateway import AuthType, GrazieApiGatewayClient
+from grazie.api.client.gateway import (
+    AuthType,
+    GrazieApiGatewayClient,
+    RequestFailedException,
+)
 from grazie.api.client.llm_parameters import LLMParameters
 from grazie.api.client.parameters import Parameters
 from grazie.api.client.profiles import Profile
@@ -34,13 +39,37 @@ def __init__(
         self.prompt_dir = prompt_dir
         self.is_gpt = "gpt" in self.profile.name
         self.user_prompts: list[str] = []
+        self.is_user_prompt_temporary: list[bool] = []
         self.responses: list[str] = []
+        self.is_response_temporary: list[bool] = []
         self.had_errors = False
         self.temperature = temperature
         self.system_prompt = (
             system_prompt if system_prompt else prompts.sys_prompt(self.prompt_dir)
         )
 
+    def add_user_prompt(self, prompt: str, temporary: bool = False):
+        self.user_prompts.append(prompt)
+        self.is_user_prompt_temporary.append(temporary)
+
+    def add_response(self, response: str, temporary: bool = False):
+        self.responses.append(response)
+        self.is_response_temporary.append(temporary)
+
+    def wipe_temporary(self):
+        self.user_prompts = [
+            prompt
+            for prompt, temporary in zip(
+                self.user_prompts, self.is_user_prompt_temporary
+            )
+            if not temporary
+        ]
+        self.responses = [
+            response
+            for response, temporary in zip(self.responses, self.is_response_temporary)
+            if not temporary
+        ]
+
     def _request(
         self, temperature: Optional[float] = None, tries: int = 5
     ) -> ChatResponse:
@@ -72,30 +101,33 @@ def _request(
         except RemoteDisconnected:
             logger.warning("Grazie API is down, retrying...")
             return self._request(temperature, tries - 1)
+        except RequestFailedException as e:
+            self.dump_history(Path("err_dump.txt"))
+            raise e
 
-    def _make_request(self) -> str:
+    def make_request(self) -> str:
         response = self._request().content
-        self.responses.append(response)
+        self.add_response(response)
         return extract_code_from_llm_output(response)
 
     def produce(self, prg: str) -> str:
-        self.user_prompts.append(
+        self.add_user_prompt(
             prompts.produce_prompt(self.prompt_dir).format(program=prg)
         )
-        return self._make_request()
+        return self.make_request()
 
     def add(self, prg: str, checks: str, function: Optional[str] = None) -> str:
         prompt = prompts.add_prompt(self.prompt_dir).format(program=prg, checks=checks)
         if "{function}" in prompt and function is not None:
             prompt = prompt.replace("{function}", function)
-        self.user_prompts.append(prompt)
-        return self._make_request()
+        self.add_user_prompt(prompt, False)
+        return self.make_request()
 
     def rewrite(self, prg: str) -> str:
-        self.user_prompts.append(
+        self.add_user_prompt(
             prompts.rewrite_prompt(self.prompt_dir).replace("{program}", prg)
         )
-        return self._make_request()
+        return self.make_request()
 
     def ask_for_fixed(self, err: str) -> str:
         prompt = (
@@ -103,9 +135,24 @@ def ask_for_fixed(self, err: str) -> str:
             if self.had_errors
             else prompts.ask_for_fixed_prompt(self.prompt_dir)
         )
-        self.user_prompts.append(prompt.format(error=err))
-        return self._make_request()
+        self.add_user_prompt(prompt.format(error=err))
+        return self.make_request()
 
     def ask_for_timeout(self) -> str:
-        self.user_prompts.append(prompts.ask_for_timeout_prompt(self.prompt_dir))
-        return self._make_request()
+        self.add_user_prompt(prompts.ask_for_timeout_prompt(self.prompt_dir))
+        return self.make_request()
+
+    def dump_history(self, file: Path):
+        with open(file, "w") as f:
+            current_prompt_user = 0
+            current_response = 0
+            while current_prompt_user < len(
+                self.user_prompts
+            ) or current_response < len(self.responses):
+                if current_prompt_user < len(self.user_prompts):
+                    user_prompt = self.user_prompts[current_prompt_user]
+                    f.write(f"User: {user_prompt}\n\n")
+                    current_prompt_user += 1
+                if current_response < len(self.responses):
+                    f.write(f"LLM: {self.responses[current_response]}\n\n")
+                    current_response += 1
diff --git a/verified_cogen/main.py b/verified_cogen/main.py
index 0709b63..32e1d50 100644
--- a/verified_cogen/main.py
+++ b/verified_cogen/main.py
@@ -9,6 +9,7 @@
 from verified_cogen.runners.languages import register_basic_languages
 from verified_cogen.runners.languages.language import LanguageDatabase
 from verified_cogen.runners.validating import ValidatingRunner
+from verified_cogen.runners.step_by_step import StepByStepRunner
 from verified_cogen.tools import (
     ext_glob,
     pprint_stat,
@@ -16,6 +17,7 @@
     tabulate_list,
     extension_from_file_list,
     get_cache_dir,
+    register_output_handler,
 )
 from verified_cogen.tools.modes import Mode
 from verified_cogen.tools.verifier import Verifier
@@ -42,15 +44,15 @@ def run_once(
 
     for file in files:
         llm = LLM(
-            args.grazie_token,  # type: ignore
-            args.llm_profile,  # type: ignore
-            args.prompts_directory,  # type: ignore
-            args.temperature,  # type: ignore
+            args.grazie_token,
+            args.llm_profile,
+            args.prompts_directory,
+            args.temperature,
         )
 
         runner = runner_cls(llm, logger, verifier)
 
-        retries = args.retries + 1  # type: ignore
+        retries = args.retries + 1
         tries = None
         while retries > 0 and tries is None:
             tries = runner.run_on_file(mode, args.tries, str(file))
@@ -70,6 +72,8 @@ def run_once(
         if tries is not None:
             cnt[name] = tries
 
+        llm.dump_history(Path(get_cache_dir()) / "history" / f"{name}.txt")
+
     if is_once:
         if args.output_style == "full":
             success_zero_tries_tabbed = tabulate_list(success_zero_tries)
@@ -95,20 +99,24 @@ def make_runner_cls(
     bench_type: str, extension: str, log_tries: Optional[pathlib.Path]
 ) -> Callable[[LLM, Logger, Verifier], Runner]:
     def runner_cls(llm: LLM, logger: Logger, verifier: Verifier):
-        match bench_type:
-            case "invariants":
-                return InvariantRunner(llm, logger, verifier, log_tries)
-            case "generic":
-                return GenericRunner(llm, logger, verifier, log_tries)
-            case "generate":
-                return GenerateRunner(llm, logger, verifier, log_tries)
-            case "validating":
-                return ValidatingRunner(
-                    InvariantRunner(llm, logger, verifier, log_tries),
-                    LanguageDatabase().get(extension),
-                )
-            case _:
-                raise ValueError(f"Unexpected bench_type: {bench_type}")
+        if bench_type == "invariants":
+            return InvariantRunner(llm, logger, verifier, log_tries)
+        elif bench_type == "generic":
+            return GenericRunner(llm, logger, verifier, log_tries)
+        elif bench_type == "generate":
+            return GenerateRunner(llm, logger, verifier, log_tries)
+        elif bench_type == "validating":
+            return ValidatingRunner(
+                InvariantRunner(llm, logger, verifier, log_tries),
+                LanguageDatabase().get(extension),
+            )
+        elif bench_type == "step-by-step":
+            return ValidatingRunner(
+                StepByStepRunner(InvariantRunner(llm, logger, verifier, log_tries)),
+                LanguageDatabase().get(extension),
+            )
+        else:
+            raise ValueError(f"Unexpected bench_type: {bench_type}")
 
     return runner_cls
 
@@ -125,6 +133,9 @@ def main():
         if args.bench_type == "generic":
             raise ValueError("Regex mode only works with invariants")
 
+    if args.output_logging:
+        register_output_handler(logger)
+
     if args.input is None and args.dir is None:
         args.input = input("Input file: ").strip()
     log_tries = pathlib.Path(args.log_tries) if args.log_tries is not None else None
@@ -200,3 +211,5 @@ def main():
             print("Verified with modification on try", tries)
         else:
             print("Failed to verify")
+
+        llm.dump_history(Path("dump.txt"))
diff --git a/verified_cogen/runners/__init__.py b/verified_cogen/runners/__init__.py
index 78b818e..01e6dbc 100644
--- a/verified_cogen/runners/__init__.py
+++ b/verified_cogen/runners/__init__.py
@@ -15,6 +15,7 @@ class Runner:
     logger: Logger
     verifier: Verifier
     log_tries: Optional[pathlib.Path]
+    starting_prg: Optional[str] = None
 
     def __init__(
         self,
@@ -56,6 +57,9 @@ def precheck(self, prg: str, mode: Mode):
     def preprocess(self, prg: str, mode: Mode) -> str:
         return prg
 
+    def postprocess(self, inv_prg: str) -> str:
+        return inv_prg
+
     def invoke(self, prg: str, mode: Mode) -> str:
         self.logger.info("Invoking LLM")
         if mode == Mode.LLM_SINGLE_STEP:
@@ -97,7 +101,7 @@ def try_fixing(
                 self.logger.info("Verification timed out")
                 tries -= 1
                 if tries > 0:
-                    inv_prg = self.ask_for_timeout()
+                    inv_prg = self.postprocess(self.ask_for_timeout())
             else:
                 verified_inv, out_inv, err_inv = verification_result
                 if verified_inv:
@@ -109,7 +113,9 @@ def try_fixing(
                     tries -= 1
                     if tries > 0:
                         self.logger.info(f"Retrying with {tries} tries left...")
-                        inv_prg = self.ask_for_fixed(out_inv + err_inv)
+                        inv_prg = self.postprocess(
+                            self.ask_for_fixed(out_inv + err_inv)
+                        )
         return None
 
     def run_on_file(
@@ -124,11 +130,13 @@ def run_on_file(
         with open(file, "r") as f:
             prg = self.preprocess(f.read(), mode)
 
+        self.starting_prg = prg
+
         verification_result = self.verify_program(name, 0, prg)
         if verification_result is not None and verification_result[0]:
             return 0
         elif verification_result is None:
             self.logger.info("Verification timed out")
         self.precheck(prg, mode)
-        inv_prg = self.invoke(prg, mode)
+        inv_prg = self.postprocess(self.invoke(prg, mode))
         return self.try_fixing(total_tries, inv_prg, name)
diff --git a/verified_cogen/runners/chain_of_thought/__init__.py b/verified_cogen/runners/chain_of_thought/__init__.py
new file mode 100644
index 0000000..3977110
--- /dev/null
+++ b/verified_cogen/runners/chain_of_thought/__init__.py
@@ -0,0 +1,3 @@
+from .step import Step as Step
+
+__ALL__ = ["Step"]
diff --git a/verified_cogen/runners/chain_of_thought/step.py b/verified_cogen/runners/chain_of_thought/step.py
new file mode 100644
index 0000000..f49ec4f
--- /dev/null
+++ b/verified_cogen/runners/chain_of_thought/step.py
@@ -0,0 +1,45 @@
+import pathlib
+
+
+class Substep:
+    question: str
+    answer: str
+
+    def __init__(self, question: str, answer: str):
+        self.question = question
+        self.answer = answer
+
+    def __repr__(self) -> str:
+        return f"Substep(question={self.question}, answer={self.answer})"
+
+    def __eq__(self, value: object, /) -> bool:
+        if not isinstance(value, Substep):
+            return False
+        return self.question == value.question and self.answer == value.answer
+
+
+class Step:
+    examples: list[Substep]
+    question: str
+
+    def __init__(self, dir: pathlib.Path):
+        self.examples = []
+        for substep in sorted((dir / "examples").iterdir()):
+            assert substep.is_dir()
+            self.examples.append(
+                Substep(
+                    (substep / "question.txt").read_text(),
+                    (substep / "answer.txt").read_text(),
+                )
+            )
+
+        self.question = (dir / "question.txt").read_text()
+
+    def __repr__(self) -> str:
+        return f"Step(question={self.question}, substeps={self.examples})"
+
+    def __str__(self) -> str:
+        result = ""
+        for substep in self.examples:
+            result += f"===== QUESTION_EXAMPLE =====\n{substep.question}\n===== ANSWER =====\n{substep.answer}\n"
+        return result + f"===== QUESTION =====\n{self.question}\n"
diff --git a/verified_cogen/runners/languages/dafny.py b/verified_cogen/runners/languages/dafny.py
index 7991632..c2958bd 100644
--- a/verified_cogen/runners/languages/dafny.py
+++ b/verified_cogen/runners/languages/dafny.py
@@ -5,7 +5,7 @@
 
 DAFNY_VALIDATOR_TEMPLATE = """\
 method {method_name}_valid({parameters}) returns ({returns}){specs}\
-{ var ret := {method_name}({param_names}); return ret; }
+{ var {return_values} := {method_name}({param_names}); return {return_values}; }
 """
 
 
@@ -25,3 +25,13 @@ def __init__(self):  # type: ignore
             "// assert-line",
             "//",
         )
+
+    def _validators_from(
+        self, method_name: str, parameters: str, returns: str, specs: str
+    ) -> str:
+        result = super()._validators_from(method_name, parameters, returns, specs)
+        result = result.replace(
+            "{return_values}",
+            ", ".join(f"ret{i}" for i in range(len(returns.split(",")))),
+        )
+        return result
diff --git a/verified_cogen/runners/languages/language.py b/verified_cogen/runners/languages/language.py
index 3895f6f..627abd5 100644
--- a/verified_cogen/runners/languages/language.py
+++ b/verified_cogen/runners/languages/language.py
@@ -41,6 +41,28 @@ def __init__(  # type: ignore
         self.assert_invariant_patterns = assert_invariants_pattern
         self.inline_assert_comment = inline_assert_comment
 
+    def _validators_from(
+        self,
+        method_name: str,
+        parameters: str,
+        returns: str,
+        specs: str,
+    ) -> str:
+        return (
+            self.validator_template.replace("{method_name}", method_name)
+            .replace("{parameters}", parameters or "")
+            .replace("{returns}", returns or "")
+            .replace("{specs}", specs or "\n")
+            .replace(
+                "{param_names}",
+                ", ".join(
+                    param.split(":")[0].strip()
+                    for param in parameters.split(",")
+                    if param.strip()
+                ),
+            )
+        )
+
     def generate_validators(self, code: str) -> str:
         methods = list(self.method_regex.finditer(code))
 
@@ -55,18 +77,7 @@ def generate_validators(self, code: str) -> str:
             )
 
             validators.append(
-                self.validator_template.replace("{method_name}", method_name)
-                .replace("{parameters}", parameters or "")
-                .replace("{returns}", returns or "")
-                .replace("{specs}", specs or "\n")
-                .replace(
-                    "{param_names}",
-                    ", ".join(
-                        param.split(":")[0].strip()
-                        for param in parameters.split(",")
-                        if param.strip()
-                    ),
-                )
+                self._validators_from(method_name, parameters, returns, specs)
             )
 
         return "\n".join(validators)
diff --git a/verified_cogen/runners/step_by_step.py b/verified_cogen/runners/step_by_step.py
new file mode 100644
index 0000000..b1c508b
--- /dev/null
+++ b/verified_cogen/runners/step_by_step.py
@@ -0,0 +1,112 @@
+import pathlib
+from typing import Optional
+
+from verified_cogen.runners import Runner
+from verified_cogen.tools import extract_code_from_llm_output
+from verified_cogen.tools.modes import Mode
+from verified_cogen.runners.chain_of_thought.step import Substep, Step
+
+
+class StepByStepConfig:
+    remove_old_examples: bool = False
+    full_examples: bool = True
+
+    @classmethod
+    def default(cls) -> "StepByStepConfig":
+        return cls()
+
+
+class StepByStepRunner(Runner):
+    wrapped_runner: Runner
+    config: StepByStepConfig
+
+    def __init__(self, wrapping: Runner, config: Optional[StepByStepConfig] = None):
+        super().__init__(wrapping.llm, wrapping.logger, wrapping.verifier)
+        self.wrapped_runner = wrapping
+        self.config = StepByStepConfig.default() if config is None else config
+
+    def preprocess(self, prg: str, mode: Mode) -> str:
+        return self.wrapped_runner.preprocess(prg, mode)
+
+    def rewrite(self, prg: str) -> str:
+        return (
+            self.rewrite_full_examples(prg)
+            if self.config.full_examples
+            else self.rewrite_step_by_step(prg)
+        )
+
+    def rewrite_step_by_step(self, prg: str) -> str:
+        steps: list[Step] = []
+        for step in sorted((pathlib.Path(self.llm.prompt_dir) / "steps").iterdir()):
+            assert step.is_dir()
+            steps.append(Step(step))
+        for it, step in enumerate(steps):
+            for substep in step.examples:
+                self.llm.add_user_prompt(
+                    substep.question, self.config.remove_old_examples
+                )
+                self.llm.add_response(substep.answer, self.config.remove_old_examples)
+            self.llm.add_user_prompt(step.question.replace("{program}", prg))
+            _ = self.llm.make_request()
+            if self.config.remove_old_examples:
+                self.llm.wipe_temporary()
+            self.logger.info(f"Step {it + 1} done")
+
+        rewrite_step = Step(pathlib.Path(self.llm.prompt_dir) / "rewrite")
+        for substep in rewrite_step.examples:
+            self.llm.add_user_prompt(substep.question, self.config.remove_old_examples)
+            self.llm.add_response(substep.answer, self.config.remove_old_examples)
+        self.llm.add_user_prompt(rewrite_step.question.replace("{program}", prg))
+        response = self.llm.make_request()
+        if self.config.remove_old_examples:
+            self.llm.wipe_temporary()
+
+        return extract_code_from_llm_output(response)
+
+    def rewrite_full_examples(self, prg: str) -> str:
+        steps: list[Step] = []
+        for step in sorted((pathlib.Path(self.llm.prompt_dir) / "steps").iterdir()):
+            assert step.is_dir()
+            steps.append(Step(step))
+
+        assert all(len(step.examples) == len(steps[0].examples) for step in steps)
+
+        rewrite_step = Step(pathlib.Path(self.llm.prompt_dir) / "rewrite")
+        assert len(steps) == 0 or len(rewrite_step.examples) == len(steps[0].examples)
+
+        examples: list[list[Substep]] = list(
+            zip(*([step.examples for step in steps] + [rewrite_step.examples]))
+        )
+
+        for example in examples:
+            for substep in example:
+                self.llm.add_user_prompt(substep.question)
+                self.llm.add_response(substep.answer)
+
+        for it, step in enumerate(steps):
+            self.llm.add_user_prompt(step.question.replace("{program}", prg))
+            _ = self.llm.make_request()
+            self.logger.info(f"Step {it + 1} done")
+
+        self.llm.add_user_prompt(rewrite_step.question.replace("{program}", prg))
+        response = self.llm.make_request()
+
+        return extract_code_from_llm_output(response)
+
+    def postprocess(self, inv_prg: str) -> str:
+        return self.wrapped_runner.postprocess(inv_prg)
+
+    def produce(self, prg: str) -> str:
+        return self.wrapped_runner.produce(prg)
+
+    def insert(self, prg: str, checks: str, mode: Mode) -> str:
+        return self.wrapped_runner.insert(prg, checks, mode)
+
+    def ask_for_timeout(self) -> str:
+        return self.wrapped_runner.ask_for_timeout()
+
+    def ask_for_fixed(self, err: str) -> str:
+        return self.wrapped_runner.ask_for_fixed(err)
+
+    def precheck(self, prg: str, mode: Mode):
+        return self.wrapped_runner.precheck(prg, mode)
diff --git a/verified_cogen/runners/validating.py b/verified_cogen/runners/validating.py
index 28f6d02..6199490 100644
--- a/verified_cogen/runners/validating.py
+++ b/verified_cogen/runners/validating.py
@@ -8,7 +8,6 @@
 class ValidatingRunner(Runner):
     wrapped_runner: Runner
     language: Language
-    prg: Optional[str] = None
 
     def __init__(
         self,
@@ -29,29 +28,32 @@ def _add_validators(self, prg: str, inv_prg: str):
     def preprocess(self, prg: str, mode: Mode) -> str:
         return self.language.remove_asserts_and_invariants(prg)
 
+    def postprocess(self, inv_prg: str) -> str:
+        assert self.starting_prg is not None
+        return self._add_validators(
+            self.starting_prg, self.wrapped_runner.postprocess(inv_prg)
+        )
+
     def rewrite(self, prg: str) -> str:
-        self.prg = prg
-        return self._add_validators(prg, self.wrapped_runner.rewrite(prg))
+        return self.wrapped_runner.rewrite(prg)
 
     def produce(self, prg: str) -> str:
-        self.prg = prg
         return self.wrapped_runner.produce(prg)
 
     def insert(self, prg: str, checks: str, mode: Mode) -> str:
-        self.prg = prg
-        return self._add_validators(prg, self.wrapped_runner.insert(prg, checks, mode))
+        return self.wrapped_runner.insert(prg, checks, mode)
 
     def ask_for_timeout(self) -> str:
         assert (
-            self.prg is not None
+            self.starting_prg is not None
         ), "one of: rewrite, produce, insert must be called before ask_for_timeout"
-        return self._add_validators(self.prg, self.wrapped_runner.ask_for_timeout())
+        return self.wrapped_runner.ask_for_timeout()
 
     def ask_for_fixed(self, err: str) -> str:
         assert (
-            self.prg is not None
+            self.starting_prg is not None
         ), "one of: rewrite, produce, insert must be called before ask_for_fixed"
-        return self._add_validators(self.prg, self.wrapped_runner.ask_for_fixed(err))
+        return self.wrapped_runner.ask_for_fixed(err)
 
     def precheck(self, prg: str, mode: Mode):
         return self.wrapped_runner.precheck(prg, mode)
diff --git a/verified_cogen/tools/__init__.py b/verified_cogen/tools/__init__.py
index 8470633..259be67 100644
--- a/verified_cogen/tools/__init__.py
+++ b/verified_cogen/tools/__init__.py
@@ -1,5 +1,7 @@
+import logging
 import pathlib
 import re
+import sys
 from typing import Optional
 
 import appdirs  # type: ignore
@@ -57,3 +59,10 @@ def extract_code_from_llm_output(reply: str) -> str:
         reply = reply[:i]
         return reply.strip()
     return reply
+
+
+def register_output_handler(logger: logging.Logger):
+    handler = logging.StreamHandler(sys.stdout)
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)