diff --git a/test_framework/basic.py b/test_framework/basic.py
index b9d50251..12f71089 100644
--- a/test_framework/basic.py
+++ b/test_framework/basic.py
@@ -1,6 +1,7 @@
 """Basic tests for Parts I & II"""
 from __future__ import annotations
 
+import difflib
 import json
 import platform
 import subprocess
@@ -144,6 +145,49 @@ def replace_stem(path: Path, new_stem: str) -> Path:
     return path.with_name(new_stem).with_suffix(path.suffix)
 
 
+def build_error_message(
+    expected_retcode: int,
+    expected_stdout: str,
+    actual: subprocess.CompletedProcess[str],
+    exe_name: str,
+) -> str:
+    """Build the error message for when a compiled test program behaves incorrectly
+    Called when a unittest assert* message fails
+    Args:
+        expected_retcode: expected return code from EXPECTED_RESULTS
+        expected_stdout: expected stdout from EXPECTED_RESULTS (often empty)
+        actual: result from calling subprocess.run() on compiled test program
+        exe_name: full path to compiled test program
+    Returns:
+        an error message
+    """
+
+    msg_lines = [f"Incorrect behavior in {exe_name}"]
+
+    # report on incorrect return code
+    if expected_retcode != actual.returncode:
+        msg_lines += [
+            f"* Bad return code: expected {expected_retcode} and got {actual.returncode}"
+        ]
+
+    # report on incorrect stdout
+    if actual.stdout != expected_stdout:
+        msg_lines.append(
+            f"* Bad stdout: expected {repr(expected_stdout)} and got {repr(actual.stdout)}"
+        )
+        diff = list(
+            difflib.ndiff(expected_stdout.splitlines(), actual.stdout.splitlines())
+        )
+        msg_lines.extend(diff)
+
+    # report on incorrect stderr (note: we always expect stderr to be empty)
+    if actual.stderr:
+        msg_lines.append("* Expected no output to stderr, but found:\n")
+        msg_lines.extend(actual.stderr)
+
+    return "\n".join(msg_lines)
+
+
 class TestChapter(unittest.TestCase):
     """Base per-chapter test class - should be subclassed, not instantiated directly.
 
@@ -269,17 +313,18 @@ def validate_runs(
         self.assertEqual(
             expected_retcode,
             actual.returncode,
-            msg=f"Expected return code {expected_retcode}, found {actual.returncode} in {exe}",
+            msg=build_error_message(expected_retcode, expected_stdout, actual, exe),
         )
         self.assertEqual(
             expected_stdout,
             actual.stdout,
-            msg=f"Expected output {expected_stdout}, found {actual.stdout} in {exe}",
+            msg=build_error_message(expected_retcode, expected_stdout, actual, exe),
         )
 
         # none of our test programs write to stderr
         self.assertFalse(
-            actual.stderr, msg=f"Unexpected error output {actual.stderr} in {exe}"
+            actual.stderr,
+            msg=build_error_message(expected_retcode, expected_stdout, actual, exe),
         )
 
     def compile_failure(self, source_file: Path) -> None:
@@ -437,7 +482,7 @@ class TestDirs:
     INVALID_LABELS = "invalid_labels"
     INVALID_STRUCT_TAGS = "invalid_struct_tags"
     # valid test programs for parts I & II
-    # (we'll handle part III test sdifferently)
+    # (we'll handle part III tests differently)
     VALID = "valid"
 
 
diff --git a/test_framework/regalloc.py b/test_framework/regalloc.py
index 7cac2174..623c4ecb 100644
--- a/test_framework/regalloc.py
+++ b/test_framework/regalloc.py
@@ -183,8 +183,8 @@ def spill_test(
             len(spill_instructions),
             max_spilled_instructions,
             msg=common.build_msg(
-                f"Should only need {max_spilled_instructions} instructions \
-                    involving spilled pseudo but found {len(spill_instructions)}",
+                f"Should only need {max_spilled_instructions} instructions "
+                "involving spilled pseudo but found {len(spill_instructions)}",
                 bad_instructions=spill_instructions,
                 full_prog=parsed_asm,
                 program_path=program_path,
@@ -203,8 +203,8 @@ def spill_test(
             len(spilled_operands),
             max_spilled_pseudos,
             msg=common.build_msg(
-                f"At most {max_spilled_pseudos} pseudoregs should have been spilled, \
-                    looks like {len(spilled_operands)} were",
+                f"At most {max_spilled_pseudos} pseudoregs should have been spilled, "
+                "looks like {len(spilled_operands)} were",
                 bad_instructions=spill_instructions,
                 full_prog=parsed_asm,
                 program_path=program_path,
diff --git a/test_framework/tacky/common.py b/test_framework/tacky/common.py
index 2fc5d272..bdb15b02 100644
--- a/test_framework/tacky/common.py
+++ b/test_framework/tacky/common.py
@@ -159,6 +159,7 @@ def build_msg(
     bad_instructions: Optional[Sequence[asm.AsmItem]] = None,
     full_prog: Optional[asm.AssemblyFunction] = None,
     program_path: Optional[Path] = None,
+    max_prog_disp_length: int = 20,
 ) -> str:
     """Utility function for validators to report invalid assembly code"""
     msg_lines = [msg]
@@ -167,7 +168,10 @@ def build_msg(
         msg_lines.append("Bad instructions:")
         msg_lines.extend(printed_instructions)
     if full_prog:
-        msg_lines.extend(["Complete program:", str(full_prog)])
+        if len(full_prog.instructions) > max_prog_disp_length:
+            msg_lines.append("Complete assembly function: <too long, not shown>")
+        else:
+            msg_lines.extend(["Complete assembly function:", str(full_prog)])
     if program_path:
         msg_lines.append(f"Program: {program_path}")
     return "\n".join(msg_lines)
diff --git a/test_framework/tacky/copy_prop.py b/test_framework/tacky/copy_prop.py
index 6143a161..a8a5d6bd 100644
--- a/test_framework/tacky/copy_prop.py
+++ b/test_framework/tacky/copy_prop.py
@@ -275,8 +275,8 @@ def same_arg_test(self, program: Path) -> None:
         )
         self.assertTrue(
             same_value,
-            msg=f"Bad arguments {actual_args[0]} and {actual_args[1]} to callee: \
-                both args should have same value",
+            msg=f"Bad arguments {actual_args[0]} and {actual_args[1]} to callee: "
+            "both args should have same value",
         )
 
     def redundant_copies_test(self, program: Path) -> None:
diff --git a/test_framework/test_tests/test_toplevel.py b/test_framework/test_tests/test_toplevel.py
index 09849833..e28f8ee5 100644
--- a/test_framework/test_tests/test_toplevel.py
+++ b/test_framework/test_tests/test_toplevel.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+import os
 import re
 import shutil
 import subprocess
@@ -24,9 +25,14 @@
     TEST_DIR,
     excluded_extra_credit,
     ExtraCredit,
+    build_test_class,
 )
 from ..tacky.dead_store_elim import STORE_ELIMINATED
 
+NQCC = os.getenv("NQCC")
+assert NQCC  # make sure this environment variable is actually set
+NQCC_PATH: Path = Path(NQCC)
+
 TEST_PATTERN = re.compile("^Ran ([0-9]+) tests", flags=re.MULTILINE)
 FAILURE_PATTERN = re.compile("failures=([0-9]+)")
 
@@ -212,7 +218,7 @@ def test_expected_error_code(self) -> None:
         # NQCC throws error code 125 in all cases
         # This should succeed b/c it specifies the error code we'll actually throw
         try:
-            testrun = run_test_script(
+            run_test_script(
                 "./test_compiler $NQCC --chapter 1 --expected-error-codes 125"
             )
         except subprocess.CalledProcessError as err:
@@ -220,7 +226,7 @@ def test_expected_error_code(self) -> None:
 
         # Specify multiple error codes including the one we'll actually throw; this should also succeed
         try:
-            testrun = run_test_script(
+            run_test_script(
                 "./test_compiler $NQCC --chapter 1 --expected-error-codes 125 127"
             )
         except subprocess.CalledProcessError as err:
@@ -243,6 +249,8 @@ class BadSourceTest(unittest.TestCase):
     ret2 = TEST_DIR / "chapter_1/valid/return_2.c"
     ret0 = TEST_DIR / "chapter_1/valid/return_0.c"
     hello_world = TEST_DIR / "chapter_9/valid/arguments_in_registers/hello_world.c"
+    # like hello_world, this writes to stdout
+    static_recursive_call = TEST_DIR / "chapter_10/valid/static_recursive_call.c"
     dse_relative = Path("chapter_19/dead_store_elimination/int_only/fig_19_11.c")
     dse = TEST_DIR / dse_relative
 
@@ -254,9 +262,14 @@ def setUpClass(cls) -> None:
         # save these to a temporary directory before overwriting them
         cls.tmpdir = tempfile.TemporaryDirectory()
         shutil.copy(cls.hello_world, cls.tmpdir.name)
+        shutil.copy(cls.static_recursive_call, cls.tmpdir.name)
         shutil.copy(cls.ret0, cls.tmpdir.name)
         shutil.copy(cls.dse, cls.tmpdir.name)
 
+        # overwrite static_recursive_call with another file that has
+        # a different retcode and different stdout
+        shutil.copy(cls.ret2, cls.static_recursive_call)
+
         # overwrite hello_world with another file that has same retcode but different stdout
         shutil.copy(cls.ret0, cls.hello_world)
 
@@ -292,11 +305,13 @@ def tearDownClass(cls) -> None:
         tmp_path = Path(cls.tmpdir.name)
         tmp_ret0 = tmp_path / cls.ret0.name
         tmp_helloworld = tmp_path / cls.hello_world.name
+        tmp_static_recursive_call = tmp_path / cls.static_recursive_call.name
         tmp_dse = tmp_path / cls.dse.name
 
         shutil.copy(tmp_ret0, cls.ret0)
         shutil.copy(tmp_helloworld, cls.hello_world)
         shutil.copy(tmp_dse, cls.dse)
+        shutil.copy(tmp_static_recursive_call, cls.static_recursive_call)
         cls.tmpdir.cleanup()
 
         # remove intermediate files produced by --keep-asm-on-failure
@@ -304,6 +319,28 @@ def tearDownClass(cls) -> None:
             if f.name not in ASSEMBLY_LIBS:
                 f.unlink(missing_ok=True)
 
+    @classmethod
+    def run_chapter_tests(cls, chapter: int) -> unittest.TestResult:
+        """Utility function to run test suite for a given chapter and return TestResult.
+        Use this instead of run_test_script so we can inspect individual test results.
+        """
+        test_class = build_test_class(
+            NQCC_PATH,
+            chapter,
+            options=[],
+            stage="run",
+            extra_credit_flags=ExtraCredit.NONE,
+            skip_invalid=True,
+            error_codes=[],
+        )
+        test_suite = unittest.defaultTestLoader.loadTestsFromTestCase(test_class)
+
+        # test failure is expected and shouldn't be displayed to stdout,
+        # so direct test output to /dev/null
+        with open(os.devnull, "w") as devnull:
+            result = unittest.TextTestRunner(stream=devnull).run(test_suite)
+            return result
+
     def assert_no_intermediate_files(self, chapter: int) -> None:
         # Executables, *.i files, etc should have been cleaned up
         intermediate_files = [
@@ -378,7 +415,7 @@ def test_intermediate(self) -> None:
 
     def test_keep_asm(self) -> None:
         """Use --keep-asm-on-failure option to generate assembly for failures"""
-        with self.assertRaises(subprocess.CalledProcessError) as cpe:
+        with self.assertRaises(subprocess.CalledProcessError):
             run_test_script("./test_compiler $NQCC --chapter 1")
         # make sure we preserved .s file for ret0, which should fail
         expected_asm = self.__class__.ret0.with_suffix(".s")
@@ -389,7 +426,7 @@ def test_keep_asm(self) -> None:
 
     def test_keep_asm_optimize(self) -> None:
         """Make sure --keep-asm-on-failure works for chapter 19 tests"""
-        with self.assertRaises(subprocess.CalledProcessError) as cpe:
+        with self.assertRaises(subprocess.CalledProcessError):
             run_test_script("./test_compiler $NQCC --chapter 19")
         # make sure we preserved .s file for ret0, which should fail
         expected_asm = self.__class__.dse.with_suffix(".s")
@@ -397,3 +434,52 @@ def test_keep_asm_optimize(self) -> None:
             expected_asm.exists,
             msg=f"{expected_asm} should be preserved on failure but wasn't found",
         )
+
+    def test_bad_retcode_output(self) -> None:
+        """If return code is incorrect, only report that, not stdout or stderr."""
+
+        chapter1_results = self.run_chapter_tests(chapter=1)
+
+        # Expected failure in return_0.c, which we replaced with return_2.c
+        self.assertEqual(len(chapter1_results.failures), 1)
+        # chapter1_results.failures is a list of (TestCase instance, error message) tuples
+        err_output: str = chapter1_results.failures[0][1]
+        expected_err = (
+            "/tests/chapter_1/valid/return_0"
+            "\n* Bad return code: expected 0 and got 2\n"
+        )
+        err_output_end = err_output[-len(expected_err) :]
+        self.assertEqual(err_output_end, expected_err)
+
+    def test_bad_stdout_output(self) -> None:
+        """If only stdout is incorrect, report that but not return code."""
+        chapter9_results = self.run_chapter_tests(chapter=9)
+
+        # Expected failure in hello_world.c, which we replaced with return_0.c
+        self.assertEqual(len(chapter9_results.failures), 1)
+        # chapter9_results.failures is a list of (TestCase instance, error message) tuples
+        err_output: str = chapter9_results.failures[0][1]
+        expected_err = (
+            "/tests/chapter_9/valid/arguments_in_registers/hello_world"
+            "\n* Bad stdout: expected 'Hello, World!\\n' and got ''"
+            "\n- Hello, World!\n"
+        )
+        err_output_end = err_output[-len(expected_err) :]
+        self.assertEqual(err_output_end, expected_err)
+
+    def test_bad_stdout_and_retcode(self) -> None:
+        """If stdout and return code are both incorrect, report both of them."""
+        chapter10_results = self.run_chapter_tests(chapter=10)
+
+        # Expected failure in static_recursive_call.c, which we replaced with return_2.c
+        self.assertEqual(len(chapter10_results.failures), 1)
+        # chapter10_results.failures is a list of (TestCase instance, error message) tuples
+        err_output: str = chapter10_results.failures[0][1]
+        expected_err = (
+            "/tests/chapter_10/valid/static_recursive_call"
+            "\n* Bad return code: expected 0 and got 2"
+            "\n* Bad stdout: expected 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' and got ''"
+            "\n- ABCDEFGHIJKLMNOPQRSTUVWXYZ\n"
+        )
+        err_output_end = err_output[-len(expected_err) :]
+        self.assertEqual(err_output_end, expected_err)