RagnarGrootKoerkamp · thorehusfeldt · Sep 8, 2023 · Sep 10, 2023 · Sep 10, 2023 · Sep 10, 2023
diff --git a/bin/expectations.py b/bin/expectations.py
diff --git a/bin/problem.py b/bin/problem.py
@@ -5,8 +5,10 @@
 import sys
 
 from pathlib import Path
+from functools import lru_cache
 
 import config
+import expectations
 import parallel
 import program
 import run
@@ -37,6 +39,7 @@ def __init__(self, path, tmpdir, label=None):
         self._program_callbacks = dict()
         # Dictionary from path to parsed file contents.
         self._testdata_yamls = dict()
+        self._expectations_registry = None
 
         # The label for the problem: A, B, A1, A2, X, ...
         self.label = label
@@ -438,6 +441,23 @@ def build_program(p):
 
         problem._validators[key] = validators
         return validators
+
+    # TODO Q from Thore: ok to use self here instead of problem?
+    def get_expectations_registry(self):
+        """ Parse yaml file (if any) describing the expectations for this problem.
+        """
+        if self._expectations_registry is None:
+            path = self.path / 'submissions' / 'expectations.yaml'
+            if has_ryaml:
+                try:
+                    yamldata = read_yaml_settings(path)
+                except ruamel.yaml.scanner.ScannerError:
+                    fatal('Make sure problem.yaml does not contain any more {% ... %}.')
+            else:
+                yamldata = read_yaml_settings(path)
+            self._expectations_registry  = expectations.Registry.from_dict(yamldata)
+        return self._expectations_registry
+
 
     def run_submissions(problem):
         needans = False if problem.interactive else True

diff --git a/bin/run.py b/bin/run.py
@@ -328,7 +328,7 @@ def __init__(self, problem, path, skip_double_build_warning=False):
 
         # The first element will match the directory the file is in, if possible.
         self.expected_verdicts = self._get_expected_verdicts()
-
+        self.expectations = self.problem.get_expectations_registry().for_path(self.short_path)
         # NOTE: Judging of interactive problems on systems without `os.wait4` is
         # suboptimal because we cannot determine which of the submission and
         # interactor exits first. Thus, we don't distinguish the different non-AC
@@ -400,9 +400,10 @@ def _get_expected_verdicts(self):
                 verdicts = [subdir]
             else:
                 if len(verdicts) == 0:
-                    error(
-                        f'Submission {self.short_path} must have @EXPECTED_RESULTS@. Defaulting to ACCEPTED.'
-                    )
+                    pass # TODO (Thore): made this shut up!
+                    #error(
+                    #    f'Submission {self.short_path} must have @EXPECTED_RESULTS@. Defaulting to ACCEPTED.'
+                    #)
 
         if len(verdicts) == 0:
             verdicts = ['ACCEPTED']
@@ -452,9 +453,10 @@ def run_all_testcases(
 
         verdict = (-100, 'ACCEPTED', 'ACCEPTED', 0)  # priority, verdict, print_verdict, duration
         verdict_run = None
+        verdict_for_testcase = dict()
 
         def process_run(run, p):
-            nonlocal max_duration, verdict, verdict_run
+            nonlocal max_duration, verdict, verdict_run, verdict_for_testcase
 
             localbar = bar.start(run)
             result = run.run()
@@ -476,7 +478,10 @@ def process_run(run, p):
             if table_dict is not None:
                 table_dict[run.name] = result.verdict == 'ACCEPTED'
 
-            got_expected = result.verdict in ['ACCEPTED'] + self.expected_verdicts
+            verdict_short = short_verdict(result.verdict)
+            verdict_for_testcase[run.name] = verdict_short
+            #got_expected = result.verdict in ['ACCEPTED'] + self.expected_verdicts
+            got_expected = self.expectations.is_permitted(verdict_short, run.name)
 
             # Print stderr whenever something is printed
             if result.out and result.err:
@@ -514,8 +519,17 @@ def process_run(run, p):
                     data += '\n'
                 data += f'{f.name}:' + localbar._format_data(t) + '\n'
 
+            if not got_expected:
+                localbar.error(f'{result.duration:6.3f}s {result.print_verdict()}', data)
+            short = short_verdict(result.verdict)
+            for prefix, pattern, verdicts in self.expectations.violated_permissions(short, run.name):
+                prefix = (f'{Fore.CYAN}{prefix:>{len(localbar.prefix)}}{Style.RESET_ALL}:' +
+                          f'{pattern:<{localbar.item_width}}')
+                localbar.warn(f"permits {verbose_verdicts(verdicts)}", prefix=prefix)
+
             localbar.done(got_expected, f'{result.duration:6.3f}s {result.print_verdict()}', data)
 
+
             # Lazy judging: stop on the first error when not in verbose mode.
             if (
                 not config.args.verbose and not config.args.table
@@ -534,16 +548,22 @@ def process_run(run, p):
         self.print_verdict = verdict[2]
         self.duration = max_duration
 
+        # Check presence of required verdicts among testgroups
+        for prefix, pattern, verdicts in self.expectations.unsatisfied_requirements(verdict_for_testcase):
+            prefix = (f'{Fore.CYAN}{prefix:>{len(bar.prefix)}}{Style.RESET_ALL}: ' +
+                      f'{pattern:<{bar.item_width}}')
+            bar.warn(f"no test case got {verbose_verdicts(verdicts)}", prefix=prefix)
+
         # Use a bold summary line if things were printed before.
         if bar.logged:
             color = (
                 Style.BRIGHT + Fore.GREEN
-                if self.verdict in self.expected_verdicts
+                if self.expectations.is_permitted(short_verdict(self.verdict), Path())
                 else Style.BRIGHT + Fore.RED
             )
             boldcolor = Style.BRIGHT
         else:
-            color = Fore.GREEN if self.verdict in self.expected_verdicts else Fore.RED
+            color = Fore.GREEN if self.expectations.is_permitted(short_verdict(self.verdict), Path()) else Fore.RED
             boldcolor = ''
 
         printed_newline = bar.finalize(

diff --git a/bin/util.py b/bin/util.py
@@ -186,6 +186,7 @@ def clearline(self):
             return
         print(self.carriage_return, end='', flush=True, file=sys.stderr)
 
+    @staticmethod
     def action(prefix, item, width=None, total_width=None):
         if width is not None and total_width is not None and len(prefix) + 2 + width > total_width:
             width = total_width - len(prefix) - 2
@@ -273,7 +274,7 @@ def _format_data(data):
 
     # Log can be called multiple times to make multiple persistent lines.
     # Make sure that the message does not end in a newline.
-    def log(self, message='', data='', color=Fore.GREEN, *, resume=True):
+    def log(self, message='', data='', color=Fore.GREEN, *, resume=True, prefix=None):
         with self.lock:
             if message is None:
                 message = ''
@@ -292,7 +293,7 @@ def log(self, message='', data='', color=Fore.GREEN, *, resume=True):
                     self.needs_leading_newline = False
 
             print(
-                self.get_prefix(),
+                self.get_prefix() if prefix is None else prefix,
                 color,
                 message,
                 ProgressBar._format_data(data),
@@ -313,9 +314,9 @@ def debug(self, message, data=''):
         if config.args.verbose:
             self.log(message, data)
 
-    def warn(self, message='', data=''):
+    def warn(self, message='', data='', prefix=None):
         config.n_warn += 1
-        self.log(message, data, Fore.YELLOW)
+        self.log(message, data, Fore.YELLOW, prefix=prefix)
 
     # Error removes the current item from the in_progress set.
     def error(self, message='', data=''):
@@ -981,3 +982,33 @@ def combine_hashes_dict(d):
         if d[key] is not None:
             hasher.update(d[key].encode())
     return hasher.hexdigest()
+
+
+def short_verdict(verdict):
+    return {
+            'ACCEPTED': 'AC',
+            'WRONG_ANSWER': 'WA',
+            'RUN_TIME_ERROR': 'RTE',
+            'TLE (aborted)': 'TLE',
+            'TIME_LIMIT_EXCEEDED': 'TLE'
+            }[verdict]
+
+def verbose_verdicts(verdicts: str | set[str], oxford_comma=True) -> str:
+    long_form = {
+            'AC': 'ACCEPTED',
+            'WA': 'WRONG_ANSWER',
+            'RTE': 'RUN_TIME_ERROR',
+            'TLE': 'TLE (aborted)'
+            }
+    if isinstance(verdicts, str):
+        verdicts = {[verdicts]}
+    verdicts = list(sorted(verdicts))
+    if len(verdicts) == 1:
+        return long_form[verdicts[0]]
+    elif len(verdicts) == 2:
+        return f"{long_form[verdicts[0]]} or {long_form[verdicts[1]]}"
+    else:
+        assert len(verdicts) == 3, len(verdicts)
+        return f"{long_form[verdicts[0]]}, {long_form[verdicts[1]]}, or {long_form[verdicts[2]]}"
+
+