Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Feat/expectations #307

Draft
wants to merge 27 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3e5b21d
Partial implementation of expectations proposal 0.4.0
thorehusfeldt Sep 8, 2023
d7165c6
Minimum Viable Implementation
thorehusfeldt Sep 10, 2023
5f7a2bd
Correct semantics of required (some, not all, must exist)
thorehusfeldt Sep 10, 2023
a357011
Change semantics of "accepted" (only "AC" allowed, but nothing required)
thorehusfeldt Sep 10, 2023
ea889a8
is_allowed_verdict: change argument order, give default path
thorehusfeldt Sep 10, 2023
b8bc3dd
bt run: check for required testcase results
thorehusfeldt Sep 10, 2023
c10108e
Add problem/spanishinquisition for testing expectations
thorehusfeldt Sep 10, 2023
c7bbef2
introduce is_allowed_verdict_for_testcase
thorehusfeldt Sep 11, 2023
0d84659
Use bar for logging errors
thorehusfeldt Sep 11, 2023
509461d
spanishinquisition: more example submissions
thorehusfeldt Sep 11, 2023
d4eb2de
Allow regex matching of patterns
thorehusfeldt Sep 11, 2023
9989ebc
Regex match examples in spanishinquisition
thorehusfeldt Sep 11, 2023
2e5861c
Notation change: allowed -> permitted
thorehusfeldt Sep 11, 2023
1dc0e49
Draft documentation
thorehusfeldt Sep 12, 2023
86ebef2
fix markdown syntax for list
thorehusfeldt Sep 12, 2023
1cccbd8
Rename some files in spanishinquisition
thorehusfeldt Sep 12, 2023
3b85390
Verbose verdicts; also hack the progressbar
thorehusfeldt Sep 12, 2023
372e4c1
bt run: more informative logging of violated expectations
thorehusfeldt Sep 12, 2023
4edfd33
refactor, rename some methods in expectations
thorehusfeldt Sep 12, 2023
6c3d945
mixed submission to spanishinquisition
thorehusfeldt Sep 12, 2023
8b3ddae
Refactor Registry again (again)
thorehusfeldt Sep 14, 2023
b68afec
Refactor into BaseExpecations
thorehusfeldt Sep 15, 2023
c783540
short_verdicts
thorehusfeldt Sep 15, 2023
aa6ddd7
bt run uses refactored expectations to detect missing requirements
thorehusfeldt Sep 15, 2023
f6f2bcc
Smoother methods for Registry
thorehusfeldt Sep 16, 2023
db01d9d
Fix doctest
thorehusfeldt Sep 16, 2023
b4d4df9
Document Registry.__setitem__(); better __repr__
thorehusfeldt Sep 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
409 changes: 409 additions & 0 deletions bin/expectations.py

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions bin/problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import sys

from pathlib import Path
from functools import lru_cache

import config
import expectations
import parallel
import program
import run
Expand Down Expand Up @@ -37,6 +39,7 @@ def __init__(self, path, tmpdir, label=None):
self._program_callbacks = dict()
# Dictionary from path to parsed file contents.
self._testdata_yamls = dict()
self._expectations_registry = None

# The label for the problem: A, B, A1, A2, X, ...
self.label = label
Expand Down Expand Up @@ -438,6 +441,23 @@ def build_program(p):

problem._validators[key] = validators
return validators

# TODO Q from Thore: ok to use self here instead of problem?
def get_expectations_registry(self):
Comment on lines +445 to +446
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably best to stay consistent with existing code, so I would use problem here instead of self, unless we rename the first parameter of all class methods from problem to self. I don't mind either way, as long as it's consistent, so this decision is better for Ragnar to make 🙂

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not really a fan of self somehow. I think it's because it's not always clear in which class you're currently reading code with the big functions everywhere, so naming things after what they are seems clearer generally. (Especially given that self doesn't really mean anything special inside the member functions anyway.)

So yes, I'd prefer problem.

""" Parse yaml file (if any) describing the expectations for this problem.
"""
if self._expectations_registry is None:
path = self.path / 'submissions' / 'expectations.yaml'
if has_ryaml:
try:
yamldata = read_yaml_settings(path)
except ruamel.yaml.scanner.ScannerError:
fatal('Make sure problem.yaml does not contain any more {% ... %}.')
else:
yamldata = read_yaml_settings(path)
self._expectations_registry = expectations.Registry.from_dict(yamldata)
return self._expectations_registry


def run_submissions(problem):
needans = False if problem.interactive else True
Expand Down
36 changes: 28 additions & 8 deletions bin/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ def __init__(self, problem, path, skip_double_build_warning=False):

# The first element will match the directory the file is in, if possible.
self.expected_verdicts = self._get_expected_verdicts()

self.expectations = self.problem.get_expectations_registry().for_path(self.short_path)
# NOTE: Judging of interactive problems on systems without `os.wait4` is
# suboptimal because we cannot determine which of the submission and
# interactor exits first. Thus, we don't distinguish the different non-AC
Expand Down Expand Up @@ -400,9 +400,10 @@ def _get_expected_verdicts(self):
verdicts = [subdir]
else:
if len(verdicts) == 0:
error(
f'Submission {self.short_path} must have @EXPECTED_RESULTS@. Defaulting to ACCEPTED.'
)
pass # TODO (Thore): made this shut up!
#error(
# f'Submission {self.short_path} must have @EXPECTED_RESULTS@. Defaulting to ACCEPTED.'
#)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error is probably still necessary. DOMjudge does not support an expectations.yaml, and reads the @EXPECTED_RESULTS@ tag to know which final verdicts are allowed. So if the submission is not in one of the four default directories, and it does not have such a tag, DOMjudge will default to ACCEPTED, which is not intended in most cases 😅

I think that the @EXPECTED_RESULTS@ tag converted to expectations.yaml syntax would look something like:

rejected/broken.py:  # contains `@EXPECTED_RESULTS@: TIME_LIMIT_EXCEEDED, WRONG_ANSWER`
  permitted: [AC, TLE, WA]  # only these verdicts may appear
  required: [TLE, WA]       # at least one of these verdicts must appear

It would be nice if the existence of an @EXPECTED_RESULTS@ could be checked for consistency with the expectations.yaml. Perhaps we could even infer the tag from the YAML on export, but that may result in too much seemingly magic behaviour?

Copy link
Collaborator Author

@thorehusfeldt thorehusfeldt Sep 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All correct, and all (in my mind) part of the tool more than of the expecations framework.

Note that the framework does allow file-based expectations; in principle a submission itself can supply “its own” expectations, which then compose with expectations in expectations.yaml in well-defined ways.

I’ve silenced @EXPECTED_RESULTS@ in my branch simply because there are too many decisions that I don’t feel strongly about involved with how to handle it. (A tiny method that translates @EXPECTED_RESULTS@ into specific expectations.BaseExpectations would be the quickest way to be complatible. Very easy to do.)

Copy link
Collaborator Author

@thorehusfeldt thorehusfeldt Sep 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, if

permitted: [AC, TLE, WA] 
required: [TLE, WA]  

is sufficiently popular, the cleanest way is to elevate it to a common abbreviation next to accepted, does not terminate, etc. In principle, there are 16 different possible permitted subsets (two of which make now sense), each of which can have a number of required subsets. I was just taking the standard set and add a few that seemed useful to me (such as not accepted).

If somebody has a large repository of @EXPECTED_RESULTS@-tagged files, we could run statistics on them! Please so.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running the following command on my folder that contains all Git repos that I've worked with in the past years (FPC from 2018, BAPC+preliminaries from 2020, NWERC from 2022):

$ grep -r @EXPECTED_RESULTS@ 2> /dev/null | grep 20 | cut -d ':' -f 3 | sed 's/ //g' | sort | uniq -c
  • The grep 20 is to only select repositories that have a year number (2018–2023), because I want to exclude things like BAPCtools and domjudge that use this tag many times in their tests.
  • The sed command removes the optional space after the comma that separates the verdicts.
  • The verdicts may still have different orders (e.g. both ACCEPTED,WRONG_ANSWER and WRONG_ANSWER,ACCEPTED occur), but I've manually aggregated those in the results below.
      3 ACCEPTED,RUN_TIME_ERROR
      2 ACCEPTED,RUN_TIME_ERROR,TIME_LIMIT_EXCEEDED
      2 ACCEPTED,RUN_TIME_ERROR,WRONG_ANSWER
     47 ACCEPTED,TIME_LIMIT_EXCEEDED
     16 ACCEPTED,WRONG_ANSWER
     10 RUN_TIME_ERROR,TIME_LIMIT_EXCEEDED
      3 RUN_TIME_ERROR,TIME_LIMIT_EXCEEDED,WRONG_ANSWER
     22 RUN_TIME_ERROR,WRONG_ANSWER
     49 TIME_LIMIT_EXCEEDED,WRONG_ANSWER

After running this command, I realize that my earlier example only works if the tag only has non-AC verdicts. For example, a combined AC,TLE verdict may better be written without required:

mixed/sometimes-too-slow.py:  # contains `@EXPECTED_RESULTS@: ACCEPTED, TIME_LIMIT_EXCEEDED`
  permitted: [AC, TLE]  # only these verdicts may appear

Adding the required field is probably not even strictly necessary in any of these cases, but for the cases that do not include AC, it's probably helpful if the tool warns about a submission that we expect to give some non-AC verdict, but is in fact AC.


And as a final remark, I'm finding it difficult to switch my mental model from "the verdict that the submission receives" to "the set of verdicts that the test cases receive". The fact that DOMjudge (and @EXPECTED_RESULTS@) use the former and expectations.yaml uses the latter, probably doesn't help either. 😂

Copy link
Collaborator Author

@thorehusfeldt thorehusfeldt Sep 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for this. Also for this formulation:

I'm finding it difficult to switch my mental model from "the verdict that the submission receives" to "the set of verdicts that the test cases receive".

That is exactly why I’m doing this; and I’m trying to be terminologically stringent about it. To repeat this in my own words,

  1. The separation of these two concepts is crucial for even defining the semantics of, e.g., time_limit_exceeded, in accordance with the specification.
  2. Another separation is between the behaviour of the construction and validation tool (which is used by authors) and the electronic judge (which is used by contest organisers, teachers, and solvers.) The former (say, BAPCtools or problemtools) needs a way of saying “no testcase for the brute-force solution can ever be WA, but TLE and RTE are acceptable, and the final verdict shan’t be AC”, and construction tools should support this to ensure problem quality. On the other hand, the judging environment (Kattis, DomJudge, etc.) needs well-defined semantics for what the final verdict is (or maybe subtask scores of IOI), and when it is OK to abort prematurely to reduce server load. For instance a judge can report the lexicographically first (by testcase path) non-AC verdict.

By the way, I’m happy to add final to the expectations specification, and tried many ideas. Here’s how that could look:

final: TLE
permitted: [AC, TLE, WA]
required: [TLE]

This is then either a requirement about the lexicographic ordering of testcases, ensuring that the TLE verdict is encountered before the WA verdict (what the spec calls first_error), or that the judge has an ordering of rejection-verdicts that makes TLE worse than WA, continues grading after encountering WA, and then reports the worst error.

But I don’t really see a need for final, nor is the semantics clear to me. This is the kind of feedback I’m soliciting here. (In my current BAPCtools integration, this issues would need to be resolved to understand whether the final verdict should be red or green.) See, for instance, submissions bad-tle-wa.py in the large screenshot above, which fails (because it gets a WA), yet gets a green TLE (which, after all, is a permitted testcase verdict). I have no opinion on what the semantics or visualisation should be.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the elaboration! I don't think that a final expectation is needed, indeed 🙂

Copy link
Collaborator Author

@thorehusfeldt thorehusfeldt Sep 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’ve updated (b4d4df9) the pydoc documentation of expectations.yaml to spell out the workflow that avoids expectation.yaml altogether and specifies expectations submission-by-submission by writing

registry['wrong_answer/greedy.py'] = Expectations({'sample': 'accepted'})

This would be my suggestion for translating submissions that contain @EXPECTED_RESULTS@. But I simply do not understand the semantics of the latter, because I can’t wrap my head around

Use this to annotate the possible outcomes that a submission can have, that will be accepted by the Judging verifier.

in the DOMjudge appendix.

As far as I can tell, the final verdict (“outcome”) in DOMjudge depends on results_prio, which is not part of the problem package, and in the default setting seems to be what the problem package specification calls first_error. Thus, the outcome depends on the lexicographic ordering of the testcases. I don’t know if this granularity is intended or used, nor don‘t I know how to translate it.

closed PR @ problem_package_format

Copy link
Collaborator

@mpsijm mpsijm Sep 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For DOMjudge, the final verdict not only depends on result_prio, but also on whether lazy judging is enabled: https://www.domjudge.org/docs/manual/8.2/judging.html#lazy-judging-and-results-priority

When [lazy judging is] enabled, judging of a submission will stop when a highest priority result has been found for any testcase. You can find these priorities under the results_prio setting. In the default configuration, when enabling this, judging will stop with said verdict when a testcase results in e.g. run-error, timelimit or wrong-answer. When a testcase is correct (lower priority), judging will continue to the next test case. In other words, to arrive at a verdict of correct, all testcases will have been evaluated, while any of the ‘error’ verdicts will immediately return this answer for the submission and the other testcases will never be tested [...].

[...]

When not using lazy judging, all testcases will always be ran for each submission. The results_prio list will then determine which of the individual testcase results will be the overall submission result: the highest priority one. In case of a tie, the first occurring testcase result with highest priority is returned.

I think we can assume the default DOMjudge settings for BAPCtools. At least, we have never changed this setting in DOMjudge in the past years 🙂 Note that the lazy judging, especially when running multiple runners in parallel, may not necessarily return the result of the first non-accepted test case. In the cases where we use @EXPECTED_REULTS@, I think we usually don't care about the ordering, as long as the submission never receives any other verdict than a verdict from this list (i.e. we don't use first_error currently, but I also don't think we knew that it existed). Thus, at least for now, I don't think we have to translate first_error to expectations.

I would translate @EXPECTED_RESULTS@: <verdict_list> as follows (and it would be nice if BAPCtools does this automatically, when an expectations.yaml does not exist):

  • if AC in <verdict_list>:
    permitted: <verdict_list>
  • if AC not in <verdict_list>:
    permitted: [AC, *<verdict_list>]  # some test cases may pass, so also include `AC`
    required: verdict_list  # at least one of the specified verdicts must appear for at least one of the test cases


if len(verdicts) == 0:
verdicts = ['ACCEPTED']
Expand Down Expand Up @@ -452,9 +453,10 @@ def run_all_testcases(

verdict = (-100, 'ACCEPTED', 'ACCEPTED', 0) # priority, verdict, print_verdict, duration
verdict_run = None
verdict_for_testcase = dict()

def process_run(run, p):
nonlocal max_duration, verdict, verdict_run
nonlocal max_duration, verdict, verdict_run, verdict_for_testcase

localbar = bar.start(run)
result = run.run()
Expand All @@ -476,7 +478,10 @@ def process_run(run, p):
if table_dict is not None:
table_dict[run.name] = result.verdict == 'ACCEPTED'

got_expected = result.verdict in ['ACCEPTED'] + self.expected_verdicts
verdict_short = short_verdict(result.verdict)
verdict_for_testcase[run.name] = verdict_short
#got_expected = result.verdict in ['ACCEPTED'] + self.expected_verdicts
got_expected = self.expectations.is_permitted(verdict_short, run.name)

# Print stderr whenever something is printed
if result.out and result.err:
Expand Down Expand Up @@ -514,8 +519,17 @@ def process_run(run, p):
data += '\n'
data += f'{f.name}:' + localbar._format_data(t) + '\n'

if not got_expected:
localbar.error(f'{result.duration:6.3f}s {result.print_verdict()}', data)
short = short_verdict(result.verdict)
for prefix, pattern, verdicts in self.expectations.violated_permissions(short, run.name):
prefix = (f'{Fore.CYAN}{prefix:>{len(localbar.prefix)}}{Style.RESET_ALL}:' +
f'{pattern:<{localbar.item_width}}')
localbar.warn(f"permits {verbose_verdicts(verdicts)}", prefix=prefix)

localbar.done(got_expected, f'{result.duration:6.3f}s {result.print_verdict()}', data)


# Lazy judging: stop on the first error when not in verbose mode.
if (
not config.args.verbose and not config.args.table
Expand All @@ -534,16 +548,22 @@ def process_run(run, p):
self.print_verdict = verdict[2]
self.duration = max_duration

# Check presence of required verdicts among testgroups
for prefix, pattern, verdicts in self.expectations.unsatisfied_requirements(verdict_for_testcase):
prefix = (f'{Fore.CYAN}{prefix:>{len(bar.prefix)}}{Style.RESET_ALL}: ' +
f'{pattern:<{bar.item_width}}')
bar.warn(f"no test case got {verbose_verdicts(verdicts)}", prefix=prefix)

# Use a bold summary line if things were printed before.
if bar.logged:
color = (
Style.BRIGHT + Fore.GREEN
if self.verdict in self.expected_verdicts
if self.expectations.is_permitted(short_verdict(self.verdict), Path())
else Style.BRIGHT + Fore.RED
)
boldcolor = Style.BRIGHT
else:
color = Fore.GREEN if self.verdict in self.expected_verdicts else Fore.RED
color = Fore.GREEN if self.expectations.is_permitted(short_verdict(self.verdict), Path()) else Fore.RED
boldcolor = ''

printed_newline = bar.finalize(
Expand Down
39 changes: 35 additions & 4 deletions bin/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def clearline(self):
return
print(self.carriage_return, end='', flush=True, file=sys.stderr)

@staticmethod
def action(prefix, item, width=None, total_width=None):
if width is not None and total_width is not None and len(prefix) + 2 + width > total_width:
width = total_width - len(prefix) - 2
Expand Down Expand Up @@ -273,7 +274,7 @@ def _format_data(data):

# Log can be called multiple times to make multiple persistent lines.
# Make sure that the message does not end in a newline.
def log(self, message='', data='', color=Fore.GREEN, *, resume=True):
def log(self, message='', data='', color=Fore.GREEN, *, resume=True, prefix=None):
with self.lock:
if message is None:
message = ''
Expand All @@ -292,7 +293,7 @@ def log(self, message='', data='', color=Fore.GREEN, *, resume=True):
self.needs_leading_newline = False

print(
self.get_prefix(),
self.get_prefix() if prefix is None else prefix,
color,
message,
ProgressBar._format_data(data),
Expand All @@ -313,9 +314,9 @@ def debug(self, message, data=''):
if config.args.verbose:
self.log(message, data)

def warn(self, message='', data=''):
def warn(self, message='', data='', prefix=None):
config.n_warn += 1
self.log(message, data, Fore.YELLOW)
self.log(message, data, Fore.YELLOW, prefix=prefix)

# Error removes the current item from the in_progress set.
def error(self, message='', data=''):
Expand Down Expand Up @@ -981,3 +982,33 @@ def combine_hashes_dict(d):
if d[key] is not None:
hasher.update(d[key].encode())
return hasher.hexdigest()


def short_verdict(verdict):
return {
'ACCEPTED': 'AC',
'WRONG_ANSWER': 'WA',
'RUN_TIME_ERROR': 'RTE',
'TLE (aborted)': 'TLE',
'TIME_LIMIT_EXCEEDED': 'TLE'
}[verdict]

def verbose_verdicts(verdicts: str | set[str], oxford_comma=True) -> str:
long_form = {
'AC': 'ACCEPTED',
'WA': 'WRONG_ANSWER',
'RTE': 'RUN_TIME_ERROR',
'TLE': 'TLE (aborted)'
}
if isinstance(verdicts, str):
verdicts = {[verdicts]}
verdicts = list(sorted(verdicts))
if len(verdicts) == 1:
return long_form[verdicts[0]]
elif len(verdicts) == 2:
return f"{long_form[verdicts[0]]} or {long_form[verdicts[1]]}"
else:
assert len(verdicts) == 3, len(verdicts)
return f"{long_form[verdicts[0]]}, {long_form[verdicts[1]]}, or {long_form[verdicts[2]]}"


Loading
Loading