prometheus-stats

#!/usr/bin/env python3

# This file is part of Cockpit.
#
# Copyright (C) 2021 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.

import argparse
import collections
import logging
import sqlite3
import sys
import urllib.parse

from lib import s3, stores

# Seconds in an hour
HOUR = 3600


def main() -> int:
    parser = argparse.ArgumentParser(description='Generate infrastructure and test statistics')
    parser.add_argument("--db", default="test-results.db", help="Database name")
    parser.add_argument("--s3", metavar="URL", type=urllib.parse.urlparse,
                        help="Upload generated file to given S3 URL")
    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
    parser.add_argument("--prune", type=int, metavar="DAYS", help="Remove entries older than DAYS")
    opts = parser.parse_args()

    if opts.verbose:
        logging.basicConfig(level=logging.DEBUG)

    db_conn = sqlite3.connect(opts.db)
    cursor = db_conn.cursor()

    if opts.prune:
        # this would be much more elegant with just "PRAGMA foreign_keys = ON", and letting SQLite
        # auto-cleanup deleted foreign keys in the other tables; but that takes unbearably long
        cursor.execute(f"CREATE TEMP TABLE to_delete AS SELECT id FROM TestRuns "
                       f"WHERE time < unixepoch('now', '-{opts.prune} days')")
        cursor.execute("DELETE FROM Tests WHERE run IN to_delete")
        cursor.execute("DELETE FROM TestCoverage WHERE run IN to_delete")
        cursor.execute("DELETE FROM TestRuns WHERE id IN to_delete")
        db_conn.commit()
        cursor.execute("VACUUM")
        db_conn.commit()

    # Output in prometheus format, see:
    # https://prometheus.io/docs/instrumenting/exposition_formats/
    output = ""

    # Get total number of runs and wait time sum
    (test_runs, test_runs_queue_wait_sum) = cursor.execute("""\
            SELECT COUNT(*), SUM(wait_seconds)
            FROM TestRuns""").fetchone()
    if test_runs_queue_wait_sum is None:
        test_runs_queue_wait_sum = 0

    # How many test runs waited for at most 5 minutes
    test_runs_wait_5m = cursor.execute("""\
            SELECT COUNT(*)
            FROM TestRuns
            WHERE wait_seconds <= 300""").fetchone()[0]

    # Get how many test runs waited for at most 1 hour
    test_runs_wait_1h = cursor.execute("""\
            SELECT COUNT(*)
            FROM TestRuns
            WHERE wait_seconds <= 3600""").fetchone()[0]

    output += f"""# HELP queue_time_wait_seconds histogram of queue wait times
# TYPE queue_time_wait_seconds histogram
queue_time_wait_seconds_bucket{{le="300"}} {test_runs_wait_5m}
queue_time_wait_seconds_bucket{{le="3600"}} {test_runs_wait_1h}
queue_time_wait_seconds_bucket{{le="+Inf"}} {test_runs}
queue_time_wait_seconds_sum {test_runs_queue_wait_sum}
queue_time_wait_seconds_count {test_runs}
"""

    # Get total number of test runtime
    test_run_seconds_sum = cursor.execute("""\
            SELECT SUM(run_seconds)
            FROM TestRuns""").fetchone()[0]
    if test_run_seconds_sum is None:
        test_run_seconds_sum = 0

    # How many tests ran for < 1 hour
    test_run_seconds_1h = cursor.execute("""\
            SELECT COUNT(*)
            FROM TestRuns
            WHERE run_seconds <= 3600""").fetchone()[0]

    # How many tests ran for < 2 hour
    test_run_seconds_2h = cursor.execute("""\
            SELECT COUNT(*)
            FROM TestRuns
            WHERE run_seconds <= 7200""").fetchone()[0]

    output += f"""
# HELP test_run_seconds histogram of test execution time
# TYPE test_run_seconds histogram
test_run_seconds_bucket{{le="3600"}} {test_run_seconds_1h}
test_run_seconds_bucket{{le="7200"}} {test_run_seconds_2h}
test_run_seconds_bucket{{le="+Inf"}} {test_runs}
test_run_seconds_sum {test_run_seconds_sum}
test_run_seconds_count {test_runs}
"""

    # top failures of last 7 days
    top_failures = cursor.execute("""
            SELECT project, allruns.testname, failruns.context, (failruns.failed * 100.0) / COUNT(run) AS percent
            FROM Tests AS allruns
            JOIN TestRuns ON allruns.run = TestRuns.id
            JOIN (
                    SELECT testname, context, COUNT(run) as failed
                    FROM Tests
                    JOIN TestRuns ON Tests.run = TestRuns.id
                    WHERE TestRuns.time > strftime('%s', 'now', '-7 days') AND Tests.failed = 1
                    GROUP BY testname, context
            ) AS failruns ON allruns.testname = failruns.testname AND TestRuns.context = failruns.context
            WHERE TestRuns.time > strftime('%s', 'now', '-7 days') AND failruns.failed > 1
            GROUP BY allruns.testname, failruns.context
            ORDER BY percent DESC
            LIMIT 50""").fetchall()

    output += """
# HELP top_failures_pct Most unstable tests in the last 7 days (more than 10% failure rate)
# TYPE top_failures_pct gauge
"""
    for (project, test, context, fail_pct) in top_failures:
        if fail_pct > 10:
            output += f'top_failures_pct{{project="{project}",test="{test}",context="{context}"}} {fail_pct}\n'

    # PR retries
    pr_retries = cursor.execute("""
            SELECT project, revision, state, MAX(retry)
            FROM TestRuns
            GROUP BY project, revision""").fetchall()
    retry_buckets: dict[str, int] = {}
    merged_failures = 0
    for _project, _revision, state, retries in pr_retries:
        retry_buckets[retries] = retry_buckets.setdefault(retries, 0) + 1
        if state != 'success':
            merged_failures += 1
    acc = 0

    output += """
# HELP pr_retries histogram of how many retries it took to get PRs to green
# TYPE pr_retries histogram
"""
    for retry_count in sorted(retry_buckets):
        acc += retry_buckets[retry_count]
        output += f'pr_retries_bucket{{le="{retry_count}"}} {acc}\n'
    output += f'''pr_retries_bucket{{le="+Inf"}} {acc}
pr_retries_sum {acc}
pr_retries_count {acc}
'''

    output += f"""
# HELP merged_failures How many PRs were merged with non-successful tests
# TYPE merged_failures counter
merged_failures {merged_failures}
"""

    # Slowest tests
    output += """
# HELP top_slowest_tests The slowest tests in the last 7 days
# TYPE top_slowest_tests gauge
"""
    slowest_tests = """
        SELECT project, testname, max(seconds) FROM Tests AS t1
        JOIN TestRuns ON t1.run = TestRuns.id
        WHERE TestRuns.time > strftime('%s', 'now', '-7 days') AND seconds > 0
        GROUP BY project, testname
        ORDER BY seconds DESC
        LIMIT 200
    """
    slowest_tests_map: collections.defaultdict[str, int] = collections.defaultdict(int)
    # Number of slowest tests per project to output
    slowest_tests_limit = 10
    for (project, test, seconds) in cursor.execute(slowest_tests).fetchall():
        if slowest_tests_map[project] > slowest_tests_limit:
            continue

        output += f'top_slowest_tests{{project="{project}",test="{test}"}} {seconds}\n'
        slowest_tests_map[project] += 1

    # S3 space usage
    space_usage = {}
    for uri in stores.PUBLIC_STORES:
        url = urllib.parse.urlparse(uri)

        if s3.is_key_present(url):
            space_usage[uri] = sum(int(size) for size, in s3.parse_list(s3.list_bucket(url), "Size"))
    if space_usage:
        output += """
# HELP s3_usage_bytes How much disk space an S3 store is using
# TYPE s3_usage_bytes gauge
"""
        for uri, size in space_usage.items():
            output += f's3_usage_bytes{{store="{uri}"}} {size}\n'

    # Code coverage
    output += """
# HELP test_coverage Overall percentage of code covered by tests
# TYPE test_coverage gauge
"""
    code_coverage = """
        SELECT project, coverage, MAX(time) FROM TestCoverage AS tc
        JOIN TestRuns ON tc.run = TestRuns.id
        GROUP BY TestRuns.project
    """
    for project, coverage, _ in cursor.execute(code_coverage).fetchall():
        output += f'test_coverage{{project="{project}"}} {coverage}\n'

    db_conn.close()

    print(output)
    if opts.s3:
        with s3.urlopen(opts.s3, data=output.encode("UTF-8"), method="PUT",
                        headers={"Content-Type": "text/plain", s3.ACL: s3.PUBLIC}):
            pass

    return 0


if __name__ == '__main__':
    sys.exit(main())