-
Notifications
You must be signed in to change notification settings - Fork 34
/
prometheus-stats
executable file
·240 lines (207 loc) · 8.71 KB
/
prometheus-stats
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python3
# This file is part of Cockpit.
#
# Copyright (C) 2021 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.
import argparse
import collections
import logging
import sqlite3
import sys
import urllib.parse
from lib import s3, stores
# Seconds in an hour
HOUR = 3600
def main() -> int:
parser = argparse.ArgumentParser(description='Generate infrastructure and test statistics')
parser.add_argument("--db", default="test-results.db", help="Database name")
parser.add_argument("--s3", metavar="URL", type=urllib.parse.urlparse,
help="Upload generated file to given S3 URL")
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
parser.add_argument("--prune", type=int, metavar="DAYS", help="Remove entries older than DAYS")
opts = parser.parse_args()
if opts.verbose:
logging.basicConfig(level=logging.DEBUG)
db_conn = sqlite3.connect(opts.db)
cursor = db_conn.cursor()
if opts.prune:
# this would be much more elegant with just "PRAGMA foreign_keys = ON", and letting SQLite
# auto-cleanup deleted foreign keys in the other tables; but that takes unbearably long
cursor.execute(f"CREATE TEMP TABLE to_delete AS SELECT id FROM TestRuns "
f"WHERE time < unixepoch('now', '-{opts.prune} days')")
cursor.execute("DELETE FROM Tests WHERE run IN to_delete")
cursor.execute("DELETE FROM TestCoverage WHERE run IN to_delete")
cursor.execute("DELETE FROM TestRuns WHERE id IN to_delete")
db_conn.commit()
cursor.execute("VACUUM")
db_conn.commit()
# Output in prometheus format, see:
# https://prometheus.io/docs/instrumenting/exposition_formats/
output = ""
# Get total number of runs and wait time sum
(test_runs, test_runs_queue_wait_sum) = cursor.execute("""\
SELECT COUNT(*), SUM(wait_seconds)
FROM TestRuns""").fetchone()
if test_runs_queue_wait_sum is None:
test_runs_queue_wait_sum = 0
# How many test runs waited for at most 5 minutes
test_runs_wait_5m = cursor.execute("""\
SELECT COUNT(*)
FROM TestRuns
WHERE wait_seconds <= 300""").fetchone()[0]
# Get how many test runs waited for at most 1 hour
test_runs_wait_1h = cursor.execute("""\
SELECT COUNT(*)
FROM TestRuns
WHERE wait_seconds <= 3600""").fetchone()[0]
output += f"""# HELP queue_time_wait_seconds histogram of queue wait times
# TYPE queue_time_wait_seconds histogram
queue_time_wait_seconds_bucket{{le="300"}} {test_runs_wait_5m}
queue_time_wait_seconds_bucket{{le="3600"}} {test_runs_wait_1h}
queue_time_wait_seconds_bucket{{le="+Inf"}} {test_runs}
queue_time_wait_seconds_sum {test_runs_queue_wait_sum}
queue_time_wait_seconds_count {test_runs}
"""
# Get total number of test runtime
test_run_seconds_sum = cursor.execute("""\
SELECT SUM(run_seconds)
FROM TestRuns""").fetchone()[0]
if test_run_seconds_sum is None:
test_run_seconds_sum = 0
# How many tests ran for < 1 hour
test_run_seconds_1h = cursor.execute("""\
SELECT COUNT(*)
FROM TestRuns
WHERE run_seconds <= 3600""").fetchone()[0]
# How many tests ran for < 2 hour
test_run_seconds_2h = cursor.execute("""\
SELECT COUNT(*)
FROM TestRuns
WHERE run_seconds <= 7200""").fetchone()[0]
output += f"""
# HELP test_run_seconds histogram of test execution time
# TYPE test_run_seconds histogram
test_run_seconds_bucket{{le="3600"}} {test_run_seconds_1h}
test_run_seconds_bucket{{le="7200"}} {test_run_seconds_2h}
test_run_seconds_bucket{{le="+Inf"}} {test_runs}
test_run_seconds_sum {test_run_seconds_sum}
test_run_seconds_count {test_runs}
"""
# top failures of last 7 days
top_failures = cursor.execute("""
SELECT project, allruns.testname, failruns.context, (failruns.failed * 100.0) / COUNT(run) AS percent
FROM Tests AS allruns
JOIN TestRuns ON allruns.run = TestRuns.id
JOIN (
SELECT testname, context, COUNT(run) as failed
FROM Tests
JOIN TestRuns ON Tests.run = TestRuns.id
WHERE TestRuns.time > strftime('%s', 'now', '-7 days') AND Tests.failed = 1
GROUP BY testname, context
) AS failruns ON allruns.testname = failruns.testname AND TestRuns.context = failruns.context
WHERE TestRuns.time > strftime('%s', 'now', '-7 days') AND failruns.failed > 1
GROUP BY allruns.testname, failruns.context
ORDER BY percent DESC
LIMIT 50""").fetchall()
output += """
# HELP top_failures_pct Most unstable tests in the last 7 days (more than 10% failure rate)
# TYPE top_failures_pct gauge
"""
for (project, test, context, fail_pct) in top_failures:
if fail_pct > 10:
output += f'top_failures_pct{{project="{project}",test="{test}",context="{context}"}} {fail_pct}\n'
# PR retries
pr_retries = cursor.execute("""
SELECT project, revision, state, MAX(retry)
FROM TestRuns
GROUP BY project, revision""").fetchall()
retry_buckets: dict[str, int] = {}
merged_failures = 0
for _project, _revision, state, retries in pr_retries:
retry_buckets[retries] = retry_buckets.setdefault(retries, 0) + 1
if state != 'success':
merged_failures += 1
acc = 0
output += """
# HELP pr_retries histogram of how many retries it took to get PRs to green
# TYPE pr_retries histogram
"""
for retry_count in sorted(retry_buckets):
acc += retry_buckets[retry_count]
output += f'pr_retries_bucket{{le="{retry_count}"}} {acc}\n'
output += f'''pr_retries_bucket{{le="+Inf"}} {acc}
pr_retries_sum {acc}
pr_retries_count {acc}
'''
output += f"""
# HELP merged_failures How many PRs were merged with non-successful tests
# TYPE merged_failures counter
merged_failures {merged_failures}
"""
# Slowest tests
output += """
# HELP top_slowest_tests The slowest tests in the last 7 days
# TYPE top_slowest_tests gauge
"""
slowest_tests = """
SELECT project, testname, max(seconds) FROM Tests AS t1
JOIN TestRuns ON t1.run = TestRuns.id
WHERE TestRuns.time > strftime('%s', 'now', '-7 days') AND seconds > 0
GROUP BY project, testname
ORDER BY seconds DESC
LIMIT 200
"""
slowest_tests_map: collections.defaultdict[str, int] = collections.defaultdict(int)
# Number of slowest tests per project to output
slowest_tests_limit = 10
for (project, test, seconds) in cursor.execute(slowest_tests).fetchall():
if slowest_tests_map[project] > slowest_tests_limit:
continue
output += f'top_slowest_tests{{project="{project}",test="{test}"}} {seconds}\n'
slowest_tests_map[project] += 1
# S3 space usage
space_usage = {}
for uri in stores.PUBLIC_STORES:
url = urllib.parse.urlparse(uri)
if s3.is_key_present(url):
space_usage[uri] = sum(int(size) for size, in s3.parse_list(s3.list_bucket(url), "Size"))
if space_usage:
output += """
# HELP s3_usage_bytes How much disk space an S3 store is using
# TYPE s3_usage_bytes gauge
"""
for uri, size in space_usage.items():
output += f's3_usage_bytes{{store="{uri}"}} {size}\n'
# Code coverage
output += """
# HELP test_coverage Overall percentage of code covered by tests
# TYPE test_coverage gauge
"""
code_coverage = """
SELECT project, coverage, MAX(time) FROM TestCoverage AS tc
JOIN TestRuns ON tc.run = TestRuns.id
GROUP BY TestRuns.project
"""
for project, coverage, _ in cursor.execute(code_coverage).fetchall():
output += f'test_coverage{{project="{project}"}} {coverage}\n'
db_conn.close()
print(output)
if opts.s3:
with s3.urlopen(opts.s3, data=output.encode("UTF-8"), method="PUT",
headers={"Content-Type": "text/plain", s3.ACL: s3.PUBLIC}):
pass
return 0
if __name__ == '__main__':
sys.exit(main())