Skip to content

Commit

Permalink
feat: Enhancements in stats server
Browse files Browse the repository at this point in the history
* Add admin-users CLI arg. Admin users can impersonate users by querying for different user set in specific header

* Add ability to query for a single job based on jobid and/or jobuuid

* Pass entire DB config to server

* Limit query window to 3 months to prevent making big queries

* Add new test scenarios for testing different end points

* Update test fixtures and Makefile

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Jan 1, 2024
1 parent 7dfa6c3 commit ced4ad3
Show file tree
Hide file tree
Showing 12 changed files with 606 additions and 235 deletions.
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,11 @@ else
.PHONY: test-e2e
test-e2e: build pkg/collector/fixtures/sys/.unpacked pkg/collector/fixtures/proc/.unpacked
@echo ">> running end-to-end tests"
./scripts/e2e-test.sh -s stats
./scripts/e2e-test.sh -s stats-account-query
./scripts/e2e-test.sh -s stats-jobuuid-query
./scripts/e2e-test.sh -s stats-jobid-query
./scripts/e2e-test.sh -s stats-jobuuid-jobid-query
./scripts/e2e-test.sh -s stats-admin-query
endif

.PHONY: skip-test-e2e
Expand Down
67 changes: 46 additions & 21 deletions pkg/jobstats/cli/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"os/signal"
"path/filepath"
"runtime"
"strings"
"sync"
"syscall"
"time"
Expand All @@ -18,6 +19,7 @@ import (
"github.com/mahendrapaipuri/batchjob_monitoring/pkg/jobstats/db"
"github.com/mahendrapaipuri/batchjob_monitoring/pkg/jobstats/schedulers"
"github.com/mahendrapaipuri/batchjob_monitoring/pkg/jobstats/server"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promlog"
"github.com/prometheus/common/promlog/flag"
"github.com/prometheus/common/version"
Expand Down Expand Up @@ -48,14 +50,18 @@ func (b *BatchJobStatsServer) Main() {
"web.config.file",
"Path to configuration file that can enable TLS or authentication. See: https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md",
).Default("").String()
adminUsers = b.App.Flag(
"web.admin-users",
"Comma separated list of admin users.",
).Default("").String()
dataPath = b.App.Flag(
"data.path",
"Absolute path to a directory where job data is stored. SQLite DB that contains jobs stats will be saved to this directory.",
).Default("/var/lib/jobstats").String()
retentionPeriod = b.App.Flag(
retentionPeriodString = b.App.Flag(
"data.retention.period",
"Period in days for which job stats data will be retained.",
).Default("365").Int()
"Period in days for which job stats data will be retained. Units Supported: y, w, d, h, m, s, ms.",
).Default("1y").String()
jobstatDBFile = b.App.Flag(
"db.name",
"Name of the SQLite DB file that contains job stats.",
Expand All @@ -66,12 +72,12 @@ func (b *BatchJobStatsServer) Main() {
).Default("jobs").String()
lastUpdateTime = b.App.Flag(
"db.last.update.time",
"Last time the DB was updated. Job stats from this time will be added for new DB.",
"Last time the DB was updated. Job stats from this time will be added for new DB. Supported formate: YYYY-MM-DD.",
).Default(time.Now().Format("2006-01-02")).String()
updateInterval = b.App.Flag(
updateIntervalString = b.App.Flag(
"db.update.interval",
"Time period in seconds at which DB will be updated with job stats.",
).Default("1800").Int()
"Time period at which DB will be updated with job stats. Units Supported: y, w, d, h, m, s, ms.",
).Default("15m").String()
maxProcs = b.App.Flag(
"runtime.gomaxprocs", "The target number of CPUs Go will run on (GOMAXPROCS)",
).Envar("GOMAXPROCS").Default("1").Int()
Expand All @@ -97,6 +103,25 @@ func (b *BatchJobStatsServer) Main() {
os.Exit(1)
}

// Parse retentionPeriod and updateInterval
retentionPeriod, err := model.ParseDuration(*retentionPeriodString)
if err != nil {
fmt.Printf("Failed to parse --data.retention.period flag. Error: %s", err)
os.Exit(1)
}
updateInterval, err := model.ParseDuration(*updateIntervalString)
if err != nil {
fmt.Printf("Failed to parse --db.update.interval flag. Error: %s", err)
os.Exit(1)
}

// Parse lastUpdateTime to check if it is in correct format
_, err = time.Parse("2006-01-02", *lastUpdateTime)
if err != nil {
fmt.Printf("Failed to parse --db.last.update.time flag. Error: %s", err)
os.Exit(1)
}

// Set logger here after properly configuring promlog
logger := promlog.New(promlogConfig)

Expand All @@ -119,14 +144,25 @@ func (b *BatchJobStatsServer) Main() {
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
defer stop()

// Make DB config
dbConfig := &db.Config{
Logger: logger,
JobstatsDBPath: jobstatDBPath,
JobstatsDBTable: *jobstatDBTable,
RetentionPeriod: time.Duration(retentionPeriod),
LastUpdateTimeString: *lastUpdateTime,
LastUpdateTimeStampFile: jobsLastTimeStampFile,
BatchScheduler: schedulers.NewBatchScheduler,
}

// Make server config
serverConfig := &server.Config{
Logger: logger,
Address: *webListenAddresses,
WebSystemdSocket: *systemdSocket,
WebConfigFile: *webConfigFile,
JobstatDBFile: jobstatDBPath,
JobstatDBTable: *jobstatDBTable,
DBConfig: *dbConfig,
AdminUsers: strings.Split(*adminUsers, ","),
}

// Create server instance
Expand All @@ -137,17 +173,6 @@ func (b *BatchJobStatsServer) Main() {
return
}

// Make DB config
dbConfig := &db.Config{
Logger: logger,
JobstatsDBPath: jobstatDBPath,
JobstatsDBTable: *jobstatDBTable,
RetentionPeriod: *retentionPeriod,
LastUpdateTimeString: *lastUpdateTime,
LastUpdateTimeStampFile: jobsLastTimeStampFile,
BatchScheduler: schedulers.NewBatchScheduler,
}

// Create DB instance
jobCollector, err := db.NewJobStatsDB(dbConfig)
if err != nil {
Expand All @@ -161,7 +186,7 @@ func (b *BatchJobStatsServer) Main() {
wg.Add(1)
go func() {
// Start a ticker
ticker := time.NewTicker(time.Second * time.Duration(*updateInterval))
ticker := time.NewTicker(time.Duration(updateInterval))
defer ticker.Stop()

loop:
Expand Down
1 change: 0 additions & 1 deletion pkg/jobstats/fixtures/e2e-test-stats-server-output.txt

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"status":"success","errorType":"","error":"","warnings":null,"data":[{"id":"acc1"}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"status":"success","errorType":"","error":"","warnings":null,"data":[{"jobid":"147973","id":"d8b28c2c-2011-d572-de94-8ec4facb4a2a","partition":"part1","QoS":"qos1","account":"acc3","group":"grp3","gid":"1003","user":"usr3","uid":"1003","submit":"2023-02-21T14:37:02","start":"2023-02-21T14:37:07","end":"2023-02-21T15:26:29","elapsed":"00:49:22","exitcode":"0:0","state":"CANCELLED by 1003","nnodes":"1","ncpus":"8","nodelist":"compute-0","nodelistexp":"compute-0","jobname":"test_script1","workdir":"/home/usr3"},{"jobid":"1481510","id":"b76ecf69-4d2f-076b-047d-2bcc8503b4cb","partition":"part1","QoS":"qos1","account":"acc3","group":"grp3","gid":"1003","user":"usr3","uid":"1003","submit":"2023-02-21T15:48:20","start":"2023-02-21T15:49:06","end":"2023-02-21T15:57:23","elapsed":"00:00:17","exitcode":"0:0","state":"CANCELLED by 1003","nnodes":"2","ncpus":"16","nodelist":"compute-[0-2]","nodelistexp":"compute-0|compute-1|compute-2","jobname":"test_script2","workdir":"/home/usr3"}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"status":"success","errorType":"","error":"","warnings":null,"data":[{"jobid":"1479763","id":"a04088e8-2699-2a9b-bc27-30282679ebb3","partition":"part1","QoS":"qos1","account":"acc1","group":"grp8","gid":"1008","user":"usr8","uid":"1008","submit":"2023-02-21T14:37:02","start":"2023-02-21T14:37:07","end":"2023-02-21T15:26:29","elapsed":"00:49:22","exitcode":"0:0","state":"CANCELLED by 1008","nnodes":"1","ncpus":"8","nodelist":"compute-0","nodelistexp":"compute-0","jobname":"test_script1","workdir":"/home/usr8"}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"status":"success","errorType":"","error":"","warnings":null,"data":[{"jobid":"11508","id":"d4956307-af17-870a-2fa0-38375105d257","partition":"part1","QoS":"qos1","account":"acc1","group":"grp15","gid":"1015","user":"usr15","uid":"1015","submit":"2023-02-21T15:48:20","start":"2023-02-21T15:49:06","end":"2023-02-21T15:57:23","elapsed":"00:08:17","exitcode":"0:0","state":"CANCELLED by 1015","nnodes":"2","ncpus":"16","nodelist":"compute-[0-2]","nodelistexp":"compute-0|compute-1|compute-2","jobname":"test_script2","workdir":"/home/usr15"},{"jobid":"81510","id":"938832b4-33b4-3303-b002-8150f737de7e","partition":"part1","QoS":"qos1","account":"acc1","group":"grp15","gid":"1015","user":"usr15","uid":"1015","submit":"2023-02-21T15:48:20","start":"2023-02-21T15:49:06","end":"2023-02-21T15:57:23","elapsed":"00:00:17","exitcode":"0:0","state":"CANCELLED by 1015","nnodes":"2","ncpus":"16","nodelist":"compute-[0-2]","nodelistexp":"compute-0|compute-1|compute-2","jobname":"test_script2","workdir":"/home/usr23"}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"status":"success","errorType":"","error":"","warnings":null,"data":[{"jobid":"1481508","id":"baee651d-df44-af2c-fa09-50f5523b5e19","partition":"part1","QoS":"qos1","account":"acc2","group":"grp2","gid":"1002","user":"usr2","uid":"1002","submit":"2023-02-21T15:48:20","start":"2023-02-21T15:49:06","end":"2023-02-21T15:57:23","elapsed":"00:08:17","exitcode":"0:0","state":"CANCELLED by 1002","nnodes":"2","ncpus":"16","nodelist":"compute-[0-2]","nodelistexp":"compute-0|compute-1|compute-2","jobname":"test_script2","workdir":"/home/usr2"}]}
14 changes: 10 additions & 4 deletions pkg/jobstats/fixtures/sacct
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
#!/bin/bash

echo """JobID|Partition|Account|Group|GID|User|UID|Submit|Start|End|Elapsed|ElapsedRaw|ExitCode|State|NNodes|NodeList|JobName|WorkDir
1479763|part1|acc1|grp|1000|usr|1000|2023-02-21T14:37:02|2023-02-21T14:37:07|2023-02-21T15:26:29|00:49:22|3000|0:0|CANCELLED by 1000|1|compute-0|test_script1|/home/usr
1481508|part1|acc1|grp|1000|usr|1000|2023-02-21T15:48:20|2023-02-21T15:49:06|2023-02-21T15:57:23|00:08:17|4500|0:0|CANCELLED by 1000|2|compute-[0-2]|test_script2|/home/usr
1481510|part1|acc1|grp|1000|usr|1000|2023-02-21T15:48:20|2023-02-21T15:49:06|2023-02-21T15:57:23|00:00:17|17|0:0|CANCELLED by 302137|2|compute-[0-2]|test_script2|/home/usr
echo """JobID|Partition|QoS|Account|Group|GID|User|UID|Submit|Start|End|Elapsed|ElapsedRaw|ExitCode|State|NNodes|Ncpus|NodeList|JobName|WorkDir
1479763|part1|qos1|acc1|grp1|1001|usr1|1001|2022-02-21T14:37:02|2022-02-21T14:37:07|2022-02-21T15:26:29|00:49:22|3000|0:0|CANCELLED by 1001|1|8|compute-0|test_script1|/home/usr1
1481508|part1|qos1|acc2|grp2|1002|usr2|1002|2023-02-21T15:48:20|2023-02-21T15:49:06|2023-02-21T15:57:23|00:08:17|4500|0:0|CANCELLED by 1002|2|16|compute-[0-2]|test_script2|/home/usr2
1481510|part1|qos1|acc3|grp3|1003|usr3|1003|2023-02-21T15:48:20|2023-02-21T15:49:06|2023-02-21T15:57:23|00:00:17|789|0:0|CANCELLED by 1003|2|16|compute-[0-2]|test_script2|/home/usr3
147973|part1|qos1|acc3|grp3|1003|usr3|1003|2023-02-21T14:37:02|2023-02-21T14:37:07|2023-02-21T15:26:29|00:49:22|3000|0:0|CANCELLED by 1003|1|8|compute-0|test_script1|/home/usr3
14508|part1|qos1|acc4|grp4|1004|usr4|1004|2023-02-21T15:48:20|2023-02-21T15:49:06|2023-02-21T15:57:23|00:08:17|4500|0:0|CANCELLED by 1004|2|16|compute-[0-2]|test_script2|/home/usr4
147973|part1|qos1|acc1|gr1|1001|usr1|1001|2023-12-21T15:48:20|2023-12-21T15:49:06|2023-12-21T15:57:23|00:00:17|567|0:0|CANCELLED by 1001|2|16|compute-[0-2]|test_script2|/home/usr1
1479763|part1|qos1|acc1|grp8|1008|usr8|1008|2023-02-21T14:37:02|2023-02-21T14:37:07|2023-02-21T15:26:29|00:49:22|3000|0:0|CANCELLED by 1008|1|8|compute-0|test_script1|/home/usr8
11508|part1|qos1|acc1|grp15|1015|usr15|1015|2023-02-21T15:48:20|2023-02-21T15:49:06|2023-02-21T15:57:23|00:08:17|4500|0:0|CANCELLED by 1015|2|16|compute-[0-2]|test_script2|/home/usr15
81510|part1|qos1|acc1|grp15|1015|usr15|1015|2023-02-21T15:48:20|2023-02-21T15:49:06|2023-02-21T15:57:23|00:00:17|3533|0:0|CANCELLED by 1015|2|16|compute-[0-2]|test_script2|/home/usr23
"""
Loading

0 comments on commit ced4ad3

Please sign in to comment.