Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add API server to serve job stats #8

Merged
merged 21 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
a939294
build: Update dependencies
mahendrapaipuri Dec 4, 2023
115c37b
fix: Correct relative path to binary
mahendrapaipuri Dec 11, 2023
d868116
refactor: Move utility functions to internal pkg
mahendrapaipuri Dec 11, 2023
c099b2e
refactor: Move emissions to separate pkg
mahendrapaipuri Dec 11, 2023
3fb57c7
refactor: Changes based on new organization
mahendrapaipuri Dec 11, 2023
8a9a0e8
test: Update test fixtures for collector pkg
mahendrapaipuri Dec 11, 2023
3956e5d
refactor: Rename jobstats DB pkg
mahendrapaipuri Dec 11, 2023
26f85d7
refactor: Move utilities to pkg helpers
mahendrapaipuri Dec 11, 2023
485502d
refactor: Update jobstats DB pkg
mahendrapaipuri Dec 11, 2023
e8cd7c2
refactor: Rename jobstats_db cmd
mahendrapaipuri Dec 11, 2023
0c41212
feat: Add jobstats API server
mahendrapaipuri Dec 11, 2023
bfad411
test: Update test fixtures and add new fixtures
mahendrapaipuri Dec 11, 2023
1261f12
build: use CGO_ENABLED env var to control builds
mahendrapaipuri Dec 11, 2023
8323734
ci: Update CI configs with new repo pkgs
mahendrapaipuri Dec 11, 2023
2a32349
build: Add linux archs in promu config
mahendrapaipuri Dec 11, 2023
e3375b0
build: Control CGO_ENABLED variable
mahendrapaipuri Dec 11, 2023
1d9d3a5
test: Update test fixture
mahendrapaipuri Dec 11, 2023
3e351cf
ci: Debug CI build job
mahendrapaipuri Dec 11, 2023
828566d
build: Replace CGO_ENABLED by CGO_BUILD
mahendrapaipuri Dec 11, 2023
848b014
build: Use only supported cross-compiles
mahendrapaipuri Dec 11, 2023
ba64c19
ci: Install sqlite3 before tests
mahendrapaipuri Dec 11, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,31 @@ jobs:
- prometheus/setup_environment
- run: go mod download
- run: make
- run: sudo apt-get update && sudo apt-get install sqlite3 -y && sqlite3 --version
- run: CGO_BUILD=1 make
test-arm:
executor: arm
steps:
- checkout
- run: uname -a
- run: make test-e2e
- run: sudo apt-get update && sudo apt-get install sqlite3 -y && sqlite3 --version
- run: CGO_BUILD=1 make test-e2e
build:
machine:
image: ubuntu-2204:current
parallelism: 3
steps:
- prometheus/setup_environment
- run: docker run --privileged linuxkit/binfmt:af88a591f9cc896a52ce596b9cf7ca26a061ef97
# - run: docker run --privileged linuxkit/binfmt:af88a591f9cc896a52ce596b9cf7ca26a061ef97
- run: promu crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX
# - run: promu --config .promu-cgo.yml crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX
# Replace default CGO_BUILD before we run CGO. There is no way to inject this env
# variable into golang-builder container. So we artificially replace the default
# CGO_BUILD variable in Makefile and run CGO promu
# Do not use CGO_ENABLED env var as golang-builder image will set this var by default to zer
# and our override will not have any effect
- run: sed -i -e 's/CGO_BUILD ?= 0/CGO_BUILD ?= 1/g' Makefile
- run: promu --config .promu-cgo.yml crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX
- persist_to_workspace:
root: .
paths:
Expand All @@ -43,7 +53,7 @@ jobs:
destination: /build
workflows:
version: 2
batchjob_exporter:
batchjob_monitoring:
jobs:
- test:
filters:
Expand Down
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

# Test binary, built with `go test -c`
*.test
coverage.txt

# Output of the go coverage tool, specifically when used with LiteIDE
*.out
Expand Down Expand Up @@ -38,4 +39,7 @@ run.sh
# Ignore test files
*.db
*.prof
lasttimestamp
lasttimestamp

# Ignore promu related files
promu*
20 changes: 12 additions & 8 deletions .promu-cgo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ repository:
path: github.com/mahendrapaipuri/batchjob_monitoring
build:
binaries:
- name: batchjob_exporter
path: ./cmd/batchjob_exporter
- name: batchjob_stats
path: ./cmd/batchjob_stats
- name: batchjob_stats_db
path: ./cmd/batchjob_stats_db
- name: batchjob_stats_server
path: ./cmd/batchjob_stats_server
flags: -a -tags 'netgo osusergo static_build'
ldflags: |
-X github.com/prometheus/common/version.Version={{.Version}}
Expand All @@ -21,13 +21,17 @@ build:
tarball:
files:
- LICENSE
- NOTICE
- README.md
crossbuild:
platforms:
- netbsd/amd64
- netbsd/386
- linux/386
- linux/amd64
- linux/arm64
- linux/mips
- linux/mipsle
- linux/mips64
- linux/mips64le
- linux/mipsle
# - linux/ppc64
- linux/ppc64le
- linux/riscv64
# - linux/s390x
14 changes: 10 additions & 4 deletions .promu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ build:
binaries:
- name: batchjob_exporter
path: ./cmd/batchjob_exporter
- name: batchjob_stats
path: ./cmd/batchjob_stats
flags: -a -tags 'netgo osusergo static_build'
ldflags: |
-X github.com/prometheus/common/version.Version={{.Version}}
Expand All @@ -20,9 +18,17 @@ build:
tarball:
files:
- LICENSE
- NOTICE
- README.md
crossbuild:
platforms:
- linux/amd64
- linux/386
- linux/amd64
- linux/arm64
- linux/mips
- linux/mips64
- linux/mips64le
- linux/mipsle
# - linux/ppc64
- linux/ppc64le
- linux/riscv64
# - linux/s390x
78 changes: 53 additions & 25 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ CGROUPS_MODE ?= $([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] &&

STATICCHECK_IGNORE =

CGO_BUILD ?= 0

ifeq ($(GOHOSTOS), linux)
test-e2e := test-e2e
else
Expand All @@ -33,30 +35,46 @@ else
test-docker := test-docker
endif

# Use CGO for non-Linux builds.
ifeq ($(GOOS), linux)
# Use CGO for batchjob_stats_* and GO for batchjob_exporter.
ifeq ($(CGO_BUILD), 1)
PROMU_CONF ?= .promu-cgo.yml
pkgs := ./pkg/jobstats ./cmd/batchjob_stats_db ./cmd/batchjob_stats_server
else
PROMU_CONF ?= .promu.yml
pkgs := ./pkg/collector ./pkg/emissions ./cmd/batchjob_exporter
endif

ifeq ($(GOHOSTOS), linux)
test-e2e := test-e2e
else
ifndef GOOS
ifeq ($(GOHOSTOS), linux)
PROMU_CONF ?= .promu.yml
else
PROMU_CONF ?= .promu-cgo.yml
endif
else
# Do not use CGO for openbsd/amd64 builds
ifeq ($(GOOS), openbsd)
ifeq ($(GOARCH), amd64)
PROMU_CONF ?= .promu.yml
else
PROMU_CONF ?= .promu-cgo.yml
endif
else
PROMU_CONF ?= .promu-cgo.yml
endif
endif
test-e2e := skip-test-e2e
endif

# We are using SQLite3 which needs CGO and thus, this logic
# is not relevant for us anymore
# ifeq ($(GOOS), linux)
# PROMU_CONF ?= .promu.yml
# else
# ifndef GOOS
# ifeq ($(GOHOSTOS), linux)
# PROMU_CONF ?= .promu.yml
# else
# PROMU_CONF ?= .promu-cgo.yml
# endif
# else
# # Do not use CGO for openbsd/amd64 builds
# ifeq ($(GOOS), openbsd)
# ifeq ($(GOARCH), amd64)
# PROMU_CONF ?= .promu.yml
# else
# PROMU_CONF ?= .promu-cgo.yml
# endif
# else
# PROMU_CONF ?= .promu-cgo.yml
# endif
# endif
# endif

PROMU := $(FIRST_GOPATH)/bin/promu --config $(PROMU_CONF)

e2e-cgroupsv2-out = pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt
Expand All @@ -71,10 +89,12 @@ endif
# 64bit -> 32bit mapping for cross-checking. At least for amd64/386, the 64bit CPU can execute 32bit code but not the other way around, so we don't support cross-testing upwards.
cross-test = skip-test-32bit
define goarch_pair
ifeq ($$(GOHOSTOS),linux)
ifeq ($$(GOHOSTARCH),$1)
GOARCH_CROSS = $2
cross-test = test-32bit
ifeq ($$(GOHOSTOS), linux)
ifndef CGO_BUILD
ifeq ($$(GOHOSTARCH), $1)
GOARCH_CROSS = $2
cross-test = test-32bit
endif
endif
endif
endef
Expand Down Expand Up @@ -110,10 +130,18 @@ update_fixtures:
rm -vf pkg/collector/fixtures/sys/.unpacked
./scripts/ttar -C pkg/collector/fixtures -c -f pkg/collector/fixtures/sys.ttar sys

ifeq ($(CGO_BUILD), 0)
.PHONY: test-e2e
test-e2e: build pkg/collector/fixtures/sys/.unpacked
@echo ">> running end-to-end tests"
./scripts/e2e-test.sh
./scripts/e2e-test.sh -p exporter
else
.PHONY: test-e2e
test-e2e: build pkg/collector/fixtures/sys/.unpacked
@echo ">> running end-to-end tests"
./scripts/e2e-test.sh -p stats_db
./scripts/e2e-test.sh -p stats_server
endif

.PHONY: skip-test-e2e
skip-test-e2e:
Expand Down
2 changes: 1 addition & 1 deletion cmd/batchjob_exporter/batchjob_exporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
)

var (
binary, _ = filepath.Abs("batchjob_exporter")
binary, _ = filepath.Abs("bin/batchjob_exporter")
)

const (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,20 @@
package main

import (
"fmt"
_ "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"runtime"
"syscall"
"time"

"github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log/level"
_ "github.com/mattn/go-sqlite3"
"github.com/prometheus/common/promlog"
"github.com/prometheus/common/promlog/flag"
"github.com/prometheus/common/version"
_ "modernc.org/sqlite"

"github.com/mahendrapaipuri/batchjob_monitoring/pkg/jobstats"
)
Expand All @@ -25,40 +28,47 @@ var (

func main() {
var (
batchScheduler = kingpin.Flag(
batchScheduler = jobstats.JobstatDBApp.Flag(
"batch.scheduler",
"Name of batch scheduler (eg slurm, lsf, pbs).",
).Default("slurm").String()
dataPath = kingpin.Flag(
dataPath = jobstats.JobstatDBApp.Flag(
"path.data",
"Absolute path to a directory where job data is placed. SQLite DB that contains jobs stats will be saved to this directory.",
).Default("/var/lib/jobstats").String()
jobstatDBFile = kingpin.Flag(
"path.db.name",
jobstatDBFile = jobstats.JobstatDBApp.Flag(
"db.name",
"Name of the SQLite DB file that contains jobs stats.",
).Default("jobstats.db").String()
jobstatDBTable = kingpin.Flag(
"path.db.table.name",
jobstatDBTable = jobstats.JobstatDBApp.Flag(
"db.table.name",
"Name of the table in SQLite DB file that contains jobs stats.",
).Default("jobs").String()
retentionPeriod = kingpin.Flag(
updateInterval = jobstats.JobstatDBApp.Flag(
"db.update.interval",
"Time period in seconds at which DB will be updated with jobs stats.",
).Default("1800").Int()
retentionPeriod = jobstats.JobstatDBApp.Flag(
"data.retention.period",
"Period in days for which job stats data will be retained.",
).Default("365").Int()
maxProcs = kingpin.Flag(
maxProcs = jobstats.JobstatDBApp.Flag(
"runtime.gomaxprocs", "The target number of CPUs Go will run on (GOMAXPROCS)",
).Envar("GOMAXPROCS").Default("1").Int()
)

promlogConfig := &promlog.Config{}
flag.AddFlags(kingpin.CommandLine, promlogConfig)
kingpin.Version(version.Print("batchjobstat"))
kingpin.CommandLine.UsageWriter(os.Stdout)
kingpin.HelpFlag.Short('h')
kingpin.Parse()
flag.AddFlags(jobstats.JobstatDBApp, promlogConfig)
jobstats.JobstatDBApp.Version(version.Print(jobstats.JobstatDBAppName))
jobstats.JobstatDBApp.UsageWriter(os.Stdout)
jobstats.JobstatDBApp.HelpFlag.Short('h')
_, err := jobstats.JobstatDBApp.Parse(os.Args[1:])
if err != nil {
panic(fmt.Sprintf("Failed to parse %s command", jobstats.JobstatDBAppName))
}
logger := promlog.New(promlogConfig)

level.Info(logger).Log("msg", "Running batchjob_jobstat", "version", version.Info())
level.Info(logger).Log("msg", fmt.Sprintf("Running %s", jobstats.JobstatDBAppName), "version", version.Info())
level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext())

runtime.GOMAXPROCS(*maxProcs)
Expand All @@ -74,7 +84,7 @@ func main() {
// return
// }
// pprof.StartCPUProfile(f)
jobCollector := jobstats.NewJobStats(
jobCollector := jobstats.NewJobStatsDB(
logger,
*batchScheduler,
jobstatDBPath,
Expand All @@ -83,9 +93,30 @@ func main() {
jobsLastTimeStampFile,
vacuumLastTimeStampFile,
)
err := jobCollector.GetJobStats()
if err != nil {
level.Error(logger).Log("msg", "Failed to get job stats", "err", err)

// Create a channel to propagate signals
interrupt := make(chan os.Signal, 1)
signal.Notify(interrupt, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)

// Start a ticker
ticker := time.NewTicker(time.Second * time.Duration(*updateInterval))
defer ticker.Stop()

loop:
for {
level.Info(logger).Log("msg", "Updating JobStats DB")
err := jobCollector.GetJobStats()
if err != nil {
level.Error(logger).Log("msg", "Failed to get job stats", "err", err)
}

select {
case <-ticker.C:
continue
case <-interrupt:
level.Info(logger).Log("msg", "Received Interrupt. Stopping DB update")
break loop
}
}
// defer pprof.StopCPUProfile()
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ import (
)

var (
binary, _ = filepath.Abs("batchjob_stats")
binary, _ = filepath.Abs("bin/batchjob_stats_db")
)

func TestBatchjobStatExecutable(t *testing.T) {
if _, err := os.Stat(binary); err != nil {
t.Skipf("batchjob_stats binary not available, try to run `make build` first: %s", err)
t.Skipf("batchjob_stats_db binary not available, try to run `make build` first: %s", err)
}
tmpDir := t.TempDir()

Expand Down
Loading