Skip to content

Commit

Permalink
Merge pull request #8 from mahendrapaipuri/json_ds
Browse files Browse the repository at this point in the history
Add API server to serve job stats
  • Loading branch information
mahendrapaipuri authored Dec 11, 2023
2 parents b67948b + ba64c19 commit add5824
Show file tree
Hide file tree
Showing 41 changed files with 1,560 additions and 656 deletions.
16 changes: 13 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,31 @@ jobs:
- prometheus/setup_environment
- run: go mod download
- run: make
- run: sudo apt-get update && sudo apt-get install sqlite3 -y && sqlite3 --version
- run: CGO_BUILD=1 make
test-arm:
executor: arm
steps:
- checkout
- run: uname -a
- run: make test-e2e
- run: sudo apt-get update && sudo apt-get install sqlite3 -y && sqlite3 --version
- run: CGO_BUILD=1 make test-e2e
build:
machine:
image: ubuntu-2204:current
parallelism: 3
steps:
- prometheus/setup_environment
- run: docker run --privileged linuxkit/binfmt:af88a591f9cc896a52ce596b9cf7ca26a061ef97
# - run: docker run --privileged linuxkit/binfmt:af88a591f9cc896a52ce596b9cf7ca26a061ef97
- run: promu crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX
# - run: promu --config .promu-cgo.yml crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX
# Replace default CGO_BUILD before we run CGO. There is no way to inject this env
# variable into golang-builder container. So we artificially replace the default
# CGO_BUILD variable in Makefile and run CGO promu
# Do not use CGO_ENABLED env var as golang-builder image will set this var by default to zer
# and our override will not have any effect
- run: sed -i -e 's/CGO_BUILD ?= 0/CGO_BUILD ?= 1/g' Makefile
- run: promu --config .promu-cgo.yml crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX
- persist_to_workspace:
root: .
paths:
Expand All @@ -43,7 +53,7 @@ jobs:
destination: /build
workflows:
version: 2
batchjob_exporter:
batchjob_monitoring:
jobs:
- test:
filters:
Expand Down
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

# Test binary, built with `go test -c`
*.test
coverage.txt

# Output of the go coverage tool, specifically when used with LiteIDE
*.out
Expand Down Expand Up @@ -38,4 +39,7 @@ run.sh
# Ignore test files
*.db
*.prof
lasttimestamp
lasttimestamp

# Ignore promu related files
promu*
20 changes: 12 additions & 8 deletions .promu-cgo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ repository:
path: github.com/mahendrapaipuri/batchjob_monitoring
build:
binaries:
- name: batchjob_exporter
path: ./cmd/batchjob_exporter
- name: batchjob_stats
path: ./cmd/batchjob_stats
- name: batchjob_stats_db
path: ./cmd/batchjob_stats_db
- name: batchjob_stats_server
path: ./cmd/batchjob_stats_server
flags: -a -tags 'netgo osusergo static_build'
ldflags: |
-X github.com/prometheus/common/version.Version={{.Version}}
Expand All @@ -21,13 +21,17 @@ build:
tarball:
files:
- LICENSE
- NOTICE
- README.md
crossbuild:
platforms:
- netbsd/amd64
- netbsd/386
- linux/386
- linux/amd64
- linux/arm64
- linux/mips
- linux/mipsle
- linux/mips64
- linux/mips64le
- linux/mipsle
# - linux/ppc64
- linux/ppc64le
- linux/riscv64
# - linux/s390x
14 changes: 10 additions & 4 deletions .promu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ build:
binaries:
- name: batchjob_exporter
path: ./cmd/batchjob_exporter
- name: batchjob_stats
path: ./cmd/batchjob_stats
flags: -a -tags 'netgo osusergo static_build'
ldflags: |
-X github.com/prometheus/common/version.Version={{.Version}}
Expand All @@ -20,9 +18,17 @@ build:
tarball:
files:
- LICENSE
- NOTICE
- README.md
crossbuild:
platforms:
- linux/amd64
- linux/386
- linux/amd64
- linux/arm64
- linux/mips
- linux/mips64
- linux/mips64le
- linux/mipsle
# - linux/ppc64
- linux/ppc64le
- linux/riscv64
# - linux/s390x
78 changes: 53 additions & 25 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ CGROUPS_MODE ?= $([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] &&

STATICCHECK_IGNORE =

CGO_BUILD ?= 0

ifeq ($(GOHOSTOS), linux)
test-e2e := test-e2e
else
Expand All @@ -33,30 +35,46 @@ else
test-docker := test-docker
endif

# Use CGO for non-Linux builds.
ifeq ($(GOOS), linux)
# Use CGO for batchjob_stats_* and GO for batchjob_exporter.
ifeq ($(CGO_BUILD), 1)
PROMU_CONF ?= .promu-cgo.yml
pkgs := ./pkg/jobstats ./cmd/batchjob_stats_db ./cmd/batchjob_stats_server
else
PROMU_CONF ?= .promu.yml
pkgs := ./pkg/collector ./pkg/emissions ./cmd/batchjob_exporter
endif

ifeq ($(GOHOSTOS), linux)
test-e2e := test-e2e
else
ifndef GOOS
ifeq ($(GOHOSTOS), linux)
PROMU_CONF ?= .promu.yml
else
PROMU_CONF ?= .promu-cgo.yml
endif
else
# Do not use CGO for openbsd/amd64 builds
ifeq ($(GOOS), openbsd)
ifeq ($(GOARCH), amd64)
PROMU_CONF ?= .promu.yml
else
PROMU_CONF ?= .promu-cgo.yml
endif
else
PROMU_CONF ?= .promu-cgo.yml
endif
endif
test-e2e := skip-test-e2e
endif

# We are using SQLite3 which needs CGO and thus, this logic
# is not relevant for us anymore
# ifeq ($(GOOS), linux)
# PROMU_CONF ?= .promu.yml
# else
# ifndef GOOS
# ifeq ($(GOHOSTOS), linux)
# PROMU_CONF ?= .promu.yml
# else
# PROMU_CONF ?= .promu-cgo.yml
# endif
# else
# # Do not use CGO for openbsd/amd64 builds
# ifeq ($(GOOS), openbsd)
# ifeq ($(GOARCH), amd64)
# PROMU_CONF ?= .promu.yml
# else
# PROMU_CONF ?= .promu-cgo.yml
# endif
# else
# PROMU_CONF ?= .promu-cgo.yml
# endif
# endif
# endif

PROMU := $(FIRST_GOPATH)/bin/promu --config $(PROMU_CONF)

e2e-cgroupsv2-out = pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt
Expand All @@ -71,10 +89,12 @@ endif
# 64bit -> 32bit mapping for cross-checking. At least for amd64/386, the 64bit CPU can execute 32bit code but not the other way around, so we don't support cross-testing upwards.
cross-test = skip-test-32bit
define goarch_pair
ifeq ($$(GOHOSTOS),linux)
ifeq ($$(GOHOSTARCH),$1)
GOARCH_CROSS = $2
cross-test = test-32bit
ifeq ($$(GOHOSTOS), linux)
ifndef CGO_BUILD
ifeq ($$(GOHOSTARCH), $1)
GOARCH_CROSS = $2
cross-test = test-32bit
endif
endif
endif
endef
Expand Down Expand Up @@ -110,10 +130,18 @@ update_fixtures:
rm -vf pkg/collector/fixtures/sys/.unpacked
./scripts/ttar -C pkg/collector/fixtures -c -f pkg/collector/fixtures/sys.ttar sys

ifeq ($(CGO_BUILD), 0)
.PHONY: test-e2e
test-e2e: build pkg/collector/fixtures/sys/.unpacked
@echo ">> running end-to-end tests"
./scripts/e2e-test.sh
./scripts/e2e-test.sh -p exporter
else
.PHONY: test-e2e
test-e2e: build pkg/collector/fixtures/sys/.unpacked
@echo ">> running end-to-end tests"
./scripts/e2e-test.sh -p stats_db
./scripts/e2e-test.sh -p stats_server
endif

.PHONY: skip-test-e2e
skip-test-e2e:
Expand Down
2 changes: 1 addition & 1 deletion cmd/batchjob_exporter/batchjob_exporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
)

var (
binary, _ = filepath.Abs("batchjob_exporter")
binary, _ = filepath.Abs("bin/batchjob_exporter")
)

const (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,20 @@
package main

import (
"fmt"
_ "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"runtime"
"syscall"
"time"

"github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log/level"
_ "github.com/mattn/go-sqlite3"
"github.com/prometheus/common/promlog"
"github.com/prometheus/common/promlog/flag"
"github.com/prometheus/common/version"
_ "modernc.org/sqlite"

"github.com/mahendrapaipuri/batchjob_monitoring/pkg/jobstats"
)
Expand All @@ -25,40 +28,47 @@ var (

func main() {
var (
batchScheduler = kingpin.Flag(
batchScheduler = jobstats.JobstatDBApp.Flag(
"batch.scheduler",
"Name of batch scheduler (eg slurm, lsf, pbs).",
).Default("slurm").String()
dataPath = kingpin.Flag(
dataPath = jobstats.JobstatDBApp.Flag(
"path.data",
"Absolute path to a directory where job data is placed. SQLite DB that contains jobs stats will be saved to this directory.",
).Default("/var/lib/jobstats").String()
jobstatDBFile = kingpin.Flag(
"path.db.name",
jobstatDBFile = jobstats.JobstatDBApp.Flag(
"db.name",
"Name of the SQLite DB file that contains jobs stats.",
).Default("jobstats.db").String()
jobstatDBTable = kingpin.Flag(
"path.db.table.name",
jobstatDBTable = jobstats.JobstatDBApp.Flag(
"db.table.name",
"Name of the table in SQLite DB file that contains jobs stats.",
).Default("jobs").String()
retentionPeriod = kingpin.Flag(
updateInterval = jobstats.JobstatDBApp.Flag(
"db.update.interval",
"Time period in seconds at which DB will be updated with jobs stats.",
).Default("1800").Int()
retentionPeriod = jobstats.JobstatDBApp.Flag(
"data.retention.period",
"Period in days for which job stats data will be retained.",
).Default("365").Int()
maxProcs = kingpin.Flag(
maxProcs = jobstats.JobstatDBApp.Flag(
"runtime.gomaxprocs", "The target number of CPUs Go will run on (GOMAXPROCS)",
).Envar("GOMAXPROCS").Default("1").Int()
)

promlogConfig := &promlog.Config{}
flag.AddFlags(kingpin.CommandLine, promlogConfig)
kingpin.Version(version.Print("batchjobstat"))
kingpin.CommandLine.UsageWriter(os.Stdout)
kingpin.HelpFlag.Short('h')
kingpin.Parse()
flag.AddFlags(jobstats.JobstatDBApp, promlogConfig)
jobstats.JobstatDBApp.Version(version.Print(jobstats.JobstatDBAppName))
jobstats.JobstatDBApp.UsageWriter(os.Stdout)
jobstats.JobstatDBApp.HelpFlag.Short('h')
_, err := jobstats.JobstatDBApp.Parse(os.Args[1:])
if err != nil {
panic(fmt.Sprintf("Failed to parse %s command", jobstats.JobstatDBAppName))
}
logger := promlog.New(promlogConfig)

level.Info(logger).Log("msg", "Running batchjob_jobstat", "version", version.Info())
level.Info(logger).Log("msg", fmt.Sprintf("Running %s", jobstats.JobstatDBAppName), "version", version.Info())
level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext())

runtime.GOMAXPROCS(*maxProcs)
Expand All @@ -74,7 +84,7 @@ func main() {
// return
// }
// pprof.StartCPUProfile(f)
jobCollector := jobstats.NewJobStats(
jobCollector := jobstats.NewJobStatsDB(
logger,
*batchScheduler,
jobstatDBPath,
Expand All @@ -83,9 +93,30 @@ func main() {
jobsLastTimeStampFile,
vacuumLastTimeStampFile,
)
err := jobCollector.GetJobStats()
if err != nil {
level.Error(logger).Log("msg", "Failed to get job stats", "err", err)

// Create a channel to propagate signals
interrupt := make(chan os.Signal, 1)
signal.Notify(interrupt, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)

// Start a ticker
ticker := time.NewTicker(time.Second * time.Duration(*updateInterval))
defer ticker.Stop()

loop:
for {
level.Info(logger).Log("msg", "Updating JobStats DB")
err := jobCollector.GetJobStats()
if err != nil {
level.Error(logger).Log("msg", "Failed to get job stats", "err", err)
}

select {
case <-ticker.C:
continue
case <-interrupt:
level.Info(logger).Log("msg", "Received Interrupt. Stopping DB update")
break loop
}
}
// defer pprof.StopCPUProfile()
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ import (
)

var (
binary, _ = filepath.Abs("batchjob_stats")
binary, _ = filepath.Abs("bin/batchjob_stats_db")
)

func TestBatchjobStatExecutable(t *testing.T) {
if _, err := os.Stat(binary); err != nil {
t.Skipf("batchjob_stats binary not available, try to run `make build` first: %s", err)
t.Skipf("batchjob_stats_db binary not available, try to run `make build` first: %s", err)
}
tmpDir := t.TempDir()

Expand Down
Loading

0 comments on commit add5824

Please sign in to comment.