Skip to content

Commit

Permalink
Merge pull request #211 from EURODEO/logging
Browse files Browse the repository at this point in the history
Add Monitoring to test local performance improvements
  • Loading branch information
Jeffrey-Vervoort-KNMI authored Nov 6, 2024
2 parents d8f3f44 + 6d45f48 commit 63e2a3e
Show file tree
Hide file tree
Showing 19 changed files with 4,972 additions and 21 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,18 @@ ci/scripts/install-just.sh

The Justfile is a simple text file that contains a list of tasks. Each task is a shell command. For example:

To run build and run the services locally:
To run build and run the services locally without the monitoring:

```bash
just up test
```

To run build and run the services locally with the monitoring:

```bash
just local test
```

To run everything including client and do a cleanup of the database afterward:

```bash
Expand Down
6 changes: 6 additions & 0 deletions datastore/database/extra.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# postgres-exporter
shared_preload_libraries = 'pg_cron,pg_stat_statements'
track_activity_query_size = 2048
pg_stat_statements.track = 'all'
pg_stat_statements.max = 10000
pg_stat_statements.save = 'on'
1 change: 1 addition & 0 deletions datastore/datastore/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
!common
!dsimpl
!storagebackend
!metrics
27 changes: 20 additions & 7 deletions datastore/datastore/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,27 @@ go 1.21
require google.golang.org/grpc v1.64.0

require (
github.com/cridenour/go-postgis v1.0.0
google.golang.org/protobuf v1.34.1
github.com/cridenour/go-postgis v1.0.0
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1
github.com/prometheus/client_golang v1.20.4
google.golang.org/protobuf v1.34.2
)

require (
github.com/lib/pq v1.10.9
golang.org/x/net v0.26.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/text v0.16.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
)

require (
github.com/lib/pq v1.10.9
golang.org/x/net v0.26.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/text v0.16.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117 // indirect
)
54 changes: 52 additions & 2 deletions datastore/datastore/main/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,23 @@ import (
"datastore/storagebackend"
"datastore/storagebackend/postgresql"
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
"log"
"net"
"time"

// gRPC
"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
"google.golang.org/grpc/peer"

// Monitoring
"datastore/metrics"
grpcprometheus "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"

_ "expvar"
"net/http"
_ "net/http/pprof"
Expand Down Expand Up @@ -52,8 +60,34 @@ func main() {
return resp, err
}

// create gRPC server
server := grpc.NewServer(grpc.UnaryInterceptor(reqTimeLogger))
grpcMetrics := grpcprometheus.NewServerMetrics(
grpcprometheus.WithServerHandlingTimeHistogram(
grpcprometheus.WithHistogramBuckets([]float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120}),
),
)
reg := prometheus.NewRegistry()
reg.MustRegister(
grpcMetrics,
promservermetrics.InFlightRequests,
promservermetrics.UptimeCounter,
promservermetrics.ResponseSizeSummary,
collectors.NewGoCollector(),
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
)

go promservermetrics.TrackUptime()

// create gRPC server with middleware
server := grpc.NewServer(
grpc.ChainUnaryInterceptor(
reqTimeLogger,
promservermetrics.InFlightRequestInterceptor,
promservermetrics.ResponseSizeUnaryInterceptor,
grpcMetrics.UnaryServerInterceptor(),
),
)

grpcMetrics.InitializeMetrics(server)
grpc_health_v1.RegisterHealthServer(server, health.NewServer())

// create storage backend
Expand Down Expand Up @@ -81,6 +115,22 @@ func main() {
http.ListenAndServe("0.0.0.0:6060", nil)
}()

// serve go metrics for monitoring
go func() {
httpSrv := &http.Server{Addr: "0.0.0.0:8081"}
m := http.NewServeMux()
// Create HTTP handler for Prometheus metrics.
m.Handle("/metrics", promhttp.HandlerFor(
reg,
promhttp.HandlerOpts{
EnableOpenMetrics: true,
},
))
httpSrv.Handler = m
log.Println("Starting HTTP server for Prometheus metrics on :8081")
log.Fatal(httpSrv.ListenAndServe())
}()

// serve incoming requests
log.Printf("starting server\n")
if err := server.Serve(listener); err != nil {
Expand Down
67 changes: 67 additions & 0 deletions datastore/datastore/metrics/promservermetrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package promservermetrics

import (
"context"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/grpc"
"sync"
"time"
)

var (
UptimeCounter = prometheus.NewCounter(prometheus.CounterOpts{
Name: "grpc_server_uptime_seconds",
Help: "Total uptime of the gRPC server in seconds",
})

InFlightRequests = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "grpc_in_flight_requests",
Help: "Current number of in-flight gRPC requests",
})

ResponseSizeSummary = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "grpc_response_size_summary_bytes",
Help: "Summary of response sizes in bytes for each gRPC method, with mean, min, and max",
Objectives: map[float64]float64{0.0: 0.001, 1.0: 0.001}, // Track min (0.0 quantile) and max (1.0 quantile)
},
[]string{"method"},
)

responseSizeMu sync.Mutex
responseSizeSum = make(map[string]float64)
responseSizeCount = make(map[string]float64)
)

func TrackUptime() {
// Increment the uptime every second
for {
UptimeCounter.Inc()
time.Sleep(1 * time.Second)
}
}

func InFlightRequestInterceptor(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) {
InFlightRequests.Inc() // Increment at the start of the request
defer InFlightRequests.Dec() // Decrement at the end of the request
return handler(ctx, req)
}

func ResponseSizeUnaryInterceptor(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) {
resp, err = handler(ctx, req)

if resp != nil {
responseSize := float64(len(resp.(interface{ String() string }).String()))

ResponseSizeSummary.WithLabelValues(info.FullMethod).Observe(responseSize)

// Used a mutex to synchronise the access for the responseSizeSum and responseSizeCount.
// To prevent race conditions and multiple goroutines accessing the variables at the same time.
responseSizeMu.Lock()
responseSizeSum[info.FullMethod] += responseSize
responseSizeCount[info.FullMethod]++
responseSizeMu.Unlock()
}

return resp, err
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DROP EXTENSION pg_stat_statements;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DROP EXTENSION IF EXISTS pg_stat_statements;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
52 changes: 48 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
version: "3.8"
name: datastore

services:
Expand All @@ -10,8 +9,10 @@ services:
volumes:
# - ts-data:/home/postgres/pgdata/data # for timescale image
- ts-data:/var/lib/postgresql # for postgres image
- ./datastore/database/healthcheck_postgis_uptime.sh:/healthcheck_postgis_uptime.sh # for the healthcheck
- ./datastore/database/extra.conf:/etc/conf_settings/extra.conf:ro # Extra Postgres configuration
- ./datastore/database/healthcheck_postgis_uptime.sh:/healthcheck_postgis_uptime.sh:ro # for the healthcheck
environment:
- EXTRA_CONF_DIR=/etc/conf_settings
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=mysecretpassword
- POSTGRES_DB=data
Expand All @@ -26,7 +27,7 @@ services:
]
interval: 5s
timeout: 1s
retries: 3
retries: 30
start_period: 30s # Failures in 30 seconds do not mark container as unhealthy

migrate:
Expand All @@ -46,6 +47,7 @@ services:
ports:
- "50050:50050"
- "6060:6060" # for flame graphing
- "8081:8081"
environment:
- PGHOST=db
- PGPORT=5432
Expand Down Expand Up @@ -156,10 +158,52 @@ services:
- DSHOST=store
- DSPORT=50050
volumes:
- ./datastore/load-test/output:/load-test/output
- ./datastore/load-test/output:/load-test/output:rw
depends_on:
store:
condition: service_healthy

prometheus:
profiles: ["monitoring"]
image: prom/prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus

prometheus-postgres-exporter:
profiles: ["monitoring"]
image: quay.io/prometheuscommunity/postgres-exporter
environment:
- DATA_SOURCE_URI=db:5432/data
- DATA_SOURCE_USER=postgres
- DATA_SOURCE_PASS=mysecretpassword
ports:
- "9187:9187"
volumes:
- ./prometheus/postgres_exporter.yml:/postgres_exporter.yml:ro
depends_on:
db:
condition: service_healthy
command: ["--collector.stat_statements", "--collector.stat_user_tables", "--collector.stat_activity_autovacuum"]

grafana:
profiles: ["monitoring"]
image: grafana/grafana-oss:11.2.0
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=mysecretpassword
volumes:
- grafana-storage:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:rw
- ./grafana/dashboards:/var/lib/grafana/dashboards:rw
depends_on:
- prometheus

volumes:
ts-data:
prometheus-data:
grafana-storage:
Loading

1 comment on commit 63e2a3e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

API Unit Test Coverage Report
FileStmtsMissCoverMissing
\_\_init\_\_.py00100% 
datastore_pb2.py614821%34–81
datastore_pb2_grpc.py542750%15–16, 19, 65–80, 121–123, 128–130, 135–137, 142–144, 148–173, 219, 246, 273, 300
export_metrics.py100100% 
grpc_getter.py201145%15–19, 23–26, 30–32, 36–38
locustfile.py15150%1–31
main.py43784%45, 50, 60, 70–71, 81–82
metadata_endpoints.py653152%45–54, 58, 85, 100–219, 223
response_classes.py50100% 
utilities.py1744674%20, 38, 45, 67–70, 78–89, 94–101, 121, 125, 127, 155, 161, 179, 193–194, 198, 214–218, 222–228, 232–234, 264, 268, 290, 295
custom_geo_json
   edr_feature_collection.py60100% 
formatters
   \_\_init\_\_.py110100% 
   covjson.py59198%91
   geojson.py21290%27, 52
openapi
   custom_dimension_examples.py40100% 
   edr_query_parameter_descriptions.py110100% 
   openapi_examples.py130100% 
routers
   \_\_init\_\_.py00100% 
   edr.py101496%348–349, 438–439
   feature.py471960%99–132, 148–153, 159–181
TOTAL72021171% 

API Unit Test Coverage Summary

Tests Skipped Failures Errors Time
30 0 💤 0 ❌ 0 🔥 1.826s ⏱️

Please sign in to comment.