Skip to content

Commit

Permalink
Merge development for v4.4-RC6 (#209)
Browse files Browse the repository at this point in the history
* Trim whitespace (#208)

* trim whitespace in: crunchy-alert-rules-node.yml.example

* trim whitespace in: crunchy-alert-rules-etcd.yml.example

* trim whitespace in: crunchy-alert-rules-blackbox.yml.example

* trim whitespaces in: crunchy-alert-rules-pg.yml.containers.example

* trim whitespace in: crunchy-alert-rules-pg.yml.example

* Add new files for PG13 support (#201)

* Add metric for monitoring for blocked queries (#207)

* Add pg_stat_statements metrics & dashboard (#200)

Co-authored-by: Joseph Mckulka <[email protected]>
Co-authored-by: Keith Fiske <[email protected]>
  • Loading branch information
3 people authored Sep 22, 2020
1 parent 328f831 commit 41ef700
Show file tree
Hide file tree
Showing 35 changed files with 1,764 additions and 528 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ start() {

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
RETVAL=$?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ start() {

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
RETVAL=$?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ start() {

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
RETVAL=$?
Expand Down
83 changes: 0 additions & 83 deletions exporter/postgres/crunchy-postgres-exporter-pg94-el6.service

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ start() {

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
RETVAL=$?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ start() {

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
RETVAL=$?
Expand Down
3 changes: 1 addition & 2 deletions exporter/postgres/[email protected]
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,11 @@ After=network.target
PermissionsStartOnly=true
User=ccp_monitoring
EnvironmentFile=/etc/sysconfig/%i
ExecStartPre=/bin/bash -c "set -o pipefail; cat $QUERY_FILE_LIST | sed 's/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g'> $$(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')"
ExecStartPre=/bin/bash -c "set -o pipefail; cat $QUERY_FILE_LIST | sed -e 's/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g' -e 's/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g' > $$(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')"
ExecStart=/usr/bin/postgres_exporter $OPT
ExecReload=/usr/bin/kill -HUP $MAINPID
Restart=always

[Install]
WantedBy=multi-user.target
DefaultInstance=postgres_exporter

4 changes: 4 additions & 0 deletions exporter/postgres/queries_pg10.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ccp_connection_stats:
, idle_in_txn
, (select coalesce(extract(epoch from (max(now() - state_change))),0) from pg_catalog.pg_stat_activity where state = 'idle in transaction') as max_idle_in_txn_time
, (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and state <> 'idle' ) as max_query_time
, (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and wait_event_type = 'Lock' ) as max_blocked_query_time
, max_connections
from (
select count(*) as total
Expand All @@ -36,6 +37,9 @@ ccp_connection_stats:
- max_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query"
- max_blocked_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query that has been blocked by a heavyweight lock"
- max_connections:
usage: "GAUGE"
description: "Value of max_connections for the monitored database"
Expand Down
4 changes: 4 additions & 0 deletions exporter/postgres/queries_pg11.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ccp_connection_stats:
, idle_in_txn
, (select coalesce(extract(epoch from (max(now() - state_change))),0) from pg_catalog.pg_stat_activity where state = 'idle in transaction') as max_idle_in_txn_time
, (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and state <> 'idle' ) as max_query_time
, (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and wait_event_type = 'Lock' ) as max_blocked_query_time
, max_connections
from (
select count(*) as total
Expand All @@ -36,6 +37,9 @@ ccp_connection_stats:
- max_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query"
- max_blocked_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query that has been blocked by a heavyweight lock"
- max_connections:
usage: "GAUGE"
description: "Value of max_connections for the monitored database"
Expand Down
4 changes: 4 additions & 0 deletions exporter/postgres/queries_pg12.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ccp_connection_stats:
, idle_in_txn
, (select coalesce(extract(epoch from (max(now() - state_change))),0) from pg_catalog.pg_stat_activity where state = 'idle in transaction') as max_idle_in_txn_time
, (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and state <> 'idle' ) as max_query_time
, (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and wait_event_type = 'Lock' ) as max_blocked_query_time
, max_connections
from (
select count(*) as total
Expand All @@ -36,6 +37,9 @@ ccp_connection_stats:
- max_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query"
- max_blocked_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query that has been blocked by a heavyweight lock"
- max_connections:
usage: "GAUGE"
description: "Value of max_connections for the monitored database"
Expand Down
138 changes: 138 additions & 0 deletions exporter/postgres/queries_pg13.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
###
#
# Begin File: queries_pg13.yml
#
###

ccp_connection_stats:
query: "select ((total - idle) - idle_in_txn) as active
, total
, idle
, idle_in_txn
, (select coalesce(extract(epoch from (max(now() - state_change))),0) from pg_catalog.pg_stat_activity where state = 'idle in transaction') as max_idle_in_txn_time
, (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and state <> 'idle' ) as max_query_time
, (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and wait_event_type = 'Lock' ) as max_blocked_query_time
, max_connections
from (
select count(*) as total
, coalesce(sum(case when state = 'idle' then 1 else 0 end),0) as idle
, coalesce(sum(case when state = 'idle in transaction' then 1 else 0 end),0) as idle_in_txn from pg_catalog.pg_stat_activity) x
join (select setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);"
metrics:
- active:
usage: "GAUGE"
description: "Total non-idle connections"
- total:
usage: "GAUGE"
description: "Total idle and non-idle connections"
- idle:
usage: "GAUGE"
description: "Total idle connections"
- idle_in_txn:
usage: "GAUGE"
description: "Total idle in transaction connections"
- max_idle_in_txn_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest idle in transaction session"
- max_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query"
- max_blocked_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query that has been blocked by a heavyweight lock"
- max_connections:
usage: "GAUGE"
description: "Value of max_connections for the monitored database"


ccp_replication_lag:
query: "SELECT
CASE
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER
END
AS replay_time"
metrics:
- replay_time:
usage: "GAUGE"
description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary."


ccp_replication_lag_size:
query: "SELECT client_addr as replica
, client_hostname as replica_hostname
, client_port as replica_port
, pg_wal_lsn_diff(sent_lsn, replay_lsn) as bytes
FROM pg_catalog.pg_stat_replication"
metrics:
- replica:
usage: "LABEL"
description: "Replica address"
- replica_hostname:
usage: "LABEL"
description: "Replica hostname"
- replica_port:
usage: "LABEL"
description: "Replica port"
- bytes:
usage: "GAUGE"
description: "Replication lag in bytes"


ccp_replication_slots:
query: "SELECT slot_name, active::int, pg_wal_lsn_diff(pg_current_wal_insert_lsn(), restart_lsn) AS retained_bytes FROM pg_catalog.pg_replication_slots"
metrics:
- slot_name:
usage: "LABEL"
description: "Name of replication slot"
- active:
usage: "GAUGE"
description: "Active state of slot. 1 = true. 0 = false."
- retained_bytes:
usage: "GAUGE"
description: "The amount of WAL (in bytes) being retained for this slot"


ccp_wal_activity:
query: "SELECT last_5_min_size_bytes,
(SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes
FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification > CURRENT_TIMESTAMP - '5 minutes'::interval) x;"
metrics:
- last_5_min_size_bytes:
usage: "GAUGE"
description: "Current size in bytes of the last 5 minutes of WAL generation. Includes recycled WALs."
- total_size_bytes:
usage: "GAUGE"
description: "Current size in bytes of the WAL directory"


ccp_data_checksum_failure:
query: "SELECT datname AS dbname
, checksum_failures AS count
, coalesce(extract(epoch from (now() - checksum_last_failure)), 0) AS time_since_last_failure_seconds
FROM pg_catalog.pg_stat_database;"
metrics:
- dbname:
usage: "LABEL"
description: "Database name"
- count:
usage: "GAUGE"
description: "Total number of checksum failures on this database"
- time_since_last_failure_seconds:
usage: "GAUGE"
description: "Time interval in seconds since the last checksum failure was encountered"


ccp_pg_hba_checksum:
query: "SELECT monitor.pg_hba_checksum() AS status"
metrics:
- status:
usage: "GAUGE"
description: "Value of checksum monitioring status for pg_catalog.pg_hba_file_rules (pg_hba.conf). 0 = valid config. 1 = settings changed. To reset current config to valid after alert, run monitor.pg_hba_checksum_set_valid()."


###
#
# End File: queries_pg13.yml
#
###
Loading

0 comments on commit 41ef700

Please sign in to comment.