Skip to content

Commit

Permalink
Merge Development for 4.3-RC1 (#183)
Browse files Browse the repository at this point in the history
* Fix syntax error in PG prometheus rules file (#171)

* Fix spelling of "gauge" (#174)

* Fix syntax error in PG prometheus rules file (#171) (#172)

* Fix spelling of "gauge"

Co-authored-by: Keith Fiske <[email protected]>

* Add metrics list. Fix duped and mislabeled metrics. (#173)

* Add metrics list. Fix duped and mislabeled metrics.

* Use set -o pipefail to fail when cat command errors out (#179)

* Blackbox exporter & etcd example alerts (#176)

* Add new query files for windows (#181)

Co-authored-by: Keith Fiske <[email protected]>
Co-authored-by: yulicrunchy <[email protected]>
  • Loading branch information
3 people authored Jul 10, 2020
1 parent c42ac6f commit 9b18f16
Show file tree
Hide file tree
Showing 21 changed files with 615 additions and 7 deletions.
4 changes: 4 additions & 0 deletions exporter/blackbox/blackbox_exporter.sysconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This file must be in a pathname that matches the EnvironmentFile entry in the service file (Default: /etc/sysconfig/blackbox_exporter)
#
OPT="--config.file=/etc/blackbox_exporter/crunchy-blackbox.yml"

6 changes: 6 additions & 0 deletions exporter/blackbox/crunchy-blackbox.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
modules:
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"

Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ start() {
# do_start_prepare

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ start() {
# do_start_prepare

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ start() {
# do_start_prepare

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ start() {
# do_start_prepare

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ start() {
# do_start_prepare

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ start() {
# do_start_prepare

echo -n $"Starting postgres_exporter: "
set -o pipefail
cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')
echo $DATA_SOURCE_NAME
daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT
Expand Down
2 changes: 1 addition & 1 deletion exporter/postgres/[email protected]
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ After=network.target
PermissionsStartOnly=true
User=ccp_monitoring
EnvironmentFile=/etc/sysconfig/%i
ExecStartPre=/bin/bash -c "cat $QUERY_FILE_LIST | sed 's/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g'> $$(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')"
ExecStartPre=/bin/bash -c "set -o pipefail; cat $QUERY_FILE_LIST | sed 's/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g'> $$(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')"
ExecStart=/usr/bin/postgres_exporter $OPT
ExecReload=/usr/bin/kill -HUP $MAINPID
Restart=always
Expand Down
2 changes: 1 addition & 1 deletion exporter/postgres/queries_bloat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ ccp_bloat_check:
description: "Size of object in bytes"
- total_wasted_space_bytes:
usage: "GAUGE"
description: "Total wasted space in bytes of current database"
description: "Total wasted space in bytes of given object"

###
#
Expand Down
4 changes: 2 additions & 2 deletions exporter/postgres/queries_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ ccp_postmaster_runtime:
metrics:
- start_time_seconds:
usage: "GAUGE"
description: "Time at which postmaster started"
description: "Time at which postmaster started. Note this metric has been deprecated as of pgMonitor 4.3 and will be removed in a future version. Use ccp_postmaster_uptime_seconds instead."

ccp_settings_guage:
ccp_settings_gauge:
query: "select (select setting::int from pg_catalog.pg_settings where name = 'checkpoint_timeout') as checkpoint_timeout
, (select setting::float from pg_catalog.pg_settings where name = 'checkpoint_completion_target') as checkpoint_completion_target
, (select 8192*setting::bigint as bytes from pg_catalog.pg_settings where name = 'shared_buffers') as shared_buffers"
Expand Down
225 changes: 225 additions & 0 deletions exporter/postgres/queries_common.yml.win
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
###
#
# Begin File: queries_common.yml
#
###

ccp_postgresql_version:
query: "SELECT current_setting('server_version_num')::int AS current"
metrics:
- current:
usage: "GAUGE"
description: "The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######)."


ccp_is_in_recovery:
query: "SELECT CASE WHEN pg_is_in_recovery = true THEN 1 ELSE 2 END AS status from pg_is_in_recovery();"
metrics:
- status:
usage: "GAUGE"
description: "Return value of 1 means database is in recovery. Otherwise 2 it is a primary."


ccp_locks:
query: "SELECT pg_database.datname as dbname, tmp.mode, COALESCE(count,0) as count
FROM
(
VALUES ('accesssharelock'),
('rowsharelock'),
('rowexclusivelock'),
('shareupdateexclusivelock'),
('sharelock'),
('sharerowexclusivelock'),
('exclusivelock'),
('accessexclusivelock')
) AS tmp(mode) CROSS JOIN pg_catalog.pg_database
LEFT JOIN
(SELECT database, lower(mode) AS mode,count(*) AS count
FROM pg_catalog.pg_locks WHERE database IS NOT NULL
GROUP BY database, lower(mode)
) AS tmp2
ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database"
metrics:
- dbname:
usage: "LABEL"
description: "Database name"
- mode:
usage: "LABEL"
description: "Lock type"
- count:
usage: "GAUGE"
description: "Number of locks"


ccp_postmaster_runtime:
query: "SELECT extract('epoch' from pg_postmaster_start_time) as start_time_seconds from pg_catalog.pg_postmaster_start_time()"
metrics:
- start_time_seconds:
usage: "GAUGE"
description: "Time at which postmaster started. Note this metric has been deprecated as of pgMonitor 4.3 and will be removed in a future version. Use ccp_postmaster_uptime_seconds instead."

ccp_settings_gauge:
query: "select (select setting::int from pg_catalog.pg_settings where name = 'checkpoint_timeout') as checkpoint_timeout
, (select setting::float from pg_catalog.pg_settings where name = 'checkpoint_completion_target') as checkpoint_completion_target
, (select 8192*setting::bigint as bytes from pg_catalog.pg_settings where name = 'shared_buffers') as shared_buffers"
metrics:
- checkpoint_timeout:
usage: "GAUGE"
description: "Checkpoint timeout in seconds"
- checkpoint_completion_target:
usage: "GAUGE"
description: "Checkpoint completion target, ranging from 0 to 1"
- shared_buffers:
usage: "GAUGE"
description: "Size of shared_buffers in bytes"


ccp_stat_bgwriter:
query: "SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, buffers_clean, maxwritten_clean, buffers_backend, buffers_backend_fsync, buffers_alloc, stats_reset FROM pg_catalog.pg_stat_bgwriter"
metrics:
- checkpoints_timed:
usage: "GAUGE"
description: "Number of scheduled checkpoints that have been performed"
- checkpoints_req:
usage: "GAUGE"
description: "Number of requested checkpoints that have been performed"
- checkpoint_write_time:
usage: "GAUGE"
description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds"
- checkpoint_sync_time:
usage: "GAUGE"
description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds"
- buffers_checkpoint:
usage: "GAUGE"
description: "Number of buffers written during checkpoints"
- buffers_clean:
usage: "GAUGE"
description: "Number of buffers written by the background writer"
- maxwritten_clean:
usage: "GAUGE"
description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers"
- buffers_backend:
usage: "GAUGE"
description: "Number of buffers written directly by a backend"
- buffers_backend_fsync:
usage: "GAUGE"
description: "Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)"
- buffers_alloc:
usage: "GAUGE"
description: "Number of buffers allocated"
- stats_reset:
usage: "GAUGE"
description: "Time at which these statistics were last reset"


ccp_stat_database:
query: "SELECT s.datname as dbname, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks FROM pg_catalog.pg_stat_database s JOIN pg_catalog.pg_database d on d.datname = s.datname WHERE d.datistemplate = false"
metrics:
- dbname:
usage: "LABEL"
description: "Name of database"
- xact_commit:
usage: "GAUGE"
description: "Number of transactions in this database that have been committed"
- xact_rollback:
usage: "GAUGE"
description: "Number of transactions in this database that have been rolled back"
- blks_read:
usage: "GAUGE"
description: "Number of disk blocks read in this database"
- blks_hit:
usage: "GAUGE"
description: "Number of times disk blocks were found already in the buffer cache, so that a read was not necessary"
- tup_returned:
usage: "GAUGE"
description: "Number of rows returned by queries in this database"
- tup_fetched:
usage: "GAUGE"
description: "Number of rows fetched by queries in this database"
- tup_inserted:
usage: "GAUGE"
description: "Number of rows inserted by queries in this database"
- tup_updated:
usage: "GAUGE"
description: "Number of rows updated by queries in this database"
- tup_deleted:
usage: "GAUGE"
description: "Number of rows deleted by queries in this database"
- conflicts:
usage: "GAUGE"
description: "Number of queries canceled due to conflicts with recovery in this database"
- temp_files:
usage: "GAUGE"
description: "Number of rows deleted by queries in this database"
- temp_bytes:
usage: "GAUGE"
description: "Total amount of data written to temporary files by queries in this database"
- deadlocks:
usage: "GAUGE"
description: "Number of deadlocks detected in this database"


ccp_transaction_wraparound:
query: "WITH max_age AS ( SELECT 2000000000 as max_old_xid, setting AS autovacuum_freeze_max_age FROM pg_catalog.pg_settings WHERE name = 'autovacuum_freeze_max_age'), per_database_stats AS ( SELECT datname , m.max_old_xid::int , m.autovacuum_freeze_max_age::int , age(d.datfrozenxid) AS oldest_current_xid FROM pg_catalog.pg_database d JOIN max_age m ON (true) WHERE d.datallowconn) SELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats"
metrics:
- oldest_current_xid:
usage: "GAUGE"
description: "Oldest current transaction ID in cluster"
- percent_towards_wraparound:
usage: "GAUGE"
description: "Percentage towards transaction ID wraparound"
- percent_towards_emergency_autovac:
usage: "GAUGE"
description: "Percentage towards emergency autovacuum process starting"


ccp_archive_command_status:
query: "SELECT CASE
WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0
WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) < 0 THEN 0
ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))
END AS seconds_since_last_fail
FROM pg_catalog.pg_stat_archiver"
metrics:
- seconds_since_last_fail:
usage: "GAUGE"
description: "Seconds since the last recorded failure of the archive_command"


ccp_sequence_exhaustion:
query: "SELECT count FROM monitor.sequence_exhaustion(75)"
metrics:
- count:
usage: "GAUGE"
description: "Count of sequences that have reached greater than or equal to 75% of their max available numbers. Function monitor.sequence_status() can provide more details if run directly on system."


ccp_postmaster_uptime:
query: "SELECT extract(epoch from (now() - pg_postmaster_start_time() )) AS seconds;"
metrics:
- seconds:
usage: "GAUGE"
description: "Time interval in seconds since PostgreSQL database was last restarted"


ccp_pg_settings_checksum:
query: "SELECT monitor.pg_settings_checksum() AS status"
metrics:
- status:
usage: "GAUGE"
description: "Value of checksum monitioring status for pg_catalog.pg_settings (postgresql.conf). 0 = valid config. 1 = settings changed. To reset current config to valid after alert, run monitor.pg_settings_checksum_set_valid()."


ccp_settings_pending_restart:
query: "SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true"
metrics:
- count:
usage: "GAUGE"
description: "Number of settings from pg_settings catalog in a pending_restart state"


###
#
# End File: queries_common.yml
#
###
66 changes: 66 additions & 0 deletions exporter/postgres/queries_per_db.yml.win
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
###
#
# Begin File: queries_per_db.yml
#
###

ccp_stat_user_tables:
query: "SELECT current_database() as dbname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_catalog.pg_stat_user_tables"
metrics:
- dbname:
usage: "LABEL"
description: "Database name"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- seq_scan:
usage: "COUNTER"
description: "Number of sequential scans initiated on this table"
- seq_tup_read:
usage: "COUNTER"
description: "Number of live rows fetched by sequential scans"
- idx_scan:
usage: "COUNTER"
description: "Number of index scans initiated on this table"
- idx_tup_fetch:
usage: "COUNTER"
description: "Number of live rows fetched by index scans"
- n_tup_ins:
usage: "COUNTER"
description: "Number of rows inserted"
- n_tup_upd:
usage: "COUNTER"
description: "Number of rows updated"
- n_tup_del:
usage: "COUNTER"
description: "Number of rows deleted"
- n_tup_hot_upd:
usage: "COUNTER"
description: "Number of rows HOT updated (i.e., with no separate index update required)"
- n_live_tup:
usage: "GAUGE"
description: "Estimated number of live rows"
- n_dead_tup:
usage: "GAUGE"
description: "Estimated number of dead rows"
- vacuum_count:
usage: "COUNTER"
description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
- autovacuum_count:
usage: "COUNTER"
description: "Number of times this table has been vacuumed by the autovacuum daemon"
- analyze_count:
usage: "COUNTER"
description: "Number of times this table has been manually analyzed"
- autoanalyze_count:
usage: "COUNTER"
description: "Number of times this table has been analyzed by the autovacuum daemon"

###
#
# End File: queries_per_db.yml
#
###
Loading

0 comments on commit 9b18f16

Please sign in to comment.