diff --git a/exporter/blackbox/blackbox_exporter.sysconfig b/exporter/blackbox/blackbox_exporter.sysconfig new file mode 100644 index 00000000..0bf0aa54 --- /dev/null +++ b/exporter/blackbox/blackbox_exporter.sysconfig @@ -0,0 +1,4 @@ +# This file must be in a pathname that matches the EnvironmentFile entry in the service file (Default: /etc/sysconfig/blackbox_exporter) +# +OPT="--config.file=/etc/blackbox_exporter/crunchy-blackbox.yml" + diff --git a/exporter/blackbox/crunchy-blackbox.yml b/exporter/blackbox/crunchy-blackbox.yml new file mode 100644 index 00000000..b46478b7 --- /dev/null +++ b/exporter/blackbox/crunchy-blackbox.yml @@ -0,0 +1,6 @@ +modules: + tcp_connect: + prober: tcp + tcp: + preferred_ip_protocol: "ip4" + diff --git a/exporter/postgres/crunchy-postgres-exporter-pg10-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg10-el6.service index e428263a..f486aba7 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg10-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg10-el6.service @@ -35,6 +35,7 @@ start() { # do_start_prepare echo -n $"Starting postgres_exporter: " + set -o pipefail cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT diff --git a/exporter/postgres/crunchy-postgres-exporter-pg11-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg11-el6.service index d19ee9a1..b4cfb9c7 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg11-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg11-el6.service @@ -35,6 +35,7 @@ start() { # do_start_prepare echo -n $"Starting postgres_exporter: " + set -o pipefail cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT diff --git a/exporter/postgres/crunchy-postgres-exporter-pg12-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg12-el6.service index 52516e2c..fbf02d19 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg12-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg12-el6.service @@ -35,6 +35,7 @@ start() { # do_start_prepare echo -n $"Starting postgres_exporter: " + set -o pipefail cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT diff --git a/exporter/postgres/crunchy-postgres-exporter-pg94-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg94-el6.service index e35d90e5..da3e71b5 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg94-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg94-el6.service @@ -35,6 +35,7 @@ start() { # do_start_prepare echo -n $"Starting postgres_exporter: " + set -o pipefail cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT diff --git a/exporter/postgres/crunchy-postgres-exporter-pg95-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg95-el6.service index 45cb93da..5d9d5c09 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg95-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg95-el6.service @@ -35,6 +35,7 @@ start() { # do_start_prepare echo -n $"Starting postgres_exporter: " + set -o pipefail cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT diff --git a/exporter/postgres/crunchy-postgres-exporter-pg96-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg96-el6.service index e7bbf12e..eebb4df5 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg96-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg96-el6.service @@ -35,6 +35,7 @@ start() { # do_start_prepare echo -n $"Starting postgres_exporter: " + set -o pipefail cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT diff --git a/exporter/postgres/crunchy-postgres-exporter@.service b/exporter/postgres/crunchy-postgres-exporter@.service index d787565b..9a1815ee 100644 --- a/exporter/postgres/crunchy-postgres-exporter@.service +++ b/exporter/postgres/crunchy-postgres-exporter@.service @@ -10,7 +10,7 @@ After=network.target PermissionsStartOnly=true User=ccp_monitoring EnvironmentFile=/etc/sysconfig/%i -ExecStartPre=/bin/bash -c "cat $QUERY_FILE_LIST | sed 's/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g'> $$(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')" +ExecStartPre=/bin/bash -c "set -o pipefail; cat $QUERY_FILE_LIST | sed 's/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g'> $$(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')" ExecStart=/usr/bin/postgres_exporter $OPT ExecReload=/usr/bin/kill -HUP $MAINPID Restart=always diff --git a/exporter/postgres/queries_bloat.yml b/exporter/postgres/queries_bloat.yml index 671d5009..7cfd3d69 100644 --- a/exporter/postgres/queries_bloat.yml +++ b/exporter/postgres/queries_bloat.yml @@ -26,7 +26,7 @@ ccp_bloat_check: description: "Size of object in bytes" - total_wasted_space_bytes: usage: "GAUGE" - description: "Total wasted space in bytes of current database" + description: "Total wasted space in bytes of given object" ### # diff --git a/exporter/postgres/queries_common.yml b/exporter/postgres/queries_common.yml index 7ffd50bb..18d4267f 100644 --- a/exporter/postgres/queries_common.yml +++ b/exporter/postgres/queries_common.yml @@ -67,9 +67,9 @@ ccp_postmaster_runtime: metrics: - start_time_seconds: usage: "GAUGE" - description: "Time at which postmaster started" + description: "Time at which postmaster started. Note this metric has been deprecated as of pgMonitor 4.3 and will be removed in a future version. Use ccp_postmaster_uptime_seconds instead." -ccp_settings_guage: +ccp_settings_gauge: query: "select (select setting::int from pg_catalog.pg_settings where name = 'checkpoint_timeout') as checkpoint_timeout , (select setting::float from pg_catalog.pg_settings where name = 'checkpoint_completion_target') as checkpoint_completion_target , (select 8192*setting::bigint as bytes from pg_catalog.pg_settings where name = 'shared_buffers') as shared_buffers" diff --git a/exporter/postgres/queries_common.yml.win b/exporter/postgres/queries_common.yml.win new file mode 100644 index 00000000..b571d175 --- /dev/null +++ b/exporter/postgres/queries_common.yml.win @@ -0,0 +1,225 @@ +### +# +# Begin File: queries_common.yml +# +### + +ccp_postgresql_version: + query: "SELECT current_setting('server_version_num')::int AS current" + metrics: + - current: + usage: "GAUGE" + description: "The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######)." + + +ccp_is_in_recovery: + query: "SELECT CASE WHEN pg_is_in_recovery = true THEN 1 ELSE 2 END AS status from pg_is_in_recovery();" + metrics: + - status: + usage: "GAUGE" + description: "Return value of 1 means database is in recovery. Otherwise 2 it is a primary." + + +ccp_locks: + query: "SELECT pg_database.datname as dbname, tmp.mode, COALESCE(count,0) as count + FROM + ( + VALUES ('accesssharelock'), + ('rowsharelock'), + ('rowexclusivelock'), + ('shareupdateexclusivelock'), + ('sharelock'), + ('sharerowexclusivelock'), + ('exclusivelock'), + ('accessexclusivelock') + ) AS tmp(mode) CROSS JOIN pg_catalog.pg_database + LEFT JOIN + (SELECT database, lower(mode) AS mode,count(*) AS count + FROM pg_catalog.pg_locks WHERE database IS NOT NULL + GROUP BY database, lower(mode) + ) AS tmp2 + ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database" + metrics: + - dbname: + usage: "LABEL" + description: "Database name" + - mode: + usage: "LABEL" + description: "Lock type" + - count: + usage: "GAUGE" + description: "Number of locks" + + +ccp_postmaster_runtime: + query: "SELECT extract('epoch' from pg_postmaster_start_time) as start_time_seconds from pg_catalog.pg_postmaster_start_time()" + metrics: + - start_time_seconds: + usage: "GAUGE" + description: "Time at which postmaster started. Note this metric has been deprecated as of pgMonitor 4.3 and will be removed in a future version. Use ccp_postmaster_uptime_seconds instead." + +ccp_settings_gauge: + query: "select (select setting::int from pg_catalog.pg_settings where name = 'checkpoint_timeout') as checkpoint_timeout + , (select setting::float from pg_catalog.pg_settings where name = 'checkpoint_completion_target') as checkpoint_completion_target + , (select 8192*setting::bigint as bytes from pg_catalog.pg_settings where name = 'shared_buffers') as shared_buffers" + metrics: + - checkpoint_timeout: + usage: "GAUGE" + description: "Checkpoint timeout in seconds" + - checkpoint_completion_target: + usage: "GAUGE" + description: "Checkpoint completion target, ranging from 0 to 1" + - shared_buffers: + usage: "GAUGE" + description: "Size of shared_buffers in bytes" + + +ccp_stat_bgwriter: + query: "SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, buffers_clean, maxwritten_clean, buffers_backend, buffers_backend_fsync, buffers_alloc, stats_reset FROM pg_catalog.pg_stat_bgwriter" + metrics: + - checkpoints_timed: + usage: "GAUGE" + description: "Number of scheduled checkpoints that have been performed" + - checkpoints_req: + usage: "GAUGE" + description: "Number of requested checkpoints that have been performed" + - checkpoint_write_time: + usage: "GAUGE" + description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds" + - checkpoint_sync_time: + usage: "GAUGE" + description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds" + - buffers_checkpoint: + usage: "GAUGE" + description: "Number of buffers written during checkpoints" + - buffers_clean: + usage: "GAUGE" + description: "Number of buffers written by the background writer" + - maxwritten_clean: + usage: "GAUGE" + description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" + - buffers_backend: + usage: "GAUGE" + description: "Number of buffers written directly by a backend" + - buffers_backend_fsync: + usage: "GAUGE" + description: "Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)" + - buffers_alloc: + usage: "GAUGE" + description: "Number of buffers allocated" + - stats_reset: + usage: "GAUGE" + description: "Time at which these statistics were last reset" + + +ccp_stat_database: + query: "SELECT s.datname as dbname, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks FROM pg_catalog.pg_stat_database s JOIN pg_catalog.pg_database d on d.datname = s.datname WHERE d.datistemplate = false" + metrics: + - dbname: + usage: "LABEL" + description: "Name of database" + - xact_commit: + usage: "GAUGE" + description: "Number of transactions in this database that have been committed" + - xact_rollback: + usage: "GAUGE" + description: "Number of transactions in this database that have been rolled back" + - blks_read: + usage: "GAUGE" + description: "Number of disk blocks read in this database" + - blks_hit: + usage: "GAUGE" + description: "Number of times disk blocks were found already in the buffer cache, so that a read was not necessary" + - tup_returned: + usage: "GAUGE" + description: "Number of rows returned by queries in this database" + - tup_fetched: + usage: "GAUGE" + description: "Number of rows fetched by queries in this database" + - tup_inserted: + usage: "GAUGE" + description: "Number of rows inserted by queries in this database" + - tup_updated: + usage: "GAUGE" + description: "Number of rows updated by queries in this database" + - tup_deleted: + usage: "GAUGE" + description: "Number of rows deleted by queries in this database" + - conflicts: + usage: "GAUGE" + description: "Number of queries canceled due to conflicts with recovery in this database" + - temp_files: + usage: "GAUGE" + description: "Number of rows deleted by queries in this database" + - temp_bytes: + usage: "GAUGE" + description: "Total amount of data written to temporary files by queries in this database" + - deadlocks: + usage: "GAUGE" + description: "Number of deadlocks detected in this database" + + +ccp_transaction_wraparound: + query: "WITH max_age AS ( SELECT 2000000000 as max_old_xid, setting AS autovacuum_freeze_max_age FROM pg_catalog.pg_settings WHERE name = 'autovacuum_freeze_max_age'), per_database_stats AS ( SELECT datname , m.max_old_xid::int , m.autovacuum_freeze_max_age::int , age(d.datfrozenxid) AS oldest_current_xid FROM pg_catalog.pg_database d JOIN max_age m ON (true) WHERE d.datallowconn) SELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats" + metrics: + - oldest_current_xid: + usage: "GAUGE" + description: "Oldest current transaction ID in cluster" + - percent_towards_wraparound: + usage: "GAUGE" + description: "Percentage towards transaction ID wraparound" + - percent_towards_emergency_autovac: + usage: "GAUGE" + description: "Percentage towards emergency autovacuum process starting" + + +ccp_archive_command_status: + query: "SELECT CASE + WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0 + WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) < 0 THEN 0 + ELSE EXTRACT(epoch from (last_failed_time - last_archived_time)) +END AS seconds_since_last_fail +FROM pg_catalog.pg_stat_archiver" + metrics: + - seconds_since_last_fail: + usage: "GAUGE" + description: "Seconds since the last recorded failure of the archive_command" + + +ccp_sequence_exhaustion: + query: "SELECT count FROM monitor.sequence_exhaustion(75)" + metrics: + - count: + usage: "GAUGE" + description: "Count of sequences that have reached greater than or equal to 75% of their max available numbers. Function monitor.sequence_status() can provide more details if run directly on system." + + +ccp_postmaster_uptime: + query: "SELECT extract(epoch from (now() - pg_postmaster_start_time() )) AS seconds;" + metrics: + - seconds: + usage: "GAUGE" + description: "Time interval in seconds since PostgreSQL database was last restarted" + + +ccp_pg_settings_checksum: + query: "SELECT monitor.pg_settings_checksum() AS status" + metrics: + - status: + usage: "GAUGE" + description: "Value of checksum monitioring status for pg_catalog.pg_settings (postgresql.conf). 0 = valid config. 1 = settings changed. To reset current config to valid after alert, run monitor.pg_settings_checksum_set_valid()." + + +ccp_settings_pending_restart: + query: "SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true" + metrics: + - count: + usage: "GAUGE" + description: "Number of settings from pg_settings catalog in a pending_restart state" + + +### +# +# End File: queries_common.yml +# +### diff --git a/exporter/postgres/queries_per_db.yml.win b/exporter/postgres/queries_per_db.yml.win new file mode 100644 index 00000000..3aed6aea --- /dev/null +++ b/exporter/postgres/queries_per_db.yml.win @@ -0,0 +1,66 @@ +### +# +# Begin File: queries_per_db.yml +# +### + +ccp_stat_user_tables: + query: "SELECT current_database() as dbname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_catalog.pg_stat_user_tables" + metrics: + - dbname: + usage: "LABEL" + description: "Database name" + - schemaname: + usage: "LABEL" + description: "Name of the schema that this table is in" + - relname: + usage: "LABEL" + description: "Name of this table" + - seq_scan: + usage: "COUNTER" + description: "Number of sequential scans initiated on this table" + - seq_tup_read: + usage: "COUNTER" + description: "Number of live rows fetched by sequential scans" + - idx_scan: + usage: "COUNTER" + description: "Number of index scans initiated on this table" + - idx_tup_fetch: + usage: "COUNTER" + description: "Number of live rows fetched by index scans" + - n_tup_ins: + usage: "COUNTER" + description: "Number of rows inserted" + - n_tup_upd: + usage: "COUNTER" + description: "Number of rows updated" + - n_tup_del: + usage: "COUNTER" + description: "Number of rows deleted" + - n_tup_hot_upd: + usage: "COUNTER" + description: "Number of rows HOT updated (i.e., with no separate index update required)" + - n_live_tup: + usage: "GAUGE" + description: "Estimated number of live rows" + - n_dead_tup: + usage: "GAUGE" + description: "Estimated number of dead rows" + - vacuum_count: + usage: "COUNTER" + description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" + - autovacuum_count: + usage: "COUNTER" + description: "Number of times this table has been vacuumed by the autovacuum daemon" + - analyze_count: + usage: "COUNTER" + description: "Number of times this table has been manually analyzed" + - autoanalyze_count: + usage: "COUNTER" + description: "Number of times this table has been analyzed by the autovacuum daemon" + +### +# +# End File: queries_per_db.yml +# +### diff --git a/exporter/postgres/queries_pg94.yml b/exporter/postgres/queries_pg94.yml index de083429..8c218084 100644 --- a/exporter/postgres/queries_pg94.yml +++ b/exporter/postgres/queries_pg94.yml @@ -88,13 +88,21 @@ ccp_replication_slots: usage: "GAUGE" description: "The amount of WAL (in bytes) being retained for this slot" - ccp_wal_activity: query: "SELECT (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_segment_size') * (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_block_size') * (SELECT count(*) FROM monitor.pg_ls_waldir()) AS total_size_bytes;" metrics: - count: + usage: "GAUGE" + description: "Current size in bytes of the WAL directory. Note this metric has been deprecated as of pgMonitor 4.3 and will be removed in a future version. Use ccp_wal_activity_total_size_bytes instead." + +ccp_wal_activity: + query: "SELECT (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_segment_size') * + (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_block_size') * + (SELECT count(*) FROM monitor.pg_ls_waldir()) AS total_size_bytes;" + metrics: + - total_size_bytes: usage: "GAUGE" description: "Current size in bytes of the WAL directory" diff --git a/exporter/postgres/queries_pg95.yml b/exporter/postgres/queries_pg95.yml index 6031d07d..080f1431 100644 --- a/exporter/postgres/queries_pg95.yml +++ b/exporter/postgres/queries_pg95.yml @@ -88,13 +88,21 @@ ccp_replication_slots: usage: "GAUGE" description: "The amount of WAL (in bytes) being retained for this slot" - ccp_wal_activity: query: "SELECT (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_segment_size') * (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_block_size') * (SELECT count(*) FROM monitor.pg_ls_waldir()) AS total_size_bytes;" metrics: - count: + usage: "GAUGE" + description: "Current size in bytes of the WAL directory. Note this metric has been deprecated as of pgMonitor 4.3 and will be removed in a future version. Use ccp_wal_activity_total_size_bytes instead." + +ccp_wal_activity: + query: "SELECT (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_segment_size') * + (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_block_size') * + (SELECT count(*) FROM monitor.pg_ls_waldir()) AS total_size_bytes;" + metrics: + - total_size_bytes: usage: "GAUGE" description: "Current size in bytes of the WAL directory" diff --git a/exporter/postgres/queries_pg96.yml b/exporter/postgres/queries_pg96.yml index abe98306..8149050a 100644 --- a/exporter/postgres/queries_pg96.yml +++ b/exporter/postgres/queries_pg96.yml @@ -89,13 +89,21 @@ ccp_replication_slots: usage: "GAUGE" description: "The amount of WAL (in bytes) being retained for this slot" - ccp_wal_activity: query: "SELECT (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_segment_size') * (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_block_size') * (SELECT count(*) FROM monitor.pg_ls_waldir()) AS total_size_bytes;" metrics: - count: + usage: "GAUGE" + description: "Current size in bytes of the WAL directory. Note this metric has been deprecated as of pgMonitor 4.3 and will be removed in a future version. Use ccp_wal_activity_total_size_bytes instead." + +ccp_wal_activity: + query: "SELECT (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_segment_size') * + (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_block_size') * + (SELECT count(*) FROM monitor.pg_ls_waldir()) AS total_size_bytes;" + metrics: + - total_size_bytes: usage: "GAUGE" description: "Current size in bytes of the WAL directory" diff --git a/hugo/content/changelog/_index.md b/hugo/content/changelog/_index.md index 6649d9f1..d50813fb 100644 --- a/hugo/content/changelog/_index.md +++ b/hugo/content/changelog/_index.md @@ -3,6 +3,23 @@ title: "Changelog" draft: false weight: 5 --- +## 4.3 + +### New Features + +### Bug Fixes + + * Fixed syntax error in example prometheus alert rules file for postgresql for the pending restart rule. + +### Non-backward Compatible Changes + +### Manual Intervention Changes + + * Renamed metric `ccp_postmaster_runtime_start_time_seconds` to `ccp_postmaster_uptime_seconds`. Both metrics report the same value, so they are currently duplicates. Note the old metric name has not yet been dropped and will still work, but it will be dropped in an upcoming version of pgMonitor. + + * For PostgreSQL 9.5 & 9.6, renamed metric `ccp_wal_activity_count` to `ccp_wal_activity_total_size_bytes`. The actual value being returned has always been the total size in bytes, so the previous name was misleading. PostgreSQL 10+ already had the metric with the proper bytes size name. Note the old metric name has not yet been dropped and will still work, but it will be dropped in an upcoming version of pgMonitor. + + ## 4.2 ### New Features diff --git a/hugo/content/exporter/_index.md b/hugo/content/exporter/_index.md index 89be3d37..9010638e 100644 --- a/hugo/content/exporter/_index.md +++ b/hugo/content/exporter/_index.md @@ -15,7 +15,11 @@ The Linux instructions below use RHEL, but any Linux-based system should work. [ - [RHEL / CentOS 7 (preferred)](#setup-on-rhel-centos-7-preferred) - [RHEL / CentOS 6](#installation-setup-on-rhel-centos-6) - [Windows Server 2012R2](#windows-server-2012r2) +- [Metrics Collected](#metrics-collected) + - [PostgreSQL](#postgresql) + - [System](#system) + ## Installation ### RPM installs @@ -393,3 +397,179 @@ Install the WMI and PostgreSQL exporters by: 14. Finally, confirm the per-db eporter is functional by loading [http://localhost:9188/metrics](http://localhost:9188/metrics) in your browser: ![client installer 13](/images/client_installer_13.png) + +## Metrics Collected + +The metrics collected by our exporters are outlined below. + +### PostgreSQL + +PostgreSQL metrics are collected by the [postgres_exporter](https://github.com/wrouesnel/postgres_exporter). pgMonitor uses custom queries for its PG metrics. The default metrics that postgres_exporter comes with are all disabled except for the `pg_up` metric. + +#### General Metrics + +*pg_up* - Database is up and connectable by metric collector. This is the only metrics that comes with postgres_exporter that is currently used + +#### Common Metrics + +Metrics contained in the `queries_common.yml` file. These metrics are common to all versions of PostgreSQL and are recommended as a minimum default for the global exporter. + + * *ccp_archive_command_status_seconds_since_last_fail* - Seconds since the last `archive_command` run failed. If zero, the `archive_command` is succeeding without error. + + * *ccp_database_size_bytes* - Total size of each database in PostgreSQL instance + + * *ccp_is_in_recovery_status* - Current value of the pg_is_in_recovery() function expressed as 2 for true (instance is a replica) and 1 for false (instance is a primary) + + * *ccp_locks_count* - Count of active lock types per database + + * *ccp_pg_settings_checksum_status* - Value of checksum monitioring status for pg_catalog.pg_settings (postgresql.conf). 0 = valid config. 1 = settings changed. Settings history is available for review in the table `monitor.pg_settings_checksum`. To reset current config to valid after alert, run monitor.pg_settings_checksum_set_valid(). Note this will clear the history table. + + * *ccp_postmaster_uptime_seconds* - Time interval in seconds since PostgreSQL database was last restarted + + * *ccp_postgresql_version_current* - Version of PostgreSQL that this exporter is monitoring. Value is the 6 digit integer returned by the `server_version_num` PostgreSQL configuration variable to allow easy monitoring for version changes. + + * *ccp_sequence_exhaustion_count* - Checks for any sequences that may be close to exhaustion (by default greater than 75% usage). Note this checks the sequences themselves, not the values contained in the columns that use said sequences. Function `monitor.sequence_status()` can provide more details if run directly on database instance. + + * *ccp_settings_pending_restart_count* - Number of settings from pg_settings catalog in a pending_restart state. This value is from the similarly named column found in pg_catalog.pg_settings. + +The meaning of the following `ccp_transaction_wraparound` metrics, and how to manage when they are triggered, is covered more extensively in this blog post: https://info.crunchydata.com/blog/managing-transaction-id-wraparound-in-postgresql + + * *ccp_transaction_wraparound_percent_towards_emergency_autovac* - Recommended thresholds set to 75%/95% when first evaluating vacuum settings on new systems. Once those have been reviewed and at least one instance-wide vacuum has been run, recommend thresholds of 110%/125%. Alerting above 100% for extended periods of time means that autovacuum is not able to keep up with current transaction rate and needs further tuning. + + * *ccp_transaction_wraparound_percent_towards_wraparound* - Recommend thresholds set to 50%/75%. If any of these thresholds is tripped, current vacuum settings must be evaluated and tuned ASAP. If critical threshold is reached, it is vitally important that vacuum be run on tables with old transaction IDs to avoid the cluster being forced to shut down and only be able to run in single user mode. + +The following `ccp_stat_bgwriter` metrics are statistics collected from the [pg_stat_bgwriter](https://www.postgresql.org/docs/current/monitoring-stats.html#PG-STAT-BGWRITER-VIEW) view for monitoring performance. These metrics cover important performance information about flushing data out to disk. Please see the documentation for further details on these metrics. + + * *ccp_stat_bgwriter_buffers_alloc* + + * *ccp_stat_bgwriter_buffers_backend* + + * *ccp_stat_bgwriter_buffers_backend_fsync* + + * *ccp_stat_bgwriter_buffers_checkpoint* + + * *ccp_stat_bgwriter_buffers_clean* + +The following `ccp_stat_database_*` metrics are statistics collected from the [pg_stat_database](https://www.postgresql.org/docs/current/monitoring-stats.html#PG-STAT-DATABASE-VIEW) view. + + * *ccp_stat_database_blks_hit* + + * *ccp_stat_database_blks_read* + + * *ccp_stat_database_conflicts* + + * *ccp_stat_database_deadlocks* + + * *ccp_stat_database_tup_deleted* + + * *ccp_stat_database_tup_fetched* + + * *ccp_stat_database_tup_inserted* + + * *ccp_stat_database_tup_returned* + + * *ccp_stat_database_tup_updated* + + * *ccp_stat_database_xact_commit* + + * *ccp_stat_database_xact_rollback* + +#### PostgreSQL Version Specific Metrics + +The following metrics either require special considerations when monitoring specific versions of PostgreSQL, or are only available for specific versions. These metrics are found in the `queries_pg##.yml` files, where ## is the major version of PG. Unless otherwise noted, the below metrics are available for all versions of PG. These metrics are recommend as a minimum default for the global exporter. + + * *ccp_connection_stats_active* - Count of active connections + + * *ccp_connection_stats_idle* - Count of idle connections + + * *ccp_connection_stats_idle_in_txn* - Count of idle in transaction connections + + * *ccp_connection_stats_max_connections* - Current value of max_connections for reference + + * *ccp_connection_stats_max_idle_in_txn_time* - Runtime of longest idle in transaction (IIT) session. + + * *ccp_connection_stats_max_query_time* - Runtime of longest general query (inclusive of IIT). + + * *ccp_replication_lag_replay_time* - Only provides values on replica instances. Time since replica received and replayed a WAL file. Note this is not the main way to determine if a replica is behind its primary. It only monitors the time the replica replayed the WAL vs what it has received. It is a secondary metric for monitoring WAL replay on the replica itself. + + * *ccp_replication_lag_size_bytes* - Only provides values on instances that have attached replicas (primary, cascading replica). Tracks byte lag of every streaming replica connected to this database instance. This is the main way that replication lag is monitored. Note that if you have WAL replay only replicas, this will not be reflected here. + + * *ccp_replication_slots_active* - Active state of given replication slot. 1 = true. 0 = false. + + * *ccp_replication_slots_retained_bytes* - The amount of WAL (in bytes) being retained for given slot. + + * *ccp_wal_activity_total_size_bytes* - Current size in bytes of the WAL directory + + * *ccp_wal_activity_last_5_min_size_bytes* - PostgreSQL 10 and later only. Current size in bytes of the last 5 minutes of WAL generation. Includes recycled WALs. + + * *ccp_pg_hba_checksum_status* - PostgreSQL 10 and later only. Value of checksum monitioring status for pg_catalog.pg_hba_file_rules (pg_hba.conf). 0 = valid config. 1 = settings changed. Settings history is available for review in the table `monitor.pg_hba_checksum`. To reset current config to valid after alert, run monitor.pg_hba_checksum_set_valid(). Note this will clear the history table. + + * *ccp_data_checksum_failure_count* - PostgreSQL 12 and later only. Total number of checksum failures on this database. + + * *ccp_data_checksum_failure_time_since_last_failure_seconds* - PostgreSQL 12 and later only. Time interval in seconds since the last checksum failure was encountered. + +#### Backup Metrics + +Backup monitoring only covers pgBackRest at this time. These metrics are found in the `queries_backrest.yml` file. These metrics only need to be collected once per database instance so should be collected by the global postgres_exporter. + + * *ccp_backrest_last_full_backup_time_since_completion_seconds* - Time since completion of last pgBackRest FULL backup + + * *ccp_backrest_last_diff_backup_time_since_completion_seconds* - Time since completion of last pgBackRest DIFFERENTIAL backup. Note that FULL backup counts as a successful DIFFERENTIAL for the given stanza. + + * *ccp_backrest_last_incr_backup_time_since_completion_seconds* - Time since completion of last pgBackRest INCREMENTAL backup. Note that both FULL and DIFFERENTIAL backups count as a successful INCREMENTAL for the given stanza. + + * *ccp_backrest_last_info_runtime_backup_runtime_seconds* - Last successful runtime of each backup type (full/diff/incr). + + * *ccp_backrest_last_info_repo_backup_size_bytes* - Actual size of only this individual backup in the pgbackrest repository + + * *ccp_backrest_last_info_repo_total_size_bytes* - Total size of this backup in the pgbackrest repository, including all required previous backups and WAL + +#### Per-Database Metrics + +These are metrics that are only available on a per-database level. These metrics are found in the `queries_per_db.yml` file. These metrics are optional and recommended for the non-global, per-db postgres_exporter. They can be included in the global exporter as well if the global database needs per-database metrics monitored. Please note that depending on the number of objects in your database, collecting these metrics can greatly increase the storage requirements for Prometheus since all of these metrics are being collected for each individual object. + + * *ccp_table_size_size_bytes* - Table size inclusive of all indexes in that table + +The following `ccp_stat_user_tables_*` metrics are statistics collected from the [pg_stat_user_tables](https://www.postgresql.org/docs/current/monitoring-stats.html#PG-STAT-ALL-TABLES-VIEW). Please see the PG documentation for descriptions of these metrics. + + * *ccp_stat_user_tables_analyze_count* + + * *ccp_stat_user_tables_autoanalyze_count* + + * *ccp_stat_user_tables_autovacuum_count* + + * *ccp_stat_user_tables_n_tup_del* + + * *ccp_stat_user_tables_n_tup_ins* + + * *ccp_stat_user_tables_n_tup_upd* + + * *ccp_stat_user_tables_vacuum_count* + +#### Bloat Metrics + +Bloat metrics are only available if the `pg_bloat_check` script has been setup to run. See instructions above. These metrics are found in the `queries_bloat.yml` file. These metrics are per-database so, should be used by the per-db postgres_exporter. + + * *ccp_bloat_check_size_bytes* - Size of object in bytes + + * *ccp_bloat_check_total_wasted_space_bytes* - Total wasted space in bytes of given object + +#### pgBouncer Metrics + +The following metric prefixes correspond to the SHOW command views found in the [pgBouncer documentation](https://www.pgbouncer.org/usage.html). Each column found in the SHOW view is a separate metric under the respective prefix. Ex: `ccp_pgbouncer_pools_client_active` corresponds to the `SHOW POOLS` view's `client_active` column. These metrics are found in the `queries_bouncer.yml` file. These metrics only need to be collected once per database instance so should be collected by the global postgres_exporter. + + * *ccp_pgbouncer_pools* - SHOW POOLS + + * *ccp_pgbouncer_databases* - SHOW DATABASES + + * *ccp_pgbouncer_clients* - SHOW CLIENTS + + * *ccp_pgbouncer_servers* - SHOW SERVERS + + * *ccp_pgbouncer_lists* - SHOW LISTS + +### System + +\*NIX Operating System metrics (Linux, BSD, etc) are collected using the [node_exporter](https://github.com/prometheus/node_exporter) provided by the Prometheus team. pgMonitor only collects the default metrics provided by node_exporter, but many additional metrics are available if needed. + +Windows Operating System metrics are collected by the [wmi_exporter](https://github.com/martinlindhe/wmi_exporter). diff --git a/prometheus/alert-rules.d/crunchy-alert-rules-blackbox.yml.example b/prometheus/alert-rules.d/crunchy-alert-rules-blackbox.yml.example new file mode 100644 index 00000000..c5d157de --- /dev/null +++ b/prometheus/alert-rules.d/crunchy-alert-rules-blackbox.yml.example @@ -0,0 +1,14 @@ +groups: +- name: alert-rules + rules: + + ########## BLACKBOX EXPORTER RULES ########## + - alert: BlackBoxProbe + expr: probe_success == 0 + for: 60s + labels: + service: blackbox + severity: critical + severity_num: 300 + annotations: + summary: 'Blackbox probe {{ $labels.job }} for instance {{ $labels.instance }} is failing' diff --git a/prometheus/alert-rules.d/crunchy-alert-rules-etcd.yml.example b/prometheus/alert-rules.d/crunchy-alert-rules-etcd.yml.example new file mode 100644 index 00000000..cb0dd837 --- /dev/null +++ b/prometheus/alert-rules.d/crunchy-alert-rules-etcd.yml.example @@ -0,0 +1,39 @@ +groups: +- name: alert-rules + rules: + +########## ETCD EXPORTER RULES ########## + +# Absence alerts must be configured per named job, otherwise there's no way to know which job is down +# Below is are some examples using the leader metric for a targets called "etcd#" for a 3 node etcd cluster + +# - alert: ECTDAbsent_etcd1 +# expr: absent(etcd_server_has_leader{job="etcd1"}) +# for: 10s +# labels: +# service: etcd +# severity: critical +# severity_num: 300 +# annotations: +# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.' + +# - alert: ECTDAbsent_etcd2 +# expr: absent(etcd_server_has_leader{job="etcd2"}) +# for: 10s +# labels: +# service: etcd +# severity: critical +# severity_num: 300 +# annotations: +# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.' + +# - alert: ECTDAbsent_etcd3 +# expr: absent(etcd_server_has_leader{job="etcd3"}) +# for: 10s +# labels: +# service: etcd +# severity: critical +# severity_num: 300 +# annotations: +# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.' + diff --git a/prometheus/crunchy-prometheus.yml b/prometheus/crunchy-prometheus.yml index 2bcafbf6..33513d76 100644 --- a/prometheus/crunchy-prometheus.yml +++ b/prometheus/crunchy-prometheus.yml @@ -37,6 +37,33 @@ scrape_configs: target_label: server replacement: "" + ## Monitoring for tcp services that don't have an associated exporter can be accomplished using the tcp probe + ## of the blackbox_exporter provided by the Prometheus developers. + ## Note this only provides a simple up/down that the service is listening on the given IP/port. + ## If an exporter is available, it is advised to use an Absence alert vs the blackbox probe. + ## Below is an example to monitor the services indicated by the comment. + ## The "targets" list is all that should need to be edited to customize to your setup assuming blackbox_exporter runs + ## on same system as Prometheus. + # - job_name: 'blackbox_tcp_services' + # metrics_path: /probe + # params: + # module: [tcp_connect] + # relabel_configs: + # - source_labels: [__address__] + # target_label: __param_target + # - source_labels: [__param_target] + # target_label: instance + # - target_label: __address__ + # replacement: 127.0.0.1:9115 + # static_configs: + # - targets: + # - 127.0.0.1:3000 # grafana + # - 192.168.122.16:8009 # patroni + # - 192.168.122.26:8009 # patroni + # - 192.168.122.36:8009 # patroni + # - 192.168.122.12:7000 # haproxy + + #### Uncomment below if using alertmanager #### # rule_files: