From c5a8c21ee1dc2cfffedfc2a5c3586627e268c2d6 Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Tue, 22 Sep 2020 15:00:43 -0400 Subject: [PATCH] Add pg_stat_statements metrics & dashboard (#200) --- ...crunchy-postgres-exporter-pg10-el6.service | 2 +- ...crunchy-postgres-exporter-pg11-el6.service | 2 +- ...crunchy-postgres-exporter-pg12-el6.service | 2 +- ...crunchy-postgres-exporter-pg94-el6.service | 83 -- ...crunchy-postgres-exporter-pg95-el6.service | 2 +- ...crunchy-postgres-exporter-pg96-el6.service | 2 +- .../crunchy-postgres-exporter@.service | 3 +- exporter/postgres/queries_pg94.yml | 113 --- .../postgres/queries_pg_stat_statements.yml | 134 +++ .../queries_pg_stat_statements_full.yml | 108 +++ exporter/postgres/setup_pg94.sql | 264 ------ .../postgres/sysconfig.postgres_exporter_pg10 | 1 + .../postgres/sysconfig.postgres_exporter_pg11 | 1 + .../postgres/sysconfig.postgres_exporter_pg12 | 1 + .../postgres/sysconfig.postgres_exporter_pg13 | 1 + .../postgres/sysconfig.postgres_exporter_pg94 | 19 - .../sysconfig.postgres_exporter_pg94_per_db | 17 - .../postgres/sysconfig.postgres_exporter_pg95 | 1 + .../postgres/sysconfig.postgres_exporter_pg96 | 1 + grafana/Overview.json | 56 +- grafana/QueryStatistics.json | 813 ++++++++++++++++++ hugo/content/changelog/_index.md | 4 + hugo/content/exporter/_index.md | 21 + 23 files changed, 1140 insertions(+), 511 deletions(-) delete mode 100644 exporter/postgres/crunchy-postgres-exporter-pg94-el6.service delete mode 100644 exporter/postgres/queries_pg94.yml create mode 100644 exporter/postgres/queries_pg_stat_statements.yml create mode 100644 exporter/postgres/queries_pg_stat_statements_full.yml delete mode 100644 exporter/postgres/setup_pg94.sql delete mode 100644 exporter/postgres/sysconfig.postgres_exporter_pg94 delete mode 100644 exporter/postgres/sysconfig.postgres_exporter_pg94_per_db create mode 100644 grafana/QueryStatistics.json diff --git a/exporter/postgres/crunchy-postgres-exporter-pg10-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg10-el6.service index f486aba7..c81bc639 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg10-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg10-el6.service @@ -36,7 +36,7 @@ start() { echo -n $"Starting postgres_exporter: " set -o pipefail - cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') + cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT RETVAL=$? diff --git a/exporter/postgres/crunchy-postgres-exporter-pg11-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg11-el6.service index b4cfb9c7..2e9d620a 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg11-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg11-el6.service @@ -36,7 +36,7 @@ start() { echo -n $"Starting postgres_exporter: " set -o pipefail - cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') + cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT RETVAL=$? diff --git a/exporter/postgres/crunchy-postgres-exporter-pg12-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg12-el6.service index fbf02d19..bb20e2c2 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg12-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg12-el6.service @@ -36,7 +36,7 @@ start() { echo -n $"Starting postgres_exporter: " set -o pipefail - cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') + cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT RETVAL=$? diff --git a/exporter/postgres/crunchy-postgres-exporter-pg94-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg94-el6.service deleted file mode 100644 index da3e71b5..00000000 --- a/exporter/postgres/crunchy-postgres-exporter-pg94-el6.service +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash -# -# /etc/init.d/postgres_exporter -# -# chkconfig: 2345 80 80 -# - - -# Source function library. -. /etc/init.d/functions - - -RETVAL=0 -PROG="postgres_exporter" -SYSCONFIG="postgres_exporter_pg94" -DAEMON_SYSCONFIG=/etc/sysconfig/${SYSCONFIG} -DAEMON=/usr/bin/${PROG} -PID_FILE=/var/run/${PROG}/${SYSCONFIG}.pid -LOCK_FILE=/var/lock/subsys/${SYSCONFIG} -LOG_FILE=/var/log/${PROG}/${SYSCONFIG}.log -DAEMON_USER="ccp_monitoring" -GOMAXPROCS=$(grep -c ^processor /proc/cpuinfo) - -. ${DAEMON_SYSCONFIG} - -# Env vars are reset by init.d, so explicitly export so exporter can see it -export DATA_SOURCE_NAME=$DATA_SOURCE_NAME - -start() { - if check_status > /dev/null; then - echo "postgres_exporter is already running" - exit 0 - fi - -# do_start_prepare - - echo -n $"Starting postgres_exporter: " - set -o pipefail - cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') - echo $DATA_SOURCE_NAME - daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT - RETVAL=$? - echo - return $RETVAL -} - -stop() { - echo -n $"Stopping postgres_exporter: " - killproc -p ${PID_FILE} -d 10 ${DAEMON} - RETVAL=$? - echo - [ $RETVAL = 0 ] && rm -f ${LOCK_FILE} ${PID_FILE} - return $RETVAL -} - -check_status() { - status -p ${PID_FILE} ${DAEMON} - RETVAL=$? - return $RETVAL -} - -case "$1" in - start) - start - ;; - stop) - stop - ;; - status) - check_status - ;; - restart) - stop - start - ;; - *) - N=/etc/init.d/${NAME} - echo "Usage: $N {start|stop|status|restart}" >&2 - RETVAL=2 - ;; -esac - -exit ${RETVAL} diff --git a/exporter/postgres/crunchy-postgres-exporter-pg95-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg95-el6.service index 5d9d5c09..ae05c7e7 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg95-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg95-el6.service @@ -36,7 +36,7 @@ start() { echo -n $"Starting postgres_exporter: " set -o pipefail - cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') + cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT RETVAL=$? diff --git a/exporter/postgres/crunchy-postgres-exporter-pg96-el6.service b/exporter/postgres/crunchy-postgres-exporter-pg96-el6.service index eebb4df5..131e3f41 100644 --- a/exporter/postgres/crunchy-postgres-exporter-pg96-el6.service +++ b/exporter/postgres/crunchy-postgres-exporter-pg96-el6.service @@ -36,7 +36,7 @@ start() { echo -n $"Starting postgres_exporter: " set -o pipefail - cat $QUERY_FILE_LIST | sed "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') + cat $QUERY_FILE_LIST | sed -e "s/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g" -e "s/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g" > $(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/') echo $DATA_SOURCE_NAME daemonize -u ${DAEMON_USER} -p ${PID_FILE} -l ${LOCK_FILE} -a -e ${LOG_FILE} -o ${LOG_FILE} ${DAEMON} $OPT RETVAL=$? diff --git a/exporter/postgres/crunchy-postgres-exporter@.service b/exporter/postgres/crunchy-postgres-exporter@.service index 9a1815ee..45c60282 100644 --- a/exporter/postgres/crunchy-postgres-exporter@.service +++ b/exporter/postgres/crunchy-postgres-exporter@.service @@ -10,7 +10,7 @@ After=network.target PermissionsStartOnly=true User=ccp_monitoring EnvironmentFile=/etc/sysconfig/%i -ExecStartPre=/bin/bash -c "set -o pipefail; cat $QUERY_FILE_LIST | sed 's/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g'> $$(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')" +ExecStartPre=/bin/bash -c "set -o pipefail; cat $QUERY_FILE_LIST | sed -e 's/#PGBACKREST_INFO_THROTTLE_MINUTES#/${PGBACKREST_INFO_THROTTLE_MINUTES}/g' -e 's/#PG_STAT_STATEMENTS_LIMIT#/${PG_STAT_STATEMENTS_LIMIT}/g' > $$(echo $OPT | sed 's/.*--extend.query-path=\(.*\.yml\).*/\1/')" ExecStart=/usr/bin/postgres_exporter $OPT ExecReload=/usr/bin/kill -HUP $MAINPID Restart=always @@ -18,4 +18,3 @@ Restart=always [Install] WantedBy=multi-user.target DefaultInstance=postgres_exporter - diff --git a/exporter/postgres/queries_pg94.yml b/exporter/postgres/queries_pg94.yml deleted file mode 100644 index 8c218084..00000000 --- a/exporter/postgres/queries_pg94.yml +++ /dev/null @@ -1,113 +0,0 @@ -### -# -# Begin File: queries_pg94.yml -# -### - -ccp_connection_stats: - query: "select ((total - idle) - idle_in_txn) as active - , total - , idle - , idle_in_txn - , (select coalesce(extract(epoch from (max(now() - state_change))),0) from monitor.pg_stat_activity() where state = 'idle in transaction') as max_idle_in_txn_time - , (select coalesce(extract(epoch from (max(now() - query_start))),0) from monitor.pg_stat_activity() where state <> 'idle') as max_query_time - , max_connections - from ( - select count(*) as total - , coalesce(sum(case when state = 'idle' then 1 else 0 end),0) as idle - , coalesce(sum(case when state = 'idle in transaction' then 1 else 0 end),0) as idle_in_txn from monitor.pg_stat_activity()) x - join (select setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);" - metrics: - - active: - usage: "GAUGE" - description: "Total non-idle connections" - - total: - usage: "GAUGE" - description: "Total idle and non-idle connections" - - idle: - usage: "GAUGE" - description: "Total idle connections" - - idle_in_txn: - usage: "GAUGE" - description: "Total idle in transaction connections" - - max_idle_in_txn_time: - usage: "GAUGE" - description: "Length of time in seconds of the longest idle in transaction session" - - max_query_time: - usage: "GAUGE" - description: "Length of time in seconds of the longest running query" - - max_connections: - usage: "GAUGE" - description: "Value of max_connections for the monitored database" - - -ccp_replication_lag: - query: "SELECT - CASE - WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 - ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER - END - AS replay_time" - metrics: - - replay_time: - usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." - - -ccp_replication_lag_size: - query: "SELECT replica_addr::text as replica - , replica_hostname::text - , replica_port::text - , byte_lag::bigint as bytes - FROM monitor.streaming_replica_check();" - metrics: - - replica: - usage: "LABEL" - description: "Replica IP address" - - replica_hostname: - usage: "LABEL" - description: "Hostname of replica system (if available)" - - replica_port: - usage: "LABEL" - description: "Replica port" - - bytes: - usage: "GAUGE" - description: "Replication lag in bytes between primary and replica" - - -ccp_replication_slots: - query: "SELECT slot_name, active::int, pg_xlog_location_diff(pg_current_xlog_insert_location(), restart_lsn) AS retained_bytes FROM pg_catalog.pg_replication_slots" - metrics: - - slot_name: - usage: "LABEL" - description: "Name of replication slot" - - active: - usage: "GAUGE" - description: "Active state of slot. 1 = true. 0 = false." - - retained_bytes: - usage: "GAUGE" - description: "The amount of WAL (in bytes) being retained for this slot" - -ccp_wal_activity: - query: "SELECT (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_segment_size') * - (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_block_size') * - (SELECT count(*) FROM monitor.pg_ls_waldir()) AS total_size_bytes;" - metrics: - - count: - usage: "GAUGE" - description: "Current size in bytes of the WAL directory. Note this metric has been deprecated as of pgMonitor 4.3 and will be removed in a future version. Use ccp_wal_activity_total_size_bytes instead." - -ccp_wal_activity: - query: "SELECT (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_segment_size') * - (SELECT setting::int8 FROM pg_settings WHERE name = 'wal_block_size') * - (SELECT count(*) FROM monitor.pg_ls_waldir()) AS total_size_bytes;" - metrics: - - total_size_bytes: - usage: "GAUGE" - description: "Current size in bytes of the WAL directory" - -### -# -# End File: queries_pg94.yml -# -### diff --git a/exporter/postgres/queries_pg_stat_statements.yml b/exporter/postgres/queries_pg_stat_statements.yml new file mode 100644 index 00000000..f044211a --- /dev/null +++ b/exporter/postgres/queries_pg_stat_statements.yml @@ -0,0 +1,134 @@ +### +# +# Begin File: queries_pg_stat_statements.yml +# +### + +ccp_pg_stat_statements_total: + query: "SELECT pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + sum(s.calls) AS calls_count, + sum(s.total_time) AS time_ms, + avg(s.mean_time) AS mean_time_ms, + sum(s.rows) AS row_count + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid + GROUP BY 1,2" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - calls_count: + usage: "GAUGE" + description: "Total number of queries run per user/database" + - time_ms: + usage: "GAUGE" + description: "Total runtime of all queries per user/database" + - mean_time_ms: + usage: "GAUGE" + description: "Mean runtime of all queries per user/database" + - row_count: + usage: "GAUGE" + description: "Total rows returned from all queries per user/database" + + +ccp_pg_stat_statements_top_mean: + query: "SELECT pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + s.queryid, + btrim(replace(left(s.query, 40), '\n', '')) AS query, + max(s.mean_time) time_ms + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid + GROUP BY 1,2,3,4 + ORDER BY 5 DESC + LIMIT #PG_STAT_STATEMENTS_LIMIT#" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - queryid: + usage: "LABEL" + description: "Internal hash code, computed from the statement's parse tree" + - query: + usage: "LABEL" + description: "First 40 characters of query text" + - time_ms: + usage: "GAUGE" + description: "Average query runtime in milliseconds" + + +# Note that individual query stats can only be reset in PG12 +ccp_pg_stat_statements_top_total: + query: "SELECT pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + s.queryid, + btrim(replace(left(s.query, 40), '\n', '')) AS query, + s.total_time time_ms + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid + ORDER BY 5 DESC + LIMIT #PG_STAT_STATEMENTS_LIMIT#" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - queryid: + usage: "LABEL" + description: "Internal hash code, computed from the statement's parse tree" + - query: + usage: "LABEL" + description: "First 40 characters of query text" + - time_ms: + usage: "GAUGE" + description: "Total time spent in the statement in milliseconds" + + +# Note that individual query stats can only be reset in PG12 +ccp_pg_stat_statements_top_max: + query: "SELECT pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + s.queryid, + btrim(replace(left(s.query, 40), '\n', '')) AS query, + s.max_time AS time_ms + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid + ORDER BY 5 DESC + LIMIT #PG_STAT_STATEMENTS_LIMIT#" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - queryid: + usage: "LABEL" + description: "Internal hash code, computed from the statement's parse tree" + - query: + usage: "LABEL" + description: "First 40 characters of query text" + - time_ms: + usage: "GAUGE" + description: "Maximum time spent in the statement in milliseconds" + + +### +# +# End File: queries_pg_stat_statements.yml +# +### + diff --git a/exporter/postgres/queries_pg_stat_statements_full.yml b/exporter/postgres/queries_pg_stat_statements_full.yml new file mode 100644 index 00000000..288282ee --- /dev/null +++ b/exporter/postgres/queries_pg_stat_statements_full.yml @@ -0,0 +1,108 @@ +### +# +# Begin File: queries_pg_stat_statements_full.yml +# +### + +# Gathers all metrics for per query monitoring. WARNING: This can greatly affect Prometheus storage and performance +ccp_pg_stat_statements: + query: "SELECT + pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + s.queryid, + s.calls, + s.total_time AS total_time_ms, + s.min_time AS min_time_ms, + s.max_time AS max_time_ms, + s.mean_time AS mean_time_ms, + s.stddev_time AS stddev_time_ms, + s.rows, + s.shared_blks_hit, + s.shared_blks_read, + s.shared_blks_dirtied, + s.shared_blks_written, + s.local_blks_hit, + s.local_blks_read, + s.local_blks_dirtied, + s.local_blks_written, + s.temp_blks_read, + s.temp_blks_written, + s.blk_read_time AS blk_read_time_ms, + s.blk_write_time AS blk_write_time_ms + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - queryid: + usage: "LABEL" + description: "Internal hash code, computed from the statement's parse tree" + - calls: + usage: "GAUGE" + description: "Number of times executed" + - total_time_ms: + usage: "GAUGE" + description: "Total time spent in the statement, in milliseconds" + - min_time_ms: + usage: "GAUGE" + description: "Minimum time spent in the statement, in milliseconds" + - max_time_ms: + usage: "GAUGE" + description: "Maximum time spent in the statement, in milliseconds" + - mean_time_ms: + usage: "GAUGE" + description: "Mean time spent in the statement, in milliseconds" + - stddev_time_ms: + usage: "GAUGE" + description: "Population standard deviation of time spent in the statement, in milliseconds" + - rows: + usage: "GAUGE" + description: "Total number of rows retrieved or affected by the statement" + - shared_blks_hit: + usage: "GAUGE" + description: "Total number of shared block cache hits by the statement" + - shared_blks_read: + usage: "GAUGE" + description: "Total number of shared blocks read by the statement" + - shared_blks_dirtied: + usage: "GAUGE" + description: "Total number of shared blocks dirtied by the statement" + - shared_blks_written: + usage: "GAUGE" + description: "Total number of shared blocks written by the statement" + - local_blks_hit: + usage: "GAUGE" + description: "Total number of local block cache hits by the statement" + - local_blks_read: + usage: "GAUGE" + description: "Total number of local blocks read by the statement" + - local_blks_dirtied: + usage: "GAUGE" + description: "Total number of local blocks dirtied by the statement" + - local_blks_written: + usage: "GAUGE" + description: "Total number of local blocks written by the statement" + - temp_blks_read: + usage: "GAUGE" + description: "Total number of temp blocks read by the statement" + - temp_blks_written: + usage: "GAUGE" + description: "Total number of temp blocks written by the statement" + - blk_read_time_ms: + usage: "GAUGE" + description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)" + - blk_write_time_ms: + usage: "GAUGE" + description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)" + +### +# +# End File: queries_pg_stat_statements_full.yml +# +### + diff --git a/exporter/postgres/setup_pg94.sql b/exporter/postgres/setup_pg94.sql deleted file mode 100644 index 91da8337..00000000 --- a/exporter/postgres/setup_pg94.sql +++ /dev/null @@ -1,264 +0,0 @@ -DO $$ -BEGIN - IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ccp_monitoring') THEN - CREATE ROLE ccp_monitoring WITH LOGIN; - END IF; -END -$$; - -ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; - -CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring; - -DROP FUNCTION IF EXISTS monitor.pg_stat_activity(); -CREATE OR REPLACE FUNCTION monitor.pg_stat_activity() RETURNS SETOF pg_catalog.pg_stat_activity - LANGUAGE plpgsql SECURITY DEFINER -AS $$ -BEGIN - RETURN query(SELECT * FROM pg_catalog.pg_stat_activity); -END -$$; - -REVOKE ALL ON FUNCTION monitor.pg_stat_activity() FROM PUBLIC; - - -DROP FUNCTION IF EXISTS monitor.streaming_replica_check(); -CREATE OR REPLACE FUNCTION monitor.streaming_replica_check() RETURNS TABLE (replica_hostname text, replica_addr inet, replica_port int, byte_lag numeric) - LANGUAGE SQL SECURITY DEFINER -AS $$ - SELECT client_hostname as replica_hostname - , client_addr as replica_addr - , client_port as replica_port - , pg_xlog_location_diff(pg_stat_replication.sent_location, pg_stat_replication.replay_location) AS byte_lag - FROM pg_catalog.pg_stat_replication; -$$; - -REVOKE ALL ON FUNCTION monitor.streaming_replica_check() FROM PUBLIC; - - --- Drop previously unused version of this function if it exists from older pgmonitor installs -DROP FUNCTION IF EXISTS monitor.pg_ls_wal_dir(text); - -CREATE OR REPLACE FUNCTION monitor.pg_ls_waldir() RETURNS SETOF TEXT - LANGUAGE plpgsql SECURITY DEFINER -as $$ -BEGIN - IF current_setting('server_version_num')::int >= 100000 THEN - RAISE EXCEPTION 'Use version of this function included with core in PG10+'; - ELSE - RETURN query(SELECT pg_catalog.pg_ls_dir('pg_xlog')); - END IF; -END -$$; -REVOKE ALL ON FUNCTION monitor.pg_ls_waldir() FROM PUBLIC; - - -DROP TABLE IF EXISTS monitor.pgbackrest_info CASCADE; -CREATE TABLE IF NOT EXISTS monitor.pgbackrest_info (config_file text NOT NULL, data jsonb NOT NULL, gather_timestamp timestamptz DEFAULT now() NOT NULL); --- Force more aggressive autovacuum to avoid table bloat over time -ALTER TABLE monitor.pgbackrest_info SET (autovacuum_analyze_scale_factor = 0, autovacuum_vacuum_scale_factor = 0, autovacuum_vacuum_threshold = 10, autovacuum_analyze_threshold = 10); - -DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 -DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); -CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info - LANGUAGE plpgsql SECURITY DEFINER -AS $function$ -DECLARE - -v_gather_timestamp timestamptz; -v_throttle interval; - -BEGIN --- Get pgBackRest info in JSON format - -v_throttle := make_interval(mins := p_throttle_minutes); - -SELECT COALESCE(max(gather_timestamp), '1970-01-01'::timestamptz) INTO v_gather_timestamp FROM monitor.pgbackrest_info; - -IF pg_catalog.pg_is_in_recovery() = 'f' THEN - IF ((CURRENT_TIMESTAMP - v_gather_timestamp) > v_throttle) THEN - - -- Ensure table is empty - DELETE FROM monitor.pgbackrest_info; - - -- Copy data into the table directory from the pgBackRest into command - COPY monitor.pgbackrest_info (config_file, data) FROM program '/usr/bin/pgbackrest-info.sh' WITH (format text,DELIMITER '|'); - - END IF; -END IF; - -RETURN QUERY SELECT * FROM monitor.pgbackrest_info; - -IF NOT FOUND THEN - RAISE EXCEPTION 'No backups being returned from pgbackrest info command'; -END IF; - -END -$function$; - - - -DROP FUNCTION IF EXISTS monitor.sequence_status(); -CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent bigint, cycle boolean, numleft numeric, table_usage text) - LANGUAGE plpgsql SECURITY DEFINER -AS $function$ -DECLARE - -v_int_max int8; -v_int2_max int2 := 32767; -v_int4_max int4 := 2147483647; -v_int8_max int8 := 9223372036854775807; -v_row record; -v_seq_locked text; -v_sql text; - -BEGIN - -/* - * Provide detailed status information of sequences in the current database - */ - -FOR v_row IN - WITH default_value_sequences AS ( - -- Get sequences defined as default values with related table - -- Note this subquery can be locked/hung by DDL that affects tables with sequences. - -- Use monitor.sequence_exhaustion() to actually monitor for sequences running out - SELECT s.oid AS seqrelid, c.oid, t.typname - FROM pg_catalog.pg_attribute a - JOIN pg_catalog.pg_attrdef ad ON (ad.adrelid,ad.adnum) = (a.attrelid,a.attnum) - JOIN pg_catalog.pg_type t ON a.atttypid = t.oid - JOIN pg_catalog.pg_class c ON a.attrelid = c.oid - JOIN pg_catalog.pg_class s ON s.oid = regexp_replace(pg_get_expr(ad.adbin,ad.adrelid), $re$^nextval\('(.+?)'::regclass\)$$re$, $re$\1$re$)::regclass - WHERE (pg_get_expr(ad.adbin,ad.adrelid)) ~ '^nextval\(' - AND t.typname IN ('int2', 'int4', 'int8') - ), dep_sequences AS ( - -- Get sequences set as dependencies with related tables (identities) - SELECT s.oid AS seqrelid, c.oid, t.typname - FROM pg_catalog.pg_class s - JOIN pg_catalog.pg_depend d ON s.oid = d.objid - JOIN pg_catalog.pg_attribute a ON (d.refobjid,d.refobjsubid) = (a.attrelid,a.attnum) - JOIN pg_catalog.pg_type t ON a.atttypid = t.oid - JOIN pg_catalog.pg_class c ON d.refobjid = c.oid - WHERE s.relkind = ('S') - AND t.typname IN ('int2', 'int4', 'int8') - UNION - SELECT seqrelid, oid, typname FROM default_value_sequences - ) - -- Get any remaining sequences - SELECT n.nspname AS schemaname - , s.relname AS sequencename - , s.oid AS sequenceoid - , n.oid AS schemaoid - , CASE WHEN typname IS NULL THEN 'int8' ELSE typname::text END AS typname - , string_agg(ds.oid::regclass::text, ', ') AS table_usage - FROM pg_catalog.pg_class s - JOIN pg_catalog.pg_namespace n ON s.relnamespace = n.oid - LEFT JOIN dep_sequences ds ON s.oid = ds.seqrelid - WHERE s.relkind = 'S' - AND n.nspname !~ 'pg_temp' - GROUP BY 1,2,3,4,5 -LOOP - IF v_row.typname = 'int2' THEN - v_int_max := v_int2_max; - ELSIF v_row.typname = 'int4' THEN - v_int_max := v_int4_max; - ELSIF v_row.typname = 'int8' THEN - v_int_max := v_int8_max; - ELSE - RAISE EXCEPTION 'Unexpected datatype encountered: %', v_row.typname; - END IF; - - v_sql := format ('SELECT relation FROM pg_catalog.pg_locks WHERE relation = %L AND mode IN (''AccessExclusiveLock'', ''ExclusiveLock'')', v_row.sequenceoid); - EXECUTE v_sql INTO v_seq_locked; - IF v_seq_locked IS NOT NULL THEN - RAISE DEBUG 'Sequence % (oid: %) locked and unable to obtain its status. Skipping.', v_row.sequencename, v_row.sequenceoid; - CONTINUE; - END IF; - - - v_sql := format ('SELECT sequence_name - , last_value - , slots - , used - , ROUND(used/slots*100)::bigint AS percent - , cycle - , CASE WHEN slots < used THEN 0 ELSE slots - used END AS numleft - , table_usage - FROM ( - SELECT ''%1$s.%2$s''::text AS sequence_name - , COALESCE(s.last_value,s.min_value) AS last_value - , s.is_cycled AS cycle - , CEIL((%3$L-min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS slots - , CEIL((COALESCE(s.last_value,s.min_value)-s.min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS used - , %4$L::text AS table_usage - FROM %1$I.%2$I s - ) x - ORDER BY ROUND(used/slots*100) DESC' - , v_row.schemaname - , v_row.sequencename - , v_int_max - , v_row.table_usage); - - RETURN QUERY EXECUTE v_sql; - -END LOOP; - -END -$function$; - - -DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); -CREATE FUNCTION monitor.sequence_exhaustion(p_percent int DEFAULT 75, out count bigint) - LANGUAGE plpgsql SECURITY DEFINER -AS $function$ -DECLARE - -v_row record; -v_seq_locked text; -v_sql text; - -BEGIN - -/* - * Returns count of sequences that have used up the % value given via the p_percent parameter (default 75%) - */ - -FOR v_row IN - SELECT n.nspname AS schemaname - , s.relname AS sequencename - , s.oid AS sequenceoid - FROM pg_catalog.pg_class s - JOIN pg_catalog.pg_namespace n ON s.relnamespace = n.oid - WHERE s.relkind = 'S' - AND n.nspname !~ 'pg_temp' -LOOP - - v_sql := format ('SELECT relation FROM pg_catalog.pg_locks WHERE relation = %L AND mode IN (''AccessExclusiveLock'', ''ExclusiveLock'')', v_row.sequenceoid); - EXECUTE v_sql INTO v_seq_locked; - IF v_seq_locked IS NOT NULL THEN - RAISE DEBUG 'Sequence % (oid: %) locked and unable to obtain its status. Skipping.', v_row.sequencename, v_row.sequenceoid; - CONTINUE; - END IF; - - v_sql := format ('SELECT count(*) - FROM ( - SELECT CEIL((max_value-min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS slots - , CEIL((COALESCE(s.last_value,s.min_value)-s.min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS used - FROM %I.%I s - ) x - WHERE (ROUND(used/slots*100)::int) > %L' - , v_row.schemaname - , v_row.sequencename - , p_percent); - - EXECUTE v_sql INTO count; - -END LOOP; - - -END -$function$; - - -GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA monitor TO ccp_monitoring; -GRANT ALL ON ALL TABLES IN SCHEMA monitor TO ccp_monitoring; diff --git a/exporter/postgres/sysconfig.postgres_exporter_pg10 b/exporter/postgres/sysconfig.postgres_exporter_pg10 index 2dcb7a94..50aa6ad8 100644 --- a/exporter/postgres/sysconfig.postgres_exporter_pg10 +++ b/exporter/postgres/sysconfig.postgres_exporter_pg10 @@ -6,6 +6,7 @@ # QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. PGBACKREST_INFO_THROTTLE_MINUTES=10 +PG_STAT_STATEMENTS_LIMIT=20 OPT="--web.listen-address=0.0.0.0:9187 --extend.query-path=/etc/postgres_exporter/10/queries.yml --disable-default-metrics --disable-settings-metrics" DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" diff --git a/exporter/postgres/sysconfig.postgres_exporter_pg11 b/exporter/postgres/sysconfig.postgres_exporter_pg11 index cb0cef55..f0db00d6 100644 --- a/exporter/postgres/sysconfig.postgres_exporter_pg11 +++ b/exporter/postgres/sysconfig.postgres_exporter_pg11 @@ -6,6 +6,7 @@ # QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. PGBACKREST_INFO_THROTTLE_MINUTES=10 +PG_STAT_STATEMENTS_LIMIT=20 OPT="--web.listen-address=0.0.0.0:9187 --extend.query-path=/etc/postgres_exporter/11/queries.yml --disable-default-metrics --disable-settings-metrics" DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" diff --git a/exporter/postgres/sysconfig.postgres_exporter_pg12 b/exporter/postgres/sysconfig.postgres_exporter_pg12 index a73e596b..64c28159 100644 --- a/exporter/postgres/sysconfig.postgres_exporter_pg12 +++ b/exporter/postgres/sysconfig.postgres_exporter_pg12 @@ -6,6 +6,7 @@ # QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. PGBACKREST_INFO_THROTTLE_MINUTES=10 +PG_STAT_STATEMENTS_LIMIT=20 OPT="--web.listen-address=0.0.0.0:9187 --extend.query-path=/etc/postgres_exporter/12/queries.yml --disable-default-metrics --disable-settings-metrics" DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" diff --git a/exporter/postgres/sysconfig.postgres_exporter_pg13 b/exporter/postgres/sysconfig.postgres_exporter_pg13 index fff3c464..03f5be5d 100644 --- a/exporter/postgres/sysconfig.postgres_exporter_pg13 +++ b/exporter/postgres/sysconfig.postgres_exporter_pg13 @@ -6,6 +6,7 @@ # QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. PGBACKREST_INFO_THROTTLE_MINUTES=10 +PG_STAT_STATEMENTS_LIMIT=20 OPT="--web.listen-address=0.0.0.0:9187 --extend.query-path=/etc/postgres_exporter/13/queries.yml --disable-default-metrics --disable-settings-metrics" DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" diff --git a/exporter/postgres/sysconfig.postgres_exporter_pg94 b/exporter/postgres/sysconfig.postgres_exporter_pg94 deleted file mode 100644 index d0d02014..00000000 --- a/exporter/postgres/sysconfig.postgres_exporter_pg94 +++ /dev/null @@ -1,19 +0,0 @@ -# This file must be in a pathname that matches the EnvironmentFile entry in the service file (Default: /etc/sysconfig/postgres_exporter) - -# --web.listen-address: change '0.0.0.0' to the network IP assigned to this system if necessary, otherwise it will listen on any IP. Change port as necessary if running multiple instances. -# --extend.query-path: location of file containing custom queries to run. Location below is one recommended by crunchy setup steps. -# DATA_SOURCE_NAME: psql connection string. set the database that the exporter will connect to. Default is `postgres`. -# QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. - -PGBACKREST_INFO_THROTTLE_MINUTES=10 -OPT="--web.listen-address=0.0.0.0:9187 --extend.query-path=/etc/postgres_exporter/94/queries.yml --disable-default-metrics --disable-settings-metrics" -DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" - -# For global / cluster metric -# Ex: Include recommended defaults + bloat and pgbackrest monitoring -# QUERY_FILE_LIST="/etc/postgres_exporter/94/queries_common.yml /etc/postgres_exporter/94/queries_pg94.yml /etc/postgres_exporter/94/queries_per_db.yml /etc/postgres_exporter/94/queries_bloat.yml /etc/postgres_exporter/94/queries_backrest.yml" -# Ex: For per db metric -# QUERY_FILE_LIST="/etc/postgres_exporter/94/queries_per_db.yml" -# QUERY_FILE_LIST="/etc/postgres_exporter/94/queries_per_db.yml /etc/postgres_exporter/94/queries_bloat.yml" -QUERY_FILE_LIST="/etc/postgres_exporter/94/queries_common.yml /etc/postgres_exporter/94/queries_pg94.yml" - diff --git a/exporter/postgres/sysconfig.postgres_exporter_pg94_per_db b/exporter/postgres/sysconfig.postgres_exporter_pg94_per_db deleted file mode 100644 index cf7d586d..00000000 --- a/exporter/postgres/sysconfig.postgres_exporter_pg94_per_db +++ /dev/null @@ -1,17 +0,0 @@ -# This file must be in a pathname that matches the EnvironmentFile entry in the service file (Default: /etc/sysconfig/postgres_exporter) - -# --web.listen-address: change '0.0.0.0' to the network IP assigned to this system if necessary, otherwise it will listen on any IP. Change port as necessary if running multiple instances. -# --extend.query-path: location of file containing custom queries to run. Location below is one recommended by crunchy setup steps. -# DATA_SOURCE_NAME: psql connection string. set the database that the exporter will connect to. Default is `postgres`. -# QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. - -# Ensure that listening port is different than global postgres_exporter (9188 below). Also ensure that query-path file is different from global one (queries_all_db_stats.yml below). -OPT="--web.listen-address=0.0.0.0:9188 --extend.query-path=/etc/postgres_exporter/94/queries_all_db_stats.yml --disable-default-metrics --disable-settings-metrics" -DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" -# As of v0.5.x of postgres_exporter, multiple DSN's can be given to connect to multiple databases with a single exporter. Separate DSNs with a comma and no spaces. Database name in example below is first the value after the "///" before the "?". Additional connection options can be given after "?", separated by "&". -#DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable,postgresql:///mydb1?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable,postgresql:///mydb2?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" - -# Ex: For per db metric. DO NOT add queries that return the same labels/values on all databases otherwise the exporter will throw errors. The queries that pgmonitor provides for per-db metrics all include a "dbname" label to distinguish them. -QUERY_FILE_LIST="/etc/postgres_exporter/94/queries_per_db.yml" -# QUERY_FILE_LIST="/etc/postgres_exporter/94/queries_per_db.yml /etc/postgres_exporter/94/queries_bloat.yml" - diff --git a/exporter/postgres/sysconfig.postgres_exporter_pg95 b/exporter/postgres/sysconfig.postgres_exporter_pg95 index 8f11ee5b..6e9e3d0c 100644 --- a/exporter/postgres/sysconfig.postgres_exporter_pg95 +++ b/exporter/postgres/sysconfig.postgres_exporter_pg95 @@ -6,6 +6,7 @@ # QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. PGBACKREST_INFO_THROTTLE_MINUTES=10 +PG_STAT_STATEMENTS_LIMIT=20 OPT="--web.listen-address=0.0.0.0:9187 --extend.query-path=/etc/postgres_exporter/95/queries.yml --disable-default-metrics --disable-settings-metrics" DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" diff --git a/exporter/postgres/sysconfig.postgres_exporter_pg96 b/exporter/postgres/sysconfig.postgres_exporter_pg96 index 1075aeb4..229943f6 100644 --- a/exporter/postgres/sysconfig.postgres_exporter_pg96 +++ b/exporter/postgres/sysconfig.postgres_exporter_pg96 @@ -6,6 +6,7 @@ # QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. PGBACKREST_INFO_THROTTLE_MINUTES=10 +PG_STAT_STATEMENTS_LIMIT=20 OPT="--web.listen-address=0.0.0.0:9187 --extend.query-path=/etc/postgres_exporter/96/queries.yml --disable-default-metrics --disable-settings-metrics" DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" diff --git a/grafana/Overview.json b/grafana/Overview.json index 6eafc93c..7d23d206 100644 --- a/grafana/Overview.json +++ b/grafana/Overview.json @@ -16,12 +16,18 @@ "editable": false, "gnetId": null, "graphTooltip": 0, - "id": 5, + "id": 15, "links": [], "panels": [ { "content": "\n
\n
\n
\n\n
\">OS
\n\n\n\n\n\n", "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, "gridPos": { "h": 7, "w": 12, @@ -31,7 +37,11 @@ "id": 7, "links": [], "mode": "html", - "options": {}, + "options": { + "content": "\n
\n
\n
\n\n
\">OS
\n\n\n\n\n\n", + "mode": "html" + }, + "pluginVersion": "7.1.0", "timeFrom": null, "timeShift": null, "title": "", @@ -40,6 +50,12 @@ { "content": "\n
\n
\n
\n\n
\">PostgreSQL
\n\n\n\n\n\n", "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, "gridPos": { "h": 7, "w": 12, @@ -49,7 +65,11 @@ "id": 8, "links": [], "mode": "html", - "options": {}, + "options": { + "content": "\n
\n
\n
\n\n
\">PostgreSQL
\n\n\n\n\n\n", + "mode": "html" + }, + "pluginVersion": "7.1.0", "timeFrom": null, "timeShift": null, "title": "", @@ -58,6 +78,12 @@ { "content": "\n
\n
\n
\n\n
\">etcd
\n\n\n\n\n\n", "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, "gridPos": { "h": 7, "w": 12, @@ -67,7 +93,11 @@ "id": 9, "links": [], "mode": "html", - "options": {}, + "options": { + "content": "\n
\n
\n
\n\n
\">etcd
\n\n\n\n\n\n", + "mode": "html" + }, + "pluginVersion": "7.1.0", "timeFrom": null, "timeShift": null, "title": "", @@ -76,6 +106,12 @@ { "content": "\n
\n
\n
\n\n
\">Prometheus
\n\n\n\n\n\n", "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, "gridPos": { "h": 7, "w": 12, @@ -85,7 +121,11 @@ "id": 10, "links": [], "mode": "html", - "options": {}, + "options": { + "content": "\n
\n
\n
\n\n
\">Prometheus
Alerts
\n\n\n\n\n\n", + "mode": "html" + }, + "pluginVersion": "7.1.0", "timeFrom": null, "timeShift": null, "title": "", @@ -93,7 +133,7 @@ } ], "refresh": false, - "schemaVersion": 21, + "schemaVersion": 26, "style": "dark", "tags": [], "templating": { @@ -123,5 +163,5 @@ "timezone": "", "title": "Overview", "uid": "VhzrGq2Wk", - "version": 1 -} \ No newline at end of file + "version": 1 +} diff --git a/grafana/QueryStatistics.json b/grafana/QueryStatistics.json new file mode 100644 index 00000000..a0fd5073 --- /dev/null +++ b/grafana/QueryStatistics.json @@ -0,0 +1,813 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "id": 14, + "iteration": 1599067369291, + "links": [], + "panels": [ + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 5, + "x": 0, + "y": 0 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "7.1.2", + "targets": [ + { + "expr": "sum(ccp_pg_stat_statements_total_calls_count{job=\"[[pgnodes]]\", dbname=~\"[[dbname]]\", role=~\"[[role]]\"})", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Queries Executed", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 5, + "y": 0 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "7.1.2", + "targets": [ + { + "expr": "sum(ccp_pg_stat_statements_total_time_ms{job=\"[[pgnodes]]\", dbname=~\"[[dbname]]\", role=~\"[[role]]\"})", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Query Runtime ", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 11, + "y": 0 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "7.1.2", + "targets": [ + { + "expr": "avg(ccp_pg_stat_statements_total_mean_time_ms{job=\"[[pgnodes]]\", dbname=~\"[[dbname]]\", role=~\"[[role]]\"})", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Query Mean Runtime", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 17, + "y": 0 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "7.1.2", + "targets": [ + { + "expr": "sum(ccp_pg_stat_statements_total_row_count{job=\"[[pgnodes]]\", dbname=~\"[[dbname]]\", role=~\"[[role]]\"})", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Rows Retrieved or Affected", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 23, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(ccp_pg_stat_statements_total_calls_count{job=\"[[pgnodes]]\", dbname=~\"[[dbname]]\", role=~\"[[role]]\"}[1m])", + "interval": "", + "legendFormat": "db: {{dbname}}, user: {{role}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Query Executions (Calls) Per Minute", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": { + "align": null, + "displayMode": "auto" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Runtime" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + }, + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 90 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 23, + "x": 0, + "y": 10 + }, + "id": 4, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Runtime" + } + ] + }, + "pluginVersion": "7.1.2", + "targets": [ + { + "expr": "ccp_pg_stat_statements_top_mean_time_ms{job=\"[[pgnodes]]\", dbname=~\"[[dbname]]\", role=~\"[[role]]\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Query Mean Runtime (Top N)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "exp_type": true, + "job": true, + "role": false + }, + "indexByName": { + "Time": 6, + "Value": 5, + "__name__": 7, + "dbname": 0, + "exp_type": 8, + "instance": 2, + "job": 9, + "query": 3, + "queryid": 4, + "role": 1 + }, + "renameByName": { + "Value": "Runtime" + } + } + } + ], + "type": "table" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": { + "align": null, + "displayMode": "auto" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Runtime" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + }, + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 90 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 23, + "x": 0, + "y": 18 + }, + "id": 6, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Runtime" + } + ] + }, + "pluginVersion": "7.1.2", + "targets": [ + { + "expr": "ccp_pg_stat_statements_top_max_time_ms{job=\"[[pgnodes]]\", dbname=~\"[[dbname]]\", role=~\"[[role]]\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Query Max Runtime (Top N)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "exp_type": true, + "job": true, + "role": false + }, + "indexByName": { + "Time": 6, + "Value": 5, + "__name__": 7, + "dbname": 0, + "exp_type": 8, + "instance": 2, + "job": 9, + "query": 3, + "queryid": 4, + "role": 1 + }, + "renameByName": { + "Value": "Runtime" + } + } + } + ], + "type": "table" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": { + "align": null + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Runtime" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + }, + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 90 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 23, + "x": 0, + "y": 26 + }, + "id": 5, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Runtime" + } + ] + }, + "pluginVersion": "7.1.2", + "targets": [ + { + "expr": "ccp_pg_stat_statements_top_total_time_ms{job=\"[[pgnodes]]\", dbname=~\"[[dbname]]\", role=~\"[[role]]\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Query Total Runtime (Top N)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "exp_type": true, + "job": true, + "role": false + }, + "indexByName": { + "Time": 6, + "Value": 5, + "__name__": 7, + "dbname": 0, + "exp_type": 8, + "instance": 2, + "job": 9, + "query": 3, + "queryid": 4, + "role": 1 + }, + "renameByName": { + "Value": "Runtime" + } + } + } + ], + "type": "table" + } + ], + "refresh": "15m", + "schemaVersion": 26, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false, + "text": "pg1_crunchy", + "value": "pg1_crunchy" + }, + "datasource": "PROMETHEUS", + "definition": "label_values(up{exp_type='pg'}, job)", + "hide": 0, + "includeAll": false, + "label": "PGCluster", + "multi": false, + "name": "pgnodes", + "options": [], + "query": "label_values(up{exp_type='pg'}, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": "PROMETHEUS", + "definition": "label_values(ccp_pg_stat_statements_total_calls_count{job=~\"[[pgnodes]]\"}, dbname) ", + "hide": 0, + "includeAll": true, + "label": "dbname", + "multi": true, + "name": "dbname", + "options": [], + "query": "label_values(ccp_pg_stat_statements_total_calls_count{job=~\"[[pgnodes]]\"}, dbname) ", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": "PROMETHEUS", + "definition": "label_values(ccp_pg_stat_statements_total_calls_count{job=~\"[[pgnodes]]\",dbname=~\"[[dbname]]\"}, role) ", + "hide": 0, + "includeAll": true, + "label": "role", + "multi": true, + "name": "role", + "options": [], + "query": "label_values(ccp_pg_stat_statements_total_calls_count{job=~\"[[pgnodes]]\",dbname=~\"[[dbname]]\"}, role) ", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Query Statistics (pg_stat_statements)", + "uid": "ZKoTOHDGk", + "version": 16 +} \ No newline at end of file diff --git a/hugo/content/changelog/_index.md b/hugo/content/changelog/_index.md index 3fd7253c..d1e46ce9 100644 --- a/hugo/content/changelog/_index.md +++ b/hugo/content/changelog/_index.md @@ -9,6 +9,7 @@ weight: 5 * Added support for PostgreSQL 13 * Added queries and dashboards for pgnodemx/container support + * Added metrics and Grafana dashboard for pg_stat_statements * Added metrics for monitoring longest blocked query time ### Bug Fixes @@ -16,6 +17,9 @@ weight: 5 ### Non-backward Compatible Changes ### Manual Intervention Changes + * To add pg_stat_statements metrics to an existing installation you will need to do the following: + * Add the queries_pg_stat_statements.yml file to the QUERY_FILE_LIST in the exporter sysconfig file + * Add a PG_STAT_STATEMENTS_LIMIT line to the exporter sysconfig file with a desired limit for the top N queries. Default for a new install is 20. ## 4.3 diff --git a/hugo/content/exporter/_index.md b/hugo/content/exporter/_index.md index fc0b49bb..de464990 100644 --- a/hugo/content/exporter/_index.md +++ b/hugo/content/exporter/_index.md @@ -189,6 +189,7 @@ psql -d template1 -c "CREATE EXTENSION pg_stat_statements;" | queries_pg##.yml | postgres_exporter query file for queries that are specific to the given version of PostgreSQL. | | queries_backrest.yml | postgres_exporter query file for monitoring pgBackRest backup status. By default, new backrest data is only collected every 10 minutes to avoid excessive load when there are large backup lists. See sysconfig file for exporter service to adjust this throttling. | | queries_pgbouncer.yml | postgres_exporter query file for monitoring pgbouncer. | +| queries_pg_stat_statements.yml | postgres_exporter query file for specific pg_stat_statements metrics that are most useful for monitoring and trending. | By default, there are two postgres_exporter services expected to be running as of pgMonitor 4.0 and higher. One connects to the default `postgres` database that most postgresql instances come with and is meant for collecting global metrics that are the same on all databases in the instance, for example connection and replication statistics. This service uses the sysconfig file postgres_exporter_pg##. Connect to this database and run the setup_pg##.sql script to install the required database objects for pgMonitor. @@ -514,6 +515,8 @@ The following metrics either require special considerations when monitoring spec * *ccp_connection_stats_max_idle_in_txn_time* - Runtime of longest idle in transaction (IIT) session. * *ccp_connection_stats_max_query_time* - Runtime of longest general query (inclusive of IIT). + + * *ccp_connection_stats_max_blocked_query_time* - Runtime of the longest running query that has been blocked by a heavyweight lock * *ccp_replication_lag_replay_time* - Only provides values on replica instances. Time since replica received and replayed a WAL file. Note this is not the main way to determine if a replica is behind its primary. It only monitors the time the replica replayed the WAL vs what it has received. It is a secondary metric for monitoring WAL replay on the replica itself. @@ -593,6 +596,24 @@ The following metric prefixes correspond to the SHOW command views found in the * *ccp_pgbouncer_lists* - SHOW LISTS +#### pg_stat_statements Metrics + +Collecting all per-query metrics into Prometheus could greatly increase storage requirements and heavily impact performance without sufficient resources. Therefore the metrics below give simplified numeric metrics on overall statistics and Top N queries. N can be set with the PG_STAT_STATEMENTS_LIMIT variable in the exporter sysconfig file (defaults to 20). Note that the statistics for individual queries can only be reset on PG12+. Prior to that, pg_stat_statements must have all statistics reset to redo the top N queries. + + * *ccp_pg_stat_statements_top_max_time_ms* - Maximum time spent in the statement in milliseconds per database/user/query for the top N queries + + * *ccp_pg_stat_statements_top_mean_time_ms* - Average query runtime in milliseconds per database/user/query for the top N queries + + * *ccp_pg_stat_statements_top_total_time_ms* - Total time spent in the statement in milliseconds per database/user/query for the top N queries + + * *ccp_pg_stat_statements_total_calls_count* - Total number of queries run per user/database + + * *ccp_pg_stat_statements_total_mean_time_ms* - Mean runtime of all queries per user/database + + * *ccp_pg_stat_statements_total_row_count* - Total rows returned from all queries per user/database + + * *ccp_pg_stat_statements_total_time_ms* - Total runtime of all queries per user/database + ### System \*NIX Operating System metrics (Linux, BSD, etc) are collected using the [node_exporter](https://github.com/prometheus/node_exporter) provided by the Prometheus team. pgMonitor only collects the default metrics provided by node_exporter, but many additional metrics are available if needed.