From 43f7b22665327532aaf6da4c582b9dd69527a3a4 Mon Sep 17 00:00:00 2001 From: Luca Canali Date: Fri, 15 Mar 2024 20:21:24 +0100 Subject: [PATCH] spark-dashboard V2 --- README.md | 168 +- {charts => charts_v1}/Chart.yaml | 0 {charts => charts_v1}/README.md | 0 ..._Perf_Dashboard_v03_with_SparkPlugins.json | 0 ...k_Perf_Dashboard_v03_with_annotations.json | 0 .../Spark_Perf_Dashboard_v04.json | 0 .../spark-dashboard-0.3.0.tgz | Bin {charts => charts_v1}/templates/_helpers.tpl | 0 .../templates/grafana_dashboards.yaml | 0 .../templates/grafana_datasource.yaml | 0 .../templates/grafana_pod.yaml | 0 .../templates/grafana_service.yaml | 0 .../templates/influx_graphiteconf.yaml | 0 .../templates/influx_pod.yaml | 0 .../templates/influx_pv.yaml | 0 .../templates/influx_service.yaml | 0 {charts => charts_v1}/values.yaml | 0 {dockerfiles => dockerfiles_v1}/Dockerfile | 0 {dockerfiles => dockerfiles_v1}/README.md | 0 {dockerfiles => dockerfiles_v1}/entrypoint.sh | 0 ..._Perf_Dashboard_v03_with_SparkPlugins.json | 0 ...k_Perf_Dashboard_v03_with_annotations.json | 0 .../Spark_Perf_Dashboard_v04.json | 0 {dockerfiles => dockerfiles_v1}/influx.yaml | 0 {dockerfiles => dockerfiles_v1}/influxdb.conf | 0 {dockerfiles => dockerfiles_v1}/spark.yaml | 0 dockerfiles_v2/Dockerfile | 48 + dockerfiles_v2/README.md | 31 + dockerfiles_v2/entrypoint.sh | 11 + dockerfiles_v2/grafana.ini | 5 + .../Spark_Perf_Dashboard_v04_PromQL.json | 6454 ++++++++++++ ...ashboard_v04_PromQL_with_SparkPlugins.json | 9094 +++++++++++++++++ dockerfiles_v2/spark.yaml | 14 + dockerfiles_v2/telegraf.conf | 29 + dockerfiles_v2/victoriametrics-datasource.yml | 9 + 35 files changed, 15826 insertions(+), 37 deletions(-) rename {charts => charts_v1}/Chart.yaml (100%) rename {charts => charts_v1}/README.md (100%) rename {charts => charts_v1}/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json (100%) rename {charts => charts_v1}/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json (100%) rename {charts => charts_v1}/grafana_dashboards/Spark_Perf_Dashboard_v04.json (100%) rename {charts => charts_v1}/spark-dashboard-0.3.0.tgz (100%) rename {charts => charts_v1}/templates/_helpers.tpl (100%) rename {charts => charts_v1}/templates/grafana_dashboards.yaml (100%) rename {charts => charts_v1}/templates/grafana_datasource.yaml (100%) rename {charts => charts_v1}/templates/grafana_pod.yaml (100%) rename {charts => charts_v1}/templates/grafana_service.yaml (100%) rename {charts => charts_v1}/templates/influx_graphiteconf.yaml (100%) rename {charts => charts_v1}/templates/influx_pod.yaml (100%) rename {charts => charts_v1}/templates/influx_pv.yaml (100%) rename {charts => charts_v1}/templates/influx_service.yaml (100%) rename {charts => charts_v1}/values.yaml (100%) rename {dockerfiles => dockerfiles_v1}/Dockerfile (100%) rename {dockerfiles => dockerfiles_v1}/README.md (100%) rename {dockerfiles => dockerfiles_v1}/entrypoint.sh (100%) rename {dockerfiles => dockerfiles_v1}/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json (100%) rename {dockerfiles => dockerfiles_v1}/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json (100%) rename {dockerfiles => dockerfiles_v1}/grafana_dashboards/Spark_Perf_Dashboard_v04.json (100%) rename {dockerfiles => dockerfiles_v1}/influx.yaml (100%) rename {dockerfiles => dockerfiles_v1}/influxdb.conf (100%) rename {dockerfiles => dockerfiles_v1}/spark.yaml (100%) create mode 100644 dockerfiles_v2/Dockerfile create mode 100644 dockerfiles_v2/README.md create mode 100755 dockerfiles_v2/entrypoint.sh create mode 100644 dockerfiles_v2/grafana.ini create mode 100644 dockerfiles_v2/grafana_dashboards/Spark_Perf_Dashboard_v04_PromQL.json create mode 100644 dockerfiles_v2/grafana_dashboards/Spark_Perf_Dashboard_v04_PromQL_with_SparkPlugins.json create mode 100644 dockerfiles_v2/spark.yaml create mode 100644 dockerfiles_v2/telegraf.conf create mode 100644 dockerfiles_v2/victoriametrics-datasource.yml diff --git a/README.md b/README.md index 794c490..f9af068 100644 --- a/README.md +++ b/README.md @@ -1,55 +1,74 @@ -# Apache Spark Performance Dashboard and Spark Monitoring +# Spark-Dashboard +![Docker Pulls](https://img.shields.io/docker/pulls/lucacanali/spark-dashboard) -This repository provides the tooling and configuration for deploying an Apache Spark Performance Dashboard using containers technology. -This provides monitoring for Apache Spark workloads. -The monitoring pipeline and dashboard are implemented from the [Spark metrics system](https://spark.apache.org/docs/latest/monitoring.html#metrics) using InfluxDB, and Grafana. +Spark-dashboard is a solution for monitoring Apache Spark jobs. -**Why:** Troubleshooting Spark jobs and understanding how system resources are used by Spark executors can be complicated. -This type of data is precious for visualizing and understanding root causes of performance issues. -Using the Spark Dashboard you can collect and visualize many of key metrics available by the Spark metrics system -as time series. This provides monitoring and help for Spark applications troubleshooting. +### Key Features +- You can find here all the components to deploy a monitoring application for Apache Spark +- Spark-dashboard collects metrics from Spark and visualizes them in a Grafana +- This tool is intended for performance troubleshooting and DevOps monitoring of Spark workloads. +- Use it with Spark 2.4 and higher (3.x) -**Compatibility:** -- Use with Spark 3.x and 2.4. -- The provided containers are for the Linux platform +### Contents +- [Architecture](#architecture) +- [How To Deploy the Spark Dashboard](#how-to-deploy-the-spark-dashboard) + - [How to run the Spark Dashboard V2 on a Docker container](#how-to-run-the-spark-dashboard-v2-on-a-docker-container) + - [Advanced configurations and notes](#advanced-configurations-and-notes) + - [Examples and testing the dashboard](#examples-and-testing-the-dashboard) +- [Old implementation (v1)](#old-implementation-v1) + - [How to run the Spark dashboard V1 on a Docker container](#how-to-run-the-spark-dashboard-v1-on-a-docker-container) + - [How to run the dashboard V1 on Kubernetes using Helm](#how-to-run-the-dashboard-v1-on-kubernetes-using-helm) +- [Advanced configurations and notes](#advanced-configurations-and-notes) -**Demos and blogs:** - - **[Short demo of the Spark dashboard](https://canali.web.cern.ch/docs/Spark_Dashboard_Demo.mp4)** +### Resources + - **[Short demo of Spark dashboard](https://canali.web.cern.ch/docs/Spark_Dashboard_Demo.mp4)** - [Blog entry on Spark Dashboard](https://db-blog.web.cern.ch/blog/luca-canali/2019-02-performance-dashboard-apache-spark) - Talk on Spark performance at [Data+AI Summit 2021](https://databricks.com/session_na21/monitor-apache-spark-3-on-kubernetes-using-metrics-and-plugins), [slides](http://canali.web.cern.ch/docs/Monitor_Spark3_on_Kubernetes_DataAI2021_LucaCanali.pdf) - Notes on [Spark Dashboard](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard) - -**Related work:** -- **[sparkMeasure](https://github.com/LucaCanali/sparkMeasure)** a tool for performance troubleshooting of Apache Spark workloads -- **[TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark)** a TPC-DS workload generator written in Python and designed to run at scale using Apache Spark +- [sparkMeasure](https://github.com/LucaCanali/sparkMeasure) a tool for performance troubleshooting of Apache Spark workloads +- [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark) a TPC-DS workload generator written in Python and designed to run at scale using Apache Spark Main author and contact: Luca.Canali@cern.ch --- ### Architecture -The Spark Dashboard collects and displays Apache Spark workload metrics produced by -the [Spark metrics system](https://spark.apache.org/docs/latest/monitoring.html#metrics). -Spark metrics are exported via a Graphite endpoint and stored in InfluxDB. -Metrics are then queried from InfluxDB and displayed using a set of pre-configured Grafana dashboards distributed with this repo. -Note that the provided installation instructions and code are intended as examples for testing and experimenting. -Hardening the installation will be necessary for production-quality use. -![Spark metrics dashboard architecture](https://raw.githubusercontent.com/LucaCanali/Miscellaneous/master/Spark_Dashboard/images/Spark_metrics_dashboard_arch.PNG "Spark metrics dashboard architecture") +![Spark metrics dashboard architecture](https://raw.githubusercontent.com/LucaCanali/Miscellaneous/master/Spark_Dashboard/images/Spark_MetricsSystem_Grafana_Dashboard_V2.0.png "Spark metrics dashboard architecture") + +This technical drawing outlines an integrated monitoring pipeline for Apache Spark using open-source components. The flow of the diagram illustrates the following components and their interactions: +- **Apache Spark's metrics:** This is the source of metrics data: [Spark metrics system](https://spark.apache.org/docs/latest/monitoring.html#metrics). Spark's executors and the driver emit metrics such + as executors' run time, CPU time, garbage collection (GC) time, memory usage, shuffle metrics, I/O metrics, and more. + Spark metrics are exported in Graphite format by Spark and then ingested by Telegraf. +- **Telegraf:** This component acts as the metrics collection agent (the sink in this context). It receives the + metrics emitted by Apache Spark's executors and driver, and it adds labels to the measurements to organize + the data effectively. Telegraf send the measurements to VitoriaMetrics for storage and later querying. +- **VictoriaMetrics:** This is a time-series database that stores the labeled metrics data collected by Telegraf. + The use of a time-series database is appropriate for storing and querying the type of data emitted by + monitoring systems, which is often timestamped and sequential. +- **Grafana:** Finally, Grafana is used for visualization. It reads the metrics stored in VictoriaMetrics + using PromQL/MetricsQL, which is a query language for time series data in Prometheus. Grafana provides + dashboards that present the data in the form of metrics and graphs, offering insights into the performance + and health of the Spark application. + +Note: spark-dashboard v1 (the original implementation) uses InfluxDB as the time-series database, see also +[spark-dashabord v1 architecture](https://raw.githubusercontent.com/LucaCanali/Miscellaneous/master/Spark_Dashboard/images/Spark_metrics_dashboard_arch.PNG) --- ## How To Deploy the Spark Dashboard -This provides a quickstart guide to deploy the Spark Dashboard. Two methods are provided: one using a Docker container -and the other is deploying on Kubernetes via Helm. +This provides a quickstart guide to deploy the Spark Dashboard. Three different installation methods are described: +- **Recommended:** Dashboard v2 on a Docker container +- Dashboard v1 on a Docker container +- Dashboard v1 on Helm -### How to run the Spark dashboard on a Docker container +### How to run the Spark Dashboard V2 on a Docker container If you chose to run on container image, these are steps: **1. Start the container** The provided container image has been built configured to run InfluxDB and Grafana - -`docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard` + - `docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard` - Note: port 2003 is for Graphite ingestion, port 3000 is for Grafana - - More options, including on how to persist InfluxDB data across restarts at: [Spark dashboard in a container](dockerfiles) + - More options, including on how to persist InfluxDB data across restarts at: [Spark dashboard in a container](dockerfiles_v2) **2. Spark configuration** You need to configure Spark to send the metrics to the desired Graphite endpoint + the add the related configuration. @@ -88,21 +107,95 @@ bin/spark-shell (or spark-submit or pyspark) --conf "spark.metrics.appStatusSource.enabled"=true ``` +Optional configuration if you want to display "Tree Process Memory Details": +``` +--conf spark.executor.processTreeMetrics.enabled=true +``` + **3. Visualize the metrics using a Grafana dashboard** - Point your browser to `http://hostname:3000` (edit `hostname` as relevant) - Credentials: use the default for the first login (user: admin, password: admin) - - Choose one of the provided dashboards (for example start with **Spark_Perf_Dashboard_v04**) and select the user, - applicationId and time range. + - Use the default dashboard bundled with the container (**Spark_Perf_Dashboard_v04_promQL**) and select the user name, + applicationId and time range (default is last 5 minutes). - You will need a running Spark application configured to use the dashboard to be able to select an application and display the metrics. - See also [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark) a TPC-DS workload generator written in Python and designed to run at scale using Apache Spark. -### Examples: +### Extended Spark dashboard +An extended Spark dashboard is available to collect and visualize OS and storage data. +This utilizes Spark Plugins to collect the extended metrics. The metrics are collected and stored in the +same VictoriaMetrics database as the Spark metrics. + +- Configuration: + - Add the following to the Spark configuration: + `--conf ch.cern.sparkmeasure:spark-plugins_2.12:0.3` + `--conf spark.plugins=ch.cern.HDFSMetrics,ch.cern.CgroupMetrics,ch.cern.CloudFSMetrics` +- Use the extended dashboard + - Manually select the dashboard **Spark_Perf_Dashboard_v04_PromQL_with_SparkPlugins** + - The dashboard includes additional graphs for OS and storage metrics. + - Three new tabs are available: + - CGroup Metrics (use with Spark running on Kubernetes) + - Cloud Storage (use with S3A, GZ, WASB, and cloud storage in general) + - HDFS Advanced Statistics (use with HDFS) + +### Examples and testing the dashboard: - See some [examples of the graphs available in the dashboard at this link](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard#example-graphs) +- You can use the [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark) +package to generate a TPC-DS workload and test the dashboard. + +- Example of running TPCDS on a YARN Spark cluster, monitor with the Spark dashboard: +``` +TPCDS_PYSPARK=`which tpcds_pyspark_run.py` + +spark-submit --master yarn --conf spark.log.level=error --conf spark.executor.cores=8 --conf spark.executor.memory=64g \ +--conf spark.driver.memory=16g --conf spark.driver.extraClassPath=tpcds_pyspark/spark-measure_2.12-0.24.jar \ +--conf spark.dynamicAllocation.enabled=false --conf spark.executor.instances=32 --conf spark.sql.shuffle.partitions=512 \ +$TPCDS_PYSPARK -d hdfs:///tpcds_10000_parquet_1.13.1 +``` + +- Example of running TPCDS on a Kubernetes cluster with S3 storage, monitor this with the extended dashboard using Spark plugins: +``` +TPCDS_PYSPARK=`which tpcds_pyspark_run.py` + +spark-submit --master k8s://https://xxx.xxx.xxx.xxx:6443 --conf spark.kubernetes.container.image=/spark:v3.5.1 --conf spark.kubernetes.namespace=xxx \ +--conf spark.eventLog.enabled=false --conf spark.task.maxDirectResultSize=2000000000 --conf spark.shuffle.service.enabled=false --conf spark.executor.cores=8 --conf spark.executor.memory=32g --conf spark.driver.memory=4g \ +--packages org.apache.hadoop:hadoop-aws:3.3.4,ch.cern.sparkmeasure:spark-measure_2.12:0.24,ch.cern.sparkmeasure:spark-plugins_2.12:0.3 --conf spark.plugins=ch.cern.HDFSMetrics,ch.cern.CgroupMetrics,ch.cern.CloudFSMetrics \ +--conf spark.cernSparkPlugin.cloudFsName=s3a \ +--conf spark.dynamicAllocation.enabled=false --conf spark.executor.instances=4 \ +--conf spark.hadoop.fs.s3a.secret.key=$SECRET_KEY \ +--conf spark.hadoop.fs.s3a.access.key=$ACCESS_KEY \ +--conf spark.hadoop.fs.s3a.endpoint="https://s3.cern.ch" \ +--conf spark.hadoop.fs.s3a.impl="org.apache.hadoop.fs.s3a.S3AFileSystem" \ +--conf spark.executor.metrics.fileSystemSchemes="file,hdfs,s3a" \ +--conf spark.hadoop.fs.s3a.fast.upload=true \ +--conf spark.hadoop.fs.s3a.path.style.access=true \ +--conf spark.hadoop.fs.s3a.list.version=1 \ +$TPCDS_PYSPARK -d s3a://luca/tpcds_100 +``` + +--- +## Old implementation (v1) + +### How to run the Spark dashboard V1 on a Docker container +This is the original implementation of the tool using InfluxDB and Grafana + +**1. Start the container** +The provided container image has been built configured to run InfluxDB and Grafana + -`docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard:v01` + - Note: port 2003 is for Graphite ingestion, port 3000 is for Grafana + - More options, including on how to persist InfluxDB data across restarts at: [Spark dashboard in a container](dockerfiles) + +**2. Spark configuration** +See above + +**3. Visualize the metrics using a Grafana dashboard** + - Point your browser to `http://hostname:3000` (edit `hostname` as relevant) + - See details above + --- -### How to run the dashboard on Kubernetes using Helm +### How to run the dashboard V1 on Kubernetes using Helm If you chose to run on Kubernetes, these are steps: 1. The Helm chart takes care of configuring and running InfluxDB and Grafana: @@ -114,7 +207,7 @@ If you chose to run on Kubernetes, these are steps: - Use `INFLUXDB_ENDPOINT=spark-dashboard-influx.default.svc.cluster.local` as the InfluxDB endpoint in the Spark configuration. -3. Grafana visualization with Helm: +3. Grafana's visualization with Helm: - The Grafana dashboard is reachable at port 3000 of the spark-dashboard-service. - See service details: `kubectl get service spark-dashboard-grafana` - When using NodePort and an internal cluster IP address, this is how you can port forward to the service from @@ -126,7 +219,7 @@ More info at [Spark dashboard on Kubernetes](charts/README.md) ## Advanced configurations and notes ### Graph annotations: display query/job/stage start and end times -Optionally, you can add annotation instrumentation to the performance dashboard. +Optionally, you can add annotation instrumentation to the performance dashboard v1. Annotations provide additional info on start and end times for queries, jobs and stages. To activate annotations, add the following additional configuration, needed for collecting and writing extra performance data: @@ -140,10 +233,11 @@ INFLUXDB_HTTP_ENDPOINT="http://`hostname`:8086" ### Notes - More details on how this works and alternative configurations at [Spark Dashboard](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard) - The dashboard can be used when running Spark on a cluster (Kubernetes, YARN, Standalone) or in local mode. -- When using Spark in local mode, it's best with Spark version 3.1 or higher, see [SPARK-31711](https://issues.apache.org/jira/browse/SPARK-31711) +- When using Spark in local mode, use Spark version 3.1 or higher, see [SPARK-31711](https://issues.apache.org/jira/browse/SPARK-31711) ### Docker -- InfluxDB will use port 2003 (graphite endpoint), and port 8086 (http endpoint) of +- Telegraf will use port 2003 (graphite endpoint) and port 8428 (VictoriaMetrics source) of your machine/VM. +- For dashboard v1: InfluxDB will use port 2003 (graphite endpoint), and port 8086 (http endpoint) of your machine/VM (when running using `--network=host`). - Note: the endpoints need to be available on the node where you started the Docker container and reachable by Spark executors and driver (mind the firewall). diff --git a/charts/Chart.yaml b/charts_v1/Chart.yaml similarity index 100% rename from charts/Chart.yaml rename to charts_v1/Chart.yaml diff --git a/charts/README.md b/charts_v1/README.md similarity index 100% rename from charts/README.md rename to charts_v1/README.md diff --git a/charts/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json b/charts_v1/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json similarity index 100% rename from charts/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json rename to charts_v1/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json diff --git a/charts/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json b/charts_v1/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json similarity index 100% rename from charts/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json rename to charts_v1/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json diff --git a/charts/grafana_dashboards/Spark_Perf_Dashboard_v04.json b/charts_v1/grafana_dashboards/Spark_Perf_Dashboard_v04.json similarity index 100% rename from charts/grafana_dashboards/Spark_Perf_Dashboard_v04.json rename to charts_v1/grafana_dashboards/Spark_Perf_Dashboard_v04.json diff --git a/charts/spark-dashboard-0.3.0.tgz b/charts_v1/spark-dashboard-0.3.0.tgz similarity index 100% rename from charts/spark-dashboard-0.3.0.tgz rename to charts_v1/spark-dashboard-0.3.0.tgz diff --git a/charts/templates/_helpers.tpl b/charts_v1/templates/_helpers.tpl similarity index 100% rename from charts/templates/_helpers.tpl rename to charts_v1/templates/_helpers.tpl diff --git a/charts/templates/grafana_dashboards.yaml b/charts_v1/templates/grafana_dashboards.yaml similarity index 100% rename from charts/templates/grafana_dashboards.yaml rename to charts_v1/templates/grafana_dashboards.yaml diff --git a/charts/templates/grafana_datasource.yaml b/charts_v1/templates/grafana_datasource.yaml similarity index 100% rename from charts/templates/grafana_datasource.yaml rename to charts_v1/templates/grafana_datasource.yaml diff --git a/charts/templates/grafana_pod.yaml b/charts_v1/templates/grafana_pod.yaml similarity index 100% rename from charts/templates/grafana_pod.yaml rename to charts_v1/templates/grafana_pod.yaml diff --git a/charts/templates/grafana_service.yaml b/charts_v1/templates/grafana_service.yaml similarity index 100% rename from charts/templates/grafana_service.yaml rename to charts_v1/templates/grafana_service.yaml diff --git a/charts/templates/influx_graphiteconf.yaml b/charts_v1/templates/influx_graphiteconf.yaml similarity index 100% rename from charts/templates/influx_graphiteconf.yaml rename to charts_v1/templates/influx_graphiteconf.yaml diff --git a/charts/templates/influx_pod.yaml b/charts_v1/templates/influx_pod.yaml similarity index 100% rename from charts/templates/influx_pod.yaml rename to charts_v1/templates/influx_pod.yaml diff --git a/charts/templates/influx_pv.yaml b/charts_v1/templates/influx_pv.yaml similarity index 100% rename from charts/templates/influx_pv.yaml rename to charts_v1/templates/influx_pv.yaml diff --git a/charts/templates/influx_service.yaml b/charts_v1/templates/influx_service.yaml similarity index 100% rename from charts/templates/influx_service.yaml rename to charts_v1/templates/influx_service.yaml diff --git a/charts/values.yaml b/charts_v1/values.yaml similarity index 100% rename from charts/values.yaml rename to charts_v1/values.yaml diff --git a/dockerfiles/Dockerfile b/dockerfiles_v1/Dockerfile similarity index 100% rename from dockerfiles/Dockerfile rename to dockerfiles_v1/Dockerfile diff --git a/dockerfiles/README.md b/dockerfiles_v1/README.md similarity index 100% rename from dockerfiles/README.md rename to dockerfiles_v1/README.md diff --git a/dockerfiles/entrypoint.sh b/dockerfiles_v1/entrypoint.sh similarity index 100% rename from dockerfiles/entrypoint.sh rename to dockerfiles_v1/entrypoint.sh diff --git a/dockerfiles/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json b/dockerfiles_v1/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json similarity index 100% rename from dockerfiles/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json rename to dockerfiles_v1/grafana_dashboards/Spark_Perf_Dashboard_v03_with_SparkPlugins.json diff --git a/dockerfiles/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json b/dockerfiles_v1/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json similarity index 100% rename from dockerfiles/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json rename to dockerfiles_v1/grafana_dashboards/Spark_Perf_Dashboard_v03_with_annotations.json diff --git a/dockerfiles/grafana_dashboards/Spark_Perf_Dashboard_v04.json b/dockerfiles_v1/grafana_dashboards/Spark_Perf_Dashboard_v04.json similarity index 100% rename from dockerfiles/grafana_dashboards/Spark_Perf_Dashboard_v04.json rename to dockerfiles_v1/grafana_dashboards/Spark_Perf_Dashboard_v04.json diff --git a/dockerfiles/influx.yaml b/dockerfiles_v1/influx.yaml similarity index 100% rename from dockerfiles/influx.yaml rename to dockerfiles_v1/influx.yaml diff --git a/dockerfiles/influxdb.conf b/dockerfiles_v1/influxdb.conf similarity index 100% rename from dockerfiles/influxdb.conf rename to dockerfiles_v1/influxdb.conf diff --git a/dockerfiles/spark.yaml b/dockerfiles_v1/spark.yaml similarity index 100% rename from dockerfiles/spark.yaml rename to dockerfiles_v1/spark.yaml diff --git a/dockerfiles_v2/Dockerfile b/dockerfiles_v2/Dockerfile new file mode 100644 index 0000000..bec3d6a --- /dev/null +++ b/dockerfiles_v2/Dockerfile @@ -0,0 +1,48 @@ +FROM ubuntu:22.04 + +ENV TELEGRAF_VERSION 1.30.0-1 +ENV GRAFANA_VERSION 10.4.1 +ENV VM_VERSION v1.99.0 +ENV ARCH amd64 +ENV GRAFANA_VM_PLUGIN_VERSION v0.6.0 +ENV PLUGIN_PATH /var/lib/grafana/plugins + +RUN set -ex && \ + apt-get update && \ + apt-get install -qq -y curl libfontconfig musl && \ + curl -O https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_${ARCH}.deb && \ + dpkg -i grafana_${GRAFANA_VERSION}_${ARCH}.deb && \ + rm -f grafana_${GRAFANA_VERSION}_${ARCH}.deb && \ + curl -O https://repos.influxdata.com/debian/packages/telegraf_${TELEGRAF_VERSION}_${ARCH}.deb && \ + dpkg -i telegraf_${TELEGRAF_VERSION}_${ARCH}.deb && \ + rm -f telegraf_${TELEGRAF_VERSION}_${ARCH}.deb + +# Configure VictoriaMetric's Grafana datasource +RUN curl -L -O https://github.com/VictoriaMetrics/grafana-datasource/releases/download/${GRAFANA_VM_PLUGIN_VERSION}/victoriametrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz && \ + tar -xzf victoriametrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz && \ + find victoriametrics-datasource -type f -name "victoriametrics_backend_plugin*" ! -name "*linux_amd64" -exec rm -f {} + && \ + mkdir ${PLUGIN_PATH} && \ + mv victoriametrics-datasource ${PLUGIN_PATH} && \ + rm victoriametrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz +COPY grafana.ini /etc/grafana/grafana.ini +COPY victoriametrics-datasource.yml /etc/grafana/provisioning/datasources/victoriametrics-datasource.yml + +# Copy the bundled dashboards for the spark-dashboard +COPY grafana_dashboards /var/lib/grafana/dashboards +COPY spark.yaml /etc/grafana/provisioning/dashboards/spark.yaml + +# Configure telegraf +COPY telegraf.conf /etc/telegraf/telegraf.conf + +# Download and install VictoriaMetrics (VM) +RUN curl -L -O https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-${ARCH}-${VM_VERSION}.tar.gz && \ + tar -xzvf victoria-metrics-*.tar.gz && \ + rm -f victoria-metrics-linux-${ARCH}-${VM_VERSION}.tar.gz + +COPY entrypoint.sh /opt/entrypoint.sh + +EXPOSE 3000/tcp 2003/tcp 8428/tcp + +WORKDIR / +ENTRYPOINT [ "/opt/entrypoint.sh" ] + diff --git a/dockerfiles_v2/README.md b/dockerfiles_v2/README.md new file mode 100644 index 0000000..e67f5b6 --- /dev/null +++ b/dockerfiles_v2/README.md @@ -0,0 +1,31 @@ +# How to build and run the Spark dashboard in a container image + +## How to run +Run the dashboard using a container image from [Dockerhub](https://hub.docker.com/r/lucacanali/spark-dashboard): +- There are a few ports needed and multiple options on how to expose them +- Port 2003 is for Graphite ingestion, port 3000 is for Grafana, port 8428 is used internally by VictoriaMetrics source +- You can expose the ports from the container individually or just make `network=host`. +- Examples: +``` +docker run --network=host -d lucacanali/spark-dashboard +or +docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard +or +docker run -p 3000:3000 -p 2003:2003 -p 8428:8428 -d lucacanali/spark-dashboard +``` + +## Advanced: persist InfluxDB data across restarts +- This shows an example of how to use a volume to store data on VictoriaMetrics. + It allows preserving the history across runs when the container is restarted, + otherwise InfluxDB starts from scratch each time. +``` +mkdir metrics_data +docker run --network=host -v ./metrics_data:/victoria-metrics-data -d lucacanali/spark-dashboard:v02 +``` + +## Example of how to build the image: +``` +cd dockerfiles_v2 +docker build -t spark-dashboard:v02 . +``` + diff --git a/dockerfiles_v2/entrypoint.sh b/dockerfiles_v2/entrypoint.sh new file mode 100755 index 0000000..c117f13 --- /dev/null +++ b/dockerfiles_v2/entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Start the services +service grafana-server start +service telegraf start +./victoria-metrics-prod + +# when running with docker run -d option this keeps the container running +tail -f /dev/null + + diff --git a/dockerfiles_v2/grafana.ini b/dockerfiles_v2/grafana.ini new file mode 100644 index 0000000..3ce36d1 --- /dev/null +++ b/dockerfiles_v2/grafana.ini @@ -0,0 +1,5 @@ +[plugins] +allow_loading_unsigned_plugins = victoriametrics-datasource +[dashboards] +default_home_dashboard_path = /var/lib/grafana/dashboards/Spark_Perf_Dashboard_v04_PromQL.json + diff --git a/dockerfiles_v2/grafana_dashboards/Spark_Perf_Dashboard_v04_PromQL.json b/dockerfiles_v2/grafana_dashboards/Spark_Perf_Dashboard_v04_PromQL.json new file mode 100644 index 0000000..5c7703b --- /dev/null +++ b/dockerfiles_v2/grafana_dashboards/Spark_Perf_Dashboard_v04_PromQL.json @@ -0,0 +1,6454 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:47", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 56, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "Summary metrics", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total task Run Time from task metrics in the selected time range. \n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 85, + "interval": "5", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(runTime_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(runTime_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"runTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"runTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task Run Time", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Executor CPU time computed from executor metrics for the selected time range. Displays CPU time used by all the executors' JVMs active in the selected time filter interval. It contains CPU time used for executing tasks and for other activities in the JVM, notably garbage collection. Note, values from executors that are no more active in the selected time window are not accounted for.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 83, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}[$__range]) \r\n - \r\n min_over_time(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}[$__range])\r\n)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n( SELECT process, last(value) as lastvalperprocess FROM jvmCpuTime WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n( SELECT process, first(value) as firstvalperprocess FROM jvmCpuTime WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Executors CPU time", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total task CPU usage from task metrics in the selected time range. Displays CPU time used for all the completed tasks, from the task metrics data source. Note, values from executors that are no more active in the selected time window are not counted.\n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 9, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(cpuTime_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(cpuTime_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"cpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"cpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task CPU Usage ", + "type": "stat" + }, + { + "datasource": {}, + "description": "Total time spent in GC operations during task execution for the selected time range. Displays values for all the completed tasks, from the task metrics data source. Note, values from executors that are no more active in the selected time window are not counted.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 46, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(jvmGCTime_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(jvmGCTime_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)\r\n", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastperprocessval) - sum(firstperprocessval) from \n(SELECT process, last(\"value\") as lastperprocessval FROM \"jvmGCTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(\"value\") as firstperprocessval FROM \"jvmGCTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process) \n\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task GC Time ", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of completed tasks, from JVM threadpool metrics in the selected time range. Note, values from executors that are no more active in the selected time window are not counted.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 34, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(threadpool_completeTasks_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(threadpool_completeTasks_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "__auto", + "measurement": "threadpool.completeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastperprocessval) - sum(firstperprocessval) from \n(SELECT process, last(\"value\") as lastperprocessval FROM \"threadpool.completeTasks\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(\"value\") as firstperprocessval FROM \"threadpool.completeTasks\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "N# of Tasks, Compled", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of running stages from the DAG scheduler.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 79, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "stage_runningStages_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "stage.failedStages", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT \"value\" FROM \"stage.runningStages\" WHERE (\"applicationid\" =~ /^$ApplicationId$/) AND $timeFilter ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Current N# of Running Stages - latest value", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Latest value of the % of heap memory used, averaging over executors and driver. JVM metric from MemoryMXBean. Conditional to spark.metrics.conf.*.source.jvm.class=org.apache.spark.metrics.source.JvmSource", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 6 + }, + "hideTimeOverride": false, + "id": 32, + "interval": "5s", + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "avg(last_over_time(heap_usage_value{applicationid=\"$ApplicationId\"}[$__range]) * 100)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "heap.usage", + "orderByTime": "ASC", + "policy": "default", + "query": "select mean(lastvalperprocess)*100 from (SELECT process, last(value) as lastvalperprocess FROM \"heap.usage\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + " * 100" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Heap Memory Used (% of maximum usable)", + "type": "gauge" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total number of bytes read from task metrics in the selected time range. \n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 6 + }, + "id": 86, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(bytesRead_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(bytesRead_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)\r\n", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"bytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"bytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Bytes Read", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total number of bytes written from task metrics in the selected time range. \n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 6 + }, + "id": 87, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(bytesWritten_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(bytesWritten_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)\r\n", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"bytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"bytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Bytes Written", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total number of shuffle bytes written from task metrics in the selected time range. \n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 6 + }, + "id": 89, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(shuffleBytesWritten_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(shuffleBytesWritten_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)\r\n", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"shuffleBytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"shuffleBytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Bytes Written", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Failed tasks for the appStatus metrics. Conditional to spark.metrics.appStatusSource.enabled=true (default is false)", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 6 + }, + "id": 44, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "tasks_failedTasks_count_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "tasks.completedTasks.count", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT \"value\" FROM \"tasks.failedTasks.count\" WHERE (\"applicationid\" =~ /^$ApplicationId$/) AND $timeFilter", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "N# of Failed Tasks", + "type": "stat" + }, + { + "datasource": {}, + "description": "Number of failed stages from the appStatus \n metrics.\nConditional to a configuration parameter:\nspark.metrics.appStatusSource.enabled=true (default is false)", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 6 + }, + "id": 42, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "stages_failedStages_count_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "stage.failedStages", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT \"value\" FROM \"stages.failedStages.count\" WHERE (\"applicationid\" =~ /^$ApplicationId$/) AND $timeFilter ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Failed Stages", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Latest sample of the value of number of active tasks, summed over all executors", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 11 + }, + "id": 41, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(last_over_time(threadpool_activeTasks_value{applicationid=\"$ApplicationId\"}[$__range]))", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "stage.runningStages", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT process, last(\"value\") as perprocessval FROM \"threadpool.activeTasks\" WHERE \"applicationid\" = '$ApplicationId' and time > now() - 60s GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Current N# of Active Tasks - latest value", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Average value of number of active tasks", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 11 + }, + "id": 90, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(avg_over_time(threadpool_activeTasks_value{applicationid=\"$ApplicationId\"}[$__range]))\r\n", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "stage.runningStages", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT mean(\"value\") as perprocessval FROM \"threadpool.activeTasks\" WHERE $timeFilter and \"applicationid\" = '$ApplicationId' GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Average N# of Active Tasks - mean value", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total executor CPU time, from executor metrics since the start of the Spark application. Displays CPU time used by all the executors' JVMs for the duration of the application. It contains CPU time used for executing tasks and for other activities in the JVM, notably garbage collection.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 11 + }, + "id": 57, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(max_over_time(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}[$__range]))", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) as JCMCPU from ( SELECT process, last(value) as lastvalperprocess FROM jvmCpuTime WHERE \"applicationid\" = '$ApplicationId' GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Total - CPU Time, Executors", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of completed tasks, from JVM threadpool metrics since the start of the Spark application.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 11 + }, + "id": 84, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(max_over_time(threadpool_completeTasks_value{applicationid=\"$ApplicationId\"}))", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "__auto", + "measurement": "threadpool.completeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastperprocessval) from \n(SELECT process, last(\"value\") as lastperprocessval FROM \"threadpool.completeTasks\" WHERE \"applicationid\" = '$ApplicationId' GROUP BY process)", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Total - N# of Tasks, Completed ", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of succeeded jobs, from the appStatus metric (introduced in Spark 3.0). Conditional to spark.metrics.appStatusSource.enabled=true (default is false)", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 11 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "jobs_succeededJobs_count_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "jobs.succeededJobs.count", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT value FROM \"jobs.succeededJobs.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Succeeded Jobs Count", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 28, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "Spark workload metrics", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of active tasks per executor.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "N# Active Tasks", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 2, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "N# active tasks - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "avg_over_time(threadpool_activeTasks_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"threadpool.activeTasks\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Number of Active Tasks", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total executor CPU utilization computed from executors' JVM instrumentation. The value is normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would give an equivalent value of CPU utilization.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Utilization (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 33, + "interval": "5", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JVM CPU - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}[$AggrInterval]) / 1000000000", + "format": "time_series", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "instant": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"jvmCpuTime\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + }, + { + "params": [ + " / 1000000000" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Executors JVM CPU Utilization (N# cores-equivalent)", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Executor run time and wait time components compared. The run time spent by Spark executors is aggregated per area of activity. The difference between run time measurements and wait time measurements can be interpreted as due to time not instrumented (e.g. HDFS/filesystem I/O wait time). The values are normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would yield an equivalent value for the given metric.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Time component (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 18, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Run time" + }, + "properties": [ + { + "id": "custom.drawStyle", + "value": "line" + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 29 + }, + "id": 24, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "deserializeCpuTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(cpuTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000000000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "CPU time", + "measurement": "deserializeCpuTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000000000 as perprocessval FROM \"deserializeCpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "deserializeTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(jvmGCTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "GC time", + "measurement": "deserializeTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000 as perprocessval FROM \"deserializeTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "cpuTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(shuffleWriteTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000000000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Shuffle write time", + "measurement": "cpuTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000000000 as perprocessval FROM \"cpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "shuffleWriteTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(resultSerializationTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Result serialization time", + "measurement": "shuffleWriteTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000000000 as perprocessval FROM \"shuffleWriteTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "resultSerializationTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(shuffleFetchWaitTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Shuffle fetch wait time", + "measurement": "resultSerializationTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000 as perprocessval FROM \"resultSerializationTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "jvmGCTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(deserializeTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "legendFormat": "Deserialize time", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000 as perprocessval FROM \"jvmGCTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "G", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "runTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(runTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "legendFormat": "Run time", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000 as perprocessval FROM \"runTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "H", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "title": "Executor run time and wait time components", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Time the Executor has spent running tasks, including idle time and wait time (for I/O or other event). The value is normalized in \"N# of core equivalent\", that is the hypothetical number of cores working at 100% running tasks that would give an equivalent result.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Task Run Time (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 29 + }, + "id": 54, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "run time - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(runTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleFetchWaitTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000 FROM \"runTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task Run Time - from task metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Time the JVM process has spent in garbage collection. The value is normalized in \"N# of core equivalent\", that is the hypothetical number of cores working at 100% on GC that would give an equivalent result.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "GC Time (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 7, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "GCTime - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(jvmGCTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmGCTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000 FROM \"jvmGCTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval),process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Garbage Collection Time (N# of cores equivalent) - from task metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total task CPU utilization computed from task metrics. The value is normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would give an equivalent value of CPU utilization.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Utilization (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 3, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Task CPU - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(cpuTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000000000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "cpuTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"cpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + }, + { + "params": [ + " / 1000000000" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task CPU Utilization (N# cores equivalent) - from Task metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Time the Executor has spent writing shuffle data. The value is normalized in \"N# of core equivalent\", that is the hypothetical number of cores working at 100% on shuffle writing that would give an equivalent result.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Shuffle Write Time (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 67, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "shuffle write time - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(shuffleWriteTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000000000\r\n", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleFetchWaitTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"shuffleWriteTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Write Time - from task metrics", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 62 + }, + "id": 69, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "Spark Executors Memory Metrics (Spark 3.0)", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "JVM Heap Memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM Heap Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 63 + }, + "id": 70, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JVM heap memory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "JVMHeapMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"JVMHeapMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "JVM On Heap Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "JVM Off Heap Memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM Off Heap Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 63 + }, + "id": 80, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JVM off heap memory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "JVMOffHeapMemory_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"JVMOffHeapMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "JVM Off Heap Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "OnHeapUnifiedMemory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "OnHeapUnifiedMemory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 75 + }, + "id": 81, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "OnHeapUnifiedMemory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "OnHeapUnifiedMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"OnHeapUnifiedMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "OnHeapUnifiedMemory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "OffHeapUnifiedMemory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "OffHeapUnifiedMemory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 75 + }, + "id": 71, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "OffHeapUnifiedMemory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "OffHeapUnifiedMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"OffHeapUnifiedMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "OffHeapUnifiedMemory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "OnHeapExecutionMemory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "OnHeap Execution Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 87 + }, + "id": 76, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "OnHeapExecutionMemory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "OnHeapExecutionMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"OnHeapExecutionMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "OnHeap Execution Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "OnHeap Storage Memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "OnHeap Storage Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 87 + }, + "id": 77, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "OnHeapStorageMemory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "OnHeapStorageMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"OnHeapStorageMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "OnHeap Storage Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "MinorGCTime", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MinorGCTime (normalized in core equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 99 + }, + "id": 75, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "GCTime - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(MinorGCTime_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmGCTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000 FROM \"MinorGCTime\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "MinorGCTime", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "MajorGCTime", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MajorGCTime (normalized in core equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 99 + }, + "id": 74, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "GCTime - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(MajorGCTime_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmGCTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000 FROM \"MajorGCTime\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "MajorGCTime", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "ProcessTreeJVMRSSMemory. Conditional to --conf spark.executor.processTreeMetrics.enabled=true", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "ProcessTreeJVMRSSMemory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 109 + }, + "id": 73, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JMV RSS - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "ProcessTreeJVMRSSMemory_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"ProcessTreeJVMRSSMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "ProcessTreeJVMRSSMemory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "ProcessTreePythonRSSMemory. Conditional to --conf spark.executor.processTreeMetrics.enabled=true", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "ProcessTreeJVMRSSMemory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 109 + }, + "id": 82, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Python RSS - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(ProcessTreePythonRSSMemory_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"ProcessTreePythonRSSMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "ProcessTreePythonRSSMemory", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 121 + }, + "id": 63, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "JVM source memory metrics (conditional to spark.metrics.conf.*.source.jvm.class=org.apache.spark.metrics.source.JvmSource)", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total memory (heap + non-heap) used per executor. From MemoryMXBean.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM total memory used", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 122 + }, + "id": 50, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Total Memory Used - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "total_used_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "heap.used", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"total.used\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter AND process != 'driver' GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + }, + { + "condition": "AND", + "key": "process", + "operator": "!=", + "value": "driver" + } + ] + } + ], + "title": "Executor JVM Total Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Heap memory used per executor. From MemoryMXBean.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM heap memory used", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 122 + }, + "id": 65, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Heap Memory Used - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "heap_used_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "heap.used", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"heap.used\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter AND process != 'driver' GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + }, + { + "condition": "AND", + "key": "process", + "operator": "!=", + "value": "driver" + } + ] + } + ], + "title": "Executor JVM Heap Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total memory (heap + non-heap) used by the driver. From MemoryMXBean.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-blue", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM total memory used", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 134 + }, + "id": 64, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Total Memory Used - $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "total_used_value{applicationid=\"$ApplicationId\", executorid=\"driver\"}", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "Total used memory, {{executorid}}", + "measurement": "heap.used", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"total.used\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter AND process = 'driver' GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + }, + { + "condition": "AND", + "key": "process", + "operator": "!=", + "value": "driver" + } + ] + } + ], + "title": "Driver JVM Total Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Driver CPU utilization computed from the JVM instrumentation. The value is normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would give an equivalent value of CPU utilization.\n", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-green", + "mode": "shades", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Utilization (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 134 + }, + "id": 91, + "interval": "5", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JVM CPU - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid=\"driver\"}[$AggrInterval]) / 1000000000", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "JVM CPU time, {{executorid}}", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"jvmCpuTime\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + }, + { + "params": [ + " / 1000000000" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Driver JVM CPU Utilization (N# cores-equivalent)", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 144 + }, + "id": 60, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "Spark I/O metrics", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS read throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 145 + }, + "id": 16, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "hdfs bytes read - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(filesystem_hdfs_read_bytes_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"filesystem.hdfs.read_bytes\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes read/sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS write throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes written/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 145 + }, + "id": 61, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "hdfs bytes written - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(filesystem_hdfs_write_bytes_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"filesystem.hdfs.write_bytes\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes written/sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Shuffle read throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Shuffle total bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 155 + }, + "id": 18, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "shuffle Total Bytes Read - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(shuffleTotalBytesRead_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleTotalBytesRead.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"shuffleTotalBytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Total Bytes Read/sec - from task metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Shuffle write throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Shuffle bytes written/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 155 + }, + "id": 17, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "shuffleBytesWritten - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(shuffleBytesWritten_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleBytesWritten.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"shuffleBytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Bytes Written/sec - from taskmetrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Details of shuffle I/O operations, local and remote.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Shuffle bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 166 + }, + "id": 22, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "shuffleRemoteBytesRead", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(shuffleLocalBytesRead_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]))", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Shuffle local bytes read", + "measurement": "shuffleRemoteBytesRead.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) as perprocessval FROM \"shuffleRemoteBytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)\n", + "range": true, + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + }, + { + "alias": "shuffleLocalBytesRead", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(shuffleRemoteBytesRead_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]))", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Shuffle Remote bytes read", + "measurement": "shuffleLocalBytesRead.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) as perprocessval FROM \"shuffleLocalBytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), \"process\") GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Read Composition", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Bytes spilled to disk per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes spilled to disk/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 166 + }, + "id": 88, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "diskBytesSpilled - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(diskBytesSpilled_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleBytesWritten.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"diskBytesSpilled.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Bytes spilled to disk per sec - from taskmetrics", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "luca", + "value": "luca" + }, + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "definition": "label_values(username)", + "hide": 0, + "includeAll": false, + "label": "User", + "multi": false, + "name": "UserName", + "options": [], + "query": { + "query": "label_values(username)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "application_1710156516635_16013", + "value": "application_1710156516635_16013" + }, + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "definition": "label_values({username=~\"$UserName\"}, applicationid)", + "hide": 0, + "includeAll": false, + "label": "Spark Application Id", + "multi": false, + "name": "ApplicationId", + "options": [], + "query": { + "query": "label_values({username=~\"$UserName\"}, applicationid)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 4, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "20s", + "value": "20s" + }, + "description": "The aggregation time interval applied when aggregating task metrics. Use 10s (default) when your tasks run <= 10s, use a large value when you have long-running tasks.\n", + "hide": 0, + "includeAll": false, + "label": "Task metrics time average", + "multi": false, + "name": "AggrInterval", + "options": [ + { + "selected": false, + "text": "2s", + "value": "2s" + }, + { + "selected": false, + "text": "10s", + "value": "10s" + }, + { + "selected": true, + "text": "20s", + "value": "20s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "2s, 10s, 20s, 30s, 1m, 2m, 5m", + "queryValue": "20s", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Spark_Perf_Dashboard_v04_PromQL", + "uid": "cdfq4554jii9sa", + "version": 1, + "weekStart": "" +} diff --git a/dockerfiles_v2/grafana_dashboards/Spark_Perf_Dashboard_v04_PromQL_with_SparkPlugins.json b/dockerfiles_v2/grafana_dashboards/Spark_Perf_Dashboard_v04_PromQL_with_SparkPlugins.json new file mode 100644 index 0000000..38a4fa7 --- /dev/null +++ b/dockerfiles_v2/grafana_dashboards/Spark_Perf_Dashboard_v04_PromQL_with_SparkPlugins.json @@ -0,0 +1,9094 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:47", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 92, + "panels": [ + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total executor CPU utilization computed from executors' container cgroup instrumentation. The value is normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would give an equivalent value of CPU utilization.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Utilization (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 94, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Container CPU - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CgroupMetrics_CPUTimeNanosec_value) / 1000000000", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"ch.cern.CgroupMetrics.CPUTimeNanosec\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + }, + { + "params": [ + " / 1000000000" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Executors Container CPU Utilization (N# cores-equivalent)", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Network bytes incoming, throughput from ch.cern.CgroupMetrics.NetworkBytesIn", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Network bytes in bytes/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 95, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Network bytes in - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CgroupMetrics_NetworkBytesIn_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.CgroupMetrics.NetworkBytesIn\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Network bytes in/sec ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Network bytes outgoing, throughput metric from ch.cern.CgroupMetrics.NetworkBytesOut", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Network bytes out bytes/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 96, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "network bytes out - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CgroupMetrics_NetworkBytesOut_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.CgroupMetrics.NetworkBytesOut\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Network bytes out/sec ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Memory RSS from ch.cern.CgroupMetrics.MemoryRss.\nBytes of anonymous and swap cache memory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Memory RSS bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 97, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Container memory RSS - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CgroupMetrics_MemoryRss_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"ch.cern.CgroupMetrics.MemoryRss\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Memory RSS", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Memory number of bytes of swap usage from ch.cern.CgroupMetrics.MemorySwap", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Memory Swap bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 98, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Memory swap - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CgroupMetrics_MemorySwap_value) ", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"ch.cern.CgroupMetrics.MemorySwap\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Memory Swap Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of bytes of page cache memory from ch.cern.CgroupMetrics.MemoryCache", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Memory cache bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 99, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Memory page cache - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CgroupMetrics_MemoryCache_value) ", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"ch.cern.CgroupMetrics.MemoryCache\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Memory Page Cache", + "type": "timeseries" + } + ], + "title": "Spark Plugins: CGroup Metrics (use with Spark on K8S)", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 109, + "panels": [ + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Cloud storage read throughput from ch.cern.CloudFSMetrics", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Cloud storage bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 110, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "bytes read - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CloudFSMetrics_bytesRead_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.CloudFSMetrics.bytesRead\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Cloud Storage bytes read/sec ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Cloud storage write throughput from ch.cern.CloudFSMetrics", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Cloud storage bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 111, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "bytes written- Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CloudFSMetrics_bytesWritten_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.CloudFSMetrics.bytesWritten\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Cloud Storage bytes written/sec ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Cloud storage read operations from ch.cern.CloudFSMetrics", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Cloud storage read ops/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 112, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "read ops - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CloudFSMetrics_readOps_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.CloudFSMetrics.readOps\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Cloud Storage read ops/sec ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Cloud storage read operations from ch.cern.CloudFSMetrics", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Cloud storage write ops/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 113, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "write ops - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_CloudFSMetrics_writeOps_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.CloudFSMetrics.writeOps\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Cloud Storage write ops/sec ", + "type": "timeseries" + } + ], + "title": "Spark Plugins: Cloud Storage (use with S3A, GS, WASB, etc)", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 100, + "panels": [ + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS read throughput from ch.cern.HDFSMetrics", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 101, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "HDFS bytes read - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_HDFSMetrics_bytesRead_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.HDFSMetrics.bytesRead\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes read/sec ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS write throughput from ch.cern.HDFSMetrics.bytesWritten", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes written/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 102, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "HDFS bytes written- Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_HDFSMetrics_bytesWritten_value) ", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.HDFSMetrics.bytesWritten\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes written/sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS read ops from ch.cern.HDFSMetrics.readOps", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS read ops/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 103, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "HDFS read ops - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_HDFSMetrics_readOps_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.HDFSMetrics.readOps\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS read ops/sec ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS large read ops from ch.cern.HDFSMetrics.largeReadOps", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS large read ops/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 104, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "HDFS large read ops - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_HDFSMetrics_largeReadOps_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.HDFSMetrics.largeReadOps\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS large read ops/sec ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS read throughput from ch.cern.HDFSMetrics.bytesLocalHost", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 105, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "HDFS bytes read - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_HDFSMetrics_bytesReadLocalHost_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.HDFSMetrics.bytesReadLocalHost\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes read localhost (bytes/sec) ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS read throughput from ch.cern.HDFSMetrics.bytesReadDistanceOfOneOrTwo", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 53 + }, + "id": 107, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "hdfs bytes read - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_HDFSMetrics_bytesReadDistanceOfOneOrTwo_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.HDFSMetrics.bytesReadDistanceOfOneOrTwo\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes read distance 1 or 2 (bytes/sec) ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS read throughput from ch.cern.HDFSMetrics.bytesReadDistanceOfThreeOrFour", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 63 + }, + "id": 108, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "hdfs bytes read - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_HDFSMetrics_bytesReadDistanceOfThreeOrFour_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.HDFSMetrics.bytesReadDistanceOfThreeOrFour\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes read distance 3 or 4 (bytes/sec) ", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS read throughput from ch.cern.HDFSMetrics.bytesReadDistanceOfFiveOrLarger", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 63 + }, + "id": 106, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "HDFS bytes read - Executor $tag_process", + "datasource": { + "uid": "influx-graphite" + }, + "editorMode": "code", + "expr": "rate(ch_cern_HDFSMetrics_bytesReadDistanceOfFiveOrLarger_value)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"ch.cern.HDFSMetrics.bytesReadDistanceOfFiveOrLarger\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes read distance 5 or larger (bytes/sec) ", + "type": "timeseries" + } + ], + "title": "Spark Plugins: HDFS Advanced Statistics", + "type": "row" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 56, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "Summary metrics", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total task Run Time from task metrics in the selected time range. \n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 4 + }, + "id": 85, + "interval": "5", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(runTime_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(runTime_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"runTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"runTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task Run Time", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Executor CPU time computed from executor metrics for the selected time range. Displays CPU time used by all the executors' JVMs active in the selected time filter interval. It contains CPU time used for executing tasks and for other activities in the JVM, notably garbage collection. Note, values from executors that are no more active in the selected time window are not accounted for.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 4 + }, + "id": 83, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}[$__range]) \r\n - \r\n min_over_time(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}[$__range])\r\n)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n( SELECT process, last(value) as lastvalperprocess FROM jvmCpuTime WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n( SELECT process, first(value) as firstvalperprocess FROM jvmCpuTime WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Executors CPU time", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total task CPU usage from task metrics in the selected time range. Displays CPU time used for all the completed tasks, from the task metrics data source. Note, values from executors that are no more active in the selected time window are not counted.\n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 4 + }, + "id": 9, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(cpuTime_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(cpuTime_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"cpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"cpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task CPU Usage ", + "type": "stat" + }, + { + "datasource": {}, + "description": "Total time spent in GC operations during task execution for the selected time range. Displays values for all the completed tasks, from the task metrics data source. Note, values from executors that are no more active in the selected time window are not counted.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 4 + }, + "id": 46, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(jvmGCTime_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(jvmGCTime_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)\r\n", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastperprocessval) - sum(firstperprocessval) from \n(SELECT process, last(\"value\") as lastperprocessval FROM \"jvmGCTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(\"value\") as firstperprocessval FROM \"jvmGCTime.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process) \n\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task GC Time ", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of completed tasks, from JVM threadpool metrics in the selected time range. Note, values from executors that are no more active in the selected time window are not counted.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 4 + }, + "id": 34, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(threadpool_completeTasks_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(threadpool_completeTasks_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "__auto", + "measurement": "threadpool.completeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastperprocessval) - sum(firstperprocessval) from \n(SELECT process, last(\"value\") as lastperprocessval FROM \"threadpool.completeTasks\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(\"value\") as firstperprocessval FROM \"threadpool.completeTasks\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "N# of Tasks, Compled", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of running stages from the DAG scheduler.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 4 + }, + "id": 79, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "stage_runningStages_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "stage.failedStages", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT \"value\" FROM \"stage.runningStages\" WHERE (\"applicationid\" =~ /^$ApplicationId$/) AND $timeFilter ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Current N# of Running Stages - latest value", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Latest value of the % of heap memory used, averaging over executors and driver. JVM metric from MemoryMXBean. Conditional to spark.metrics.conf.*.source.jvm.class=org.apache.spark.metrics.source.JvmSource", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 9 + }, + "hideTimeOverride": false, + "id": 32, + "interval": "5s", + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "avg(last_over_time(heap_usage_value{applicationid=\"$ApplicationId\"}[$__range]) * 100)", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "heap.usage", + "orderByTime": "ASC", + "policy": "default", + "query": "select mean(lastvalperprocess)*100 from (SELECT process, last(value) as lastvalperprocess FROM \"heap.usage\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + " * 100" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Heap Memory Used (% of maximum usable)", + "type": "gauge" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total number of bytes read from task metrics in the selected time range. \n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 9 + }, + "id": 86, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(bytesRead_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(bytesRead_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)\r\n", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"bytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"bytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Bytes Read", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total number of bytes written from task metrics in the selected time range. \n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 9 + }, + "id": 87, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(bytesWritten_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(bytesWritten_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)\r\n", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"bytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"bytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Bytes Written", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total number of shuffle bytes written from task metrics in the selected time range. \n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 9 + }, + "id": 89, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum (\r\n max_over_time(shuffleBytesWritten_count_value{applicationid=\"$ApplicationId\"}[$__range]) \r\n - \r\n min_over_time(shuffleBytesWritten_count_value{applicationid=\"$ApplicationId\"}[$__range])\r\n)\r\n", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "job.activeJobs", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) - sum(firstvalperprocess) from \n(SELECT process, last(value) as lastvalperprocess FROM \"shuffleBytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process),\n(SELECT process, first(value) as firstvalperprocess FROM \"shuffleBytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter GROUP BY process)\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Bytes Written", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Failed tasks for the appStatus metrics. Conditional to spark.metrics.appStatusSource.enabled=true (default is false)", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 9 + }, + "id": 44, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "tasks_failedTasks_count_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "tasks.completedTasks.count", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT \"value\" FROM \"tasks.failedTasks.count\" WHERE (\"applicationid\" =~ /^$ApplicationId$/) AND $timeFilter", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "N# of Failed Tasks", + "type": "stat" + }, + { + "datasource": {}, + "description": "Number of failed stages from the appStatus \n metrics.\nConditional to a configuration parameter:\nspark.metrics.appStatusSource.enabled=true (default is false)", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 9 + }, + "id": 42, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "stages_failedStages_count_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "stage.failedStages", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT \"value\" FROM \"stages.failedStages.count\" WHERE (\"applicationid\" =~ /^$ApplicationId$/) AND $timeFilter ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Failed Stages", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Latest sample of the value of number of active tasks, summed over all executors", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 14 + }, + "id": 41, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(last_over_time(threadpool_activeTasks_value{applicationid=\"$ApplicationId\"}[$__range]))", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "stage.runningStages", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT process, last(\"value\") as perprocessval FROM \"threadpool.activeTasks\" WHERE \"applicationid\" = '$ApplicationId' and time > now() - 60s GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Current N# of Active Tasks - latest value", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Average value of number of active tasks", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 14 + }, + "id": 90, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(avg_over_time(threadpool_activeTasks_value{applicationid=\"$ApplicationId\"}[$__range]))\r\n", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "stage.runningStages", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT mean(\"value\") as perprocessval FROM \"threadpool.activeTasks\" WHERE $timeFilter and \"applicationid\" = '$ApplicationId' GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Average N# of Active Tasks - mean value", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total executor CPU time, from executor metrics since the start of the Spark application. Displays CPU time used by all the executors' JVMs for the duration of the application. It contains CPU time used for executing tasks and for other activities in the JVM, notably garbage collection.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 14 + }, + "id": 57, + "interval": "5s", + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(max_over_time(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}[$__range]))", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "__auto", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastvalperprocess) as JCMCPU from ( SELECT process, last(value) as lastvalperprocess FROM jvmCpuTime WHERE \"applicationid\" = '$ApplicationId' GROUP BY process) ", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Total - CPU Time, Executors", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of completed tasks, from JVM threadpool metrics since the start of the Spark application.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 14 + }, + "id": 84, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(max_over_time(threadpool_completeTasks_value{applicationid=\"$ApplicationId\"}))", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "__auto", + "measurement": "threadpool.completeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(lastperprocessval) from \n(SELECT process, last(\"value\") as lastperprocessval FROM \"threadpool.completeTasks\" WHERE \"applicationid\" = '$ApplicationId' GROUP BY process)", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Total - N# of Tasks, Completed ", + "type": "stat" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of succeeded jobs, from the appStatus metric (introduced in Spark 3.0). Conditional to spark.metrics.appStatusSource.enabled=true (default is false)", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 14 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "jobs_succeededJobs_count_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "0" + ], + "type": "fill" + } + ], + "legendFormat": "__auto", + "measurement": "jobs.succeededJobs.count", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT value FROM \"jobs.succeededJobs.count\" WHERE \"applicationid\" = '$ApplicationId' and $timeFilter", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Succeeded Jobs Count", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 28, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "Spark workload metrics", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Number of active tasks per executor.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "N# Active Tasks", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 2, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "N# active tasks - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "avg_over_time(threadpool_activeTasks_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"threadpool.activeTasks\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Number of Active Tasks", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total executor CPU utilization computed from executors' JVM instrumentation. The value is normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would give an equivalent value of CPU utilization.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Utilization (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 33, + "interval": "5", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JVM CPU - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}[$AggrInterval]) / 1000000000", + "format": "time_series", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "instant": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"jvmCpuTime\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + }, + { + "params": [ + " / 1000000000" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Executors JVM CPU Utilization (N# cores-equivalent)", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Executor run time and wait time components compared. The run time spent by Spark executors is aggregated per area of activity. The difference between run time measurements and wait time measurements can be interpreted as due to time not instrumented (e.g. HDFS/filesystem I/O wait time). The values are normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would yield an equivalent value for the given metric.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Time component (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 18, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Run time" + }, + "properties": [ + { + "id": "custom.drawStyle", + "value": "line" + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 24, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "deserializeCpuTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(cpuTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000000000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "CPU time", + "measurement": "deserializeCpuTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000000000 as perprocessval FROM \"deserializeCpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "deserializeTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(jvmGCTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "GC time", + "measurement": "deserializeTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000 as perprocessval FROM \"deserializeTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "cpuTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(shuffleWriteTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000000000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Shuffle write time", + "measurement": "cpuTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000000000 as perprocessval FROM \"cpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "shuffleWriteTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(resultSerializationTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Result serialization time", + "measurement": "shuffleWriteTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000000000 as perprocessval FROM \"shuffleWriteTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "resultSerializationTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(shuffleFetchWaitTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Shuffle fetch wait time", + "measurement": "resultSerializationTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000 as perprocessval FROM \"resultSerializationTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [] + }, + { + "alias": "jvmGCTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(deserializeTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "legendFormat": "Deserialize time", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000 as perprocessval FROM \"jvmGCTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "G", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "runTime", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(runTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])) / 1000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "legendFormat": "Run time", + "orderByTime": "ASC", + "policy": "default", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) / 1000 as perprocessval FROM \"runTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "H", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "title": "Executor run time and wait time components", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Time the Executor has spent running tasks, including idle time and wait time (for I/O or other event). The value is normalized in \"N# of core equivalent\", that is the hypothetical number of cores working at 100% running tasks that would give an equivalent result.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Task Run Time (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 54, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "run time - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(runTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleFetchWaitTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000 FROM \"runTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task Run Time - from task metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Time the JVM process has spent in garbage collection. The value is normalized in \"N# of core equivalent\", that is the hypothetical number of cores working at 100% on GC that would give an equivalent result.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "GC Time (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 7, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "GCTime - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(jvmGCTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmGCTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000 FROM \"jvmGCTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval),process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Garbage Collection Time (N# of cores equivalent) - from task metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total task CPU utilization computed from task metrics. The value is normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would give an equivalent value of CPU utilization.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Utilization (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 3, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Task CPU - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(cpuTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000000000", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "cpuTime.count", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"cpuTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + }, + { + "params": [ + " / 1000000000" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Task CPU Utilization (N# cores equivalent) - from Task metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Time the Executor has spent writing shuffle data. The value is normalized in \"N# of core equivalent\", that is the hypothetical number of cores working at 100% on shuffle writing that would give an equivalent result.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Shuffle Write Time (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 54 + }, + "id": 67, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "shuffle write time - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(shuffleWriteTime_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000000000\r\n", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleFetchWaitTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"shuffleWriteTime.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Write Time - from task metrics", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 69, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "Spark Executors Memory Metrics (Spark 3.0)", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "JVM Heap Memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM Heap Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 66 + }, + "id": 70, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JVM heap memory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "JVMHeapMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"JVMHeapMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "JVM On Heap Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "JVM Off Heap Memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM Off Heap Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 66 + }, + "id": 80, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JVM off heap memory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "JVMOffHeapMemory_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"JVMOffHeapMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "JVM Off Heap Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "OnHeapUnifiedMemory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "OnHeapUnifiedMemory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 81, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "OnHeapUnifiedMemory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "OnHeapUnifiedMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"OnHeapUnifiedMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "OnHeapUnifiedMemory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "OffHeapUnifiedMemory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "OffHeapUnifiedMemory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 71, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "OffHeapUnifiedMemory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "OffHeapUnifiedMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"OffHeapUnifiedMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "OffHeapUnifiedMemory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "OnHeapExecutionMemory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "OnHeap Execution Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 90 + }, + "id": 76, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "OnHeapExecutionMemory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "OnHeapExecutionMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"OnHeapExecutionMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "OnHeap Execution Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "OnHeap Storage Memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "OnHeap Storage Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 90 + }, + "id": 77, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "OnHeapStorageMemory - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "OnHeapStorageMemory_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"OnHeapStorageMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "OnHeap Storage Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "MinorGCTime", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MinorGCTime (normalized in core equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 102 + }, + "id": 75, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "GCTime - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(MinorGCTime_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmGCTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000 FROM \"MinorGCTime\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "MinorGCTime", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "MajorGCTime", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MajorGCTime (normalized in core equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 102 + }, + "id": 74, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "GCTime - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(MajorGCTime_value{applicationid=\"$ApplicationId\"}[$AggrInterval]) / 1000", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "jvmGCTime.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000 FROM \"MajorGCTime\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "MajorGCTime", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "ProcessTreeJVMRSSMemory. Conditional to --conf spark.executor.processTreeMetrics.enabled=true", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "ProcessTreeJVMRSSMemory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 112 + }, + "id": 73, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JMV RSS - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "ProcessTreeJVMRSSMemory_value{applicationid=\"$ApplicationId\"}", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"ProcessTreeJVMRSSMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "ProcessTreeJVMRSSMemory", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "ProcessTreePythonRSSMemory. Conditional to --conf spark.executor.processTreeMetrics.enabled=true", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "ProcessTreeJVMRSSMemory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 112 + }, + "id": 82, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Python RSS - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(ProcessTreePythonRSSMemory_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "legendFormat": "exec: {{executorid}}", + "measurement": "threadpool.activeTasks", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(value) FROM \"ProcessTreePythonRSSMemory\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "ProcessTreePythonRSSMemory", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 124 + }, + "id": 63, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "JVM source memory metrics (conditional to spark.metrics.conf.*.source.jvm.class=org.apache.spark.metrics.source.JvmSource)", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total memory (heap + non-heap) used per executor. From MemoryMXBean.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM total memory used", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 125 + }, + "id": 50, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Total Memory Used - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "total_used_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "heap.used", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"total.used\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter AND process != 'driver' GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + }, + { + "condition": "AND", + "key": "process", + "operator": "!=", + "value": "driver" + } + ] + } + ], + "title": "Executor JVM Total Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Heap memory used per executor. From MemoryMXBean.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM heap memory used", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 125 + }, + "id": 65, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Heap Memory Used - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "heap_used_value{applicationid=\"$ApplicationId\", executorid!=\"driver\"}", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "heap.used", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"heap.used\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter AND process != 'driver' GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + }, + { + "condition": "AND", + "key": "process", + "operator": "!=", + "value": "driver" + } + ] + } + ], + "title": "Executor JVM Heap Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Total memory (heap + non-heap) used by the driver. From MemoryMXBean.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-blue", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "JVM total memory used", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 137 + }, + "id": 64, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "Total Memory Used - $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "total_used_value{applicationid=\"$ApplicationId\", executorid=\"driver\"}", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "Total used memory, {{executorid}}", + "measurement": "heap.used", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(value) FROM \"total.used\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter AND process = 'driver' GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + }, + { + "condition": "AND", + "key": "process", + "operator": "!=", + "value": "driver" + } + ] + } + ], + "title": "Driver JVM Total Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Driver CPU utilization computed from the JVM instrumentation. The value is normalized in \"N# of core equivalent\", that is the number of cores running at 100% that would give an equivalent value of CPU utilization.\n", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-green", + "mode": "shades", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Utilization (N# cores equivalent)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 137 + }, + "id": 91, + "interval": "5", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "JVM CPU - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(jvmCpuTime_value{applicationid=\"$ApplicationId\", executorid=\"driver\"}[$AggrInterval]) / 1000000000", + "groupBy": [ + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "hide": false, + "legendFormat": "JVM CPU time, {{executorid}}", + "measurement": "jvmCpuTime", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(value), 1s) / 1000000000 FROM \"jvmCpuTime\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process\n", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + }, + { + "params": [ + " / 1000000000" + ], + "type": "math" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Driver JVM CPU Utilization (N# cores-equivalent)", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 147 + }, + "id": 60, + "panels": [], + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "refId": "A" + } + ], + "title": "Spark I/O metrics", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS read throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 148 + }, + "id": 16, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "hdfs bytes read - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(filesystem_hdfs_read_bytes_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"filesystem.hdfs.read_bytes\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes read/sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "HDFS write throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "HDFS bytes written/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 148 + }, + "id": 61, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "hdfs bytes written - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(filesystem_hdfs_write_bytes_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "filesystem.hdfs.read_bytes", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"filesystem.hdfs.write_bytes\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($__interval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "HDFS bytes written/sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Shuffle read throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Shuffle total bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 158 + }, + "id": 18, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "shuffle Total Bytes Read - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(shuffleTotalBytesRead_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleTotalBytesRead.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"shuffleTotalBytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Total Bytes Read/sec - from task metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Shuffle write throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Shuffle bytes written/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 158 + }, + "id": 17, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "shuffleBytesWritten - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(shuffleBytesWritten_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleBytesWritten.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"shuffleBytesWritten.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Bytes Written/sec - from taskmetrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Details of shuffle I/O operations, local and remote.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Shuffle bytes read/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 169 + }, + "id": 22, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "shuffleRemoteBytesRead", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(shuffleLocalBytesRead_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]))", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Shuffle local bytes read", + "measurement": "shuffleRemoteBytesRead.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) as perprocessval FROM \"shuffleRemoteBytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process) GROUP BY time($groupbyInterval)\n", + "range": true, + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + }, + { + "alias": "shuffleLocalBytesRead", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "sum(rate(shuffleRemoteBytesRead_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval]))", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "legendFormat": "Shuffle Remote bytes read", + "measurement": "shuffleLocalBytesRead.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "select sum(perprocessval) from (SELECT non_negative_derivative(last(\"value\"), 1s) as perprocessval FROM \"shuffleLocalBytesRead.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), \"process\") GROUP BY time($groupbyInterval)", + "range": true, + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Shuffle Read Composition", + "type": "timeseries" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "description": "Bytes spilled to disk per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes spilled to disk/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 169 + }, + "id": 88, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "diskBytesSpilled - Executor $tag_process", + "datasource": { + "type": "influxdb", + "uid": "PAB4C6DB7858A5C06" + }, + "editorMode": "code", + "expr": "rate(diskBytesSpilled_count_value{applicationid=\"$ApplicationId\"}[$AggrInterval])", + "groupBy": [ + { + "params": [ + "15s" + ], + "type": "time" + }, + { + "params": [ + "process" + ], + "type": "tag" + } + ], + "legendFormat": "exec: {{executorid}}", + "measurement": "shuffleBytesWritten.count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT non_negative_derivative(mean(value), 1s) FROM \"diskBytesSpilled.count\" WHERE \"applicationid\" = '$ApplicationId' AND $timeFilter GROUP BY time($groupbyInterval), process", + "range": true, + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "15s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "applicationid", + "operator": "=~", + "value": "/^$ApplicationId$/" + } + ] + } + ], + "title": "Bytes spilled to disk per sec - from taskmetrics", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "luca", + "value": "luca" + }, + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "definition": "label_values(username)", + "hide": 0, + "includeAll": false, + "label": "User", + "multi": false, + "name": "UserName", + "options": [], + "query": { + "query": "label_values(username)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "spark-26669bff862048b180f8aa1d7c4d0ca5", + "value": "spark-26669bff862048b180f8aa1d7c4d0ca5" + }, + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P4169E866C3094E38" + }, + "definition": "label_values({username=~\"$UserName\"}, applicationid)", + "hide": 0, + "includeAll": false, + "label": "Spark Application Id", + "multi": false, + "name": "ApplicationId", + "options": [], + "query": { + "query": "label_values({username=~\"$UserName\"}, applicationid)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 4, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "20s", + "value": "20s" + }, + "description": "The aggregation time interval applied when aggregating task metrics. Use 10s (default) when your tasks run <= 10s, use a large value when you have long-running tasks.\n", + "hide": 0, + "includeAll": false, + "label": "Task metrics time average", + "multi": false, + "name": "AggrInterval", + "options": [ + { + "selected": false, + "text": "2s", + "value": "2s" + }, + { + "selected": false, + "text": "10s", + "value": "10s" + }, + { + "selected": true, + "text": "20s", + "value": "20s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "2s, 10s, 20s, 30s, 1m, 2m, 5m", + "queryValue": "20s", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Spark_Perf_Dashboard_v04_PromQL_with_SparkPlugins", + "uid": "fqfq4554jii9sa", + "version": 1, + "weekStart": "" +} diff --git a/dockerfiles_v2/spark.yaml b/dockerfiles_v2/spark.yaml new file mode 100644 index 0000000..8946ee0 --- /dev/null +++ b/dockerfiles_v2/spark.yaml @@ -0,0 +1,14 @@ +apiVersion: 1 + +providers: +- name: spark-dashboard + orgId: 1 + folder: '' + folderUid: '' + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 10 + options: + path: /var/lib/grafana/dashboards + diff --git a/dockerfiles_v2/telegraf.conf b/dockerfiles_v2/telegraf.conf new file mode 100644 index 0000000..ca6e81b --- /dev/null +++ b/dockerfiles_v2/telegraf.conf @@ -0,0 +1,29 @@ +[[inputs.socket_listener]] + service_address = "tcp://:2003" + data_format = "graphite" + separator = "." + templates = [ + # JVM source + "*.*.jvm.pools.* username.applicationid.executorid.namespace.namespace.measurement*", + # YARN source + "*.*.applicationMaster.* username.applicationid.namespace.measurement*", + # shuffle service source + "*.shuffleService.* username.namespace.measurement*", + # streaming + "*.*.*.spark.streaming.* username.applicationid.executorid.namespace.namespace.id.measurement*", + # generic template for driver and executor sources + "username.applicationid.executorid.namespace.measurement*" ] + +[[outputs.http]] + ## URL is the address to send metrics to + url = "http://localhost:8428/api/v1/write" + method = "POST" + data_format = "prometheusremotewrite" + tagexclude = ["host", "namespace"] + +# Configure if needed +#[agent] +# interval = "10s" +# flush_interval = "10s" +# flush_jitter = "0s" + diff --git a/dockerfiles_v2/victoriametrics-datasource.yml b/dockerfiles_v2/victoriametrics-datasource.yml new file mode 100644 index 0000000..5aa6847 --- /dev/null +++ b/dockerfiles_v2/victoriametrics-datasource.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: VictoriaMetrics + type: victoriametrics-datasource + access: proxy + url: http://localhost:8428 + isDefault: true +