From eb11da391bca3a4dc4e63df98509e06e9d1d1fd2 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Tue, 9 Apr 2024 22:14:01 +0200 Subject: [PATCH] migrate: another mash of detail fixes --- dashboards/cluster.jsonnet | 6 +- dashboards/common.libsonnet | 24 ++--- dashboards/jupyterhub.jsonnet | 167 +++++++++++++++++--------------- dashboards/jupyterhub.libsonnet | 56 ++++------- 4 files changed, 127 insertions(+), 126 deletions(-) diff --git a/dashboards/cluster.jsonnet b/dashboards/cluster.jsonnet index 52f26be..a6fb2ef 100755 --- a/dashboards/cluster.jsonnet +++ b/dashboards/cluster.jsonnet @@ -435,10 +435,8 @@ dashboard.new('Cluster Information') [ row.new('Cluster Utilization') + row.withPanels([ - userPods, // FIXME: previously width 24 + userPods, userNodes, - nodepoolMemoryCommitment, - nodepoolCPUCommitment, ]), row.new('Cluster Health') + row.withPanels([ @@ -451,6 +449,8 @@ dashboard.new('Cluster Information') nodeMemoryUtil, nodeCPUCommit, nodeMemoryCommit, + nodepoolCPUCommitment, + nodepoolMemoryCommitment, ]), ], panelWidth=12, diff --git a/dashboards/common.libsonnet b/dashboards/common.libsonnet index 97d5089..702a178 100644 --- a/dashboards/common.libsonnet +++ b/dashboards/common.libsonnet @@ -42,31 +42,34 @@ local var = grafonnet.dashboard.variable; // grafana ref: https://grafana.com/docs/grafana/v10.4/panels-visualizations/visualizations/heatmap/ // grafonnet ref: https://grafana.github.io/grafonnet/API/panel/heatmap/index.html heatmapOptions: - heatmap.standardOptions.withMin(0) + heatmap.options.withCalculate(true) + + heatmap.options.yAxis.withMin(0) , tableOptions: table.standardOptions.withMin(0) , - + // grafonnet ref: https://grafana.github.io/grafonnet/API/dashboard/variable.html variables: { prometheus: - var.datasource.new('PROMETHEUS_DS', 'prometheus'), + var.datasource.new('PROMETHEUS_DS', 'prometheus') + + var.datasource.generalOptions.showOnDashboard.withValueOnly() + , hub: var.query.new('hub') + var.query.withDatasourceFromVariable(self.prometheus) - + var.query.withRefresh('time') + var.query.selectionOptions.withMulti() - + var.query.selectionOptions.withIncludeAll() - + var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}'), + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}') + , user_pod: var.query.new('user_pod') + var.query.withDatasourceFromVariable(self.prometheus) - + var.query.withRefresh('time') + var.query.selectionOptions.withMulti() - + var.query.selectionOptions.withIncludeAll() - + var.query.queryTypes.withLabelValues('pod', 'kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub"}'), + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.queryTypes.withLabelValues('pod', 'kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub"}') + , // Queries should use the 'instance' label when querying metrics that // come from collectors present on each node - such as node_exporter or // container_ metrics, and use the 'node' label when querying metrics @@ -75,9 +78,8 @@ local var = grafonnet.dashboard.variable; instance: var.query.new('instance') + var.query.withDatasourceFromVariable(self.prometheus) - + var.query.withRefresh('time') + var.query.selectionOptions.withMulti() - + var.query.selectionOptions.withIncludeAll() + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + var.query.queryTypes.withLabelValues('node', 'kube_node_info'), }, diff --git a/dashboards/jupyterhub.jsonnet b/dashboards/jupyterhub.jsonnet index 36ab0f0..375a6c9 100755 --- a/dashboards/jupyterhub.jsonnet +++ b/dashboards/jupyterhub.jsonnet @@ -49,7 +49,6 @@ local dailyActiveUsers = Requires JupyterHub 3.1. |||, ) - // FIXME: not migrated config legend_hideZero=false, + ts.standardOptions.withDecimals(0) + ts.fieldConfig.defaults.custom.stacking.withMode('normal') + ts.queryOptions.withTargets([ @@ -74,7 +73,6 @@ local weeklyActiveUsers = Requires JupyterHub 3.1. ||| ) - // FIXME: not migrated config legend_hideZero=false, + ts.standardOptions.withDecimals(0) + ts.fieldConfig.defaults.custom.stacking.withMode('normal') + ts.queryOptions.withTargets([ @@ -99,7 +97,6 @@ local monthlyActiveUsers = Requires JupyterHub 3.1. ||| ) - // FIXME: not migrated config legend_hideZero=false, + ts.standardOptions.withDecimals(0) + ts.fieldConfig.defaults.custom.stacking.withMode('normal') + ts.queryOptions.withTargets([ @@ -117,8 +114,8 @@ local monthlyActiveUsers = local userMemoryDistribution = common.heatmapOptions + heatmap.new('User memory usage distribution') - + heatmap.standardOptions.withUnit('bytes') - + heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis') + + heatmap.options.yAxis.withUnit('bytes') + + heatmap.options.color.HeatmapColorOptions.withScheme('Viridis') + heatmap.options.calculation.xBuckets.withMode('size') + heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval + heatmap.queryOptions.withInterval('600s') // must align with xBuckets value @@ -142,8 +139,8 @@ local userMemoryDistribution = local userCPUDistribution = common.heatmapOptions + heatmap.new('User CPU usage distribution') - + heatmap.standardOptions.withUnit('percentunit') - + heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis') + + heatmap.options.yAxis.withUnit('percentunit') + + heatmap.options.color.HeatmapColorOptions.withScheme('Viridis') + heatmap.options.calculation.xBuckets.withMode('size') + heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval + heatmap.queryOptions.withInterval('600s') // must align with xBuckets value @@ -167,8 +164,8 @@ local userCPUDistribution = local userAgeDistribution = common.heatmapOptions + heatmap.new('User active age distribution') - + heatmap.standardOptions.withUnit('s') - + heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis') + + heatmap.options.yAxis.withUnit('s') + + heatmap.options.color.HeatmapColorOptions.withScheme('Viridis') + heatmap.options.calculation.xBuckets.withMode('size') + heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval + heatmap.queryOptions.withInterval('600s') // must align with xBuckets value @@ -193,7 +190,8 @@ local userAgeDistribution = local hubResponseLatency = common.tsOptions + ts.new('Hub response latency') - // formatY1='s', + + ts.standardOptions.withUnit('s') + + ts.queryOptions.withInterval('1m') + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -257,6 +255,7 @@ local hubResponseLatency = local hubResponseCodes = common.tsOptions + ts.new('Hub response status codes') + + ts.standardOptions.withUnit('short') + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -276,8 +275,14 @@ local hubResponseCodes = // with multi=true, component='singleuser-server' means all components *except* singleuser-server -local allComponentsMemory = jupyterhub.memoryPanel('All JupyterHub Components', component='singleuser-server', multi=true); -local allComponentsCPU = jupyterhub.cpuPanel('All JupyterHub Components', component='singleuser-server', multi=true); +local allComponentsMemory = jupyterhub.memoryPanel( + 'All JupyterHub Components', + component='singleuser-server', +); +local allComponentsCPU = jupyterhub.cpuPanel( + 'All JupyterHub Components', + component='singleuser-server', +); local hubDBUsage = common.tsOptions @@ -289,7 +294,7 @@ local hubDBUsage = ) + ts.standardOptions.withDecimals(0) + ts.standardOptions.withMax(1) - // formatY1='percentunit', + + ts.standardOptions.withUnit('percentunit') + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -307,10 +312,9 @@ local hubDBUsage = local serverStartTimes = common.tsOptions + ts.new('Server Start Times') - // formatY1='s', - // lines=false, - // points=true, - // pointradius=2, + + ts.fieldConfig.defaults.custom.withDrawStyle('points') + + ts.standardOptions.withUnit('s') + + ts.queryOptions.withInterval('5m') + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -333,11 +337,9 @@ local serverSpawnFailures = Attempts by users to start servers that failed. ||| ) - // lines=false, - // points=false, - // FIXME: not migrated config legend_hideZero=true, - // bars=true, - // pointradius=2, + + ts.fieldConfig.defaults.custom.withDrawStyle('points') + + ts.standardOptions.withDecimals(0) + + ts.queryOptions.withInterval('2m') + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -408,7 +410,7 @@ local sharedVolumeFreeSpace = ) // decimalsY1=0, + ts.standardOptions.withMax(1) - // formatY1='percentunit', + + ts.standardOptions.withUnit('percentunit') + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -434,19 +436,25 @@ local oldUserpods = This often indicates problems with the idle culler ||| ) - // styles=[ - // { - // pattern: 'Value', - // type: 'number', - // unit: 's', - // alias: 'Age', - // }, - // ], - + table.options.withSortBy({ - col: 2, - desc: true, - }) - + table.queryOptions.withTransformations('timeseries_to_rows') + + table.standardOptions.withUnit('s') + + table.options.withSortBy({ displayName: 'Age', desc: true }) + + table.queryOptions.withTransformations([ + { + id: 'reduce', + options: { + reducers: ['last'], + }, + }, + { + id: 'organize', + options: { + renameByName: { + Field: 'User pod', + Last: 'Age', + }, + }, + }, + ]) + table.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -457,10 +465,9 @@ local oldUserpods = ||| % jupyterhub.onComponentLabel('singleuser-server') ) + + prometheus.withInstant(true) + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), - // instant=true ]); -// FIXME: not migrated config .hideColumn('Time') local highCPUUserPods = common.tableOptions @@ -473,19 +480,24 @@ local highCPUUserPods = unnecessarily. ||| ) - // styles=[ - // { - // pattern: 'Value', - // type: 'number', - // unit: 'percentunit', - // alias: 'CPU usage', - // }, - // ], - + table.options.withSortBy({ - col: 2, - desc: true, - }) - + table.queryOptions.withTransformations('timeseries_to_rows') + + table.options.withSortBy({ displayName: 'CPU used', desc: true }) + + table.queryOptions.withTransformations([ + { + id: 'reduce', + options: { + reducers: ['last'], + }, + }, + { + id: 'organize', + options: { + renameByName: { + Field: 'User pod', + Last: 'CPU used', + }, + }, + }, + ]) + table.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -497,10 +509,9 @@ local highCPUUserPods = ||| % jupyterhub.onComponentLabel('singleuser-server', group_left='') ) + + prometheus.withInstant(true) + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), - // instant=true ]); -// FIXME: not migrated config .hideColumn('Time') local highMemoryUsagePods = common.tableOptions @@ -512,19 +523,25 @@ local highMemoryUsagePods = Once they hit their memory limit, user kernels will start dying. ||| ) - // styles=[ - // { - // pattern: 'Value', - // type: 'number', - // unit: 'percentunit', - // alias: '% of mem limit consumed', - // }, - // ], - + table.options.withSortBy({ - col: 2, - desc: true, - }) - + table.queryOptions.withTransformations('timeseries_to_rows') + + table.standardOptions.withUnit('percentunit') + + table.options.withSortBy({ displayName: '% of mem limit consumed', desc: true }) + + table.queryOptions.withTransformations([ + { + id: 'reduce', + options: { + reducers: ['last'], + }, + }, + { + id: 'organize', + options: { + renameByName: { + Field: 'User pod', + Last: '% of mem limit consumed', + }, + }, + }, + ]) + table.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -544,10 +561,9 @@ local highMemoryUsagePods = selector: jupyterhub.onComponentLabel('singleuser-server', group_left=''), } ) + + prometheus.withInstant(true) + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), - // instant=true ]); -// FIXME: not migrated config .hideColumn('Time') // Show images used by different users on the hub local notebookImagesUsed = @@ -558,7 +574,6 @@ local notebookImagesUsed = Number of user servers using a container image. ||| ) - // FIXME: not migrated config legend_hideZero=false, + ts.standardOptions.withDecimals(0) + ts.fieldConfig.defaults.custom.stacking.withMode('normal') + ts.queryOptions.withTargets([ @@ -608,18 +623,18 @@ dashboard.new('JupyterHub Dashboard') serverSpawnFailures, hubResponseLatency, hubResponseCodes, - allComponentsCPU, // FIXME: previous height 12 - allComponentsMemory, // FIXME: previous height 12 - hubDBUsage, - nonRunningPods, + allComponentsCPU, + allComponentsMemory, usersPerNode, + nonRunningPods, + hubDBUsage, sharedVolumeFreeSpace, ]), row.new('Anomalous user pods') + row.withPanels([ - oldUserpods, // FIXME: previous height 12 - highCPUUserPods, // FIXME: previous height 12 - highMemoryUsagePods, // FIXME: previous height 12 + oldUserpods, + highCPUUserPods, + highMemoryUsagePods, ]), ], panelWidth=12, diff --git a/dashboards/jupyterhub.libsonnet b/dashboards/jupyterhub.libsonnet index 14145bc..4a79e1a 100644 --- a/dashboards/jupyterhub.libsonnet +++ b/dashboards/jupyterhub.libsonnet @@ -64,34 +64,24 @@ local prometheus = grafonnet.query.prometheus; ) , /** - * Creates a graph panel for a resource for one (or more) JupyterHub component(s). + * Creates a timeseries panel for a resource for one (or more) JupyterHub component(s). * The given metric will be summed across pods for the given component. - * if `multi` a multi-component chart will be produced, with sums for each component. * * @name jupyterhub.componentResourcePanel * - * @param title The title of the graph panel. + * @param title The title of the timeseries panel. * @param metric The metric to be observed. - * @param component The component to be measured (or excluded). Optional if `multi=true`, in which case it is an exclusion, otherwise required. - * @param formatY1 (optional) Passthrough `formatY1` to `ts.new` - * @param decimalsY1 (optional) Passthrough `decimalsY1` to `ts.new` - * @param multi (default `false`) If true, do a multi-component chart instead of single-component. - * The chart will have a legend table for each component. + * @param component The component to be measured (or excluded). */ - componentResourcePanel(title, metric, component='', formatY1=null, decimalsY1=null, multi=false):: + componentResourcePanel(title, metric, component):: ts.new(title) - // FIXME: not migrated config below commented out - //decimalsY1=decimalsY1, - //formatY1=formatY1, // show legend as a table with current, avg, max values - //legend_alignAsTable=true, - //legend_current=true, - //legend_avg=true, - //legend_max=true, //legend_hideZero=true, // legend_values is required for any of the above to work //legend_values=true, //min=0, + + ts.options.legend.withDisplayMode('table') + + ts.options.legend.withCalcs(['min', 'mean', 'max']) + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -104,24 +94,22 @@ local prometheus = grafonnet.query.prometheus; |||, [ metric, - self.onComponentLabel(component, cmp=if multi then '!=' else '=', group_left='container, label_component'), + self.onComponentLabel(component, cmp='!=', group_left='container, label_component'), ], ) ) - + prometheus.withLegendFormat(if multi then '{{ label_component }}' else title), + + prometheus.withLegendFormat('{{ label_component }}'), ]), /** - * Creates a memory (working set) graph panel for one (or more) JupyterHub component(s). + * Creates a memory (working set) timeseries panel for one (or more) JupyterHub component(s). * * @name jupyterhub.memoryPanel * * @param name The name of the resource. Used to create the title. - * @param component The component to be measured (or excluded). Optional if `multi=true`, in which case it is an exclusion, otherwise required. - * @param multi (default `false`) If true, do a multi-component chart instead of single-component. - * The chart will have a legend table for each component. + * @param component The component to be measured (or excluded). */ - memoryPanel(name, component, multi=false):: + memoryPanel(name, component):: self.componentResourcePanel( std.format('%s Memory (Working Set)', [name]), component=component, @@ -131,21 +119,19 @@ local prometheus = grafonnet.query.prometheus; # in which case sum() reports double the actual metric container_memory_working_set_bytes{name!=""} |||, - formatY1='bytes', - multi=multi, - ), + ) + + ts.standardOptions.withUnit('bytes') + , /** - * Creates a CPU usage graph panel for one (or more) JupyterHub component(s). + * Creates a CPU usage timeseries panel for one (or more) JupyterHub component(s). * * @name jupyterhub.cpuPanel * * @param name The name of the resource. Used to create the title. - * @param component The component to be measured (or excluded). Optional if `multi=true`, in which case it is an exclusion, otherwise required. - * @param multi (default `false`) If true, do a multi-component chart instead of single-component. - * The chart will have a legend table for each component. + * @param component The component to be measured (or excluded). */ - cpuPanel(name, component, multi=false):: + cpuPanel(name, component):: self.componentResourcePanel( std.format('%s CPU', [name]), component=component, @@ -155,9 +141,7 @@ local prometheus = grafonnet.query.prometheus; # in which case sum() reports double the actual metric irate(container_cpu_usage_seconds_total{name!=""}[5m]) |||, - // decimals=1 with percentunit means round to nearest 10% - decimalsY1=1, - formatY1='percentunit', - multi=multi, - ), + ) + + ts.standardOptions.withDecimals(1) + + ts.standardOptions.withUnit('percentunit'), }