kafka replay speed: upstream push sharding (#9454) (#9481)

* kafka replay speed: upstream push sharding Signed-off-by: Dimitar Dimitrov <[email protected]> Co-authored-by: gotjosh <[email protected]> Signed-off-by: Dimitar Dimitrov <[email protected]> * Apply suggestions from code review Co-authored-by: gotjosh <[email protected]> --------- Signed-off-by: Dimitar Dimitrov <[email protected]> Co-authored-by: gotjosh <[email protected]> (cherry picked from commit 456bbfd) Co-authored-by: Dimitar Dimitrov <[email protected]>
grafana · Sep 30, 2024 · b6c1fcd · b6c1fcd
1 parent 978c109
commit b6c1fcd
Show file tree

Hide file tree

Showing 12 changed files with 1,504 additions and 177 deletions.
diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json
@@ -6667,6 +6667,26 @@
               "fieldDefaultValue": true,
               "fieldFlag": "ingest-storage.kafka.use-compressed-bytes-as-fetch-max-bytes",
               "fieldType": "boolean"
+            },
+            {
+              "kind": "field",
+              "name": "ingestion_concurrency",
+              "required": false,
+              "desc": "The number of concurrent ingestion streams to the TSDB head. Every tenant has their own set of streams. 0 to disable.",
+              "fieldValue": null,
+              "fieldDefaultValue": 0,
+              "fieldFlag": "ingest-storage.kafka.ingestion-concurrency",
+              "fieldType": "int"
+            },
+            {
+              "kind": "field",
+              "name": "ingestion_concurrency_batch_size",
+              "required": false,
+              "desc": "The number of timeseries to batch together before ingesting into TSDB. This is only used when -ingest-storage.kafka.ingestion-concurrency is greater than 0.",
+              "fieldValue": null,
+              "fieldDefaultValue": 150,
+              "fieldFlag": "ingest-storage.kafka.ingestion-concurrency-batch-size",
+              "fieldType": "int"
             }
           ],
           "fieldValue": null,

diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl
@@ -1363,6 +1363,10 @@ Usage of ./cmd/mimir/mimir:
     	How frequently a consumer should commit the consumed offset to Kafka. The last committed offset is used at startup to continue the consumption from where it was left. (default 1s)
   -ingest-storage.kafka.dial-timeout duration
     	The maximum time allowed to open a connection to a Kafka broker. (default 2s)
+  -ingest-storage.kafka.ingestion-concurrency int
+    	The number of concurrent ingestion streams to the TSDB head. Every tenant has their own set of streams. 0 to disable.
+  -ingest-storage.kafka.ingestion-concurrency-batch-size int
+    	The number of timeseries to batch together before ingesting into TSDB. This is only used when -ingest-storage.kafka.ingestion-concurrency is greater than 0. (default 150)
   -ingest-storage.kafka.last-produced-offset-poll-interval duration
     	How frequently to poll the last produced offset, used to enforce strong read consistency. (default 1s)
   -ingest-storage.kafka.last-produced-offset-retry-timeout duration

diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl
@@ -411,6 +411,10 @@ Usage of ./cmd/mimir/mimir:
     	How frequently a consumer should commit the consumed offset to Kafka. The last committed offset is used at startup to continue the consumption from where it was left. (default 1s)
   -ingest-storage.kafka.dial-timeout duration
     	The maximum time allowed to open a connection to a Kafka broker. (default 2s)
+  -ingest-storage.kafka.ingestion-concurrency int
+    	The number of concurrent ingestion streams to the TSDB head. Every tenant has their own set of streams. 0 to disable.
+  -ingest-storage.kafka.ingestion-concurrency-batch-size int
+    	The number of timeseries to batch together before ingesting into TSDB. This is only used when -ingest-storage.kafka.ingestion-concurrency is greater than 0. (default 150)
   -ingest-storage.kafka.last-produced-offset-poll-interval duration
     	How frequently to poll the last produced offset, used to enforce strong read consistency. (default 1s)
   -ingest-storage.kafka.last-produced-offset-retry-timeout duration

diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md
@@ -3866,6 +3866,17 @@ kafka:
   # CLI flag: -ingest-storage.kafka.use-compressed-bytes-as-fetch-max-bytes
   [use_compressed_bytes_as_fetch_max_bytes: <boolean> | default = true]
 
+  # The number of concurrent ingestion streams to the TSDB head. Every tenant
+  # has their own set of streams. 0 to disable.
+  # CLI flag: -ingest-storage.kafka.ingestion-concurrency
+  [ingestion_concurrency: <int> | default = 0]
+
+  # The number of timeseries to batch together before ingesting into TSDB. This
+  # is only used when -ingest-storage.kafka.ingestion-concurrency is greater
+  # than 0.
+  # CLI flag: -ingest-storage.kafka.ingestion-concurrency-batch-size
+  [ingestion_concurrency_batch_size: <int> | default = 150]
+
 migration:
   # When both this option and ingest storage are enabled, distributors write to
   # both Kafka and ingesters. A write request is considered successful only when

diff --git a/operations/mimir-mixin/dashboards/writes.libsonnet b/operations/mimir-mixin/dashboards/writes.libsonnet
@@ -274,10 +274,10 @@ local filename = 'mimir-writes.json';
         ) +
         $.queryPanel(
           [
-            'histogram_avg(sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
-            'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
-            'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
-            'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
+            'histogram_avg(sum(rate(cortex_ingest_storage_reader_records_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
+            'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_records_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
+            'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_records_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
+            'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_records_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
           ],
           [
             'avg',

diff --git a/pkg/storage/ingest/config.go b/pkg/storage/ingest/config.go
@@ -98,6 +98,9 @@ type KafkaConfig struct {
 	OngoingFetchConcurrency           int  `yaml:"ongoing_fetch_concurrency"`
 	OngoingRecordsPerFetch            int  `yaml:"ongoing_records_per_fetch"`
 	UseCompressedBytesAsFetchMaxBytes bool `yaml:"use_compressed_bytes_as_fetch_max_bytes"`
+
+	IngestionConcurrency          int `yaml:"ingestion_concurrency"`
+	IngestionConcurrencyBatchSize int `yaml:"ingestion_concurrency_batch_size"`
 }
 
 func (cfg *KafkaConfig) RegisterFlags(f *flag.FlagSet) {
@@ -138,6 +141,9 @@ func (cfg *KafkaConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet)
 	f.IntVar(&cfg.OngoingFetchConcurrency, prefix+".ongoing-fetch-concurrency", 0, "The number of concurrent fetch requests that the ingester makes when reading data continuously from Kafka after startup. Is disabled unless "+prefix+".startup-fetch-concurrency is greater than 0. It must be greater than 0.")
 	f.IntVar(&cfg.OngoingRecordsPerFetch, prefix+".ongoing-records-per-fetch", 30, "The number of records per fetch request that the ingester makes when reading data continuously from Kafka after startup. Depends on "+prefix+".ongoing-fetch-concurrency being greater than 0.")
 	f.BoolVar(&cfg.UseCompressedBytesAsFetchMaxBytes, prefix+".use-compressed-bytes-as-fetch-max-bytes", true, "When enabled, the fetch request MaxBytes field is computed using the compressed size of previous records. When disabled, MaxBytes is computed using uncompressed bytes. Different Kafka implementations interpret MaxBytes differently.")
+
+	f.IntVar(&cfg.IngestionConcurrency, prefix+".ingestion-concurrency", 0, "The number of concurrent ingestion streams to the TSDB head. Every tenant has their own set of streams. 0 to disable.")
+	f.IntVar(&cfg.IngestionConcurrencyBatchSize, prefix+".ingestion-concurrency-batch-size", 150, "The number of timeseries to batch together before ingesting into TSDB. This is only used when -ingest-storage.kafka.ingestion-concurrency is greater than 0.")
 }
 
 func (cfg *KafkaConfig) Validate() error {