From 271e33a66461c32ed2b6dba5b17497c46dd01f16 Mon Sep 17 00:00:00 2001
From: Ben Ye <benye@amazon.com>
Date: Mon, 11 Sep 2023 02:59:49 -0700
Subject: [PATCH] Optimize postings fetching by checking postings and series
 size (#6465)

* optimize postings fetching by checking postings and series size

Signed-off-by: Ben Ye <benye@amazon.com>

* address some review comments

Signed-off-by: Ben Ye <benye@amazon.com>

* add acceptance test and fixed bug of skipping posting groups with add keys

Signed-off-by: Ben Ye <benye@amazon.com>

* add lazy postings param to block series clinet

Signed-off-by: Ben Ye <benye@amazon.com>

* switch to use block estimated max series size

Signed-off-by: Ben Ye <benye@amazon.com>

* added two more metrics

Signed-off-by: Ben Ye <benye@amazon.com>

---------

Signed-off-by: Ben Ye <benye@amazon.com>
---
 cmd/thanos/store.go                         |   5 +
 docs/components/store.md                    |   5 +
 pkg/block/block_test.go                     |  10 +-
 pkg/block/indexheader/binary_reader.go      |  19 +-
 pkg/block/indexheader/header.go             |   8 +-
 pkg/block/indexheader/header_test.go        |  32 ++
 pkg/block/indexheader/lazy_binary_reader.go |  13 +
 pkg/store/acceptance_test.go                | 148 +++---
 pkg/store/bucket.go                         | 265 ++++++----
 pkg/store/bucket_test.go                    | 208 +++++---
 pkg/store/lazy_postings.go                  | 272 +++++++++++
 pkg/store/lazy_postings_test.go             | 504 ++++++++++++++++++++
 pkg/testutil/e2eutil/prometheus.go          |  61 ++-
 test/e2e/store_gateway_test.go              | 146 +++++-
 14 files changed, 1449 insertions(+), 247 deletions(-)
 create mode 100644 pkg/store/lazy_postings.go
 create mode 100644 pkg/store/lazy_postings_test.go

diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go
index 9ddfbd89b76..29ac6921a29 100644
--- a/cmd/thanos/store.go
+++ b/cmd/thanos/store.go
@@ -88,6 +88,7 @@ type storeConfig struct {
 	reqLogConfig                *extflag.PathOrContent
 	lazyIndexReaderEnabled      bool
 	lazyIndexReaderIdleTimeout  time.Duration
+	lazyExpandedPostingsEnabled bool
 }
 
 func (sc *storeConfig) registerFlag(cmd extkingpin.FlagClause) {
@@ -182,6 +183,9 @@ func (sc *storeConfig) registerFlag(cmd extkingpin.FlagClause) {
 	cmd.Flag("store.index-header-lazy-reader-idle-timeout", "If index-header lazy reader is enabled and this idle timeout setting is > 0, memory map-ed index-headers will be automatically released after 'idle timeout' inactivity.").
 		Hidden().Default("5m").DurationVar(&sc.lazyIndexReaderIdleTimeout)
 
+	cmd.Flag("store.enable-lazy-expanded-postings", "If true, Store Gateway will estimate postings size and try to lazily expand postings if it downloads less data than expanding all postings.").
+		Default("false").BoolVar(&sc.lazyExpandedPostingsEnabled)
+
 	cmd.Flag("web.disable", "Disable Block Viewer UI.").Default("false").BoolVar(&sc.disableWeb)
 
 	cmd.Flag("web.external-prefix", "Static prefix for all HTML links and redirect URLs in the bucket web UI interface. Actual endpoints are still served on / or the web.route-prefix. This allows thanos bucket web UI to be served behind a reverse proxy that strips a URL sub-path.").
@@ -382,6 +386,7 @@ func runStore(
 			}
 			return conf.estimatedMaxChunkSize
 		}),
+		store.WithLazyExpandedPostings(conf.lazyExpandedPostingsEnabled),
 	}
 
 	if conf.debugLogging {
diff --git a/docs/components/store.md b/docs/components/store.md
index ac0234f1df3..26b359f3267 100644
--- a/docs/components/store.md
+++ b/docs/components/store.md
@@ -176,6 +176,11 @@ Flags:
                                  If true, Store Gateway will lazy memory map
                                  index-header only once the block is required by
                                  a query.
+      --store.enable-lazy-expanded-postings
+                                 If true, Store Gateway will estimate postings
+                                 size and try to lazily expand postings if
+                                 it downloads less data than expanding all
+                                 postings.
       --store.grpc.downloaded-bytes-limit=0
                                  Maximum amount of downloaded (either
                                  fetched or touched) bytes in a single
diff --git a/pkg/block/block_test.go b/pkg/block/block_test.go
index 12eb5eed84b..a2712705904 100644
--- a/pkg/block/block_test.go
+++ b/pkg/block/block_test.go
@@ -144,7 +144,7 @@ func TestUpload(t *testing.T) {
 		testutil.Equals(t, 3, len(bkt.Objects()))
 		testutil.Equals(t, 3727, len(bkt.Objects()[path.Join(b1.String(), ChunksDirname, "000001")]))
 		testutil.Equals(t, 401, len(bkt.Objects()[path.Join(b1.String(), IndexFilename)]))
-		testutil.Equals(t, 567, len(bkt.Objects()[path.Join(b1.String(), MetaFilename)]))
+		testutil.Equals(t, 595, len(bkt.Objects()[path.Join(b1.String(), MetaFilename)]))
 
 		// File stats are gathered.
 		testutil.Equals(t, fmt.Sprintf(`{
@@ -184,7 +184,9 @@ func TestUpload(t *testing.T) {
 				"rel_path": "meta.json"
 			}
 		],
-		"index_stats": {}
+		"index_stats": {
+			"series_max_size": 16
+		}
 	}
 }
 `, b1.String(), b1.String()), string(bkt.Objects()[path.Join(b1.String(), MetaFilename)]))
@@ -195,7 +197,7 @@ func TestUpload(t *testing.T) {
 		testutil.Equals(t, 3, len(bkt.Objects()))
 		testutil.Equals(t, 3727, len(bkt.Objects()[path.Join(b1.String(), ChunksDirname, "000001")]))
 		testutil.Equals(t, 401, len(bkt.Objects()[path.Join(b1.String(), IndexFilename)]))
-		testutil.Equals(t, 567, len(bkt.Objects()[path.Join(b1.String(), MetaFilename)]))
+		testutil.Equals(t, 595, len(bkt.Objects()[path.Join(b1.String(), MetaFilename)]))
 	}
 	{
 		// Upload with no external labels should be blocked.
@@ -227,7 +229,7 @@ func TestUpload(t *testing.T) {
 		testutil.Equals(t, 6, len(bkt.Objects()))
 		testutil.Equals(t, 3727, len(bkt.Objects()[path.Join(b2.String(), ChunksDirname, "000001")]))
 		testutil.Equals(t, 401, len(bkt.Objects()[path.Join(b2.String(), IndexFilename)]))
-		testutil.Equals(t, 546, len(bkt.Objects()[path.Join(b2.String(), MetaFilename)]))
+		testutil.Equals(t, 574, len(bkt.Objects()[path.Join(b2.String(), MetaFilename)]))
 	}
 }
 
diff --git a/pkg/block/indexheader/binary_reader.go b/pkg/block/indexheader/binary_reader.go
index 1befe63a7f2..16ef73ac3b0 100644
--- a/pkg/block/indexheader/binary_reader.go
+++ b/pkg/block/indexheader/binary_reader.go
@@ -47,6 +47,8 @@ const (
 	postingLengthFieldSize = 4
 )
 
+var NotFoundRange = index.Range{Start: -1, End: -1}
+
 // The table gets initialized with sync.Once but may still cause a race
 // with any other use of the crc32 package anywhere. Thus we initialize it
 // before.
@@ -747,13 +749,18 @@ func (r *BinaryReader) IndexVersion() (int, error) {
 	return r.indexVersion, nil
 }
 
+// PostingsOffsets implements Reader.
+func (r *BinaryReader) PostingsOffsets(name string, values ...string) ([]index.Range, error) {
+	return r.postingsOffset(name, values...)
+}
+
 // TODO(bwplotka): Get advantage of multi value offset fetch.
 func (r *BinaryReader) PostingsOffset(name, value string) (index.Range, error) {
 	rngs, err := r.postingsOffset(name, value)
 	if err != nil {
 		return index.Range{}, err
 	}
-	if len(rngs) != 1 {
+	if len(rngs) != 1 || rngs[0] == NotFoundRange {
 		return index.Range{}, NotFoundRangeErr
 	}
 	return rngs[0], nil
@@ -801,6 +808,7 @@ func (r *BinaryReader) postingsOffset(name string, values ...string) ([]index.Ra
 	valueIndex := 0
 	for valueIndex < len(values) && values[valueIndex] < e.offsets[0].value {
 		// Discard values before the start.
+		rngs = append(rngs, NotFoundRange)
 		valueIndex++
 	}
 
@@ -811,6 +819,9 @@ func (r *BinaryReader) postingsOffset(name string, values ...string) ([]index.Ra
 		i := sort.Search(len(e.offsets), func(i int) bool { return e.offsets[i].value >= wantedValue })
 		if i == len(e.offsets) {
 			// We're past the end.
+			for len(rngs) < len(values) {
+				rngs = append(rngs, NotFoundRange)
+			}
 			break
 		}
 		if i > 0 && e.offsets[i].value != wantedValue {
@@ -858,6 +869,8 @@ func (r *BinaryReader) postingsOffset(name string, values ...string) ([]index.Ra
 				// Record on the way if wanted value is equal to the current value.
 				if string(value) == wantedValue {
 					newSameRngs = append(newSameRngs, index.Range{Start: postingOffset + postingLengthFieldSize})
+				} else {
+					rngs = append(rngs, NotFoundRange)
 				}
 				valueIndex++
 				if valueIndex == len(values) {
@@ -877,6 +890,10 @@ func (r *BinaryReader) postingsOffset(name string, values ...string) ([]index.Ra
 			}
 
 			if valueIndex != len(values) && wantedValue <= e.offsets[i+1].value {
+				// Increment i when wanted value is same as next offset.
+				if wantedValue == e.offsets[i+1].value {
+					i++
+				}
 				// wantedValue is smaller or same as the next offset we know about, let's iterate further to add those.
 				continue
 			}
diff --git a/pkg/block/indexheader/header.go b/pkg/block/indexheader/header.go
index 8ecef33564d..d0b4141afd8 100644
--- a/pkg/block/indexheader/header.go
+++ b/pkg/block/indexheader/header.go
@@ -20,10 +20,16 @@ type Reader interface {
 	// IndexVersion returns version of index.
 	IndexVersion() (int, error)
 
+	// PostingsOffsets returns start and end offsets for postings for given name and values.
+	// Input values need to be sorted.
+	// If the requested label name doesn't exist, then no posting and error will be returned.
+	// If the requested label name exists, but some values don't exist, the corresponding index range
+	// will be set to -1 for both start and end.
+	PostingsOffsets(name string, value ...string) ([]index.Range, error)
+
 	// PostingsOffset returns start and end offsets of postings for given name and value.
 	// The end offset might be bigger than the actual posting ending, but not larger than the whole index file.
 	// NotFoundRangeErr is returned when no index can be found for given name and value.
-	// TODO(bwplotka): Move to PostingsOffsets(name string, value ...string) []index.Range and benchmark.
 	PostingsOffset(name string, value string) (index.Range, error)
 
 	// LookupSymbol returns string based on given reference.
diff --git a/pkg/block/indexheader/header_test.go b/pkg/block/indexheader/header_test.go
index d0d7eb5f7dd..540e4d6c579 100644
--- a/pkg/block/indexheader/header_test.go
+++ b/pkg/block/indexheader/header_test.go
@@ -141,6 +141,38 @@ func TestReaders(t *testing.T) {
 					testutil.Ok(t, err)
 					testutil.Equals(t, []string(nil), vals)
 
+					// single value
+					rngs, err := br.PostingsOffsets("a", "9")
+					testutil.Ok(t, err)
+					for _, rng := range rngs {
+						testutil.Assert(t, rng.End > rng.Start)
+					}
+
+					rngs, err = br.PostingsOffsets("a", "2", "3", "4", "5", "6", "7", "8", "9")
+					testutil.Ok(t, err)
+					for _, rng := range rngs {
+						testutil.Assert(t, rng.End > rng.Start)
+					}
+
+					rngs, err = br.PostingsOffsets("a", "0")
+					testutil.Ok(t, err)
+					testutil.Assert(t, len(rngs) == 1)
+					testutil.Equals(t, NotFoundRange, rngs[0])
+
+					rngs, err = br.PostingsOffsets("a", "0", "10", "99")
+					testutil.Ok(t, err)
+					testutil.Assert(t, len(rngs) == 3)
+					for _, rng := range rngs {
+						testutil.Equals(t, NotFoundRange, rng)
+					}
+
+					rngs, err = br.PostingsOffsets("a", "1", "10", "9")
+					testutil.Ok(t, err)
+					testutil.Assert(t, len(rngs) == 3)
+					testutil.Assert(t, rngs[0].End > rngs[0].Start)
+					testutil.Assert(t, rngs[2].End > rngs[2].Start)
+					testutil.Equals(t, NotFoundRange, rngs[1])
+
 					// Regression tests for https://github.com/thanos-io/thanos/issues/2213.
 					// Most of not existing value was working despite bug, except in certain unlucky cases
 					// it was causing "invalid size" errors.
diff --git a/pkg/block/indexheader/lazy_binary_reader.go b/pkg/block/indexheader/lazy_binary_reader.go
index c3bee382c2f..451a79b6ee5 100644
--- a/pkg/block/indexheader/lazy_binary_reader.go
+++ b/pkg/block/indexheader/lazy_binary_reader.go
@@ -154,6 +154,19 @@ func (r *LazyBinaryReader) IndexVersion() (int, error) {
 	return r.reader.IndexVersion()
 }
 
+// PostingsOffsets implements Reader.
+func (r *LazyBinaryReader) PostingsOffsets(name string, values ...string) ([]index.Range, error) {
+	r.readerMx.RLock()
+	defer r.readerMx.RUnlock()
+
+	if err := r.load(); err != nil {
+		return nil, err
+	}
+
+	r.usedAt.Store(time.Now().UnixNano())
+	return r.reader.PostingsOffsets(name, values...)
+}
+
 // PostingsOffset implements Reader.
 func (r *LazyBinaryReader) PostingsOffset(name, value string) (index.Range, error) {
 	r.readerMx.RLock()
diff --git a/pkg/store/acceptance_test.go b/pkg/store/acceptance_test.go
index ecc23f5aa34..f6a5ef55ec3 100644
--- a/pkg/store/acceptance_test.go
+++ b/pkg/store/acceptance_test.go
@@ -722,78 +722,86 @@ func testStoreAPIsAcceptance(t *testing.T, startStore func(t *testing.T, extLset
 func TestBucketStore_Acceptance(t *testing.T) {
 	t.Cleanup(func() { custom.TolerantVerifyLeak(t) })
 
-	testStoreAPIsAcceptance(t, func(tt *testing.T, extLset labels.Labels, appendFn func(app storage.Appender)) storepb.StoreServer {
-		tmpDir := tt.TempDir()
-		bktDir := filepath.Join(tmpDir, "bkt")
-		auxDir := filepath.Join(tmpDir, "aux")
-		metaDir := filepath.Join(tmpDir, "meta")
-
-		testutil.Ok(tt, os.MkdirAll(metaDir, os.ModePerm))
-		testutil.Ok(tt, os.MkdirAll(auxDir, os.ModePerm))
-
-		bkt, err := filesystem.NewBucket(bktDir)
-		testutil.Ok(tt, err)
-		tt.Cleanup(func() { testutil.Ok(tt, bkt.Close()) })
-
-		headOpts := tsdb.DefaultHeadOptions()
-		headOpts.ChunkDirRoot = tmpDir
-		headOpts.ChunkRange = 1000
-		h, err := tsdb.NewHead(nil, nil, nil, nil, headOpts, nil)
-		testutil.Ok(tt, err)
-		tt.Cleanup(func() { testutil.Ok(tt, h.Close()) })
-		logger := log.NewNopLogger()
-
-		appendFn(h.Appender(context.Background()))
-
-		if h.NumSeries() == 0 {
-			tt.Skip("Bucket Store cannot handle empty HEAD")
-		}
-
-		id := createBlockFromHead(tt, auxDir, h)
-
-		auxBlockDir := filepath.Join(auxDir, id.String())
-		_, err = metadata.InjectThanos(log.NewNopLogger(), auxBlockDir, metadata.Thanos{
-			Labels:     extLset.Map(),
-			Downsample: metadata.ThanosDownsample{Resolution: 0},
-			Source:     metadata.TestSource,
-		}, nil)
-		testutil.Ok(tt, err)
-
-		testutil.Ok(tt, block.Upload(context.Background(), logger, bkt, auxBlockDir, metadata.NoneFunc))
-		testutil.Ok(tt, block.Upload(context.Background(), logger, bkt, auxBlockDir, metadata.NoneFunc))
-
-		chunkPool, err := NewDefaultChunkBytesPool(2e5)
-		testutil.Ok(tt, err)
+	for _, lazyExpandedPosting := range []bool{false, true} {
+		testStoreAPIsAcceptance(t, func(tt *testing.T, extLset labels.Labels, appendFn func(app storage.Appender)) storepb.StoreServer {
+			tmpDir := tt.TempDir()
+			bktDir := filepath.Join(tmpDir, "bkt")
+			auxDir := filepath.Join(tmpDir, "aux")
+			metaDir := filepath.Join(tmpDir, "meta")
+
+			testutil.Ok(tt, os.MkdirAll(metaDir, os.ModePerm))
+			testutil.Ok(tt, os.MkdirAll(auxDir, os.ModePerm))
+
+			bkt, err := filesystem.NewBucket(bktDir)
+			testutil.Ok(tt, err)
+			tt.Cleanup(func() { testutil.Ok(tt, bkt.Close()) })
+
+			headOpts := tsdb.DefaultHeadOptions()
+			headOpts.ChunkDirRoot = tmpDir
+			headOpts.ChunkRange = 1000
+			h, err := tsdb.NewHead(nil, nil, nil, nil, headOpts, nil)
+			testutil.Ok(tt, err)
+			tt.Cleanup(func() { testutil.Ok(tt, h.Close()) })
+			logger := log.NewNopLogger()
+
+			appendFn(h.Appender(context.Background()))
+
+			if h.NumSeries() == 0 {
+				tt.Skip("Bucket Store cannot handle empty HEAD")
+			}
 
-		metaFetcher, err := block.NewMetaFetcher(logger, 20, objstore.WithNoopInstr(bkt), metaDir, nil, []block.MetadataFilter{
-			block.NewTimePartitionMetaFilter(allowAllFilterConf.MinTime, allowAllFilterConf.MaxTime),
+			id := createBlockFromHead(tt, auxDir, h)
+
+			auxBlockDir := filepath.Join(auxDir, id.String())
+			meta, err := metadata.ReadFromDir(auxBlockDir)
+			testutil.Ok(t, err)
+			stats, err := block.GatherIndexHealthStats(logger, filepath.Join(auxBlockDir, block.IndexFilename), meta.MinTime, meta.MaxTime)
+			testutil.Ok(t, err)
+			_, err = metadata.InjectThanos(log.NewNopLogger(), auxBlockDir, metadata.Thanos{
+				Labels:     extLset.Map(),
+				Downsample: metadata.ThanosDownsample{Resolution: 0},
+				Source:     metadata.TestSource,
+				IndexStats: metadata.IndexStats{SeriesMaxSize: stats.SeriesMaxSize, ChunkMaxSize: stats.ChunkMaxSize},
+			}, nil)
+			testutil.Ok(tt, err)
+
+			testutil.Ok(tt, block.Upload(context.Background(), logger, bkt, auxBlockDir, metadata.NoneFunc))
+			testutil.Ok(tt, block.Upload(context.Background(), logger, bkt, auxBlockDir, metadata.NoneFunc))
+
+			chunkPool, err := NewDefaultChunkBytesPool(2e5)
+			testutil.Ok(tt, err)
+
+			metaFetcher, err := block.NewMetaFetcher(logger, 20, objstore.WithNoopInstr(bkt), metaDir, nil, []block.MetadataFilter{
+				block.NewTimePartitionMetaFilter(allowAllFilterConf.MinTime, allowAllFilterConf.MaxTime),
+			})
+			testutil.Ok(tt, err)
+
+			bucketStore, err := NewBucketStore(
+				objstore.WithNoopInstr(bkt),
+				metaFetcher,
+				"",
+				NewChunksLimiterFactory(10e6),
+				NewSeriesLimiterFactory(10e6),
+				NewBytesLimiterFactory(10e6),
+				NewGapBasedPartitioner(PartitionerMaxGapSize),
+				20,
+				true,
+				DefaultPostingOffsetInMemorySampling,
+				false,
+				false,
+				1*time.Minute,
+				WithChunkPool(chunkPool),
+				WithFilterConfig(allowAllFilterConf),
+				WithLazyExpandedPostings(lazyExpandedPosting),
+			)
+			testutil.Ok(tt, err)
+			tt.Cleanup(func() { testutil.Ok(tt, bucketStore.Close()) })
+
+			testutil.Ok(tt, bucketStore.SyncBlocks(context.Background()))
+
+			return bucketStore
 		})
-		testutil.Ok(tt, err)
-
-		bucketStore, err := NewBucketStore(
-			objstore.WithNoopInstr(bkt),
-			metaFetcher,
-			"",
-			NewChunksLimiterFactory(10e6),
-			NewSeriesLimiterFactory(10e6),
-			NewBytesLimiterFactory(10e6),
-			NewGapBasedPartitioner(PartitionerMaxGapSize),
-			20,
-			true,
-			DefaultPostingOffsetInMemorySampling,
-			false,
-			false,
-			1*time.Minute,
-			WithChunkPool(chunkPool),
-			WithFilterConfig(allowAllFilterConf),
-		)
-		testutil.Ok(tt, err)
-		tt.Cleanup(func() { testutil.Ok(tt, bucketStore.Close()) })
-
-		testutil.Ok(tt, bucketStore.SyncBlocks(context.Background()))
-
-		return bucketStore
-	})
+	}
 }
 
 func TestPrometheusStore_Acceptance(t *testing.T) {
diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go
index 027122447ef..4a8eae45727 100644
--- a/pkg/store/bucket.go
+++ b/pkg/store/bucket.go
@@ -138,6 +138,10 @@ type bucketStoreMetrics struct {
 	chunkRefetches        prometheus.Counter
 	emptyPostingCount     prometheus.Counter
 
+	lazyExpandedPostingsCount                     prometheus.Counter
+	lazyExpandedPostingSizeBytes                  prometheus.Counter
+	lazyExpandedPostingSeriesOverfetchedSizeBytes prometheus.Counter
+
 	cachedPostingsCompressions           *prometheus.CounterVec
 	cachedPostingsCompressionErrors      *prometheus.CounterVec
 	cachedPostingsCompressionTimeSeconds *prometheus.CounterVec
@@ -302,6 +306,21 @@ func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics {
 		Help: "Total number of empty postings when fetching block series.",
 	})
 
+	m.lazyExpandedPostingsCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_lazy_expanded_postings_total",
+		Help: "Total number of times when lazy expanded posting optimization applies.",
+	})
+
+	m.lazyExpandedPostingSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_lazy_expanded_posting_size_bytes_total",
+		Help: "Total number of lazy posting group size in bytes.",
+	})
+
+	m.lazyExpandedPostingSeriesOverfetchedSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_lazy_expanded_posting_series_overfetched_size_bytes_total",
+		Help: "Total number of series size in bytes overfetched due to posting lazy expansion.",
+	})
+
 	return &m
 }
 
@@ -366,6 +385,8 @@ type BucketStore struct {
 
 	enableChunkHashCalculation bool
 
+	enabledLazyExpandedPostings bool
+
 	bmtx          sync.Mutex
 	labelNamesSet stringset.Set
 
@@ -473,6 +494,13 @@ func WithBlockEstimatedMaxChunkFunc(f BlockEstimator) BucketStoreOption {
 	}
 }
 
+// WithLazyExpandedPostings enables lazy expanded postings.
+func WithLazyExpandedPostings(enabled bool) BucketStoreOption {
+	return func(s *BucketStore) {
+		s.enabledLazyExpandedPostings = enabled
+	}
+}
+
 // NewBucketStore creates a new bucket backed store that implements the store API against
 // an object store bucket. It is optimized to work against high latency backends.
 func NewBucketStore(
@@ -898,20 +926,27 @@ type blockSeriesClient struct {
 	chunksLimiter  ChunksLimiter
 	bytesLimiter   BytesLimiter
 
+	lazyExpandedPostingEnabled                    bool
+	lazyExpandedPostingsCount                     prometheus.Counter
+	lazyExpandedPostingSizeBytes                  prometheus.Counter
+	lazyExpandedPostingSeriesOverfetchedSizeBytes prometheus.Counter
+
 	skipChunks         bool
 	shardMatcher       *storepb.ShardMatcher
+	blockMatchers      []*labels.Matcher
 	calculateChunkHash bool
 	chunkFetchDuration prometheus.Histogram
 
 	// Internal state.
-	i               uint64
-	postings        []storage.SeriesRef
-	chkMetas        []chunks.Meta
-	lset            labels.Labels
-	symbolizedLset  []symbolizedLabel
-	entries         []seriesEntry
-	hasMorePostings bool
-	batchSize       int
+	i                uint64
+	lazyPostings     *lazyExpandedPostings
+	expandedPostings []storage.SeriesRef
+	chkMetas         []chunks.Meta
+	lset             labels.Labels
+	symbolizedLset   []symbolizedLabel
+	entries          []seriesEntry
+	hasMorePostings  bool
+	batchSize        int
 }
 
 func newBlockSeriesClient(
@@ -921,11 +956,16 @@ func newBlockSeriesClient(
 	req *storepb.SeriesRequest,
 	limiter ChunksLimiter,
 	bytesLimiter BytesLimiter,
+	blockMatchers []*labels.Matcher,
 	shardMatcher *storepb.ShardMatcher,
 	calculateChunkHash bool,
 	batchSize int,
 	chunkFetchDuration prometheus.Histogram,
 	extLsetToRemove map[string]struct{},
+	lazyExpandedPostingEnabled bool,
+	lazyExpandedPostingsCount prometheus.Counter,
+	lazyExpandedPostingSizeBytes prometheus.Counter,
+	lazyExpandedPostingSeriesOverfetchedSizeBytes prometheus.Counter,
 ) *blockSeriesClient {
 	var chunkr *bucketChunkReader
 	if !req.SkipChunks {
@@ -952,8 +992,14 @@ func newBlockSeriesClient(
 		skipChunks:         req.SkipChunks,
 		chunkFetchDuration: chunkFetchDuration,
 
+		lazyExpandedPostingEnabled:                    lazyExpandedPostingEnabled,
+		lazyExpandedPostingsCount:                     lazyExpandedPostingsCount,
+		lazyExpandedPostingSizeBytes:                  lazyExpandedPostingSizeBytes,
+		lazyExpandedPostingSeriesOverfetchedSizeBytes: lazyExpandedPostingSeriesOverfetchedSizeBytes,
+
 		loadAggregates:     req.Aggregates,
 		shardMatcher:       shardMatcher,
+		blockMatchers:      blockMatchers,
 		calculateChunkHash: calculateChunkHash,
 		hasMorePostings:    true,
 		batchSize:          batchSize,
@@ -996,22 +1042,30 @@ func (b *blockSeriesClient) ExpandPostings(
 	matchers sortedMatchers,
 	seriesLimiter SeriesLimiter,
 ) error {
-	ps, err := b.indexr.ExpandedPostings(b.ctx, matchers, b.bytesLimiter)
+	ps, err := b.indexr.ExpandedPostings(b.ctx, matchers, b.bytesLimiter, b.lazyExpandedPostingEnabled, b.lazyExpandedPostingSizeBytes)
 	if err != nil {
 		return errors.Wrap(err, "expanded matching posting")
 	}
 
-	if len(ps) == 0 {
+	if ps == nil || len(ps.postings) == 0 {
+		b.lazyPostings = emptyLazyPostings
 		return nil
 	}
+	b.lazyPostings = ps
 
-	if err := seriesLimiter.Reserve(uint64(len(ps))); err != nil {
+	// If lazy expanded posting enabled, it is possible to fetch more series
+	//  so easier to hit the series limit.
+	if err := seriesLimiter.Reserve(uint64(len(ps.postings))); err != nil {
 		return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded series limit: %s", err)
 	}
 
-	b.postings = ps
-	if b.batchSize > len(ps) {
-		b.batchSize = len(ps)
+	if b.batchSize > len(ps.postings) {
+		b.batchSize = len(ps.postings)
+	}
+	if b.lazyPostings.lazyExpanded() {
+		// Assume lazy expansion could cut actual expanded postings length to 50%.
+		b.expandedPostings = make([]storage.SeriesRef, 0, len(b.lazyPostings.postings)/2)
+		b.lazyExpandedPostingsCount.Inc()
 	}
 	b.entries = make([]seriesEntry, 0, b.batchSize)
 	return nil
@@ -1043,14 +1097,26 @@ func (b *blockSeriesClient) Recv() (*storepb.SeriesResponse, error) {
 func (b *blockSeriesClient) nextBatch() error {
 	start := b.i
 	end := start + SeriesBatchSize
-	if end > uint64(len(b.postings)) {
-		end = uint64(len(b.postings))
+	if end > uint64(len(b.lazyPostings.postings)) {
+		end = uint64(len(b.lazyPostings.postings))
 	}
 	b.i = end
 
-	postingsBatch := b.postings[start:end]
+	postingsBatch := b.lazyPostings.postings[start:end]
 	if len(postingsBatch) == 0 {
 		b.hasMorePostings = false
+		if b.lazyPostings.lazyExpanded() {
+			v, err := b.indexr.IndexVersion()
+			if err != nil {
+				return errors.Wrap(err, "get index version")
+			}
+			if v >= 2 {
+				for i := range b.expandedPostings {
+					b.expandedPostings[i] = b.expandedPostings[i] / 16
+				}
+			}
+			b.indexr.storeExpandedPostingsToCache(b.blockMatchers, index.NewListPostings(b.expandedPostings), len(b.expandedPostings))
+		}
 		return nil
 	}
 
@@ -1064,6 +1130,7 @@ func (b *blockSeriesClient) nextBatch() error {
 	}
 
 	b.entries = b.entries[:0]
+OUTER:
 	for i := 0; i < len(postingsBatch); i++ {
 		if err := b.ctx.Err(); err != nil {
 			return err
@@ -1080,6 +1147,19 @@ func (b *blockSeriesClient) nextBatch() error {
 			return errors.Wrap(err, "Lookup labels symbols")
 		}
 
+		for _, matcher := range b.lazyPostings.matchers {
+			val := b.lset.Get(matcher.Name)
+			if !matcher.Matches(val) {
+				// Series not matched means series we overfetched due to lazy posting expansion.
+				seriesBytes := b.indexr.loadedSeries[postingsBatch[i]]
+				b.lazyExpandedPostingSeriesOverfetchedSizeBytes.Add(float64(len(seriesBytes)))
+				continue OUTER
+			}
+		}
+		if b.lazyPostings.lazyExpanded() {
+			b.expandedPostings = append(b.expandedPostings, postingsBatch[i])
+		}
+
 		completeLabelset := labelpb.ExtendSortedLabels(b.lset, b.extLset)
 		if b.extLsetToRemove != nil {
 			completeLabelset = rmLabels(completeLabelset, b.extLsetToRemove)
@@ -1318,7 +1398,8 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 		if !ok {
 			continue
 		}
-
+		// Sort matchers to make sure we generate the same cache key
+		// when fetching expanded postings.
 		sortedBlockMatchers := newSortedMatchers(blockMatchers)
 
 		blocks := bs.getFor(req.MinTime, req.MaxTime, req.MaxResolutionWindow, reqBlockMatchers)
@@ -1345,11 +1426,16 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 				req,
 				chunksLimiter,
 				bytesLimiter,
+				sortedBlockMatchers,
 				shardMatcher,
 				s.enableChunkHashCalculation,
 				s.seriesBatchSize,
 				s.metrics.chunkFetchDuration,
 				extLsetToRemove,
+				s.enabledLazyExpandedPostings,
+				s.metrics.lazyExpandedPostingsCount,
+				s.metrics.lazyExpandedPostingSizeBytes,
+				s.metrics.lazyExpandedPostingSeriesOverfetchedSizeBytes,
 			)
 
 			defer blockClient.Close()
@@ -1369,7 +1455,10 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 					mtx.Unlock()
 				}
 
-				if err := blockClient.ExpandPostings(sortedBlockMatchers, seriesLimiter); err != nil {
+				if err := blockClient.ExpandPostings(
+					sortedBlockMatchers,
+					seriesLimiter,
+				); err != nil {
 					onClose()
 					span.Finish()
 					return errors.Wrapf(err, "fetch postings for block %s", blk.meta.ULID)
@@ -1643,11 +1732,16 @@ func (s *BucketStore) LabelNames(ctx context.Context, req *storepb.LabelNamesReq
 					seriesReq,
 					nil,
 					bytesLimiter,
+					reqSeriesMatchersNoExtLabels,
 					nil,
 					true,
 					SeriesBatchSize,
 					s.metrics.chunkFetchDuration,
 					nil,
+					s.enabledLazyExpandedPostings,
+					s.metrics.lazyExpandedPostingsCount,
+					s.metrics.lazyExpandedPostingSizeBytes,
+					s.metrics.lazyExpandedPostingSeriesOverfetchedSizeBytes,
 				)
 				defer blockClient.Close()
 
@@ -1871,11 +1965,16 @@ func (s *BucketStore) LabelValues(ctx context.Context, req *storepb.LabelValuesR
 					seriesReq,
 					nil,
 					bytesLimiter,
+					reqSeriesMatchersNoExtLabels,
 					nil,
 					true,
 					SeriesBatchSize,
 					s.metrics.chunkFetchDuration,
 					nil,
+					s.enabledLazyExpandedPostings,
+					s.metrics.lazyExpandedPostingsCount,
+					s.metrics.lazyExpandedPostingSizeBytes,
+					s.metrics.lazyExpandedPostingSeriesOverfetchedSizeBytes,
 				)
 				defer blockClient.Close()
 
@@ -2275,6 +2374,8 @@ type bucketIndexReader struct {
 
 	mtx          sync.Mutex
 	loadedSeries map[storage.SeriesRef][]byte
+
+	indexVersion int
 }
 
 func newBucketIndexReader(block *bucketBlock) *bucketIndexReader {
@@ -2288,6 +2389,20 @@ func newBucketIndexReader(block *bucketBlock) *bucketIndexReader {
 	}
 	return r
 }
+
+// IndexVersion caches the index header version.
+func (r *bucketIndexReader) IndexVersion() (int, error) {
+	if r.indexVersion != 0 {
+		return r.indexVersion, nil
+	}
+	v, err := r.block.indexHeaderReader.IndexVersion()
+	if err != nil {
+		return 0, err
+	}
+	r.indexVersion = v
+	return v, nil
+}
+
 func (r *bucketIndexReader) reset() {
 	r.loadedSeries = map[storage.SeriesRef][]byte{}
 }
@@ -2301,7 +2416,7 @@ func (r *bucketIndexReader) reset() {
 // Reminder: A posting is a reference (represented as a uint64) to a series reference, which in turn points to the first
 // chunk where the series contains the matching label-value pair for a given block of data. Postings can be fetched by
 // single label name=value.
-func (r *bucketIndexReader) ExpandedPostings(ctx context.Context, ms sortedMatchers, bytesLimiter BytesLimiter) ([]storage.SeriesRef, error) {
+func (r *bucketIndexReader) ExpandedPostings(ctx context.Context, ms sortedMatchers, bytesLimiter BytesLimiter, lazyExpandedPostingEnabled bool, lazyExpandedPostingSizeBytes prometheus.Counter) (*lazyExpandedPostings, error) {
 	// Shortcut the case of `len(postingGroups) == 0`. It will only happen when no
 	// matchers specified, and we don't need to fetch expanded postings from cache.
 	if len(ms) == 0 {
@@ -2313,12 +2428,11 @@ func (r *bucketIndexReader) ExpandedPostings(ctx context.Context, ms sortedMatch
 		return nil, err
 	}
 	if hit {
-		return postings, nil
+		return newLazyExpandedPostings(postings), nil
 	}
 	var (
 		allRequested = false
 		hasAdds      = false
-		keys         []labels.Label
 	)
 
 	postingGroups, err := matchersToPostingGroups(ctx, r.block.indexHeaderReader.LabelValues, ms)
@@ -2329,83 +2443,50 @@ func (r *bucketIndexReader) ExpandedPostings(ctx context.Context, ms sortedMatch
 		r.storeExpandedPostingsToCache(ms, index.EmptyPostings(), 0)
 		return nil, nil
 	}
+	i := 0
 	for _, pg := range postingGroups {
 		allRequested = allRequested || pg.addAll
 		hasAdds = hasAdds || len(pg.addKeys) > 0
 
-		// Postings returned by fetchPostings will be in the same order as keys
-		// so it's important that we iterate them in the same order later.
-		// We don't have any other way of pairing keys and fetched postings.
-		for _, key := range pg.addKeys {
-			keys = append(keys, labels.Label{Name: pg.name, Value: key})
-		}
-		for _, key := range pg.removeKeys {
-			keys = append(keys, labels.Label{Name: pg.name, Value: key})
+		// If a posting group doesn't have any keys, like posting group created
+		// from `=~".*"`, we don't have to keep the posting group as long as we
+		// keep track of whether we need to add all postings or not.
+		if len(pg.addKeys) == 0 && len(pg.removeKeys) == 0 {
+			continue
 		}
+		postingGroups[i] = pg
+		i++
 	}
+	postingGroups = postingGroups[:i]
 
+	addAllPostings := allRequested && !hasAdds
 	// We only need special All postings if there are no other adds. If there are, we can skip fetching
 	// special All postings completely.
-	if allRequested && !hasAdds {
+	if addAllPostings {
 		// add group with label to fetch "special All postings".
 		name, value := index.AllPostingsKey()
-		allPostingsLabel := labels.Label{Name: name, Value: value}
-
 		postingGroups = append(postingGroups, newPostingGroup(true, name, []string{value}, nil))
-		keys = append(keys, allPostingsLabel)
 	}
 
-	fetchedPostings, closeFns, err := r.fetchPostings(ctx, keys, bytesLimiter)
-	defer func() {
-		for _, closeFn := range closeFns {
-			closeFn()
-		}
-	}()
+	ps, err := fetchLazyExpandedPostings(ctx, postingGroups, r, bytesLimiter, addAllPostings, lazyExpandedPostingEnabled, lazyExpandedPostingSizeBytes)
 	if err != nil {
-		return nil, errors.Wrap(err, "get postings")
-	}
-
-	// Get "add" and "remove" postings from groups. We iterate over postingGroups and their keys
-	// again, and this is exactly the same order as before (when building the groups), so we can simply
-	// use one incrementing index to fetch postings from returned slice.
-	postingIndex := 0
-
-	var groupAdds, groupRemovals []index.Postings
-	for _, g := range postingGroups {
-		// We cannot add empty set to groupAdds, since they are intersected.
-		if len(g.addKeys) > 0 {
-			toMerge := make([]index.Postings, 0, len(g.addKeys))
-			for _, l := range g.addKeys {
-				toMerge = append(toMerge, checkNilPosting(g.name, l, fetchedPostings[postingIndex]))
-				postingIndex++
-			}
-
-			groupAdds = append(groupAdds, index.Merge(toMerge...))
-		}
-
-		for _, l := range g.removeKeys {
-			groupRemovals = append(groupRemovals, checkNilPosting(g.name, l, fetchedPostings[postingIndex]))
-			postingIndex++
-		}
+		return nil, errors.Wrap(err, "fetch and expand postings")
 	}
-
-	result := index.Without(index.Intersect(groupAdds...), index.Merge(groupRemovals...))
-	ps, err := ExpandPostingsWithContext(ctx, result)
-	if err != nil {
-		return nil, errors.Wrap(err, "expand")
+	// If postings still have matchers to be applied lazily, cache expanded postings after filtering series so skip here.
+	if !ps.lazyExpanded() {
+		r.storeExpandedPostingsToCache(ms, index.NewListPostings(ps.postings), len(ps.postings))
 	}
-	r.storeExpandedPostingsToCache(ms, index.NewListPostings(ps), len(ps))
 
-	if len(ps) > 0 {
+	if len(ps.postings) > 0 {
 		// As of version two all series entries are 16 byte padded. All references
 		// we get have to account for that to get the correct offset.
-		version, err := r.block.indexHeaderReader.IndexVersion()
+		version, err := r.IndexVersion()
 		if err != nil {
 			return nil, errors.Wrap(err, "get index version")
 		}
 		if version >= 2 {
-			for i, id := range ps {
-				ps[i] = id * 16
+			for i, id := range ps.postings {
+				ps.postings[i] = id * 16
 			}
 		}
 	}
@@ -2428,22 +2509,26 @@ func ExpandPostingsWithContext(ctx context.Context, p index.Postings) (res []sto
 // If addAll is not set: Merge of postings for "addKeys" labels minus postings for removeKeys labels
 // This computation happens in ExpandedPostings.
 type postingGroup struct {
-	addAll     bool
-	name       string
-	addKeys    []string
-	removeKeys []string
+	addAll      bool
+	name        string
+	matchers    []*labels.Matcher
+	addKeys     []string
+	removeKeys  []string
+	cardinality int64
+	lazy        bool
 }
 
 func newPostingGroup(addAll bool, name string, addKeys, removeKeys []string) *postingGroup {
 	return &postingGroup{
-		addAll:     addAll,
 		name:       name,
+		addAll:     addAll,
 		addKeys:    addKeys,
 		removeKeys: removeKeys,
 	}
 }
 
-func (pg postingGroup) merge(other *postingGroup) *postingGroup {
+// mergeKeys merges keys from two posting groups and ignores other fields.
+func (pg postingGroup) mergeKeys(other *postingGroup) *postingGroup {
 	if other == nil {
 		return &pg
 	}
@@ -2539,12 +2624,16 @@ func checkNilPosting(name, value string, p index.Postings) index.Postings {
 }
 
 func matchersToPostingGroups(ctx context.Context, lvalsFn func(name string) ([]string, error), ms []*labels.Matcher) ([]*postingGroup, error) {
-	matchersMap := make(map[string][]*labels.Matcher)
+	matchersMap := make(map[string]map[string]*labels.Matcher)
 	for _, m := range ms {
-		matchersMap[m.Name] = append(matchersMap[m.Name], m)
+		m := m
+		if _, ok := matchersMap[m.Name]; !ok {
+			matchersMap[m.Name] = make(map[string]*labels.Matcher)
+		}
+		matchersMap[m.Name][m.String()] = m
 	}
 
-	pgs := make([]*postingGroup, 0)
+	pgs := make([]*postingGroup, 0, len(matchersMap))
 	// NOTE: Derived from tsdb.PostingsForMatchers.
 	for _, values := range matchersMap {
 		var (
@@ -2555,8 +2644,9 @@ func matchersToPostingGroups(ctx context.Context, lvalsFn func(name string) ([]s
 			valuesCached bool
 		)
 		lvalsFunc := lvalsFn
+		matchers := make([]*labels.Matcher, 0, len(vals))
 		// Merge PostingGroups with the same matcher into 1 to
-		//  avoid fetching duplicate postings.
+		// avoid fetching duplicate postings.
 		for _, val := range values {
 			pg, vals, err = toPostingGroup(ctx, lvalsFunc, val)
 			if err != nil {
@@ -2579,7 +2669,7 @@ func matchersToPostingGroups(ctx context.Context, lvalsFn func(name string) ([]s
 			if mergedPG == nil {
 				mergedPG = pg
 			} else {
-				mergedPG = mergedPG.merge(pg)
+				mergedPG = mergedPG.mergeKeys(pg)
 			}
 
 			// If this groups adds nothing, it's an empty group. We can shortcut this, since intersection with empty
@@ -2588,7 +2678,10 @@ func matchersToPostingGroups(ctx context.Context, lvalsFn func(name string) ([]s
 			if !mergedPG.addAll && len(mergedPG.addKeys) == 0 {
 				return nil, nil
 			}
+			matchers = append(matchers, val)
 		}
+		// Set and sort matchers to be used when picking up posting fetch strategy.
+		mergedPG.matchers = newSortedMatchers(matchers)
 		pgs = append(pgs, mergedPG)
 	}
 	slices.SortFunc(pgs, func(a, b *postingGroup) bool {
diff --git a/pkg/store/bucket_test.go b/pkg/store/bucket_test.go
index e13afd65183..82d42dd9d71 100644
--- a/pkg/store/bucket_test.go
+++ b/pkg/store/bucket_test.go
@@ -31,6 +31,7 @@ import (
 	"github.com/leanovate/gopter/prop"
 	"github.com/oklog/ulid"
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
 	promtest "github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/prometheus/prometheus/model/labels"
 	"github.com/prometheus/prometheus/model/relabel"
@@ -1099,20 +1100,27 @@ func uploadTestBlock(t testing.TB, tmpDir string, bkt objstore.Bucket, series in
 	}()
 
 	logger := log.NewNopLogger()
+	ctx := context.Background()
 
-	appendTestData(t, h.Appender(context.Background()), series)
+	appendTestData(t, h.Appender(ctx), series)
 
-	testutil.Ok(t, os.MkdirAll(filepath.Join(tmpDir, "tmp"), os.ModePerm))
-	id := createBlockFromHead(t, filepath.Join(tmpDir, "tmp"), h)
+	dir := filepath.Join(tmpDir, "tmp")
+	testutil.Ok(t, os.MkdirAll(dir, os.ModePerm))
+	id := createBlockFromHead(t, dir, h)
+	bdir := filepath.Join(dir, id.String())
+	meta, err := metadata.ReadFromDir(bdir)
+	testutil.Ok(t, err)
+	stats, err := block.GatherIndexHealthStats(logger, filepath.Join(bdir, block.IndexFilename), meta.MinTime, meta.MaxTime)
+	testutil.Ok(t, err)
 
 	_, err = metadata.InjectThanos(log.NewNopLogger(), filepath.Join(tmpDir, "tmp", id.String()), metadata.Thanos{
 		Labels:     labels.Labels{{Name: "ext1", Value: "1"}}.Map(),
 		Downsample: metadata.ThanosDownsample{Resolution: 0},
 		Source:     metadata.TestSource,
+		IndexStats: metadata.IndexStats{SeriesMaxSize: stats.SeriesMaxSize, ChunkMaxSize: stats.ChunkMaxSize},
 	}, nil)
 	testutil.Ok(t, err)
-	testutil.Ok(t, block.Upload(context.Background(), logger, bkt, filepath.Join(tmpDir, "tmp", id.String()), metadata.NoneFunc))
-	testutil.Ok(t, block.Upload(context.Background(), logger, bkt, filepath.Join(tmpDir, "tmp", id.String()), metadata.NoneFunc))
+	testutil.Ok(t, block.Upload(ctx, logger, bkt, bdir, metadata.NoneFunc))
 
 	return id
 }
@@ -1213,6 +1221,7 @@ func benchmarkExpandedPostings(
 		{`uniq=~"9|random-shuffled-values|1"`, []*labels.Matcher{iRegexBigValueSet}, bigValueSetSize},
 	}
 
+	dummyCounter := promauto.NewCounter(prometheus.CounterOpts{Name: "test"})
 	for _, c := range cases {
 		t.Run(c.name, func(t testutil.TB) {
 			b := &bucketBlock{
@@ -1229,9 +1238,9 @@ func benchmarkExpandedPostings(
 
 			t.ResetTimer()
 			for i := 0; i < t.N(); i++ {
-				p, err := indexr.ExpandedPostings(context.Background(), newSortedMatchers(c.matchers), NewBytesLimiterFactory(0)(nil))
+				p, err := indexr.ExpandedPostings(context.Background(), newSortedMatchers(c.matchers), NewBytesLimiterFactory(0)(nil), false, dummyCounter)
 				testutil.Ok(t, err)
-				testutil.Equals(t, c.expectedLen, len(p))
+				testutil.Equals(t, c.expectedLen, len(p.postings))
 			}
 		})
 	}
@@ -1262,9 +1271,11 @@ func TestExpandedPostingsEmptyPostings(t *testing.T) {
 	matcher1 := labels.MustNewMatcher(labels.MatchEqual, "j", "foo")
 	// Match nothing.
 	matcher2 := labels.MustNewMatcher(labels.MatchRegexp, "i", "500.*")
-	ps, err := indexr.ExpandedPostings(context.Background(), newSortedMatchers([]*labels.Matcher{matcher1, matcher2}), NewBytesLimiterFactory(0)(nil))
+	ctx := context.Background()
+	dummyCounter := promauto.With(prometheus.NewRegistry()).NewCounter(prometheus.CounterOpts{Name: "test"})
+	ps, err := indexr.ExpandedPostings(ctx, newSortedMatchers([]*labels.Matcher{matcher1, matcher2}), NewBytesLimiterFactory(0)(nil), false, dummyCounter)
 	testutil.Ok(t, err)
-	testutil.Equals(t, len(ps), 0)
+	testutil.Equals(t, ps, (*lazyExpandedPostings)(nil))
 	// Make sure even if a matcher doesn't match any postings, we still cache empty expanded postings.
 	testutil.Equals(t, 1, indexr.stats.cachedPostingsCompressions)
 }
@@ -1272,21 +1283,28 @@ func TestExpandedPostingsEmptyPostings(t *testing.T) {
 func TestBucketSeries(t *testing.T) {
 	tb := testutil.NewTB(t)
 	storetestutil.RunSeriesInterestingCases(tb, 200e3, 200e3, func(t testutil.TB, samplesPerSeries, series int) {
-		benchBucketSeries(t, chunkenc.ValFloat, false, samplesPerSeries, series, 1)
+		benchBucketSeries(t, chunkenc.ValFloat, false, false, samplesPerSeries, series, 1)
+	})
+}
+
+func TestBucketSeriesLazyExpandedPostings(t *testing.T) {
+	tb := testutil.NewTB(t)
+	storetestutil.RunSeriesInterestingCases(tb, 200e3, 200e3, func(t testutil.TB, samplesPerSeries, series int) {
+		benchBucketSeries(t, chunkenc.ValFloat, false, true, samplesPerSeries, series, 1)
 	})
 }
 
 func TestBucketHistogramSeries(t *testing.T) {
 	tb := testutil.NewTB(t)
 	storetestutil.RunSeriesInterestingCases(tb, 200e3, 200e3, func(t testutil.TB, samplesPerSeries, series int) {
-		benchBucketSeries(t, chunkenc.ValHistogram, false, samplesPerSeries, series, 1)
+		benchBucketSeries(t, chunkenc.ValHistogram, false, false, samplesPerSeries, series, 1)
 	})
 }
 
 func TestBucketSkipChunksSeries(t *testing.T) {
 	tb := testutil.NewTB(t)
 	storetestutil.RunSeriesInterestingCases(tb, 200e3, 200e3, func(t testutil.TB, samplesPerSeries, series int) {
-		benchBucketSeries(t, chunkenc.ValFloat, true, samplesPerSeries, series, 1)
+		benchBucketSeries(t, chunkenc.ValFloat, true, false, samplesPerSeries, series, 1)
 	})
 }
 
@@ -1294,7 +1312,7 @@ func BenchmarkBucketSeries(b *testing.B) {
 	tb := testutil.NewTB(b)
 	// 10e6 samples = ~1736 days with 15s scrape
 	storetestutil.RunSeriesInterestingCases(tb, 10e6, 10e5, func(t testutil.TB, samplesPerSeries, series int) {
-		benchBucketSeries(t, chunkenc.ValFloat, false, samplesPerSeries, series, 1/100e6, 1/10e4, 1)
+		benchBucketSeries(t, chunkenc.ValFloat, false, false, samplesPerSeries, series, 1/100e6, 1/10e4, 1)
 	})
 }
 
@@ -1302,11 +1320,11 @@ func BenchmarkBucketSkipChunksSeries(b *testing.B) {
 	tb := testutil.NewTB(b)
 	// 10e6 samples = ~1736 days with 15s scrape
 	storetestutil.RunSeriesInterestingCases(tb, 10e6, 10e5, func(t testutil.TB, samplesPerSeries, series int) {
-		benchBucketSeries(t, chunkenc.ValFloat, true, samplesPerSeries, series, 1/100e6, 1/10e4, 1)
+		benchBucketSeries(t, chunkenc.ValFloat, true, false, samplesPerSeries, series, 1/100e6, 1/10e4, 1)
 	})
 }
 
-func benchBucketSeries(t testutil.TB, sampleType chunkenc.ValueType, skipChunk bool, samplesPerSeries, totalSeries int, requestedRatios ...float64) {
+func benchBucketSeries(t testutil.TB, sampleType chunkenc.ValueType, skipChunk, lazyExpandedPostings bool, samplesPerSeries, totalSeries int, requestedRatios ...float64) {
 	const numOfBlocks = 4
 
 	tmpDir := t.TempDir()
@@ -1322,12 +1340,6 @@ func benchBucketSeries(t testutil.TB, sampleType chunkenc.ValueType, skipChunk b
 	)
 
 	extLset := labels.Labels{{Name: "ext1", Value: "1"}}
-	thanosMeta := metadata.Thanos{
-		Labels:     extLset.Map(),
-		Downsample: metadata.ThanosDownsample{Resolution: 0},
-		Source:     metadata.TestSource,
-	}
-
 	blockDir := filepath.Join(tmpDir, "tmp")
 
 	samplesPerSeriesPerBlock := samplesPerSeries / numOfBlocks
@@ -1355,19 +1367,33 @@ func benchBucketSeries(t testutil.TB, sampleType chunkenc.ValueType, skipChunk b
 		})
 		id := createBlockFromHead(t, blockDir, head)
 		testutil.Ok(t, head.Close())
+		blockIDDir := filepath.Join(blockDir, id.String())
+		meta, err := metadata.ReadFromDir(blockIDDir)
+		testutil.Ok(t, err)
+		stats, err := block.GatherIndexHealthStats(logger, filepath.Join(blockIDDir, block.IndexFilename), meta.MinTime, meta.MaxTime)
+		testutil.Ok(t, err)
+		thanosMeta := metadata.Thanos{
+			Labels:     extLset.Map(),
+			Downsample: metadata.ThanosDownsample{Resolution: 0},
+			Source:     metadata.TestSource,
+			IndexStats: metadata.IndexStats{
+				SeriesMaxSize: stats.SeriesMaxSize,
+				ChunkMaxSize:  stats.ChunkMaxSize,
+			},
+		}
 
 		// Histogram chunks are represented differently in memory and on disk. In order to
 		// have a precise comparison, we need to use the on-disk representation as the expected value
 		// instead of the in-memory one.
-		diskBlock, err := tsdb.OpenBlock(logger, path.Join(blockDir, id.String()), nil)
+		diskBlock, err := tsdb.OpenBlock(logger, blockIDDir, nil)
 		testutil.Ok(t, err)
 		series = append(series, storetestutil.ReadSeriesFromBlock(t, diskBlock, extLset, skipChunk)...)
 
-		meta, err := metadata.InjectThanos(logger, filepath.Join(blockDir, id.String()), thanosMeta, nil)
+		meta, err = metadata.InjectThanos(logger, blockIDDir, thanosMeta, nil)
 		testutil.Ok(t, err)
 
-		testutil.Ok(t, meta.WriteToDir(logger, filepath.Join(blockDir, id.String())))
-		testutil.Ok(t, block.Upload(context.Background(), logger, bkt, filepath.Join(blockDir, id.String()), metadata.NoneFunc))
+		testutil.Ok(t, meta.WriteToDir(logger, blockIDDir))
+		testutil.Ok(t, block.Upload(context.Background(), logger, bkt, blockIDDir, metadata.NoneFunc))
 	}
 
 	ibkt := objstore.WithNoopInstr(bkt)
@@ -1393,6 +1419,7 @@ func benchBucketSeries(t testutil.TB, sampleType chunkenc.ValueType, skipChunk b
 		0,
 		WithLogger(logger),
 		WithChunkPool(chunkPool),
+		WithLazyExpandedPostings(lazyExpandedPostings),
 	)
 	testutil.Ok(t, err)
 
@@ -2702,6 +2729,7 @@ func benchmarkBlockSeriesWithConcurrency(b *testing.B, concurrency int, blockMet
 	wg := sync.WaitGroup{}
 	wg.Add(concurrency)
 
+	dummyCounter := promauto.NewCounter(prometheus.CounterOpts{Name: "test"})
 	for w := 0; w < concurrency; w++ {
 		go func() {
 			defer wg.Done()
@@ -2736,11 +2764,16 @@ func benchmarkBlockSeriesWithConcurrency(b *testing.B, concurrency int, blockMet
 					req,
 					chunksLimiter,
 					NewBytesLimiterFactory(0)(nil),
+					matchers,
 					nil,
 					false,
 					SeriesBatchSize,
 					dummyHistogram,
 					nil,
+					false,
+					dummyCounter,
+					dummyCounter,
+					dummyCounter,
 				)
 				testutil.Ok(b, blockClient.ExpandPostings(sortedMatchers, seriesLimiter))
 				defer blockClient.Close()
@@ -2797,9 +2830,10 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
 				},
 			},
 		},
@@ -2814,9 +2848,10 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
 				},
 			},
 		},
@@ -2834,9 +2869,10 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
 				},
 			},
 		},
@@ -2852,14 +2888,16 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "bar",
-					addAll:  false,
-					addKeys: []string{"baz"},
+					name:     "bar",
+					addAll:   false,
+					addKeys:  []string{"baz"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "bar", "baz")},
 				},
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
 				},
 			},
 		},
@@ -2896,9 +2934,10 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"), labels.MustNewMatcher(labels.MatchNotEqual, "foo", "baz")},
 				},
 			},
 		},
@@ -2923,9 +2962,10 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"), labels.MustNewMatcher(labels.MatchRegexp, "foo", "b.*")},
 				},
 			},
 		},
@@ -2940,9 +2980,10 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"), labels.MustNewMatcher(labels.MatchNotEqual, "foo", "")},
 				},
 			},
 		},
@@ -2957,9 +2998,10 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"), labels.MustNewMatcher(labels.MatchRegexp, "foo", ".+")},
 				},
 			},
 		},
@@ -2974,9 +3016,10 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    "foo",
-					addAll:  false,
-					addKeys: []string{"bar"},
+					name:     "foo",
+					addAll:   false,
+					addKeys:  []string{"bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar|baz"), labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar|buzz")},
 				},
 			},
 		},
@@ -2994,6 +3037,7 @@ func TestMatchersToPostingGroup(t *testing.T) {
 					name:       "foo",
 					addAll:     true,
 					removeKeys: []string{"bar", "baz"},
+					matchers:   []*labels.Matcher{labels.MustNewMatcher(labels.MatchNotEqual, "foo", "bar"), labels.MustNewMatcher(labels.MatchNotEqual, "foo", "baz")},
 				},
 			},
 		},
@@ -3011,8 +3055,9 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:   labels.MetricName,
-					addAll: true,
+					name:     labels.MetricName,
+					addAll:   true,
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchRegexp, "__name__", ".*")},
 				},
 			},
 		},
@@ -3030,18 +3075,21 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    labels.MetricName,
-					addAll:  false,
-					addKeys: []string{"up"},
+					name:     labels.MetricName,
+					addAll:   false,
+					addKeys:  []string{"up"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "__name__", "up")},
 				},
 				{
-					name:    "cluster",
-					addAll:  false,
-					addKeys: []string{"us-east-1", "us-west-2"},
+					name:     "cluster",
+					addAll:   false,
+					addKeys:  []string{"us-east-1", "us-west-2"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchNotEqual, "cluster", "")},
 				},
 				{
-					name:   "job",
-					addAll: true,
+					name:     "job",
+					addAll:   true,
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchRegexp, "job", ".*")},
 				},
 			},
 		},
@@ -3062,19 +3110,22 @@ func TestMatchersToPostingGroup(t *testing.T) {
 			},
 			expected: []*postingGroup{
 				{
-					name:    labels.MetricName,
-					addAll:  false,
-					addKeys: []string{"go_info", "up"},
+					name:     labels.MetricName,
+					addAll:   false,
+					addKeys:  []string{"go_info", "up"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchNotEqual, "__name__", "")},
 				},
 				{
-					name:    "cluster",
-					addAll:  false,
-					addKeys: []string{"us-east-1", "us-west-2"},
+					name:     "cluster",
+					addAll:   false,
+					addKeys:  []string{"us-east-1", "us-west-2"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchNotEqual, "cluster", "")},
 				},
 				{
-					name:    "job",
-					addAll:  false,
-					addKeys: []string{"prometheus", "thanos"},
+					name:     "job",
+					addAll:   false,
+					addKeys:  []string{"prometheus", "thanos"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchNotEqual, "job", "")},
 				},
 			},
 		},
@@ -3209,7 +3260,7 @@ func TestPostingGroupMerge(t *testing.T) {
 				slices.Sort(tc.group2.addKeys)
 				slices.Sort(tc.group2.removeKeys)
 			}
-			res := tc.group1.merge(tc.group2)
+			res := tc.group1.mergeKeys(tc.group2)
 			testutil.Equals(t, tc.expected, res)
 		})
 	}
@@ -3290,6 +3341,7 @@ func TestExpandedPostingsRace(t *testing.T) {
 
 	l := sync.Mutex{}
 	previousRefs := make(map[int][]storage.SeriesRef)
+	dummyCounter := promauto.With(prometheus.NewRegistry()).NewCounter(prometheus.CounterOpts{Name: "test"})
 
 	for {
 		if tm.Err() != nil {
@@ -3312,16 +3364,16 @@ func TestExpandedPostingsRace(t *testing.T) {
 			i := i
 			bb := bb
 			go func(i int, bb *bucketBlock) {
-				refs, err := bb.indexReader().ExpandedPostings(context.Background(), m, NewBytesLimiterFactory(0)(nil))
+				refs, err := bb.indexReader().ExpandedPostings(context.Background(), m, NewBytesLimiterFactory(0)(nil), false, dummyCounter)
 				testutil.Ok(t, err)
 				defer wg.Done()
 
 				l.Lock()
 				defer l.Unlock()
 				if previousRefs[i] != nil {
-					testutil.Equals(t, previousRefs[i], refs)
+					testutil.Equals(t, previousRefs[i], refs.postings)
 				} else {
-					previousRefs[i] = refs
+					previousRefs[i] = refs.postings
 				}
 			}(i, bb)
 		}
diff --git a/pkg/store/lazy_postings.go b/pkg/store/lazy_postings.go
new file mode 100644
index 00000000000..2e02836c0c9
--- /dev/null
+++ b/pkg/store/lazy_postings.go
@@ -0,0 +1,272 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package store
+
+import (
+	"context"
+	"math"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb/index"
+	"golang.org/x/exp/slices"
+
+	"github.com/thanos-io/thanos/pkg/block/indexheader"
+)
+
+var emptyLazyPostings = &lazyExpandedPostings{postings: nil, matchers: nil}
+
+// lazyExpandedPostings contains expanded postings (series IDs). If lazy posting expansion is
+// enabled, it might contain matchers that can be lazily applied during series filtering time.
+type lazyExpandedPostings struct {
+	postings []storage.SeriesRef
+	matchers []*labels.Matcher
+}
+
+func newLazyExpandedPostings(ps []storage.SeriesRef, matchers ...*labels.Matcher) *lazyExpandedPostings {
+	return &lazyExpandedPostings{
+		postings: ps,
+		matchers: matchers,
+	}
+}
+
+func (p *lazyExpandedPostings) lazyExpanded() bool {
+	return p != nil && len(p.matchers) > 0
+}
+
+func optimizePostingsFetchByDownloadedBytes(r *bucketIndexReader, postingGroups []*postingGroup, seriesMaxSize int64, seriesMatchRatio float64, lazyExpandedPostingSizeBytes prometheus.Counter) ([]*postingGroup, bool, error) {
+	if len(postingGroups) <= 1 {
+		return postingGroups, false, nil
+	}
+	// Collect posting cardinality of each posting group.
+	for _, pg := range postingGroups {
+		// A posting group can have either add keys or remove keys but not both the same time.
+		vals := pg.addKeys
+		if len(pg.removeKeys) > 0 {
+			vals = pg.removeKeys
+		}
+		rngs, err := r.block.indexHeaderReader.PostingsOffsets(pg.name, vals...)
+		if err != nil {
+			return nil, false, errors.Wrapf(err, "postings offsets for %s", pg.name)
+		}
+
+		// No posting ranges found means empty posting.
+		if len(rngs) == 0 {
+			return nil, true, nil
+		}
+		for _, r := range rngs {
+			if r == indexheader.NotFoundRange {
+				continue
+			}
+			// Each range starts from the #entries field which is 4 bytes.
+			// Need to subtract it when calculating number of postings.
+			// https://github.com/prometheus/prometheus/blob/v2.46.0/tsdb/docs/format/index.md.
+			pg.cardinality += (r.End - r.Start - 4) / 4
+		}
+	}
+	slices.SortFunc(postingGroups, func(a, b *postingGroup) bool {
+		if a.cardinality == b.cardinality {
+			return a.name < b.name
+		}
+		return a.cardinality < b.cardinality
+	})
+
+	/*
+	   Algorithm of choosing what postings we need to fetch right now and what
+	   postings we expand lazily.
+	   Sort posting groups by cardinality, so we can iterate from posting group with the smallest posting size.
+	   The algorithm focuses on fetching fewer data, including postings and series.
+
+	   We need to fetch at least 1 posting group in order to fetch series. So if we only fetch the first posting group,
+	   the data bytes we need to download is formula F1: P1 * 4 + P1 * S where P1 is the number of postings in group 1
+	   and S is the size per series. 4 is the byte size per posting.
+
+	   If we are going to fetch 2 posting groups, we can intersect the two postings to reduce series we need to download (hopefully).
+	   Assuming for each intersection, the series matching ratio is R (0 < R < 1). Then the data bytes we need to download is
+	   formula F2: P1 * 4 + P2 * 4 + P1 * S * R.
+	   We can get formula F3 if we are going to fetch 3 posting groups:
+	   F3: P1 * 4 + P2 * 4 + P3 * 4 + P1 * S * R^2.
+
+	   Let's compare formula F2 and F1 first.
+	   P1 * 4 + P2 * 4 + P1 * S * R < P1 * 4 + P1 * S
+	   => P2 * 4 < P1 * S * (1 - R)
+	   Left hand side is the posting group size and right hand side is basically the series size we don't need to fetch
+	   by having the additional intersection. In order to fetch less data for F2 than F1, we just need to ensure that
+	   the additional postings size is smaller.
+
+	   Let's compare formula F3 and F2.
+	   P1 * 4 + P2 * 4 + P3 * 4 + P1 * S * R^2 < P1 * 4 + P2 * 4 + P1 * S * R
+	   => P3 * 4 < P1 * S * R * (1 - R)
+	   Same as the previous formula.
+
+	   Compare formula F4 (Cost to fetch up to 4 posting groups) and F3.
+	   P4 * 4 < P1 * S * R^2 * (1 - R)
+
+	   We can generalize this to formula: Pn * 4 < P1 * S * R^(n - 2) * (1 - R)
+
+	   The idea of the algorithm:
+	   By iterating the posting group in sorted order of cardinality, we need to make sure that by fetching the current posting group,
+	   the total data fetched is smaller than the previous posting group. If so, then we continue to next posting group,
+	   otherwise we stop.
+
+	   This ensures that when we stop at one posting group, posting groups after it always need to fetch more data.
+	   Based on formula Pn * 4 < P1 * S * R^(n - 2) * (1 - R), left hand side is always increasing while iterating to larger
+	   posting groups while right hand side value is always decreasing as R < 1.
+	*/
+	seriesBytesToFetch := postingGroups[0].cardinality * seriesMaxSize
+	p := float64(1)
+	i := 1 // Start from index 1 as we always need to fetch the smallest posting group.
+	hasAdd := !postingGroups[0].addAll
+	for i < len(postingGroups) {
+		pg := postingGroups[i]
+		// Need to fetch more data on postings than series we avoid fetching, stop here and lazy expanding rest of matchers.
+		// If there is no posting group with add keys, don't skip any posting group until we have one.
+		// Fetch posting group with addAll is much more expensive due to fetch all postings.
+		if hasAdd && pg.cardinality*4 > int64(p*math.Ceil((1-seriesMatchRatio)*float64(seriesBytesToFetch))) {
+			break
+		}
+		hasAdd = hasAdd || !pg.addAll
+		p = p * seriesMatchRatio
+		i++
+	}
+	for i < len(postingGroups) {
+		postingGroups[i].lazy = true
+		lazyExpandedPostingSizeBytes.Add(float64(4 * postingGroups[i].cardinality))
+		i++
+	}
+	return postingGroups, false, nil
+}
+
+func fetchLazyExpandedPostings(
+	ctx context.Context,
+	postingGroups []*postingGroup,
+	r *bucketIndexReader,
+	bytesLimiter BytesLimiter,
+	addAllPostings bool,
+	lazyExpandedPostingEnabled bool,
+	lazyExpandedPostingSizeBytes prometheus.Counter,
+) (*lazyExpandedPostings, error) {
+	var (
+		err               error
+		emptyPostingGroup bool
+	)
+	/*
+			There are several cases that we skip postings fetch optimization:
+			- Lazy expanded posting disabled.
+			- Add all postings. This means we don't have a posting group with any add keys.
+		    - Block estimated max series size not set which means we don't have a way to estimate series bytes downloaded.
+			- `SeriesMaxSize` not set for this block then we have no way to estimate series size.
+			- Only one effective posting group available. We need to at least download postings from 1 posting group so no need to optimize.
+	*/
+	if lazyExpandedPostingEnabled && !addAllPostings &&
+		r.block.estimatedMaxSeriesSize > 0 && len(postingGroups) > 1 {
+		postingGroups, emptyPostingGroup, err = optimizePostingsFetchByDownloadedBytes(
+			r,
+			postingGroups,
+			int64(r.block.estimatedMaxSeriesSize),
+			0.5, // TODO(yeya24): Expose this as a flag.
+			lazyExpandedPostingSizeBytes,
+		)
+		if err != nil {
+			return nil, err
+		}
+		if emptyPostingGroup {
+			return emptyLazyPostings, nil
+		}
+	}
+
+	ps, matchers, err := fetchAndExpandPostingGroups(ctx, r, postingGroups, bytesLimiter)
+	if err != nil {
+		return nil, err
+	}
+	return &lazyExpandedPostings{postings: ps, matchers: matchers}, nil
+}
+
+// keysToFetchFromPostingGroups returns label pairs (postings) to fetch
+// and matchers we need to use for lazy posting expansion.
+// Input `postingGroups` needs to be ordered by cardinality in case lazy
+// expansion is enabled. When we find the first lazy posting group we can exit.
+func keysToFetchFromPostingGroups(postingGroups []*postingGroup) ([]labels.Label, []*labels.Matcher) {
+	var lazyMatchers []*labels.Matcher
+	keys := make([]labels.Label, 0)
+	i := 0
+	for i < len(postingGroups) {
+		pg := postingGroups[i]
+		if pg.lazy {
+			break
+		}
+
+		// Postings returned by fetchPostings will be in the same order as keys
+		// so it's important that we iterate them in the same order later.
+		// We don't have any other way of pairing keys and fetched postings.
+		for _, key := range pg.addKeys {
+			keys = append(keys, labels.Label{Name: pg.name, Value: key})
+		}
+		for _, key := range pg.removeKeys {
+			keys = append(keys, labels.Label{Name: pg.name, Value: key})
+		}
+		i++
+	}
+	if i < len(postingGroups) {
+		lazyMatchers = make([]*labels.Matcher, 0)
+		for i < len(postingGroups) {
+			lazyMatchers = append(lazyMatchers, postingGroups[i].matchers...)
+			i++
+		}
+	}
+	return keys, lazyMatchers
+}
+
+func fetchAndExpandPostingGroups(ctx context.Context, r *bucketIndexReader, postingGroups []*postingGroup, bytesLimiter BytesLimiter) ([]storage.SeriesRef, []*labels.Matcher, error) {
+	keys, lazyMatchers := keysToFetchFromPostingGroups(postingGroups)
+	fetchedPostings, closeFns, err := r.fetchPostings(ctx, keys, bytesLimiter)
+	defer func() {
+		for _, closeFn := range closeFns {
+			closeFn()
+		}
+	}()
+	if err != nil {
+		return nil, nil, errors.Wrap(err, "get postings")
+	}
+
+	// Get "add" and "remove" postings from groups. We iterate over postingGroups and their keys
+	// again, and this is exactly the same order as before (when building the groups), so we can simply
+	// use one incrementing index to fetch postings from returned slice.
+	postingIndex := 0
+
+	var groupAdds, groupRemovals []index.Postings
+	for _, g := range postingGroups {
+		if g.lazy {
+			break
+		}
+		// We cannot add empty set to groupAdds, since they are intersected.
+		if len(g.addKeys) > 0 {
+			toMerge := make([]index.Postings, 0, len(g.addKeys))
+			for _, l := range g.addKeys {
+				toMerge = append(toMerge, checkNilPosting(g.name, l, fetchedPostings[postingIndex]))
+				postingIndex++
+			}
+
+			groupAdds = append(groupAdds, index.Merge(toMerge...))
+		}
+
+		for _, l := range g.removeKeys {
+			groupRemovals = append(groupRemovals, checkNilPosting(g.name, l, fetchedPostings[postingIndex]))
+			postingIndex++
+		}
+	}
+
+	result := index.Without(index.Intersect(groupAdds...), index.Merge(groupRemovals...))
+
+	if ctx.Err() != nil {
+		return nil, nil, ctx.Err()
+	}
+	ps, err := ExpandPostingsWithContext(ctx, result)
+	if err != nil {
+		return nil, nil, errors.Wrap(err, "expand")
+	}
+	return ps, lazyMatchers, nil
+}
diff --git a/pkg/store/lazy_postings_test.go b/pkg/store/lazy_postings_test.go
new file mode 100644
index 00000000000..7b17a59ec6a
--- /dev/null
+++ b/pkg/store/lazy_postings_test.go
@@ -0,0 +1,504 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package store
+
+import (
+	"context"
+	"path"
+	"testing"
+
+	"github.com/efficientgo/core/testutil"
+	"github.com/go-kit/log"
+	"github.com/oklog/ulid"
+	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+	promtest "github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/prometheus/prometheus/tsdb/index"
+
+	"github.com/thanos-io/objstore/providers/filesystem"
+	"github.com/thanos-io/thanos/pkg/block/indexheader"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+)
+
+func TestKeysToFetchFromPostingGroups(t *testing.T) {
+	for _, tc := range []struct {
+		name             string
+		pgs              []*postingGroup
+		expectedLabels   []labels.Label
+		expectedMatchers []*labels.Matcher
+	}{
+		{
+			name: "empty group",
+			pgs: []*postingGroup{
+				{
+					addKeys:    []string{},
+					removeKeys: []string{},
+				},
+			},
+			expectedLabels: []labels.Label{},
+		},
+		{
+			name: "empty groups",
+			pgs: []*postingGroup{
+				{
+					addKeys:    []string{},
+					removeKeys: []string{},
+				},
+				{
+					addKeys:    []string{},
+					removeKeys: []string{},
+				},
+				{
+					addKeys:    []string{},
+					removeKeys: []string{},
+				},
+			},
+			expectedLabels: []labels.Label{},
+		},
+		{
+			name: "group with add keys",
+			pgs: []*postingGroup{
+				{
+					name:       "test",
+					addKeys:    []string{"foo", "bar"},
+					removeKeys: []string{},
+				},
+			},
+			expectedLabels: []labels.Label{{Name: "test", Value: "foo"}, {Name: "test", Value: "bar"}},
+		},
+		{
+			name: "group with remove keys",
+			pgs: []*postingGroup{
+				{
+					name:       "test",
+					addKeys:    []string{},
+					removeKeys: []string{"foo", "bar"},
+				},
+			},
+			expectedLabels: []labels.Label{{Name: "test", Value: "foo"}, {Name: "test", Value: "bar"}},
+		},
+		{
+			name: "group with both add and remove keys",
+			pgs: []*postingGroup{
+				{
+					name:       "test",
+					addKeys:    []string{"foo", "bar"},
+					removeKeys: []string{"a", "b"},
+				},
+			},
+			expectedLabels: []labels.Label{
+				{Name: "test", Value: "foo"}, {Name: "test", Value: "bar"},
+				{Name: "test", Value: "a"}, {Name: "test", Value: "b"},
+			},
+		},
+		{
+			name: "groups with both add keys",
+			pgs: []*postingGroup{
+				{
+					name:    "test",
+					addKeys: []string{"foo", "bar"},
+				},
+				{
+					name:    "foo",
+					addKeys: []string{"bar"},
+				},
+			},
+			expectedLabels: []labels.Label{
+				{Name: "test", Value: "foo"}, {Name: "test", Value: "bar"},
+				{Name: "foo", Value: "bar"},
+			},
+		},
+		{
+			name: "groups with add and remove keys",
+			pgs: []*postingGroup{
+				{
+					name:    "test",
+					addKeys: []string{"foo", "bar"},
+				},
+				{
+					name:       "foo",
+					removeKeys: []string{"bar"},
+				},
+			},
+			expectedLabels: []labels.Label{
+				{Name: "test", Value: "foo"}, {Name: "test", Value: "bar"},
+				{Name: "foo", Value: "bar"},
+			},
+		},
+		{
+			name: "lazy posting group with empty matchers",
+			pgs: []*postingGroup{
+				{
+					name:     "test",
+					addKeys:  []string{"foo", "bar"},
+					matchers: []*labels.Matcher{},
+					lazy:     true,
+				},
+			},
+			expectedLabels:   []labels.Label{},
+			expectedMatchers: []*labels.Matcher{},
+		},
+		{
+			name: "lazy posting group",
+			pgs: []*postingGroup{
+				{
+					name:     "test",
+					addKeys:  []string{"foo", "bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
+					lazy:     true,
+				},
+			},
+			expectedLabels:   []labels.Label{},
+			expectedMatchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
+		},
+		{
+			name: "multiple lazy posting groups",
+			pgs: []*postingGroup{
+				{
+					name:     "test",
+					addKeys:  []string{"foo", "bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
+					lazy:     true,
+				},
+				{
+					name:     "job",
+					addKeys:  []string{"prometheus"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchRegexp, "job", "prometheus.*")},
+					lazy:     true,
+				},
+			},
+			expectedLabels: []labels.Label{},
+			expectedMatchers: []*labels.Matcher{
+				labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"),
+				labels.MustNewMatcher(labels.MatchRegexp, "job", "prometheus.*"),
+			},
+		},
+		{
+			name: "multiple non lazy and lazy posting groups",
+			pgs: []*postingGroup{
+				{
+					name:     "test",
+					addKeys:  []string{"foo", "bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
+				},
+				{
+					name:     "test",
+					addKeys:  []string{"foo", "bar"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},
+					lazy:     true,
+				},
+				{
+					name:     "job",
+					addKeys:  []string{"prometheus"},
+					matchers: []*labels.Matcher{labels.MustNewMatcher(labels.MatchRegexp, "job", "prometheus.*")},
+					lazy:     true,
+				},
+			},
+			expectedLabels: []labels.Label{{Name: "test", Value: "foo"}, {Name: "test", Value: "bar"}},
+			expectedMatchers: []*labels.Matcher{
+				labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"),
+				labels.MustNewMatcher(labels.MatchRegexp, "job", "prometheus.*"),
+			},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			keys, matchers := keysToFetchFromPostingGroups(tc.pgs)
+			testutil.Equals(t, tc.expectedLabels, keys)
+			testutil.Equals(t, tc.expectedMatchers, matchers)
+		})
+	}
+}
+
+type mockIndexHeaderReader struct {
+	postings map[string]map[string]index.Range
+	err      error
+}
+
+func (h *mockIndexHeaderReader) Close() error { return nil }
+
+func (h *mockIndexHeaderReader) IndexVersion() (int, error) { return 0, nil }
+
+func (h *mockIndexHeaderReader) PostingsOffsets(name string, value ...string) ([]index.Range, error) {
+	ranges := make([]index.Range, 0)
+	if _, ok := h.postings[name]; !ok {
+		return nil, nil
+	}
+	for _, val := range value {
+		if rng, ok := h.postings[name][val]; ok {
+			ranges = append(ranges, rng)
+		} else {
+			ranges = append(ranges, indexheader.NotFoundRange)
+		}
+	}
+	return ranges, h.err
+}
+
+func (h *mockIndexHeaderReader) PostingsOffset(name string, value string) (index.Range, error) {
+	return index.Range{}, nil
+}
+
+func (h *mockIndexHeaderReader) LookupSymbol(o uint32) (string, error) { return "", nil }
+
+func (h *mockIndexHeaderReader) LabelValues(name string) ([]string, error) { return nil, nil }
+
+func (h *mockIndexHeaderReader) LabelNames() ([]string, error) { return nil, nil }
+
+func TestOptimizePostingsFetchByDownloadedBytes(t *testing.T) {
+	ctx := context.Background()
+	logger := log.NewNopLogger()
+	dir := t.TempDir()
+	bkt, err := filesystem.NewBucket(dir)
+	testutil.Ok(t, err)
+	defer func() { testutil.Ok(t, bkt.Close()) }()
+
+	inputError := errors.New("random")
+	blockID := ulid.MustNew(1, nil)
+	meta := &metadata.Meta{
+		BlockMeta: tsdb.BlockMeta{ULID: blockID},
+		Thanos: metadata.Thanos{
+			Labels: map[string]string{
+				"a": "b",
+				"c": "d",
+			},
+		},
+	}
+	for _, tc := range []struct {
+		name                  string
+		inputPostings         map[string]map[string]index.Range
+		inputError            error
+		postingGroups         []*postingGroup
+		seriesMaxSize         int64
+		seriesMatchRatio      float64
+		expectedPostingGroups []*postingGroup
+		expectedEmptyPosting  bool
+		expectedError         string
+	}{
+		{
+			name: "empty posting group",
+		},
+		{
+			name: "one posting group",
+			postingGroups: []*postingGroup{
+				{name: "foo"},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{name: "foo"},
+			},
+		},
+		{
+			name: "posting offsets return error",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+				"bar": {"foo": index.Range{Start: 8, End: 16}},
+			},
+			inputError:       inputError,
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+			},
+			expectedPostingGroups: nil,
+			expectedError:         "postings offsets for foo: random",
+		},
+		{
+			name: "posting offsets empty",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+			},
+			expectedPostingGroups: nil,
+			expectedEmptyPosting:  true,
+		},
+		{
+			name: "posting group label doesn't exist",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+			},
+			expectedPostingGroups: nil,
+			expectedEmptyPosting:  true,
+		},
+		{
+			name: "posting group keys partial exist",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+				"bar": {"foo": index.Range{Start: 8, End: 16}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo", "buz"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{name: "bar", addKeys: []string{"foo", "buz"}, cardinality: 1},
+				{name: "foo", addKeys: []string{"bar"}, cardinality: 1},
+			},
+		},
+		{
+			name: "two posting groups with add keys, small postings and large series size",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+				"bar": {"foo": index.Range{Start: 8, End: 16}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{name: "bar", addKeys: []string{"foo"}, cardinality: 1},
+				{name: "foo", addKeys: []string{"bar"}, cardinality: 1},
+			},
+		},
+		{
+			// This test case won't be optimized in real case because it is add all
+			// so doesn't make sense to optimize postings fetching anyway.
+			name: "two posting groups with remove keys, small postings and large series size",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+				"bar": {"foo": index.Range{Start: 8, End: 16}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{addAll: true, name: "foo", removeKeys: []string{"bar"}},
+				{addAll: true, name: "bar", removeKeys: []string{"foo"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{addAll: true, name: "bar", removeKeys: []string{"foo"}, cardinality: 1},
+				{addAll: true, name: "foo", removeKeys: []string{"bar"}, cardinality: 1},
+			},
+		},
+		{
+			name: "one group with remove keys and another one with add keys. Always add the addKeys posting group to avoid fetching all postings",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+				"bar": {"foo": index.Range{Start: 8, End: 1000012}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{addAll: true, name: "foo", removeKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{addAll: true, name: "foo", removeKeys: []string{"bar"}, cardinality: 1},
+				{name: "bar", addKeys: []string{"foo"}, cardinality: 250000},
+			},
+		},
+		{
+			name: "two posting groups with add keys, very small series size, making one posting group lazy",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+				"bar": {"foo": index.Range{Start: 8, End: 16}},
+			},
+			seriesMaxSize:    1,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{name: "bar", addKeys: []string{"foo"}, cardinality: 1},
+				{name: "foo", addKeys: []string{"bar"}, cardinality: 1, lazy: true},
+			},
+		},
+		{
+			name: "two posting groups with add keys, one small posting group and a very large posting group, large one become lazy",
+			inputPostings: map[string]map[string]index.Range{
+				"foo": {"bar": index.Range{End: 8}},
+				"bar": {"foo": index.Range{Start: 8, End: 1000012}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}, cardinality: 1},
+				{name: "bar", addKeys: []string{"foo"}, cardinality: 250000, lazy: true},
+			},
+		},
+		{
+			name: "three posting groups with add keys, two small posting group and a very large posting group, large one become lazy",
+			inputPostings: map[string]map[string]index.Range{
+				"foo":     {"bar": index.Range{End: 8}},
+				"bar":     {"foo": index.Range{Start: 8, End: 1000012}},
+				"cluster": {"us": index.Range{Start: 1000012, End: 1000020}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{name: "foo", addKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+				{name: "cluster", addKeys: []string{"us"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{name: "cluster", addKeys: []string{"us"}, cardinality: 1},
+				{name: "foo", addKeys: []string{"bar"}, cardinality: 1},
+				{name: "bar", addKeys: []string{"foo"}, cardinality: 250000, lazy: true},
+			},
+		},
+		{
+			name: "three posting groups with either add or remove keys, two small posting group and a very large posting group, large one become lazy",
+			inputPostings: map[string]map[string]index.Range{
+				"foo":     {"bar": index.Range{End: 8}},
+				"bar":     {"foo": index.Range{Start: 8, End: 1000012}},
+				"cluster": {"us": index.Range{Start: 1000012, End: 1000020}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{addAll: true, name: "foo", removeKeys: []string{"bar"}},
+				{addAll: true, name: "bar", removeKeys: []string{"foo"}},
+				{name: "cluster", addKeys: []string{"us"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{name: "cluster", addKeys: []string{"us"}, cardinality: 1},
+				{addAll: true, name: "foo", removeKeys: []string{"bar"}, cardinality: 1},
+				{addAll: true, name: "bar", removeKeys: []string{"foo"}, cardinality: 250000, lazy: true},
+			},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			headerReader := &mockIndexHeaderReader{postings: tc.inputPostings, err: tc.inputError}
+			registry := prometheus.NewRegistry()
+			block, err := newBucketBlock(ctx, logger, newBucketStoreMetrics(registry), meta, bkt, path.Join(dir, blockID.String()), nil, nil, headerReader, nil, nil, nil)
+			testutil.Ok(t, err)
+			ir := newBucketIndexReader(block)
+			dummyCounter := promauto.With(registry).NewCounter(prometheus.CounterOpts{Name: "test"})
+			pgs, emptyPosting, err := optimizePostingsFetchByDownloadedBytes(ir, tc.postingGroups, tc.seriesMaxSize, tc.seriesMatchRatio, dummyCounter)
+			if err != nil {
+				testutil.Equals(t, tc.expectedError, err.Error())
+				return
+			}
+			testutil.Equals(t, tc.expectedEmptyPosting, emptyPosting)
+			testutil.Equals(t, tc.expectedPostingGroups, pgs)
+			var c int64
+			for _, pg := range pgs {
+				if pg.lazy {
+					c += pg.cardinality
+				}
+			}
+			testutil.Equals(t, float64(4*c), promtest.ToFloat64(dummyCounter))
+		})
+	}
+}
diff --git a/pkg/testutil/e2eutil/prometheus.go b/pkg/testutil/e2eutil/prometheus.go
index bf7e900a9bc..9da879de823 100644
--- a/pkg/testutil/e2eutil/prometheus.go
+++ b/pkg/testutil/e2eutil/prometheus.go
@@ -24,6 +24,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/efficientgo/core/testutil"
 	"github.com/go-kit/log"
 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
@@ -38,8 +39,6 @@ import (
 	"go.uber.org/atomic"
 	"golang.org/x/sync/errgroup"
 
-	"github.com/efficientgo/core/testutil"
-
 	"github.com/thanos-io/thanos/pkg/block/metadata"
 	"github.com/thanos-io/thanos/pkg/runutil"
 )
@@ -446,15 +445,16 @@ func createBlockWithDelay(ctx context.Context, dir string, series []labels.Label
 		return ulid.ULID{}, errors.Wrap(err, "create block id")
 	}
 
-	m, err := metadata.ReadFromDir(path.Join(dir, blockID.String()))
+	bdir := path.Join(dir, blockID.String())
+	m, err := metadata.ReadFromDir(bdir)
 	if err != nil {
 		return ulid.ULID{}, errors.Wrap(err, "open meta file")
 	}
 
+	logger := log.NewNopLogger()
 	m.ULID = id
 	m.Compaction.Sources = []ulid.ULID{id}
-
-	if err := m.WriteToDir(log.NewNopLogger(), path.Join(dir, blockID.String())); err != nil {
+	if err := m.WriteToDir(logger, path.Join(dir, blockID.String())); err != nil {
 		return ulid.ULID{}, errors.Wrap(err, "write meta.json file")
 	}
 
@@ -555,6 +555,11 @@ func createBlock(
 	}
 
 	blockDir := filepath.Join(dir, id.String())
+	logger := log.NewNopLogger()
+	seriesSize, err := gatherMaxSeriesSize(filepath.Join(blockDir, "index"))
+	if err != nil {
+		return id, errors.Wrap(err, "gather max series size")
+	}
 
 	files := []metadata.File{}
 	if hashFunc != metadata.NoneFunc {
@@ -581,11 +586,12 @@ func createBlock(
 		}
 	}
 
-	if _, err = metadata.InjectThanos(log.NewNopLogger(), blockDir, metadata.Thanos{
+	if _, err = metadata.InjectThanos(logger, blockDir, metadata.Thanos{
 		Labels:     extLset.Map(),
 		Downsample: metadata.ThanosDownsample{Resolution: resolution},
 		Source:     metadata.TestSource,
 		Files:      files,
+		IndexStats: metadata.IndexStats{SeriesMaxSize: seriesSize},
 	}, nil); err != nil {
 		return id, errors.Wrap(err, "finalize block")
 	}
@@ -599,6 +605,49 @@ func createBlock(
 	return id, nil
 }
 
+func gatherMaxSeriesSize(fn string) (int64, error) {
+	r, err := index.NewFileReader(fn)
+	if err != nil {
+		return 0, errors.Wrap(err, "open index file")
+	}
+	defer runutil.CloseWithErrCapture(&err, r, "gather index issue file reader")
+
+	p, err := r.Postings(index.AllPostingsKey())
+	if err != nil {
+		return 0, errors.Wrap(err, "get all postings")
+	}
+
+	// As of version two all series entries are 16 byte padded. All references
+	// we get have to account for that to get the correct offset.
+	offsetMultiplier := 1
+	version := r.Version()
+	if version >= 2 {
+		offsetMultiplier = 16
+	}
+
+	// Per series.
+	var (
+		prevId        storage.SeriesRef
+		maxSeriesSize int64
+	)
+	for p.Next() {
+		id := p.At()
+		if prevId != 0 {
+			// Approximate size.
+			seriesSize := int64(id-prevId) * int64(offsetMultiplier)
+			if seriesSize > maxSeriesSize {
+				maxSeriesSize = seriesSize
+			}
+		}
+		prevId = id
+	}
+	if p.Err() != nil {
+		return 0, errors.Wrap(err, "walk postings")
+	}
+
+	return maxSeriesSize, nil
+}
+
 var indexFilename = "index"
 
 type indexWriterSeries struct {
diff --git a/test/e2e/store_gateway_test.go b/test/e2e/store_gateway_test.go
index d3c3de945e3..9fec13287b4 100644
--- a/test/e2e/store_gateway_test.go
+++ b/test/e2e/store_gateway_test.go
@@ -11,6 +11,7 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -878,7 +879,7 @@ config:
 		testutil.Ok(t, runutil.RetryWithLog(log.NewLogfmtLogger(os.Stdout), 5*time.Second, ctx.Done(), func() error {
 			if _, _, _, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+q1.Endpoint("http")), testQuery, now, opts); err != nil {
 				e := err.Error()
-				if strings.Contains(e, "expanded matching posting: get postings") && strings.Contains(e, "exceeded bytes limit while fetching postings: limit 1 violated") {
+				if strings.Contains(e, "expanded matching posting: fetch and expand postings") && strings.Contains(e, "exceeded bytes limit while fetching postings: limit 1 violated") {
 					return nil
 				}
 				return err
@@ -1047,3 +1048,146 @@ config:
 		testutil.Ok(t, s1.WaitSumMetricsWithOptions(e2emon.Equals(1), []string{`thanos_store_index_cache_hits_total`}, e2emon.WithLabelMatchers(matchers.MustNewMatcher(matchers.MatchEqual, "item_type", "ExpandedPostings"))))
 	})
 }
+
+func TestStoreGatewayLazyExpandedPostingsEnabled(t *testing.T) {
+	t.Parallel()
+
+	e, err := e2e.NewDockerEnvironment("memcached-exp")
+	testutil.Ok(t, err)
+	t.Cleanup(e2ethanos.CleanScenario(t, e))
+
+	const bucket = "store-gateway-lazy-expanded-postings-test"
+	m := e2edb.NewMinio(e, "thanos-minio", bucket, e2edb.WithMinioTLS())
+	testutil.Ok(t, e2e.StartAndWaitReady(m))
+
+	// Create 2 store gateways, one with lazy expanded postings enabled and another one disabled.
+	s1 := e2ethanos.NewStoreGW(
+		e,
+		"1",
+		client.BucketConfig{
+			Type:   client.S3,
+			Config: e2ethanos.NewS3Config(bucket, m.InternalEndpoint("http"), m.InternalDir()),
+		},
+		"",
+		"",
+		[]string{"--store.enable-lazy-expanded-postings"},
+	)
+	s2 := e2ethanos.NewStoreGW(
+		e,
+		"2",
+		client.BucketConfig{
+			Type:   client.S3,
+			Config: e2ethanos.NewS3Config(bucket, m.InternalEndpoint("http"), m.InternalDir()),
+		},
+		"",
+		"",
+		nil,
+	)
+	testutil.Ok(t, e2e.StartAndWaitReady(s1, s2))
+
+	q1 := e2ethanos.NewQuerierBuilder(e, "1", s1.InternalEndpoint("grpc")).Init()
+	q2 := e2ethanos.NewQuerierBuilder(e, "2", s2.InternalEndpoint("grpc")).Init()
+	testutil.Ok(t, e2e.StartAndWaitReady(q1, q2))
+
+	dir := filepath.Join(e.SharedDir(), "tmp")
+	testutil.Ok(t, os.MkdirAll(dir, os.ModePerm))
+
+	numSeries := 10000
+	ss := make([]labels.Labels, 0, 10000)
+	for i := 0; i < numSeries; i++ {
+		ss = append(ss, labels.FromStrings("a", strconv.Itoa(i), "b", "1"))
+	}
+	extLset := labels.FromStrings("ext1", "value1", "replica", "1")
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	t.Cleanup(cancel)
+
+	now := time.Now()
+	id, err := e2eutil.CreateBlockWithBlockDelay(ctx, dir, ss, 10, timestamp.FromTime(now), timestamp.FromTime(now.Add(2*time.Hour)), 30*time.Minute, extLset, 0, metadata.NoneFunc)
+	testutil.Ok(t, err)
+
+	l := log.NewLogfmtLogger(os.Stdout)
+	bkt, err := s3.NewBucketWithConfig(l,
+		e2ethanos.NewS3Config(bucket, m.Endpoint("http"), m.Dir()), "test-feed")
+	testutil.Ok(t, err)
+
+	testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, id.String()), id.String()))
+
+	// Wait for store to sync blocks.
+	// thanos_blocks_meta_synced: 1x loadedMeta 0x labelExcludedMeta 0x TooFreshMeta.
+	testutil.Ok(t, s1.WaitSumMetrics(e2emon.Equals(1), "thanos_blocks_meta_synced"))
+	testutil.Ok(t, s1.WaitSumMetrics(e2emon.Equals(0), "thanos_blocks_meta_sync_failures_total"))
+	testutil.Ok(t, s1.WaitSumMetrics(e2emon.Equals(1), "thanos_bucket_store_blocks_loaded"))
+	testutil.Ok(t, s1.WaitSumMetrics(e2emon.Equals(0), "thanos_bucket_store_block_drops_total"))
+	testutil.Ok(t, s1.WaitSumMetrics(e2emon.Equals(0), "thanos_bucket_store_block_load_failures_total"))
+
+	testutil.Ok(t, s2.WaitSumMetrics(e2emon.Equals(1), "thanos_blocks_meta_synced"))
+	testutil.Ok(t, s2.WaitSumMetrics(e2emon.Equals(0), "thanos_blocks_meta_sync_failures_total"))
+	testutil.Ok(t, s2.WaitSumMetrics(e2emon.Equals(1), "thanos_bucket_store_blocks_loaded"))
+	testutil.Ok(t, s2.WaitSumMetrics(e2emon.Equals(0), "thanos_bucket_store_block_drops_total"))
+	testutil.Ok(t, s2.WaitSumMetrics(e2emon.Equals(0), "thanos_bucket_store_block_load_failures_total"))
+
+	t.Run("query with count", func(t *testing.T) {
+		queryAndAssert(t, ctx, q1.Endpoint("http"), func() string { return `count({b="1"})` },
+			time.Now, promclient.QueryOptions{
+				Deduplicate: false,
+			},
+			model.Vector{
+				{
+					Metric: map[model.LabelName]model.LabelValue{},
+					Value:  model.SampleValue(numSeries),
+				},
+			},
+		)
+
+		queryAndAssert(t, ctx, q2.Endpoint("http"), func() string { return `count({b="1"})` },
+			time.Now, promclient.QueryOptions{
+				Deduplicate: false,
+			},
+			model.Vector{
+				{
+					Metric: map[model.LabelName]model.LabelValue{},
+					Value:  model.SampleValue(numSeries),
+				},
+			},
+		)
+	})
+
+	// We expect no lazy expanded postings as query `count({b="1"})` won't trigger the optimization.
+	testutil.Ok(t, s1.WaitSumMetrics(e2emon.Equals(0), "thanos_bucket_store_lazy_expanded_postings_total"))
+	testutil.Ok(t, s2.WaitSumMetrics(e2emon.Equals(0), "thanos_bucket_store_lazy_expanded_postings_total"))
+
+	t.Run("query specific series will trigger lazy posting", func(t *testing.T) {
+		queryAndAssertSeries(t, ctx, q1.Endpoint("http"), func() string { return `{a="1", b="1"}` },
+			time.Now, promclient.QueryOptions{
+				Deduplicate: false,
+			},
+			[]model.Metric{
+				{
+					"a":       "1",
+					"b":       "1",
+					"ext1":    "value1",
+					"replica": "1",
+				},
+			},
+		)
+
+		queryAndAssertSeries(t, ctx, q2.Endpoint("http"), func() string { return `{a="1", b="1"}` },
+			time.Now, promclient.QueryOptions{
+				Deduplicate: false,
+			},
+			[]model.Metric{
+				{
+					"a":       "1",
+					"b":       "1",
+					"ext1":    "value1",
+					"replica": "1",
+				},
+			},
+		)
+	})
+
+	// Use greater or equal to handle flakiness.
+	testutil.Ok(t, s1.WaitSumMetrics(e2emon.GreaterOrEqual(1), "thanos_bucket_store_lazy_expanded_postings_total"))
+	testutil.Ok(t, s2.WaitSumMetrics(e2emon.Equals(0), "thanos_bucket_store_lazy_expanded_postings_total"))
+}