This repository has been archived by the owner on Jun 24, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
format_major_version.go
678 lines (625 loc) · 27.6 KB
/
format_major_version.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package kvstore
import (
"fmt"
"strconv"
"github.com/cockroachdb/errors"
"github.com/edgelesssys/ego-kvstore/internal/base"
"github.com/edgelesssys/ego-kvstore/internal/manifest"
"github.com/edgelesssys/ego-kvstore/sstable"
"github.com/edgelesssys/ego-kvstore/vfs"
"github.com/edgelesssys/ego-kvstore/vfs/atomicfs"
)
// FormatMajorVersion is a constant controlling the format of persisted
// data. Backwards incompatible changes to durable formats are gated
// behind new format major versions.
//
// At any point, a database's format major version may be bumped.
// However, once a database's format major version is increased,
// previous versions of Pebble will refuse to open the database.
//
// The zero value format is the FormatDefault constant. The exact
// FormatVersion that the default corresponds to may change with time.
type FormatMajorVersion uint64
// SafeValue implements redact.SafeValue.
func (v FormatMajorVersion) SafeValue() {}
// String implements fmt.Stringer.
func (v FormatMajorVersion) String() string {
// NB: This must not change. It's used as the value for the on-disk
// version marker file.
//
// Specifically, this value must always parse as a base 10 integer
// that fits in a uint64. We format it as zero-padded, 3-digit
// number today, but the padding may change.
return fmt.Sprintf("%03d", v)
}
const (
// 21.2 versions.
// FormatDefault leaves the format version unspecified. The
// FormatDefault constant may be ratcheted upwards over time.
FormatDefault FormatMajorVersion = iota
// FormatMostCompatible maintains the most backwards compatibility,
// maintaining bi-directional compatibility with RocksDB 6.2.1 in
// the particular configuration described in the Pebble README.
FormatMostCompatible
// formatVersionedManifestMarker is the first
// backwards-incompatible change made to Pebble, introducing the
// format-version marker file for handling backwards-incompatible
// changes more broadly, and replacing the `CURRENT` file with a
// marker file.
//
// This format version is intended as an intermediary version state.
// It is deliberately unexported to discourage direct use of this
// format major version. Clients should use FormatVersioned which
// also ensures earlier versions of Pebble fail to open a database
// written in a future format major version.
formatVersionedManifestMarker
// FormatVersioned is a new format major version that replaces the
// old `CURRENT` file with a new 'marker' file scheme. Previous
// Pebble versions will be unable to open the database unless
// they're aware of format versions.
FormatVersioned
// FormatSetWithDelete is a format major version that introduces a new key
// kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
// unable to open this database.
FormatSetWithDelete
// 22.1 versions.
// FormatBlockPropertyCollector is a format major version that introduces
// BlockPropertyCollectors.
FormatBlockPropertyCollector
// FormatSplitUserKeysMarked is a format major version that guarantees that
// all files that share user keys with neighbors are marked for compaction
// in the manifest. Ratcheting to FormatSplitUserKeysMarked will block
// (without holding mutexes) until the scan of the LSM is complete and the
// manifest has been rotated.
FormatSplitUserKeysMarked
// 22.2 versions.
// FormatSplitUserKeysMarkedCompacted is a format major version that
// guarantees that all files explicitly marked for compaction in the manifest
// have been compacted. Combined with the FormatSplitUserKeysMarked format
// major version, this version guarantees that there are no user keys split
// across multiple files within a level L1+. Ratcheting to this format version
// will block (without holding mutexes) until all necessary compactions for
// files marked for compaction are complete.
FormatSplitUserKeysMarkedCompacted
// FormatRangeKeys is a format major version that introduces range keys.
FormatRangeKeys
// FormatMinTableFormatPebblev1 is a format major version that guarantees that
// tables created by or ingested into the DB at or above this format major
// version will have a table format version of at least Pebblev1 (Block
// Properties).
FormatMinTableFormatPebblev1
// FormatPrePebblev1Marked is a format major version that guarantees that all
// sstables with a table format version pre-Pebblev1 (i.e. those that are
// guaranteed to not contain block properties) are marked for compaction in
// the manifest. Ratcheting to FormatPrePebblev1Marked will block (without
// holding mutexes) until the scan of the LSM is complete and the manifest has
// been rotated.
FormatPrePebblev1Marked
// 23.1 versions.
// formatUnusedPrePebblev1MarkedCompacted is an unused format major version.
// This format major version was originally intended to ship in the 23.1
// release. It was later decided that this should be deferred until a
// subsequent release. The original ordering is preserved so as not to
// introduce breaking changes in Cockroach.
formatUnusedPrePebblev1MarkedCompacted
// FormatSSTableValueBlocks is a format major version that adds support for
// storing values in value blocks in the sstable. Value block support is not
// necessarily enabled when writing sstables, when running with this format
// major version.
//
// WARNING: In development, so no production code should upgrade to this
// format, since a DB with this format major version will not actually
// interoperate correctly with another DB with the same format major
// version. This format major version is introduced so that tests can start
// being executed up to this version. Note that these tests succeed despite
// the incomplete support since they do not enable value blocks and use
// TableFormatPebblev2.
FormatSSTableValueBlocks
// FormatFlushableIngest is a format major version that enables lazy
// addition of ingested sstables into the LSM structure. When an ingest
// overlaps with a memtable, a record of the ingest is written to the WAL
// without waiting for a flush. Subsequent reads treat the ingested files as
// a level above the overlapping memtable. Once the memtable is flushed, the
// ingested files are moved into the lowest possible levels.
//
// This feature is behind a format major version because it required
// breaking changes to the WAL format.
FormatFlushableIngest
// 23.2 versions.
// FormatPrePebblev1MarkedCompacted is a format major version that guarantees
// that all sstables explicitly marked for compaction in the manifest (see
// FormatPrePebblev1Marked) have been compacted. Ratcheting to this format
// version will block (without holding mutexes) until all necessary
// compactions for files marked for compaction are complete.
FormatPrePebblev1MarkedCompacted
// FormatDeleteSizedAndObsolete is a format major version that adds support
// for deletion tombstones that encode the size of the value they're
// expected to delete. This format major version is required before the
// associated key kind may be committed through batch applications or
// ingests. It also adds support for keys that are marked obsolete (see
// sstable/format.go for details).
FormatDeleteSizedAndObsolete
// FormatVirtualSSTables is a format major version that adds support for
// virtual sstables that can reference a sub-range of keys in an underlying
// physical sstable. This information is persisted through new,
// backward-incompatible fields in the Manifest, and therefore requires
// a format major version.
FormatVirtualSSTables
// internalFormatNewest holds the newest format major version, including
// experimental ones excluded from the exported FormatNewest constant until
// they've stabilized. Used in tests.
internalFormatNewest FormatMajorVersion = iota - 1
// FormatNewest always contains the most recent format major version.
FormatNewest FormatMajorVersion = internalFormatNewest
)
// MaxTableFormat returns the maximum sstable.TableFormat that can be used at
// this FormatMajorVersion.
func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
switch v {
case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
FormatVersioned, FormatSetWithDelete:
return sstable.TableFormatRocksDBv2
case FormatBlockPropertyCollector, FormatSplitUserKeysMarked,
FormatSplitUserKeysMarkedCompacted:
return sstable.TableFormatPebblev1
case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
formatUnusedPrePebblev1MarkedCompacted:
return sstable.TableFormatPebblev2
case FormatSSTableValueBlocks, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted:
return sstable.TableFormatPebblev3
case FormatDeleteSizedAndObsolete, FormatVirtualSSTables:
return sstable.TableFormatPebblev4
default:
panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
}
}
// MinTableFormat returns the minimum sstable.TableFormat that can be used at
// this FormatMajorVersion.
func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
switch v {
case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
FormatVersioned, FormatSetWithDelete, FormatBlockPropertyCollector,
FormatSplitUserKeysMarked, FormatSplitUserKeysMarkedCompacted,
FormatRangeKeys:
return sstable.TableFormatLevelDB
case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
formatUnusedPrePebblev1MarkedCompacted, FormatSSTableValueBlocks,
FormatFlushableIngest, FormatPrePebblev1MarkedCompacted,
FormatDeleteSizedAndObsolete, FormatVirtualSSTables:
return sstable.TableFormatPebblev1
default:
panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
}
}
// orderingInvariants returns an enum encoding the set of invariants that must
// hold within the receiver format major version. Invariants only get stricter
// as the format major version advances, so it is okay to retrieve the
// invariants from the current format major version and by the time the
// invariants are enforced, the format major version has advanced.
func (v FormatMajorVersion) orderingInvariants() manifest.OrderingInvariants {
if v < FormatSplitUserKeysMarkedCompacted {
return manifest.AllowSplitUserKeys
}
return manifest.ProhibitSplitUserKeys
}
// formatMajorVersionMigrations defines the migrations from one format
// major version to the next. Each migration is defined as a closure
// which will be invoked on the database before the new format major
// version is committed. Migrations must be idempotent. Migrations are
// invoked with d.mu locked.
//
// Each migration is responsible for invoking finalizeFormatVersUpgrade
// to set the new format major version. RatchetFormatMajorVersion will
// panic if a migration returns a nil error but fails to finalize the
// new format major version.
var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
FormatMostCompatible: func(d *DB) error { return nil },
formatVersionedManifestMarker: func(d *DB) error {
// formatVersionedManifestMarker introduces the use of a marker
// file for pointing to the current MANIFEST file.
// Lock the manifest.
d.mu.versions.logLock()
defer d.mu.versions.logUnlock()
// Construct the filename of the currently active manifest and
// move the manifest marker to that filename. The marker is
// guaranteed to exist, because we unconditionally locate it
// during Open.
manifestFileNum := d.mu.versions.manifestFileNum
filename := base.MakeFilename(fileTypeManifest, manifestFileNum.DiskFileNum())
if err := d.mu.versions.manifestMarker.Move(filename); err != nil {
return errors.Wrap(err, "moving manifest marker")
}
// Now that we have a manifest marker file in place and pointing
// to the current MANIFEST, finalize the upgrade. If we fail for
// some reason, a retry of this migration is guaranteed to again
// move the manifest marker file to the latest manifest. If
// we're unable to finalize the upgrade, a subsequent call to
// Open will ignore the manifest marker.
if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil {
return err
}
// We've finalized the upgrade. All subsequent Open calls will
// ignore the CURRENT file and instead read the manifest marker.
// Before we unlock the manifest, we need to update versionSet
// to use the manifest marker on future rotations.
d.mu.versions.setCurrent = setCurrentFuncMarker(
d.mu.versions.manifestMarker,
d.mu.versions.fs,
d.mu.versions.dirname)
return nil
},
// The FormatVersioned version is split into two, each with their
// own migration to ensure the post-migration cleanup happens even
// if there's a crash immediately after finalizing the version. Once
// a new format major version is finalized, its migration will never
// run again. Post-migration cleanup like the one in the migration
// below must be performed in a separate migration or every time the
// database opens.
FormatVersioned: func(d *DB) error {
// Replace the `CURRENT` file with one that points to the
// nonexistent `MANIFEST-000000` file. If an earlier Pebble
// version that does not know about format major versions
// attempts to open the database, it will error avoiding
// accidental corruption.
if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, base.FileNum(0).DiskFileNum()); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatVersioned)
},
// As SetWithDelete is a new key kind, there is nothing to migrate. We can
// simply finalize the format version and we're done.
FormatSetWithDelete: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatSetWithDelete)
},
FormatBlockPropertyCollector: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector)
},
FormatSplitUserKeysMarked: func(d *DB) error {
// Mark any unmarked files with split-user keys. Note all format major
// versions migrations are invoked with DB.mu locked.
if err := d.markFilesLocked(markFilesWithSplitUserKeys(d.opts.Comparer.Equal)); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarked)
},
FormatSplitUserKeysMarkedCompacted: func(d *DB) error {
// Before finalizing the format major version, rewrite any sstables
// still marked for compaction. Note all format major versions
// migrations are invoked with DB.mu locked.
if err := d.compactMarkedFilesLocked(); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarkedCompacted)
},
FormatRangeKeys: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatRangeKeys)
},
FormatMinTableFormatPebblev1: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatMinTableFormatPebblev1)
},
FormatPrePebblev1Marked: func(d *DB) error {
// Mark any unmarked files that contain only table properties. Note all
// format major versions migrations are invoked with DB.mu locked.
if err := d.markFilesLocked(markFilesPrePebblev1(d.tableCache)); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatPrePebblev1Marked)
},
formatUnusedPrePebblev1MarkedCompacted: func(d *DB) error {
// Intentional no-op.
return d.finalizeFormatVersUpgrade(formatUnusedPrePebblev1MarkedCompacted)
},
FormatSSTableValueBlocks: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks)
},
FormatFlushableIngest: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatFlushableIngest)
},
FormatPrePebblev1MarkedCompacted: func(d *DB) error {
// Before finalizing the format major version, rewrite any sstables
// still marked for compaction. Note all format major versions
// migrations are invoked with DB.mu locked.
if err := d.compactMarkedFilesLocked(); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted)
},
FormatDeleteSizedAndObsolete: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete)
},
FormatVirtualSSTables: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatVirtualSSTables)
},
}
const formatVersionMarkerName = `format-version`
func lookupFormatMajorVersion(
fs vfs.FS, dirname string,
) (FormatMajorVersion, *atomicfs.Marker, error) {
m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName)
if err != nil {
return 0, nil, err
}
if versString == "" {
return FormatMostCompatible, m, nil
}
v, err := strconv.ParseUint(versString, 10, 64)
if err != nil {
return 0, nil, errors.Wrap(err, "parsing format major version")
}
vers := FormatMajorVersion(v)
if vers == FormatDefault {
return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
}
if vers > internalFormatNewest {
return 0, nil, errors.Newf("pebble: database %q written in format major version %d", dirname, vers)
}
return vers, m, nil
}
// FormatMajorVersion returns the database's active format major
// version. The format major version may be higher than the one
// provided in Options when the database was opened if the existing
// database was written with a higher format version.
func (d *DB) FormatMajorVersion() FormatMajorVersion {
return FormatMajorVersion(d.mu.formatVers.vers.Load())
}
// RatchetFormatMajorVersion ratchets the opened database's format major
// version to the provided version. It errors if the provided format
// major version is below the database's current version. Once a
// database's format major version is upgraded, previous Pebble versions
// that do not know of the format version will be unable to open the
// database.
func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
if err := d.closed.Load(); err != nil {
panic(err)
}
d.mu.Lock()
defer d.mu.Unlock()
return d.ratchetFormatMajorVersionLocked(fmv)
}
func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
if d.opts.ReadOnly {
return ErrReadOnly
}
if formatVers > internalFormatNewest {
// Guard against accidentally forgetting to update internalFormatNewest.
return errors.Errorf("pebble: unknown format version %d", formatVers)
}
if currentVers := d.FormatMajorVersion(); currentVers > formatVers {
return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
currentVers, formatVers)
}
if d.mu.formatVers.ratcheting {
return errors.Newf("pebble: database format major version upgrade is in-progress")
}
d.mu.formatVers.ratcheting = true
defer func() { d.mu.formatVers.ratcheting = false }()
for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ {
if err := formatMajorVersionMigrations[nextVers](d); err != nil {
return errors.Wrapf(err, "migrating to version %d", nextVers)
}
// NB: The migration is responsible for calling
// finalizeFormatVersUpgrade to finalize the upgrade. This
// structure is necessary because some migrations may need to
// update in-memory state (without ever dropping locks) after
// the upgrade is finalized. Here we assert that the upgrade
// did occur.
if d.FormatMajorVersion() != nextVers {
d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
}
}
return nil
}
// finalizeFormatVersUpgrade is typically only be called from within a
// format major version migration.
//
// See formatMajorVersionMigrations.
func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
// We use the marker to encode the active format version in the
// marker filename. Unlike other uses of the atomic marker, there is
// no file with the filename `formatVers.String()` on the
// filesystem.
if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil {
return err
}
d.mu.formatVers.vers.Store(uint64(formatVers))
d.opts.EventListener.FormatUpgrade(formatVers)
return nil
}
// compactMarkedFilesLocked performs a migration that schedules rewrite
// compactions to compact away any sstables marked for compaction.
// compactMarkedFilesLocked is run while ratcheting the database's format major
// version to FormatSplitUserKeysMarkedCompacted.
//
// Note that while this method is called with the DB.mu held, and will not
// return until all marked files have been compacted, the mutex is dropped while
// waiting for compactions to complete (or for slots to free up).
func (d *DB) compactMarkedFilesLocked() error {
curr := d.mu.versions.currentVersion()
for curr.Stats.MarkedForCompaction > 0 {
// Attempt to schedule a compaction to rewrite a file marked for
// compaction.
d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction {
return picker.pickRewriteCompaction(env)
})
// The above attempt might succeed and schedule a rewrite compaction. Or
// there might not be available compaction concurrency to schedule the
// compaction. Or compaction of the file might have already been in
// progress. In any scenario, wait until there's some change in the
// state of active compactions.
// Before waiting, check that the database hasn't been closed. Trying to
// schedule the compaction may have dropped d.mu while waiting for a
// manifest write to complete. In that dropped interim, the database may
// have been closed.
if err := d.closed.Load(); err != nil {
return err.(error)
}
// Some flush or compaction may have scheduled or completed while we waited
// for the manifest lock in maybeScheduleCompactionPicker. Get the latest
// Version before waiting on a compaction.
curr = d.mu.versions.currentVersion()
// Only wait on compactions if there are files still marked for compaction.
// NB: Waiting on this condition variable drops d.mu while blocked.
if curr.Stats.MarkedForCompaction > 0 {
if d.mu.compact.compactingCount == 0 {
panic("expected a compaction of marked files in progress")
}
d.mu.compact.cond.Wait()
// Refresh the current version again.
curr = d.mu.versions.currentVersion()
}
}
return nil
}
// findFilesFunc scans the LSM for files, returning true if at least one
// file was found. The returned array contains the matched files, if any, per
// level.
type findFilesFunc func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error)
// markFilesWithSplitUserKeys scans the LSM's levels 1 through 6 for adjacent
// files that contain the same user key. Such arrangements of files were
// permitted in RocksDB and in Pebble up to SHA a860bbad.
var markFilesWithSplitUserKeys = func(equal Equal) findFilesFunc {
return func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) {
// Files with split user keys are expected to be rare and performing key
// comparisons for every file within the LSM is expensive, so drop the
// database lock while scanning the file metadata.
for l := numLevels - 1; l > 0; l-- {
iter := v.Levels[l].Iter()
var prevFile *fileMetadata
var prevUserKey []byte
for f := iter.First(); f != nil; f = iter.Next() {
if prevUserKey != nil && equal(prevUserKey, f.Smallest.UserKey) {
// NB: We may append a file twice, once as prevFile and once
// as f. That's okay, and handled below.
files[l] = append(files[l], prevFile, f)
found = true
}
if f.Largest.IsExclusiveSentinel() {
prevUserKey = nil
prevFile = nil
} else {
prevUserKey = f.Largest.UserKey
prevFile = f
}
}
}
return
}
}
// markFilesPrePebblev1 scans the LSM for files that do not support block
// properties (i.e. a table format version pre-Pebblev1).
var markFilesPrePebblev1 = func(tc *tableCacheContainer) findFilesFunc {
return func(v *version) (found bool, files [numLevels][]*fileMetadata, err error) {
for l := numLevels - 1; l > 0; l-- {
iter := v.Levels[l].Iter()
for f := iter.First(); f != nil; f = iter.Next() {
if f.Virtual {
// Any physical sstable which has been virtualized must
// have already undergone this migration, and we don't
// need to worry about the virtual sstable themselves.
panic("pebble: unexpected virtual sstable during migration")
}
err = tc.withReader(
f.PhysicalMeta(), func(r *sstable.Reader) error {
tf, err := r.TableFormat()
if err != nil {
return err
}
if tf < sstable.TableFormatPebblev1 {
found = true
files[l] = append(files[l], f)
}
return nil
})
if err != nil {
return
}
}
}
return
}
}
// markFilesLock durably marks the files that match the given findFilesFunc for
// compaction.
func (d *DB) markFilesLocked(findFn findFilesFunc) error {
jobID := d.mu.nextJobID
d.mu.nextJobID++
// Acquire a read state to have a view of the LSM and a guarantee that none
// of the referenced files will be deleted until we've unreferenced the read
// state. Some findFilesFuncs may read the files, requiring they not be
// deleted.
rs := d.loadReadState()
var (
found bool
files [numLevels][]*fileMetadata
err error
)
func() {
defer rs.unrefLocked()
// Note the unusual locking: unlock, defer Lock(). The scan of the files in
// the version does not need to block other operations that require the
// DB.mu. Drop it for the scan, before re-acquiring it.
d.mu.Unlock()
defer d.mu.Lock()
found, files, err = findFn(rs.current)
}()
if err != nil {
return err
}
// The database lock has been acquired again by the defer within the above
// anonymous function.
if !found {
// Nothing to do.
return nil
}
// After scanning, if we found files to mark, we fetch the current state of
// the LSM (which may have changed) and set MarkedForCompaction on the files,
// and update the version's Stats.MarkedForCompaction count, which are both
// protected by d.mu.
// Lock the manifest for a coherent view of the LSM. The database lock has
// been re-acquired by the defer within the above anonymous function.
d.mu.versions.logLock()
vers := d.mu.versions.currentVersion()
for l, filesToMark := range files {
if len(filesToMark) == 0 {
continue
}
for _, f := range filesToMark {
// Ignore files to be marked that have already been compacted or marked.
if f.CompactionState == manifest.CompactionStateCompacted ||
f.MarkedForCompaction {
continue
}
// Else, mark the file for compaction in this version.
vers.Stats.MarkedForCompaction++
f.MarkedForCompaction = true
}
// The compaction picker uses the markedForCompactionAnnotator to
// quickly find files marked for compaction, or to quickly determine
// that there are no such files marked for compaction within a level.
// A b-tree node may be annotated with an annotation recording that
// there are no files marked for compaction within the node's subtree,
// based on the assumption that it's static.
//
// Since we're marking files for compaction, these b-tree nodes'
// annotations will be out of date. Clear the compaction-picking
// annotation, so that it's recomputed the next time the compaction
// picker looks for a file marked for compaction.
vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{})
}
// The 'marked-for-compaction' bit is persisted in the MANIFEST file
// metadata. We've already modified the in-memory file metadata, but the
// manifest hasn't been updated. Force rotation to a new MANIFEST file,
// which will write every file metadata to the new manifest file and ensure
// that the now marked-for-compaction file metadata are persisted as marked.
// NB: This call to logAndApply will unlockthe MANIFEST, which we locked up
// above before obtaining `vers`.
return d.mu.versions.logAndApply(
jobID,
&manifest.VersionEdit{},
map[int]*LevelMetrics{},
true, /* forceRotation */
func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) })
}