forked from speedb-io/speedb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_system.h
1830 lines (1597 loc) · 74.4 KB
/
file_system.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// A FileSystem is an interface used by the rocksdb implementation to access
// storage functionality like the filesystem etc. Callers
// may wish to provide a custom FileSystem object when opening a database to
// get fine gain control; e.g., to rate limit file system operations.
//
// All FileSystem implementations are safe for concurrent access from
// multiple threads without any external synchronization.
//
// WARNING: Since this is a new interface, it is expected that there will be
// some changes as storage systems are ported over.
#pragma once
#include <stdint.h>
#include <chrono>
#include <cstdarg>
#include <functional>
#include <limits>
#include <memory>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "rocksdb/customizable.h"
#include "rocksdb/env.h"
#include "rocksdb/io_status.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "rocksdb/thread_status.h"
namespace ROCKSDB_NAMESPACE {
class FileLock;
class FSDirectory;
class FSRandomAccessFile;
class FSRandomRWFile;
class FSSequentialFile;
class FSWritableFile;
class Logger;
class Slice;
struct ImmutableDBOptions;
struct MutableDBOptions;
class RateLimiter;
struct ConfigOptions;
using AccessPattern = RandomAccessFile::AccessPattern;
using FileAttributes = Env::FileAttributes;
// Priority of an IO request. This is a hint and does not guarantee any
// particular QoS.
// IO_LOW - Typically background reads/writes such as compaction/flush
// IO_HIGH - Typically user reads/synchronous WAL writes
enum class IOPriority : uint8_t {
kIOLow,
kIOHigh,
kIOTotal,
};
// Type of the data begin read/written. It can be passed down as a flag
// for the FileSystem implementation to optionally handle different types in
// different ways
enum class IOType : uint8_t {
kData,
kFilter,
kIndex,
kMetadata,
kWAL,
kManifest,
kLog,
kUnknown,
kInvalid,
};
// Per-request options that can be passed down to the FileSystem
// implementation. These are hints and are not necessarily guaranteed to be
// honored. More hints can be added here in the future to indicate things like
// storage media (HDD/SSD) to be used, replication level etc.
struct IOOptions {
// Timeout for the operation in microseconds
std::chrono::microseconds timeout;
// Priority - high or low
IOPriority prio;
// Priority used to charge rate limiter configured in file system level (if
// any)
// Limitation: right now RocksDB internal does not consider this
// rate_limiter_priority
Env::IOPriority rate_limiter_priority;
// Type of data being read/written
IOType type;
// EXPERIMENTAL
// An option map that's opaque to RocksDB. It can be used to implement a
// custom contract between a FileSystem user and the provider. This is only
// useful in cases where a RocksDB user directly uses the FileSystem or file
// object for their own purposes, and wants to pass extra options to APIs
// such as NewRandomAccessFile and NewWritableFile.
std::unordered_map<std::string, std::string> property_bag;
// Force directory fsync, some file systems like btrfs may skip directory
// fsync, set this to force the fsync
bool force_dir_fsync;
IOOptions() : IOOptions(false) {}
explicit IOOptions(bool force_dir_fsync_)
: timeout(std::chrono::microseconds::zero()),
prio(IOPriority::kIOLow),
rate_limiter_priority(Env::IO_TOTAL),
type(IOType::kUnknown),
force_dir_fsync(force_dir_fsync_) {}
};
struct DirFsyncOptions {
enum FsyncReason : uint8_t {
kNewFileSynced,
kFileRenamed,
kDirRenamed,
kFileDeleted,
kDefault,
} reason;
std::string renamed_new_name; // for kFileRenamed
// add other options for other FsyncReason
DirFsyncOptions();
explicit DirFsyncOptions(std::string file_renamed_new_name);
explicit DirFsyncOptions(FsyncReason fsync_reason);
};
// File scope options that control how a file is opened/created and accessed
// while its open. We may add more options here in the future such as
// redundancy level, media to use etc.
struct FileOptions : EnvOptions {
// Embedded IOOptions to control the parameters for any IOs that need
// to be issued for the file open/creation
IOOptions io_options;
// EXPERIMENTAL
// The feature is in development and is subject to change.
// When creating a new file, set the temperature of the file so that
// underlying file systems can put it with appropriate storage media and/or
// coding.
Temperature temperature = Temperature::kUnknown;
// The checksum type that is used to calculate the checksum value for
// handoff during file writes.
ChecksumType handoff_checksum_type;
FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
FileOptions(const DBOptions& opts)
: EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
FileOptions(const EnvOptions& opts)
: EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
FileOptions(const FileOptions& opts)
: EnvOptions(opts),
io_options(opts.io_options),
temperature(opts.temperature),
handoff_checksum_type(opts.handoff_checksum_type) {}
FileOptions& operator=(const FileOptions&) = default;
};
// A structure to pass back some debugging information from the FileSystem
// implementation to RocksDB in case of an IO error
struct IODebugContext {
// file_path to be filled in by RocksDB in case of an error
std::string file_path;
// A map of counter names to values - set by the FileSystem implementation
std::map<std::string, uint64_t> counters;
// To be set by the FileSystem implementation
std::string msg;
// To be set by the underlying FileSystem implementation.
std::string request_id;
// In order to log required information in IO tracing for different
// operations, Each bit in trace_data stores which corresponding info from
// IODebugContext will be added in the trace. Foreg, if trace_data = 1, it
// means bit at position 0 is set so TraceData::kRequestID (request_id) will
// be logged in the trace record.
//
enum TraceData : char {
// The value of each enum represents the bitwise position for
// that information in trace_data which will be used by IOTracer for
// tracing. Make sure to add them sequentially.
kRequestID = 0,
};
uint64_t trace_data = 0;
IODebugContext() {}
void AddCounter(std::string& name, uint64_t value) {
counters.emplace(name, value);
}
// Called by underlying file system to set request_id and log request_id in
// IOTracing.
void SetRequestId(const std::string& _request_id) {
request_id = _request_id;
trace_data |= (1 << TraceData::kRequestID);
}
std::string ToString() {
std::ostringstream ss;
ss << file_path << ", ";
for (auto counter : counters) {
ss << counter.first << " = " << counter.second << ",";
}
ss << msg;
return ss.str();
}
};
// A function pointer type for custom destruction of void pointer passed to
// ReadAsync API. RocksDB/caller is responsible for deleting the void pointer
// allocated by FS in ReadAsync API.
using IOHandleDeleter = std::function<void(void*)>;
// The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile,
// FSRandomRWFileclass, and FSDIrectory classes define the interface between
// RocksDB and storage systems, such as Posix filesystems,
// remote filesystems etc.
// The interface allows for fine grained control of individual IO operations,
// such as setting a timeout, prioritization, hints on data placement,
// different handling based on type of IO etc.
// This is accomplished by passing an instance of IOOptions to every
// API call that can potentially perform IO. Additionally, each such API is
// passed a pointer to a IODebugContext structure that can be used by the
// storage system to include troubleshooting information. The return values
// of the APIs is of type IOStatus, which can indicate an error code/sub-code,
// as well as metadata about the error such as its scope and whether its
// retryable.
// NewCompositeEnv can be used to create an Env with a custom FileSystem for
// DBOptions::env.
//
// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
// because RocksDB is not exception-safe. This could cause undefined behavior
// including data loss, unreported corruption, deadlocks, and more.
class FileSystem : public Customizable {
public:
FileSystem();
// No copying allowed
FileSystem(const FileSystem&) = delete;
virtual ~FileSystem();
static const char* Type() { return "FileSystem"; }
static const char* kDefaultName() { return "DefaultFileSystem"; }
// Loads the FileSystem specified by the input value into the result
// The CreateFromString alternative should be used; this method may be
// deprecated in a future release.
static Status Load(const std::string& value,
std::shared_ptr<FileSystem>* result);
// Loads the FileSystem specified by the input value into the result
// @see Customizable for a more detailed description of the parameters and
// return codes
// @param config_options Controls how the FileSystem is loaded
// @param value The name and optional properties describing the file system
// to load.
// @param result On success, returns the loaded FileSystem
// @return OK if the FileSystem was successfully loaded.
// @return not-OK if the load failed.
static Status CreateFromString(const ConfigOptions& options,
const std::string& value,
std::shared_ptr<FileSystem>* result);
// Return a default FileSystem suitable for the current operating
// system.
static std::shared_ptr<FileSystem> Default();
// Handles the event when a new DB or a new ColumnFamily starts using the
// specified data paths.
//
// The data paths might be shared by different DBs or ColumnFamilies,
// so RegisterDbPaths might be called with the same data paths.
// For example, when CreateColumnFamily is called multiple times with the same
// data path, RegisterDbPaths will also be called with the same data path.
//
// If the return status is ok, then the paths must be correspondingly
// called in UnregisterDbPaths;
// otherwise this method should have no side effect, and UnregisterDbPaths
// do not need to be called for the paths.
//
// Different implementations may take different actions.
// By default, it's a no-op and returns Status::OK.
virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
return Status::OK();
}
// Handles the event a DB or a ColumnFamily stops using the specified data
// paths.
//
// It should be called corresponding to each successful RegisterDbPaths.
//
// Different implementations may take different actions.
// By default, it's a no-op and returns Status::OK.
virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
return Status::OK();
}
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores nullptr in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
virtual IOStatus NewSequentialFile(const std::string& fname,
const FileOptions& file_opts,
std::unique_ptr<FSSequentialFile>* result,
IODebugContext* dbg) = 0;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
virtual IOStatus NewRandomAccessFile(
const std::string& fname, const FileOptions& file_opts,
std::unique_ptr<FSRandomAccessFile>* result,
IODebugContext* dbg) = 0;
// These values match Linux definition
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
enum WriteLifeTimeHint {
kWLTHNotSet = 0, // No hint information set
kWLTHNone, // No hints about write life time
kWLTHShort, // Data written has a short life time
kWLTHMedium, // Data written has a medium life time
kWLTHLong, // Data written has a long life time
kWLTHExtreme, // Data written has an extremely long life time
};
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual IOStatus NewWritableFile(const std::string& fname,
const FileOptions& file_opts,
std::unique_ptr<FSWritableFile>* result,
IODebugContext* dbg) = 0;
// Create an object that writes to a file with the specified name.
// `FSWritableFile::Append()`s will append after any existing content. If the
// file does not already exist, creates it.
//
// On success, stores a pointer to the file in *result and returns OK. On
// failure stores nullptr in *result and returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual IOStatus ReopenWritableFile(
const std::string& /*fname*/, const FileOptions& /*options*/,
std::unique_ptr<FSWritableFile>* /*result*/, IODebugContext* /*dbg*/) {
return IOStatus::NotSupported("ReopenWritableFile");
}
// Reuse an existing file by renaming it and opening it as writable.
virtual IOStatus ReuseWritableFile(const std::string& fname,
const std::string& old_fname,
const FileOptions& file_opts,
std::unique_ptr<FSWritableFile>* result,
IODebugContext* dbg);
// Open `fname` for random read and write, if file doesn't exist the file
// will be created. On success, stores a pointer to the new file in
// *result and returns OK. On failure returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual IOStatus NewRandomRWFile(const std::string& /*fname*/,
const FileOptions& /*options*/,
std::unique_ptr<FSRandomRWFile>* /*result*/,
IODebugContext* /*dbg*/) {
return IOStatus::NotSupported(
"RandomRWFile is not implemented in this FileSystem");
}
// Opens `fname` as a memory-mapped file for read and write (in-place updates
// only, i.e., no appends). On success, stores a raw buffer covering the whole
// file in `*result`. The file must exist prior to this call.
virtual IOStatus NewMemoryMappedFileBuffer(
const std::string& /*fname*/,
std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
return IOStatus::NotSupported(
"MemoryMappedFileBuffer is not implemented in this FileSystem");
}
// Create an object that represents a directory. Will fail if directory
// doesn't exist. If the directory exists, it will open the directory
// and create a new Directory object.
//
// On success, stores a pointer to the new Directory in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
virtual IOStatus NewDirectory(const std::string& name,
const IOOptions& io_opts,
std::unique_ptr<FSDirectory>* result,
IODebugContext* dbg) = 0;
// Returns OK if the named file exists.
// NotFound if the named file does not exist,
// the calling process does not have permission to determine
// whether this file exists, or if the path is invalid.
// IOError if an IO Error was encountered
virtual IOStatus FileExists(const std::string& fname,
const IOOptions& options,
IODebugContext* dbg) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
// Returns OK if "dir" exists and "*result" contains its children.
// NotFound if "dir" does not exist, the calling process does not have
// permission to access "dir", or if "dir" is invalid.
// IOError if an IO Error was encountered
virtual IOStatus GetChildren(const std::string& dir, const IOOptions& options,
std::vector<std::string>* result,
IODebugContext* dbg) = 0;
// Store in *result the attributes of the children of the specified directory.
// In case the implementation lists the directory prior to iterating the files
// and files are concurrently deleted, the deleted files will be omitted from
// result.
// The name attributes are relative to "dir".
// Original contents of *results are dropped.
// Returns OK if "dir" exists and "*result" contains its children.
// NotFound if "dir" does not exist, the calling process does not have
// permission to access "dir", or if "dir" is invalid.
// IOError if an IO Error was encountered
virtual IOStatus GetChildrenFileAttributes(
const std::string& dir, const IOOptions& options,
std::vector<FileAttributes>* result, IODebugContext* dbg) {
assert(result != nullptr);
std::vector<std::string> child_fnames;
IOStatus s = GetChildren(dir, options, &child_fnames, dbg);
if (!s.ok()) {
return s;
}
result->resize(child_fnames.size());
size_t result_size = 0;
for (size_t i = 0; i < child_fnames.size(); ++i) {
const std::string path = dir + "/" + child_fnames[i];
if (!(s = GetFileSize(path, options, &(*result)[result_size].size_bytes,
dbg))
.ok()) {
if (FileExists(path, options, dbg).IsNotFound()) {
// The file may have been deleted since we listed the directory
continue;
}
return s;
}
(*result)[result_size].name = std::move(child_fnames[i]);
result_size++;
}
result->resize(result_size);
return IOStatus::OK();
}
// This seems to clash with a macro on Windows, so #undef it here
#ifdef DeleteFile
#undef DeleteFile
#endif
// Delete the named file.
virtual IOStatus DeleteFile(const std::string& fname,
const IOOptions& options,
IODebugContext* dbg) = 0;
// Truncate the named file to the specified size.
virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
const IOOptions& /*options*/,
IODebugContext* /*dbg*/) {
return IOStatus::NotSupported("Truncate is not supported for this FileSystem");
}
// Create the specified directory. Returns error if directory exists.
virtual IOStatus CreateDir(const std::string& dirname,
const IOOptions& options, IODebugContext* dbg) = 0;
// Creates directory if missing. Return Ok if it exists, or successful in
// Creating.
virtual IOStatus CreateDirIfMissing(const std::string& dirname,
const IOOptions& options,
IODebugContext* dbg) = 0;
// Delete the specified directory.
virtual IOStatus DeleteDir(const std::string& dirname,
const IOOptions& options, IODebugContext* dbg) = 0;
// Store the size of fname in *file_size.
virtual IOStatus GetFileSize(const std::string& fname,
const IOOptions& options, uint64_t* file_size,
IODebugContext* dbg) = 0;
// Store the last modification time of fname in *file_mtime.
virtual IOStatus GetFileModificationTime(const std::string& fname,
const IOOptions& options,
uint64_t* file_mtime,
IODebugContext* dbg) = 0;
// Rename file src to target.
virtual IOStatus RenameFile(const std::string& src, const std::string& target,
const IOOptions& options,
IODebugContext* dbg) = 0;
// Hard Link file src to target.
virtual IOStatus LinkFile(const std::string& /*src*/,
const std::string& /*target*/,
const IOOptions& /*options*/,
IODebugContext* /*dbg*/) {
return IOStatus::NotSupported("LinkFile is not supported for this FileSystem");
}
virtual IOStatus NumFileLinks(const std::string& /*fname*/,
const IOOptions& /*options*/,
uint64_t* /*count*/, IODebugContext* /*dbg*/) {
return IOStatus::NotSupported(
"Getting number of file links is not supported for this FileSystem");
}
virtual IOStatus AreFilesSame(const std::string& /*first*/,
const std::string& /*second*/,
const IOOptions& /*options*/, bool* /*res*/,
IODebugContext* /*dbg*/) {
return IOStatus::NotSupported("AreFilesSame is not supported for this FileSystem");
}
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores nullptr in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
virtual IOStatus LockFile(const std::string& fname, const IOOptions& options,
FileLock** lock, IODebugContext* dbg) = 0;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
virtual IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
IODebugContext* dbg) = 0;
// *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ
// between runs of the same process, but subsequent calls will return the
// same directory.
virtual IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
IODebugContext* dbg) = 0;
// Create and returns a default logger (an instance of EnvLogger) for storing
// informational messages. Derived classes can override to provide custom
// logger.
virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
std::shared_ptr<Logger>* result,
IODebugContext* dbg) = 0;
// Get full directory name for this db.
virtual IOStatus GetAbsolutePath(const std::string& db_path,
const IOOptions& options,
std::string* output_path,
IODebugContext* dbg) = 0;
// Sanitize the FileOptions. Typically called by a FileOptions/EnvOptions
// copy constructor
virtual void SanitizeFileOptions(FileOptions* /*opts*/) const {}
// OptimizeForLogRead will create a new FileOptions object that is a copy of
// the FileOptions in the parameters, but is optimized for reading log files.
virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const;
// OptimizeForManifestRead will create a new FileOptions object that is a copy
// of the FileOptions in the parameters, but is optimized for reading manifest
// files.
virtual FileOptions OptimizeForManifestRead(
const FileOptions& file_options) const;
// OptimizeForLogWrite will create a new FileOptions object that is a copy of
// the FileOptions in the parameters, but is optimized for writing log files.
// Default implementation returns the copy of the same object.
virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options,
const DBOptions& db_options) const;
// OptimizeForManifestWrite will create a new FileOptions object that is a
// copy of the FileOptions in the parameters, but is optimized for writing
// manifest files. Default implementation returns the copy of the same
// object.
virtual FileOptions OptimizeForManifestWrite(
const FileOptions& file_options) const;
// OptimizeForCompactionTableWrite will create a new FileOptions object that
// is a copy of the FileOptions in the parameters, but is optimized for
// writing table files.
virtual FileOptions OptimizeForCompactionTableWrite(
const FileOptions& file_options,
const ImmutableDBOptions& immutable_ops) const;
// OptimizeForCompactionTableRead will create a new FileOptions object that
// is a copy of the FileOptions in the parameters, but is optimized for
// reading table files.
virtual FileOptions OptimizeForCompactionTableRead(
const FileOptions& file_options,
const ImmutableDBOptions& db_options) const;
// OptimizeForBlobFileRead will create a new FileOptions object that
// is a copy of the FileOptions in the parameters, but is optimized for
// reading blob files.
virtual FileOptions OptimizeForBlobFileRead(
const FileOptions& file_options,
const ImmutableDBOptions& db_options) const;
// This seems to clash with a macro on Windows, so #undef it here
#ifdef GetFreeSpace
#undef GetFreeSpace
#endif
// Get the amount of free disk space
virtual IOStatus GetFreeSpace(const std::string& /*path*/,
const IOOptions& /*options*/,
uint64_t* /*diskfree*/,
IODebugContext* /*dbg*/) {
return IOStatus::NotSupported("GetFreeSpace");
}
virtual IOStatus IsDirectory(const std::string& /*path*/,
const IOOptions& options, bool* is_dir,
IODebugContext* /*dgb*/) = 0;
// EXPERIMENTAL
// Poll for completion of read IO requests. The Poll() method should call the
// callback functions to indicate completion of read requests.
// Underlying FS is required to support Poll API. Poll implementation should
// ensure that the callback gets called at IO completion, and return only
// after the callback has been called.
// If Poll returns partial results for any reads, its caller reponsibility to
// call Read or ReadAsync in order to get the remaining bytes.
//
// Default implementation is to return IOStatus::OK.
virtual IOStatus Poll(std::vector<void*>& /*io_handles*/,
size_t /*min_completions*/) {
return IOStatus::OK();
}
// EXPERIMENTAL
// Abort the read IO requests submitted asynchronously. Underlying FS is
// required to support AbortIO API. AbortIO implementation should ensure that
// the all the read requests related to io_handles should be aborted and
// it shouldn't call the callback for these io_handles.
//
// Default implementation is to return IOStatus::OK.
virtual IOStatus AbortIO(std::vector<void*>& /*io_handles*/) {
return IOStatus::OK();
}
// If you're adding methods here, remember to add them to EnvWrapper too.
private:
void operator=(const FileSystem&);
};
// A file abstraction for reading sequentially through a file
class FSSequentialFile {
public:
FSSequentialFile() {}
virtual ~FSSequentialFile() {}
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// May set "*result" to point at data in "scratch[0..n-1]", so
// "scratch[0..n-1]" must be live when "*result" is used.
// If an error was encountered, returns a non-OK status.
//
// After call, result->size() < n only if end of file has been
// reached (or non-OK status). Read might fail if called again after
// first result->size() < n.
//
// REQUIRES: External synchronization
virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result,
char* scratch, IODebugContext* dbg) = 0;
// Skip "n" bytes from the file. This is guaranteed to be no
// slower that reading the same data, but may be faster.
//
// If end of file is reached, skipping will stop at the end of the
// file, and Skip will return OK.
//
// REQUIRES: External synchronization
virtual IOStatus Skip(uint64_t n) = 0;
// Indicates the upper layers if the current SequentialFile implementation
// uses direct IO.
virtual bool use_direct_io() const { return false; }
// Use the returned alignment value to allocate
// aligned buffer for Direct I/O
virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
return IOStatus::NotSupported("InvalidateCache not supported.");
}
// Positioned Read for direct I/O
// If Direct I/O enabled, offset, n, and scratch should be properly aligned
virtual IOStatus PositionedRead(uint64_t /*offset*/, size_t /*n*/,
const IOOptions& /*options*/,
Slice* /*result*/, char* /*scratch*/,
IODebugContext* /*dbg*/) {
return IOStatus::NotSupported("PositionedRead");
}
// EXPERIMENTAL
// When available, returns the actual temperature for the file. This is
// useful in case some outside process moves a file from one tier to another,
// though the temperature is generally expected not to change while a file is
// open.
virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
// If you're adding methods here, remember to add them to
// SequentialFileWrapper too.
};
// A read IO request structure for use in MultiRead and asynchronous Read APIs.
struct FSReadRequest {
// Input parameter that represents the file offset in bytes.
uint64_t offset;
// Input parameter that represents the length to read in bytes. `result` only
// returns fewer bytes if end of file is hit (or `status` is not OK).
size_t len;
// A buffer that MultiRead() can optionally place data in. It can
// ignore this and allocate its own buffer.
// The lifecycle of scratch will be until IO is completed.
//
// In case of asynchronous reads, its an output parameter and it will be
// maintained until callback has been called. Scratch is allocated by RocksDB
// and will be passed to underlying FileSystem.
char* scratch;
// Output parameter set by MultiRead() to point to the data buffer, and
// the number of valid bytes
//
// In case of asynchronous reads, this output parameter is set by Async Read
// APIs to point to the data buffer, and
// the number of valid bytes.
// Slice result should point to scratch i.e the data should
// always be read into scratch.
Slice result;
// Output parameter set by underlying FileSystem that represents status of
// read request.
IOStatus status;
};
// A file abstraction for randomly reading the contents of a file.
class FSRandomAccessFile {
public:
FSRandomAccessFile() {}
virtual ~FSRandomAccessFile() {}
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// After call, result->size() < n only if end of file has been
// reached (or non-OK status). Read might fail if called again after
// first result->size() < n.
//
// Safe for concurrent use by multiple threads.
// If Direct I/O enabled, offset, n, and scratch should be aligned properly.
virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
Slice* result, char* scratch,
IODebugContext* dbg) const = 0;
// Readahead the file starting from offset by n bytes for caching.
// If it's not implemented (default: `NotSupported`), RocksDB will create
// internal prefetch buffer to improve read performance.
virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
const IOOptions& /*options*/,
IODebugContext* /*dbg*/) {
return IOStatus::NotSupported("Prefetch");
}
// Read a bunch of blocks as described by reqs. The blocks can
// optionally be read in parallel. This is a synchronous call, i.e it
// should return after all reads have completed. The reads will be
// non-overlapping but can be in any order. If the function return Status
// is not ok, status of individual requests will be ignored and return
// status will be assumed for all read requests. The function return status
// is only meant for errors that occur before processing individual read
// requests.
virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
const IOOptions& options, IODebugContext* dbg) {
assert(reqs != nullptr);
for (size_t i = 0; i < num_reqs; ++i) {
FSReadRequest& req = reqs[i];
req.status =
Read(req.offset, req.len, options, &req.result, req.scratch, dbg);
}
return IOStatus::OK();
}
// Tries to get an unique ID for this file that will be the same each time
// the file is opened (and will stay the same while the file is open).
// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
// ID can be created this function returns the length of the ID and places it
// in "id"; otherwise, this function returns 0, in which case "id"
// may not have been modified.
//
// This function guarantees, for IDs from a given environment, two unique ids
// cannot be made equal to each other by adding arbitrary bytes to one of
// them. That is, no unique ID is the prefix of another.
//
// This function guarantees that the returned ID will not be interpretable as
// a single varint.
//
// Note: these IDs are only valid for the duration of the process.
virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
return 0; // Default implementation to prevent issues with backwards
// compatibility.
};
enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed };
virtual void Hint(AccessPattern /*pattern*/) {}
// Indicates the upper layers if the current RandomAccessFile implementation
// uses direct IO.
virtual bool use_direct_io() const { return false; }
// Use the returned alignment value to allocate
// aligned buffer for Direct I/O
virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
return IOStatus::NotSupported("InvalidateCache not supported.");
}
// EXPERIMENTAL
// This API reads the requested data in FSReadRequest asynchronously. This is
// a asynchronous call, i.e it should return after submitting the request.
//
// When the read request is completed, callback function specified in cb
// should be called with arguments cb_arg and the result populated in
// FSReadRequest with result and status fileds updated by FileSystem.
// cb_arg should be used by the callback to track the original request
// submitted.
//
// This API should also populate io_handle which should be used by
// underlying FileSystem to store the context in order to distinguish the read
// requests at their side and provide the custom deletion function in del_fn.
// RocksDB guarantees that the del_fn for io_handle will be called after
// receiving the callback. Furthermore, RocksDB guarantees that if it calls
// the Poll API for this io_handle, del_fn will be called after the Poll
// returns. RocksDB is responsible for managing the lifetime of io_handle.
//
// req contains the request offset and size passed as input parameter of read
// request and result and status fields are output parameter set by underlying
// FileSystem. The data should always be read into scratch field.
//
// Default implementation is to read the data synchronously.
virtual IOStatus ReadAsync(
FSReadRequest& req, const IOOptions& opts,
std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
void** /*io_handle*/, IOHandleDeleter* /*del_fn*/, IODebugContext* dbg) {
req.status =
Read(req.offset, req.len, opts, &(req.result), req.scratch, dbg);
cb(req, cb_arg);
return IOStatus::OK();
}
// EXPERIMENTAL
// When available, returns the actual temperature for the file. This is
// useful in case some outside process moves a file from one tier to another,
// though the temperature is generally expected not to change while a file is
// open.
virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
// If you're adding methods here, remember to add them to
// RandomAccessFileWrapper too.
};
// A data structure brings the data verification information, which is
// used together with data being written to a file.
struct DataVerificationInfo {
// checksum of the data being written.
Slice checksum;
};
// A file abstraction for sequential writing. The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class FSWritableFile {
public:
FSWritableFile()
: last_preallocated_block_(0),
preallocation_block_size_(0),
io_priority_(Env::IO_TOTAL),
write_hint_(Env::WLTH_NOT_SET),
strict_bytes_per_sync_(false) {}
explicit FSWritableFile(const FileOptions& options)
: last_preallocated_block_(0),
preallocation_block_size_(0),
io_priority_(Env::IO_TOTAL),
write_hint_(Env::WLTH_NOT_SET),
strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
virtual ~FSWritableFile() {}
// Append data to the end of the file
// Note: A WriteableFile object must support either Append or
// PositionedAppend, so the users cannot mix the two.
virtual IOStatus Append(const Slice& data, const IOOptions& options,
IODebugContext* dbg) = 0;
// Append data with verification information.
// Note that this API change is experimental and it might be changed in
// the future. Currently, RocksDB only generates crc32c based checksum for
// the file writes when the checksum handoff option is set.
// Expected behavior: if the handoff_checksum_type in FileOptions (currently,
// ChecksumType::kCRC32C is set as default) is not supported by this
// FSWritableFile, the information in DataVerificationInfo can be ignored
// (i.e. does not perform checksum verification).
virtual IOStatus Append(const Slice& data, const IOOptions& options,
const DataVerificationInfo& /* verification_info */,
IODebugContext* dbg) {
return Append(data, options, dbg);
}
// PositionedAppend data to the specified offset. The new EOF after append
// must be larger than the previous EOF. This is to be used when writes are
// not backed by OS buffers and hence has to always start from the start of
// the sector. The implementation thus needs to also rewrite the last
// partial sector.
// Note: PositionAppend does not guarantee moving the file offset after the
// write. A WritableFile object must support either Append or
// PositionedAppend, so the users cannot mix the two.
//
// PositionedAppend() can only happen on the page/sector boundaries. For that
// reason, if the last write was an incomplete sector we still need to rewind
// back to the nearest sector/page and rewrite the portion of it with whatever
// we need to add. We need to keep where we stop writing.
//
// PositionedAppend() can only write whole sectors. For that reason we have to
// pad with zeros for the last write and trim the file when closing according
// to the position we keep in the previous step.
//
// PositionedAppend() requires aligned buffer to be passed in. The alignment
// required is queried via GetRequiredBufferAlignment()
virtual IOStatus PositionedAppend(const Slice& /* data */,
uint64_t /* offset */,
const IOOptions& /*options*/,
IODebugContext* /*dbg*/) {
return IOStatus::NotSupported("PositionedAppend");
}
// PositionedAppend data with verification information.
// Note that this API change is experimental and it might be changed in
// the future. Currently, RocksDB only generates crc32c based checksum for