From 60eeba8cfc0b056d66eca7f89484b146433ba93c Mon Sep 17 00:00:00 2001 From: Allison Karlitskaya Date: Mon, 16 Dec 2024 10:30:20 +0100 Subject: [PATCH] src: add internal erofs writer code This introduces experimental code for writing erofs images ourselves, instead of using the external mkcomposefs CLI. It's currently disabled by default. You can test it by setting the `COMPOSEFS_FORMAT=new` environment variable. This currently produces a different output than the output of mkcomposefs, which is why it's gated behind an environment variable. The plan is to add a compatibility mode to our internal writer code so that it produces as similar of an output as possible and then switch over to using it once we are convinced that it's equivalent. Then the `COMPOSEFS_FORMAT=` variable will disable this compatibility mode. There's also a `COMPOSEFS_DUMP_EROFS=1` environment variable (which works with both `mkcomposefs` and our internal code) which will dump the erofs layout for diffing. There's also a standalone `erofs-debug` binary that will do the same. Additionally, this introduces two new files in docs: - a detailed description of the parts of erofs that we use - a document which attempts to describe the decisions made in creating an erofs composefs image (in terms of which order the files are in, etc). The main idea here is to start a serious effort towards standardizing the composefs label we want to start adding to container images: it should be possible to define what will be in that label by way of documentation instead of saying "run this software and use the output". These two new documents, taken together with the existing "oci.md" form a rough (and still incomplete) outline for that. Many thanks to Gao Xiang for helping clarify many points about the erofs file format for the documentation. Closes #56 Signed-off-by: Allison Karlitskaya --- Cargo.toml | 5 +- doc/erofs.md | 431 ++++++++++++++++++++++++ doc/image-format.md | 276 +++++++++++++++ src/bin/cfsctl.rs | 2 + src/bin/erofs-debug.rs | 25 ++ src/erofs/debug.rs | 377 +++++++++++++++++++++ src/erofs/format.rs | 279 +++++++++++++++ src/erofs/mod.rs | 3 + src/erofs/reader.rs | 460 +++++++++++++++++++++++++ src/fs.rs | 2 +- src/image.rs | 27 +- src/lib.rs | 2 + src/mkfs.rs | 749 +++++++++++++++++++++++++++++++++++++++++ src/oci/image.rs | 7 +- 14 files changed, 2614 insertions(+), 31 deletions(-) create mode 100644 doc/erofs.md create mode 100644 doc/image-format.md create mode 100644 src/bin/erofs-debug.rs create mode 100644 src/erofs/debug.rs create mode 100644 src/erofs/format.rs create mode 100644 src/erofs/mod.rs create mode 100644 src/erofs/reader.rs create mode 100644 src/mkfs.rs diff --git a/Cargo.toml b/Cargo.toml index 38ff692..f6f7dc6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,8 +16,10 @@ anyhow = { version = "1.0.89", default-features = false } async-compression = { version = "0.4.17", default-features = false, features = ["tokio", "gzip"] } clap = { version = "4.5.19", default-features = false, features = ["std", "help", "usage", "derive"] } containers-image-proxy = "0.7.0" +env_logger = "0.11.5" hex = "0.4.3" indicatif = { version = "0.17.8", features = ["tokio"] } +log = "0.4.22" oci-spec = "0.7.0" regex-automata = { version = "0.4.8", default-features = false } rustix = { version = "0.38.37", features = ["fs", "mount", "process"] } @@ -26,7 +28,8 @@ tar = { version = "0.4.42", default-features = false } tempfile = "3.13.0" thiserror = "2.0.4" tokio = "1.41.0" -zerocopy = "0.8.13" +xxhash-rust = { version = "0.8.12", features = ["xxh32"] } +zerocopy = { version = "0.8.13", features = ["derive"] } zstd = "0.13.2" [dev-dependencies] diff --git a/doc/erofs.md b/doc/erofs.md new file mode 100644 index 0000000..07b5755 --- /dev/null +++ b/doc/erofs.md @@ -0,0 +1,431 @@ +# erofs: the missing manual + +## Introduction + +This is an attempt to document the format of erofs (or at least the subsets of +it that we use in composefs). + +It probably makes sense to have `erofs_fs.h` open when reading this. + +## Overall concepts + +All integers (including all offsets) are stored in little-endian byte order. + +The file layout is fairly free-form. You can freely mix inodes, data blocks, +and shared xattr entries. inodes are 64-bit values based on file offsets +rather than integer indexes into a fixed table, so they can be anywhere at all. +xattrs are 32-bit values based on offsets, so they're a bit more limited (but +not in filesystems of reasonable size). + +## The first 1024 bytes (pre-superblock) + +The first 1024 bytes of an erofs have no particular meaning. You can put +anything you want there, like partition tables or boot sectors or anything +else. composefs puts its own header inside of this area, at the start. + +## The superblock (at 1024 bytes, 128 bytes long) + +The superblock is defined by `struct erofs_super_block`. + +Here's some notes about some of the fields. Anything not mentioned is left as +0 by us. There's some pretty wild features in here, but we don't use them all +(and I don't understand them, either) so they're not all documented. + +* `magic`: set that to `EROFS_SUPER_MAGIC_V1` (`0xE0F5E1E2`) +* `checksum`: only meaningful of the `SB_CHKSUM` feature is enabled. This is + a crc32c over a block-sized-chunk of data starting from the superblock, + with this field set to 0. That's pretty weird. Maybe don't use this. +* `feature_compat`: a flags field. The filesystem will still mount even if + the kernel doesn't know about any features which might be present. The + flags: + - `SB_CHKSUM` (`0x0001`): set if the checksum field in the superblock is + populated. Otherwise, the checksum is ignored. + - `MTIME` (`0x0002`): at first, erofs named the timestamp fields `ctime` + instead of `mtime`. That got changed a long time ago, and this flag + got added to indicate filesystems that were created with the new + semantics. This flag has absolutely zero impact at run time: the kernel + ignores it. + - `XATTR_FILTER` (`0x0004`): set if the xattr bloom filter should be + used. Read about this in the inode section. +* `blkszbits`: log2 of the block size. Better set this to 12 (4096). +* `root_nid`: the reference to the root inode. See the inodes section for + what that means. Normally inodes are stored in u64, but this is somewhat + randomly a u16, which means that you're gonna need to put the root + directory near the start. +* `inos`: the total number of inodes defined. This is only used for + `statfs()` purposes. +* `build_time`, `build_time_nsec`: this is something like a compression + feature if you want all (or many) files in your filesystem to have the same + mtime. Then you can use the "compact" inode layout, which doesn't have its + own `mtime` field, and this one will be used instead. If you don't have + compact inodes then this is meaningless. +* `blocks`: total filesystem block size. This is only used for `statfs()`. +* `meta_blkaddr`: the start of the "metadata area". This is where the inodes + are. This is a block address, so it gets multiplied by the block size to + determine the actual offset. +* `xattr_blkaddr`: the start of the "shared xattr area". See the "Shared + xattr" and "Inodes" sections for more info. + +## Extended attributes + +There are two options for storing xattr data in a erofs: +* inline with the inode itself +* in a "shared xattr" struct somewhere + +The format of both of these is the same. + +The inline thing is nice and simple, but it might be space-inefficient for +cases where the same (key, value) pair appears over and over again (which might +be the case for things like security labels and acls and the like). + +### Prefix indexes + +A rudimentary form of compression is supported on xattr names. There are a +number of hardcoded "common prefixes" defined with the `EROFS_XATTR_INDEX_` +constants in `erofs_fs.h`. Confusingly, although `LUSTRE` is present, it's not +wired up in the kernel. Don't use that one. + +The basic idea is that you find the prefix for your xattr from the list (like +`user.` or `security.`) and then you store only the "suffix" part, along with +the prefix index. If you can't find a prefix, you use 0 (which is conceptually +a prefix of ""). If the prefix matches the entire name then the suffix is `""`. + +Note: you really need to do this "compression" step, because it's assumed +during the lookup phase. ie: if we're looking for an xattr `"user.xyz"` then +we'll only consider the entries that have the prefix index for `user.` set on +them. If you didn't properly "compress" your xattr names, they won't be found. + +There's support in the erofs format for custom prefixes. That's when the high +bit of the prefix index is set. These got added circa kernel version 6.4 with +a patch series ending with `6a318ccd7e08` ("erofs: enable long extended +attribute name prefixes") but aren't documented here because we don't use them. + +### On-disk format + +All extended attributes (both shared and inode-inline) are stored in a +simple format with a small header. That's `struct erofs_xattr_entry`. It's just 4 bytes: +* u8: the suffix length (in bytes, no nul) +* u8: the prefix index (see above) +* u16: the value length (in bytes, no nul) + +The header must start at an offset with an alignment of 4. + +Immediately following the header is the suffix (name with prefix removed), +immediately followed by the value. There's no nul after the name (which is OK, +since we know the length from the header). + +### Shared xattrs + +This is basically just an xattr stored somewhere in the filesystem image, using +the format mentioned above. It is referred to by a 32-bit identifier: +* start at the `xattr_blkaddr` mentioned in the super block. That's a block + address, so remember to multiply that by the block size. +* add 4 times the shared xattr identifier (since the header must be 4-aligned) +* that's the xattr header (mentioned above) + +If your filesystem image is going to be smaller than 16GB then you can probably +just leave the `xattr_blkaddr` set to 0 to make your life easier. + +### Inode-inline xattrs + +We talk about those in the Inode section. Speaking of which, let's talk about... + +## Inodes + +Here's where things get complicated. + +First, the easy part: similar to shared xattrs, inodes are just a structure +stored somewhere in the filesystem image. There's no "inode table". This +works because the way that you refer to inodes is with an "nid": +* start at the `meta_blkaddr` mentioned in the super block. That's a block + address, so remember to multiply that by the block size. +* add 32 times the nid (since inodes must be 32-aligned) +* that's the inode header + +### On-disk formats + +The very first thing in the inode is the format field. This is a mix of two +things, but the most important thing to talk about first is the low-order bit: +it's set to 0 if this is a "compact" inode and 1 if it's a "extended" inode. + +We don't use compact inodes, so I'm not going to document them, but you can get +a pretty good idea of what they're capable of by reading the headers. The rest +of this section discusses extended inodes. + +The extended inode header (`struct erofs_inode_extended`) has a size of 64 and +needs to be 32-aligned. It has these interesting fields: +* `format`: + - first bit: as mentioned above, for an extended inode the low order bit + will always be set + - the rest: the "data layout" (which is complicated enough to get its own + section) +* `xattr_icount`: this is also complicated enough that we want to talk about + it elsewhere. See the "Extended attributes" section below (not the one + above!). The main thing to know is that this will be 0 if there are none. +* `mode`: that's the same like you'd find in `.st_mode` from `stat()` +* `size`: ditto, except `.st_size` +* `i_u`: you'd better look at the "data layout" section about this one... +* `ino`: a compatibility shim for cases where we need to report `st_ino` in + 32-bits. For 64-bit userlands, we use the nid directly as the `.st_ino`. + You can do what you want with this (as long as it's unique), but for + filesystems smaller than 128GB you can probably just use the nid. +* `uid`, `gid`: those are fairly obvious, I guess +* `mtime`, `mtime_nsec`: those too +* `nlink`: try to set this correctly: some things might get upset if it's not + right. For non-directories, that's the number of hardlinks (ie: 1 for + non-hardlinked files). For directories, that's 2 plus the number of + subdirectories. + +Directly following the inode header is the extended attribute header (if +`xattr_icount` is non-zero). Then comes any inline data (as per the "data +layout" section). + +### Extended attributes + +If the `xattr_icount` field in the inode header is set to 0 then this section +is skipped entirely. Otherwise we write out the inode xattr header (`struct +erofs_xattr_ibody_header`). This has: +* `name_filter` (`u32`): a bloom filter for which xattrs are present. This + needs its own section. +* `shared_count` (`u8`): the number of shared xattrs +* some reserved bytes to pad things up to 12 + +Immediately following the header come the shared xattr references. They're in +the format mentioned in the "Shared xattrs" section above, simply encoded as +little-endian u32s. So: the first `4 * shared_count` bytes after the header +are those. + +Then the inline xattrs are next. Those are stored in the format mentioned in +the "On-disk format" sub-section in the "Extended attributes" section. They're +just written here one after another, with padding added so that each header is +4-aligned. There is also padding after the last one, which is important if +inline data is to follow (as per the "data layout" section). + +#### About `xattr_icount` + +So, if there's no xattrs then this is zero. + +Otherwise this is basically the size of the extended attributes area divided by +4, with the exception that the 12-byte header counts for only 4 bytes. Put +another way: you remove the size of the header, divide by 4, then add 1 back +again. + +A value of 1 would be pretty suspicious, since that would indicate the presence +of a header, but no xattrs (shared or inline), and in that case normally we'd +omit the header. + +The kernel basically uses this to know how many bytes it needs to skip over +before it can find the inline file data. It will remove the 1, multiply by 4, +then add 12 (the header). See `erofs_xattr_ibody_size()`. + +#### About `name_filter` + +This is a 32-bit bloom filter used to quickly determine if a given xattr is not present. + +The hash algorithm is xxh32. The thing that gets hashed is not the name, but +the "suffix" that's left after removing the prefix. The seed is +`EROFS_XATTR_FILTER_SEED` plus the prefix index. The lower 5 bits of the hash +value (0..31) are used to determine which bit is used. + +For some reason a bit value of 1 here indicates the absence of a particular +xattr, which is opposite to the usual arrangement. You'd think it was for +compatibility, but the filter is only engaged if the feature bit is present in +the superblock. + +This feature got added in kernel commits: +* `3f339920175c` ("erofs: update on-disk format for xattr name filter") +* `fd73a4395d47` ("erofs: boost negative xattr lookup with bloom filter") + +### Data layout + +erofs has a bunch of different ways to represent the actual content associated +with an inode (regular file content, directory entries, symlink target). + +We describe three of them here: +* plain +* inline +* chunked + +The data layout is chosen using some of the bits of the `format` field in the +inode header. + +#### `EROFS_INODE_FLAT_PLAIN` + +In this case there's never any inline data. The inode content is stored +entirely as a series of contiguous blocks. The offset of the first block is +what goes in the `i_u` field (measured in blocks, not bytes). + +The number of blocks is determined by the `.size` field (divided by block size, +rounded up). + +If the content is not a multiple of the blocksize then the last block should be +0-padded. + +#### `EROFS_INODE_FLAT_INLINE` + +This is similar to `EROFS_INODE_FLAT_PLAIN` except if the content is not a +multiple of the blocksize. In that case, instead of 0-padding the last block +to fill up a block, the content of the last block is stored directly inline +with the inode, without padding. + +So, imagining the content is 2.5 blocks worth of data: +* the first block is the one pointed to by `i_u` +* the second block is the one immediately following it +* the last block is stored at the end of the inode + +The number of blocks is determined by the `.size` field, divided by block size, +rounded down. The remainder is the number of bytes of inline data. + +The inline data must be written in such a way that it does not cross a block +boundary. It is theoretically permitted for the inline data to be in a +separate block (ie: the block directly following the inode data). It is also +permitted for the inode data itself to cross block boundaries. There are a +couple of caveats to be aware of, however: +* the alignment of inodes is 32 bytes, but the size of an extended inode is 64 + bytes. `mkfs.erofs` tries to ensure that extended inodes headers land + entirely within on disk block (for efficiency), but this isn't required by + the kernel. +* `mkfs.erofs` also tries to ensure that the inline data ends in the same + disk block as the last byte of the inode metadata (ie: inode header plus + xattrs). This is theoretically not required by the kernel. +* A bug present in the kernel before 6.12 meant that this was required for + inline symlink targets. This was fixed by `9ed50b8231e3` ("erofs: fix + incorrect symlink detection in fast symlink"). +* In general, when faced with the task of writing out an inode with inline + data present, you may need to add padding bytes before the start of the + inode in order to ensure that the inline data falls within a single block. + If you allow inlining of large amounts of data (approaching the block size) + then you'll almost always need to add padding to get the correct alignment + (and often a large amount of it), which is wasteful. On the other hand, if + you only inline very small amounts of data then you are wasting space by + padding out filesystem blocks with zeros. There is a balance to be struck, + and `mkcomposefs` uses a "heuristic" of half a block size as the inlining + limit. I've performed simulations which show that this value is fairly + close to ideal for a random distribution of file sizes, starting inode + alignment and xattr content sizes. + +#### `EROFS_INODE_FLAT_CHUNK_BASED` + +In this case, the `i_u` field isn't a block reference but is instead split into +sub-fields. The main gist of it, though, is that this stores the log2 of the +number of blocks per chunk (maximum of 31). + +So if you write 4 here, then there are 16 blocks in each chunk. + +The references to the chunks are then written as the inline data, 4 bytes per +chunk, as block indexes (to the starting block). I'm not sure if that's +measured in blocks or in chunks, because the only reason we use this feature is +for a special purpose: null chunks. + +If a chunk index is written as -1 (ie: 0xffffffff) then it refers to a "null" +chunk of the given size. This effectively gets you support for sparse files. + +For the sparse file use-case there's no benefit to choosing anything other than +the maximum chunk format of 31 for the `-i_u` field. The number of chunks you +need to write is determined by the file size, but for a 4096 byte block size +and a chunk format of 31 all files less than 8TB can be handled with a single +"chunk". + +#### Character and block devices + +If the `mode` field of the inode indicates that this is a device, then the data +layout isn't relevant, and the `i_u` field gets the `rdev` of the device. Note +that this is a 32-bit field, so 32-bit rdev. `size` is zero. + + +#### Fifos and sockets + +These have no storage at all. `i_u` is ignored and there is never inline data. +`size` should always be 0. + +## Directories + +The final thing that needs describing is how a directory gets stored. erofs +directories are the classical mapping from names to inodes, with the extra +'file type' field that gets returned via the `d_type` field in `struct dirent` +(to avoid needing to `stat()` the inode). + +The dirent structure has a size of 12 (and an alignment of 4) and looks like: +* `nid` (`u64`): the inode referred to by this entry +* `nameoff` (`u16`): an offset to the name (inside of this block). See below. +* `file_type` (`u8`): the filetype field for `d_type` + +The directory needs to explicitly include the `.` and `..` entries. All +entries (including `.` and `..`) are sorted in asciibetical order. Note: the +`.` and `..` are not handled specially and are not necessarily at the start: +they're in asciibetical order too. + +The directory entries are taken in their sorted order and split into blocks. +However many entries will fit into the first block go into the first block, and +so on. All blocks except for the last one are padded with zeros. A directory +has a specific encoded size (which ends up in the `size` field of the inode). +It is made from a number of complete blocks, times the blocksize, plus the size +of the (possible) trailing partial block (which might be inlined, depending on +the selected data layout). + +Each block is a number of dirent structs packed at the start, plus the entry +names referred to from those structs. The entry names must immediately follow +the structs, and each entry name must immediately follow the previous (with no +nul). The reason for that will become clear with our example: + +Let's consider an example directory with entries `.`, `..`, +`someverylongfilename`, `subdir`. To keep things interesting, let's further +imagine that our filesystem block size is 32 bytes. + +We segment into blocks by taking entries until no more entries fit. Each entry +is the 12 byte dirent struct, plus the name, so: +* `.`: (12 + 1) = 13 → 13 total bytes +* `..`: (12 + 2) = 14 → 27 total bytes +* `file`: (12 + 4) = 16 → too big, won't fit. + +So we know that the first directory block will contain `.` and `..`. It looks like: +* offset `0`: the dirent struct for `.`, `nameoff` is `24`. +* offset `12`: the dirent struct for `..`, `nameoff` is `25`. +* offset `24`: `.` +* offset `25`: `..` +* offset `27`: padded with `nul` + +The `nameoff` fields are more important here than they seem. If we look at the +first `nameoff` field, it's `24`. That tells us that there are two entries in +this block (since the entry size is 12). We also know the length of the name +of the first entry because the name of the second entry starts right after it. + +How do we know the name of the last entry? One of three ways: +* if this is the final block of the directory, then the overall size of the + directory (in the inode `size` field) will indicate where the final name + must surely terminate +* if this is a non-final block, it might be that the name fits exactly into + the block size. In that case, the end of the name is the end of the block. +* if this is a non-final block, and the name doesn't fit exactly into the + block size then it means we'll have added some padding. In this case the + name is `nul`-terminated. That's the case for our `..` entry here. + +Now let's do our next block: +* `someverylongfilename` (12 + 20) = 32 → 32 total bytes +* `subdir` (12 + 6) = 18 → too big, won't fit. + +So we only get one entry in this block. The layout is: +* offset `0`: the dirent struct for `someverylongfilename`, `nameoff` is `12`. +* offset `12`: `someverylongfilename` +* no padding, since we're already at 32 bytes. + +In this case we look at the `nameoff` of the first entry (`12`) and know that +there must only be one entry in this block. And in this case, the name fills +the block exactly, so we won't find a `nul` terminator, and we know the name +must have a length of `12`. + +Finally, `subdir` gets put in the last partial block: +* offset `0`: the dirent struct, `nameoff` is `12` +* offset `12`: `subdir` +* offset `18`: that's the end of the directory + +What comes at offset `18`? Nothing. The `size` field of the directory is 2 +blocks (`2 * 32` = `64`) plus the `18` bytes from this block, so a total of +`82`. + +Of course, if we're storing the directory as "flat plain" or "chunk based" then +we need to pad this out to a complete block size (and we'll do that with +`nul`s), but those padding bytes are not conceptually part of the directory +content. But what if we stored it "flat inline"? We might have the next inode +directly following. In that case, we effectively depend on the inode `size` to +know that the final filename has a length of `6`. diff --git a/doc/image-format.md b/doc/image-format.md new file mode 100644 index 0000000..40356c4 --- /dev/null +++ b/doc/image-format.md @@ -0,0 +1,276 @@ +# Canonical composefs file format + +## Prelude + +We expect the process of creating an erofs from a filesystem image to be +deterministic. `erofs` is very free-form and there are many ways things could +be organized. + +Here's where we try to document some of the decisions we make. This documents +the erofs images produced by the `composefs` rust crate, which are currently +different from the official `composefs` repository (ie: `libcomposefs`, in C). +It would be very desirable to try to make this implementation exactly match the +`libcomposefs` implementation so that we could check them against each other to +ensure that they produce bitwise identical output. On the other hand, we've +been discussing creating a "version 1.1" format, and this might be a good +jumping-off spot for that. + +The goal of this document is to completely and unambiguously document every +decision we made in such a way that you could use this document as a guide to +produce a new composefs erofs writer implementation, from scratch, which +produces exactly the same output. However, this document is probably currently +very incomplete, and maybe even incorrect. We should strive to cover every +possible detail here, but it's hard. Hopefully things will improve with time, +but until then, you might need to check the implementation. + +In cases of ambiguity or incorrectness, issues and patches are extremely +welcome. + +## Overall layout concept + +The composefs header and superblock are the only things that need to be at +fixed offsets. How do we organize everything else? + +Generally speaking, we perform these steps: +* collect the filesystem into a flat list of inodes +* collect and "share" xattrs, as appropriate +* write the composefs header and the superblock +* write the inodes directly following the superblock +* write the shared xattrs directly following the inodes +* then the blocks (only for directories) + +## Collecting inodes + +We collect the inodes into a flat list according to the following algorithm: +* our goal is to visit each inode, collecting it into the inode list as we + visit it, in the order that we visited it +* start at the root directory +* for each directory that we visit: + - the directory is stored first, then the children + - we visit the children in asciibetical order, regardless of file type + (ie: we interleave directories and regular files) + - when visiting a child directory, we store all content of the child + directory before returning to the parent directory (ie: depth first) +* in the case of hardlinks, the inode gets added to the list at the spot that + the first link was encountered + +Consider a filesystem tree + +``` + / + bin/ + cfsctl + usr/ + lib/ + libcomposefs.so + libglib-2.0.so + libexec/ + cfsctl +``` + +where `/bin/cfsctl` and `/usr/libexec/cfsctl` are hardlinks. + +In that case, we'd collect the inodes in this order: +1. `/` +1. `/bin/` +1. `/bin/cfsctl` (aka `/usr/libexec/cfsctl`) +1. `/usr/` +1. `/usr/lib/` +1. `/usr/lib/libcomposefs.so` +1. `/usr/lib/libglib-2.0.so` +1. `/usr/libexec/` + +(skipping `/usr/libexec/ctlctl` because we already had it by the time we encountered it). + +So that's 8 inodes, in that order. + +## Special handling for overlayfs + +Ultimately, the erofs image that we produce needs to be used as a layer in an +overlayfs stack. There are a lot of cases where the thing that we write out +only makes sense to overlayfs. There are other cases where we need to avoiding +writing out things that overlayfs would treat as "special". + +`libcomposefs` writes 256 files named from `00` to `ff` into the root directory +as character devices with major/minor of (0, 0). Those are overlayfs whiteouts +and they are needed for older versions of overlayfs which don't support "data +only" layers. We don't target these versions, so *we don't add these files*. +We also don't mark the root directory as opaque or do anything else special +with it. + +Conversely, if we encounter a character device with major/minor (0, 0) then we +need to escape it to make sure that it appears as such in the final composed +image (and does not get handled by overlayfs as a whiteout). We do that by: +TODO (not implemented yet). + +We also need to make sure that the only `trusted.overlay.*` attributes which we +write are ones that came from us. If we encounter any `trusted.overlay.*` +attributes in the source, we escape them to `trusted.overlay.overlay.`, causing +them to lose their special meaning. + +## Extended attribute handling + +For each inode, we collect and write the extended attributes in asciibetical +order, by full name. Note: this is different than the shared xattr table which +has a more complicated sorting, but maybe we want to unify the two. + +We use the hardcoded prefix indexes (which is actually mandatory). + +We don't use "long prefixes", but we might start doing that at some point, +because it would sure be nice to not have to write `"overlay.redirect"`, +`"overlay.metacopy"` and `"selinux"` over and over again. The feature seems +complicated, though... + +## Collecting shared xattrs + +`erofs` has a facility for sharing xattrs where the name and the value are +identical, and we use it. After we've collected all of our inodes, we iterate +the list and take note of all (name, value) pairs. If any (name, value) pair +appears more than once, we share it. + +The process of "sharing" involves modifying the original inode. We iterate the +present xattrs, and for each attribute that we share, we remove it from the +"inline" list and add it to the "shared" list, in the same order as it appeared +in the inline list. + +NB: this operation is performed on the flattened inode list, not the directory +tree. That means that if a particular (name, value) pair appears uniquely on +an inode with multiple hardlinks, we'll count that as a single occurrence and +it won't be shared. + +Note also: the attributes that we add ourselves are considered candidates for +sharing. That means that if we had two external files which were not hardlinks +but nevertheless contained the same data, we'd end up sharing their +`trusted.overlayfs.` attributes. + +## The composefs header + +`erofs` leaves the first 1024 bytes of the file free to us, and we store a +32-byte header at offset 0. The kernel ignores this, and our mount code +doesn't actually do anything with it at the moment, either. We try to fill it +out in the same way as `libcomposefs`: + +* `magic` (`u32`): `0xd078629a` +* `version` (`u32`): I think this is something like the overall file format + version. If this changes, then things are possibly incompatible, and maybe + this isn't even an `erofs` anymore. Currently `1`. +* `flags`: `0` +* `composefs_version`: I think this is something like a statement about the + current strategy for layout decisions. If this changes, the algorithm for + building the file has probably decided to put things in different places + (and the checksum of the file will have changed), but the result is still + understandable as an `erofs`. Currently `1`. + +## The superblock + +* `checksum`: we don't fill that out +* `feature_compat`: we set `MTIME` and `XATTR_FILTER` +* `blkszbits`: we use 12, for a block size of 4096 +* `root_nid`: that's going to end up being 36, which follows from the fact + that we put the root inode directly following the superblock, at offset + `1024 + 128` = `1152`. `1152 / 32` = `36`. +* `inos`: we currently set that to the number of inodes in the filesystem. + `libcomposefs` adds some extra file content (the `00`..`ff` whiteouts) so + it gets a larger number than we do. +* `blocks`: the total filesize, divided by 4096. +* `build_time`, `build_time_nsec`: since we only use extended format inodes, + these fields are meaningless and we currently set them to 0 (which is + different from `libcomposefs`). +* `meta_blkaddr`, `xattr_blkaddr`. We currently set both of these to 0 to + keep things simple. `libcomposefs` performs a complicated calculation to + set `meta_blkaddr` to zero as well (since the first inode directly follows + the superblock, it will always be within the first 4096 byte filesystem + block), but its complicated calculation for `xattr_blkaddr` might well land + on a non-zero value, so that's different from us. + +## The inodes + +After the superblock, we write the inodes. Some notes: + +* we only use extended inodes, because mtime is important to us and we + generally expect every file to have a unique mtime. This is a difference + from `libcomposefs`. + +* we use a "chunk based" data layout for non-inline regular files: + + - the way this works in overlayfs, we want to store a correctly-sized + sparse file in the upper layer. This lets us have the correct `size` + field on the inode, so we don't need to interact with the data layer in + order to do `stat()`. + + - we set the chunk format (ie: the `i_u` field) to 31, the maximum + + - we store a single "null" chunk pointer + + - this corresponds to a chunk size of 8TB, which is then the upper limit + of files we can store + + - `libcomposefs` tries to take the smallest chunk format value which will + get the job done with a single chunk pointer, and will write multiple + chunk pointers if necessary (for extreemely large files). Maybe we + should do that too. + + - in this case we set the `trusted.overlay.metacopy` and + `trusted.overlay.redirect` attributes (in that order) on the file. + These attributes are written first, before the other attributes that + would be present on the same file (which are otherwise in sorted + order). + + - the `trusted.overlay.metacopy` attribute is 36 bytes long, and is set to: + + the 4-byte header: [0 36, 0, 1] + + the 32-byte SHA256 fs-verity digest + + - the `trusted.overlay.redirect` attribute is set to the string + `"/xx/yyyy..."` where `xx` is the first two lowercase hexidecimal bytes + of the fs-verity digest and the `yyyy...` is the rest. That's just a + reference into the `objects/` subdirectory of the repository (which is + mounted in the overlayfs stack as the data layer). + +* we use a "flat inline" data layout for all other inodes: + + - for character and block devices, as well as fifos and sockets this is + meaningless, but we need to set something + + - for inline regular files we store the content inline. This will break + if we try to inline a file larger than 4095 characters, but our current + cut-off is 64. + + - for symlinks this means that the link target gets stored inline. + Hopefully we don't have symlinks with targets longer than 4095 + characters, or we're gonna get in trouble. + + - directories may well be larger than 4096 bytes, so we might end up + needing to store blocks for those. These follow the "shared xattrs" + area. We could probably set "flat plain" for directories that are an + exact multiple of 4096 bytes in size, and `libcomposefs` does that, but + we don't bother. + +We pad the last inode to the required alignment for inodes, even though it is +generally followed by a shared xattr (which has a less stringent alignment +requirement). + +## The shared xattrs + +There's not much left to be said about these. We currently write them out in +the order that `collections::BTreeMap` applies to our `struct XAttr`, which I +think basically ends up sorting them by prefix index, then by suffix, then by +value. We might like to firm that up at some point. This is notably different +than the sorting applied to the attributes as they appear in the inodes, and we +also don't give any special treatment to the `trusted.overlay.` attributes that +we added: they're sorted here in the usual way. + +After we do this, and even if there was no shared xattrs, we always pad up to a +4096 byte boundary, even if there are no data blocks. That means that the +filesystem image will always be a multiple of 4096. + +## The blocks + +Now comes the data blocks. These are written in sequence for each inode, +according to the sequence of the inode in the inode list. Due to our use of +"flat inline" data layout, only full blocks are stored (although they may have +included inter-block padding in directories), so we keep 4096-byte alignment +from here on out. + +## The end + +That's it. The file is over now. We'll have ended on a multiple of 4096. diff --git a/src/bin/cfsctl.rs b/src/bin/cfsctl.rs index d62a03e..1e0cc5b 100644 --- a/src/bin/cfsctl.rs +++ b/src/bin/cfsctl.rs @@ -97,6 +97,8 @@ enum Command { } fn main() -> Result<()> { + env_logger::init(); + let args = App::parse(); let repo = (if let Some(path) = args.repo { diff --git a/src/bin/erofs-debug.rs b/src/bin/erofs-debug.rs new file mode 100644 index 0000000..e8afea4 --- /dev/null +++ b/src/bin/erofs-debug.rs @@ -0,0 +1,25 @@ +use std::{fs::File, io::Read, path::PathBuf}; + +use clap::Parser; + +use composefs::erofs::debug::debug_img; + +/// Produce a detailed dump of an entire erofs image +/// +/// The output is in a diff-friendly format, such that every distinct image produces a distinct +/// output (ie: an injective mapping). This is useful for determining the exact ways in which two +/// different images are different. +#[derive(Parser)] +struct Args { + /// The path to the image file to dump + image: PathBuf, +} + +fn main() { + let args = Args::parse(); + let mut image = File::open(args.image).expect("Opening file"); + + let mut data = vec![]; + image.read_to_end(&mut data).expect("read_to_end() failed"); + debug_img(&data); +} diff --git a/src/erofs/debug.rs b/src/erofs/debug.rs new file mode 100644 index 0000000..1cc082c --- /dev/null +++ b/src/erofs/debug.rs @@ -0,0 +1,377 @@ +use core::mem::offset_of; +use std::{ + collections::BTreeMap, + ffi::OsStr, + mem::discriminant, + os::unix::ffi::OsStrExt, + path::{Path, PathBuf}, +}; + +use zerocopy::{Immutable, KnownLayout, TryFromBytes}; + +use super::{ + format::{self, CompactInodeHeader, ComposefsHeader, ExtendedInodeHeader, Superblock}, + reader::{DirectoryBlock, Image, Inode, InodeHeader, InodeOps, InodeType, InodeXAttrs, XAttr}, +}; + +macro_rules! print_fields { + ($ty: ty, $s: expr, $f: ident) => {{ + let value = &$s.$f; + let default = if false { value } else { &Default::default() }; + if value != default { + println!(" +{:02x} {}: {:?}", offset_of!($ty, $f), stringify!($f), value); + } + }}; + ($ty: ty, $s:expr, $head: ident; $($tail: ident);+) => {{ + print_fields!($ty, $s, $head); + print_fields!($ty, $s, $($tail);+); + }}; +} + +fn utf8_or_hex(data: &[u8]) -> String { + if let Ok(str) = std::str::from_utf8(data) { + format!("\"{str}\"") + } else { + hex::encode(data) + } +} + +// This is basically just a fancy fat pointer type +enum SegmentType<'img> { + Header(&'img ComposefsHeader), + Superblock(&'img Superblock), + CompactInode(&'img Inode), + ExtendedInode(&'img Inode), + XAttr(&'img XAttr), + DataBlock(&'img [u8]), + DirectoryBlock(&'img DirectoryBlock), +} + +// TODO: Something for `enum_dispatch` would be good here, but I couldn't get it working... +impl SegmentType<'_> { + fn addr(&self) -> usize { + match self { + SegmentType::Header(h) => &raw const **h as usize, + SegmentType::Superblock(sb) => &raw const **sb as usize, + SegmentType::CompactInode(i) => &raw const **i as *const u8 as usize, + SegmentType::ExtendedInode(i) => &raw const **i as *const u8 as usize, + SegmentType::XAttr(x) => &raw const **x as *const u8 as usize, + SegmentType::DataBlock(b) => &raw const **b as *const u8 as usize, + SegmentType::DirectoryBlock(b) => &raw const **b as *const u8 as usize, + } + } + + fn size(&self) -> usize { + match self { + SegmentType::Header(h) => size_of_val(*h), + SegmentType::Superblock(sb) => size_of_val(*sb), + SegmentType::CompactInode(i) => size_of_val(*i), + SegmentType::ExtendedInode(i) => size_of_val(*i), + SegmentType::XAttr(x) => size_of_val(*x), + SegmentType::DataBlock(b) => size_of_val(*b), + SegmentType::DirectoryBlock(b) => size_of_val(*b), + } + } +} + +#[repr(C)] +#[derive(TryFromBytes, KnownLayout, Immutable)] +struct DataBlock([u8]); + +struct ImageVisitor<'img> { + image: &'img Image<'img>, + visited: BTreeMap, Vec>)>, +} + +impl<'img> ImageVisitor<'img> { + fn note(&mut self, segment: SegmentType<'img>, path: Option<&Path>) -> bool { + let offset = segment.addr() - self.image.image.as_ptr() as usize; + match self.visited.entry(offset) { + std::collections::btree_map::Entry::Occupied(mut e) => { + let (existing, paths) = e.get_mut(); + // TODO: figure out pointer value equality... + assert_eq!(discriminant(existing), discriminant(&segment)); + assert_eq!(existing.addr(), segment.addr()); + assert_eq!(existing.size(), segment.size()); + if let Some(path) = path { + paths.push(Box::from(path)); + } + true + } + std::collections::btree_map::Entry::Vacant(e) => { + let mut paths = vec![]; + if let Some(path) = path { + paths.push(Box::from(path)); + } + e.insert((segment, paths)); + false + } + } + } + + fn visit_directory_block(&mut self, block: &DirectoryBlock, path: &Path) { + for entry in block.entries() { + if entry.name == b"." || entry.name == b".." { + // TODO: maybe we want to follow those and let deduplication happen + continue; + } + self.visit_inode(entry.inode, &path.join(OsStr::from_bytes(entry.name))); + } + } + + fn visit_inode(&mut self, id: u64, path: &Path) { + let inode = self.image.inode(id); + let segment = match inode { + InodeType::Compact(inode) => SegmentType::CompactInode(inode), + InodeType::Extended(inode) => SegmentType::ExtendedInode(inode), + }; + if self.note(segment, Some(path)) { + // TODO: maybe we want to throw an error if we detect loops + /* already processed */ + return; + } + + if let Some(xattrs) = inode.xattrs() { + for id in xattrs.shared() { + self.note( + SegmentType::XAttr(self.image.shared_xattr(id.get())), + Some(path), + ); + } + } + + if inode.mode() & format::S_IFMT == format::S_IFDIR { + let inline = inode.inline(); + if !inline.is_empty() { + let inline_block = DirectoryBlock::try_ref_from_bytes(inode.inline()).unwrap(); + self.visit_directory_block(inline_block, path); + } + + for id in inode.blocks(self.image.blkszbits) { + let block = self.image.directory_block(id); + self.visit_directory_block(block, path); + self.note(SegmentType::DirectoryBlock(block), Some(path)); + } + } else { + for id in inode.blocks(self.image.blkszbits) { + let block = self.image.data_block(id); + self.note(SegmentType::DataBlock(block), Some(path)); + } + } + } + + fn visit_image( + image: &'img Image<'img>, + ) -> BTreeMap, Vec>)> { + let mut this = Self { + image, + visited: BTreeMap::new(), + }; + this.note(SegmentType::Header(image.header), None); + this.note(SegmentType::Superblock(image.sb), None); + this.visit_inode(image.sb.root_nid.get() as u64, &PathBuf::from("/")); + this.visited + } +} + +pub fn print_paths(paths: &[Box]) { + match paths { + [] => {} + [one] => { + println!(" filename: {one:?}"); + } + many => { + println!(" links:"); + many.iter() + .for_each(|one| println!(" - {one:?}")); + } + } +} + +impl std::fmt::Debug for XAttr { + // Injective (ie: accounts for every byte in the input) + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "({} {} {}) {}{} = {}", + self.header.name_index, + self.header.name_len, + self.header.value_size, + std::str::from_utf8(format::XATTR_PREFIXES[self.header.name_index as usize]).unwrap(), + utf8_or_hex(self.suffix()), + utf8_or_hex(self.value()), + )?; + if self.padding().iter().any(|c| *c != 0) { + write!(f, " {:?}", self.padding())?; + } + Ok(()) + } +} + +// This accounts for every bytes of InodeXAttrs +fn print_xattrs(xattrs: Option<&InodeXAttrs>) { + let Some(xattrs) = xattrs else { + return; + }; + + if !xattrs.shared().is_empty() { + print!(" Shared xattrs:"); + for id in xattrs.shared() { + print!(" {id}"); + } + println!(); + } + println!(" Local xattrs:"); + for xattr in xattrs.local() { + println!(" - {:?}", xattr); + } +} + +fn hexdump(block: &[u8]) { + for row in 0..((block.len() + 15) / 16) { + let offset = row * 16; + print!(" +{offset:04x} "); + for idx in offset..(offset + 16) { + if idx < block.len() { + print!("{:02x} ", block[idx]); + } else { + print!(" "); + } + if idx % 8 == 7 { + print!(" "); + } + } + print!("|"); + for idx in offset..(offset + 16) { + if idx < block.len() { + let c = block[idx]; + if c.is_ascii() && !c.is_ascii_control() { + print!("{}", c as char); + } else { + print!("."); + } + } else { + print!(" "); + } + } + println!("|"); + } +} + +pub fn print_directory_block(block: &DirectoryBlock) { + for entry in block.entries() { + println!( + " {} {:?} -> {}", + utf8_or_hex(entry.name), + entry.file_type, + entry.inode + ); + } +} + +fn print_inode_extra(inode: impl InodeOps + InodeHeader) { + print_xattrs(inode.xattrs()); + let inline = inode.inline(); + if !inline.is_empty() { + if inode.mode() & format::S_IFMT == format::S_IFDIR { + let block = DirectoryBlock::try_ref_from_bytes(inline).unwrap(); + print_directory_block(block); + } else { + hexdump(inode.inline()); + } + } +} + +pub fn debug_img(data: &[u8]) { + let image = Image::open(data); + let visited = ImageVisitor::visit_image(&image); + + let mut offset = 0; + for (start, (segment, paths)) in visited { + if offset > start { + println!("*** Overlapping segments!"); + offset = start; + } + if offset < start { + println!("{offset:08x} Padding"); + let padding = &data[offset..start]; + if padding.iter().all(|c| *c == 0) { + println!(" {} * nul", padding.len()); + } else { + println!(" {:?}", padding); + } + println!(); + offset = start; + } + + match segment { + SegmentType::Header(header) => { + println!("{offset:08x} ComposefsHeader"); + print_fields!( + ComposefsHeader, header, + magic; flags; version; composefs_version; unused + ); + } + SegmentType::Superblock(sb) => { + println!("{offset:08x} Superblock"); + print_fields!( + Superblock, sb, + magic; checksum; feature_compat; blkszbits; extslots; root_nid; inos; build_time; + build_time_nsec; blocks; meta_blkaddr; xattr_blkaddr; uuid; volume_name; + feature_incompat; available_compr_algs; extra_devices; devt_slotoff; dirblkbits; + xattr_prefix_count; xattr_prefix_start; packed_nid; xattr_filter_reserved; reserved2 + ); + } + SegmentType::CompactInode(inode) => { + println!("{offset:08x} Inode (compact) #{}", offset / 32); // TODO: doesn't take metablk into account + print_paths(&paths); + print_fields!( + CompactInodeHeader, inode.header, + format; xattr_icount; mode; reserved; size; u; ino; uid; gid; nlink; reserved2; + reserved2 + ); + print_inode_extra(inode); + } + SegmentType::ExtendedInode(inode) => { + println!("{offset:08x} Inode (extended) #{}", offset / 32); // TODO: doesn't take metablk into account + print_paths(&paths); + print_fields!( + ExtendedInodeHeader, inode.header, + format; xattr_icount; mode; reserved; size; u; ino; uid; gid; mtime; mtime_nsec; nlink; + reserved2 + ); + print_inode_extra(inode); + } + SegmentType::XAttr(xattr) => { + println!("{offset:08x} XAttr #{}", offset / 4); // TODO: doesn't take xattrblk into account + print_paths(&paths); + println!(" {:?}", xattr); + } + SegmentType::DirectoryBlock(block) => { + println!("{offset:08x} Directory block"); + print_paths(&paths); + print_directory_block(block); + } + SegmentType::DataBlock(block) => { + println!("{offset:08x} Data block"); + print_paths(&paths); + hexdump(block); + } + } + println!(); + + offset = start + segment.size(); + } + if offset < data.len() { + println!("{offset:08x} Padding"); + let padding = &data[offset..data.len()]; + if padding.iter().any(|c| *c != 0) { + println!(" {:?}", padding); + } + println!(); + } + + if offset > data.len() { + println!("*** Segments past EOF!"); + } +} diff --git a/src/erofs/format.rs b/src/erofs/format.rs new file mode 100644 index 0000000..9927ca5 --- /dev/null +++ b/src/erofs/format.rs @@ -0,0 +1,279 @@ +use zerocopy::{ + little_endian::{U16, U32, U64}, + Immutable, IntoBytes, KnownLayout, TryFromBytes, +}; + +#[derive(Debug)] +pub enum FormatError { + InvalidDataLayout, +} + +pub const BLOCK_BITS: u8 = 12; +pub const BLOCK_SIZE: usize = 1 << BLOCK_BITS; + +/* composefs Header */ + +pub const COMPOSEFS_VERSION: U32 = U32::new(1); +pub const COMPOSEFS_MAGIC: U32 = U32::new(0xd078629a); + +#[derive(Debug, Immutable, IntoBytes, TryFromBytes)] +#[repr(u32)] +pub enum ComposefsFlags { + HasAcl = 1 << 0, +} + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct ComposefsHeader { + pub magic: U32, + pub version: U32, + pub flags: U32, + pub composefs_version: U32, + pub unused: [U32; 4], +} + +/* Superblock */ + +pub const MAGIC_V1: U32 = U32::new(0xE0F5E1E2); +pub const FEATURE_COMPAT_MTIME: U32 = U32::new(2); +pub const FEATURE_COMPAT_XATTR_FILTER: U32 = U32::new(4); + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct Superblock { + // vertical whitespace every 16 bytes (hexdump-friendly) + pub magic: U32, + pub checksum: U32, + pub feature_compat: U32, + pub blkszbits: u8, + pub extslots: u8, + pub root_nid: U16, + + pub inos: U64, + pub build_time: U64, + + pub build_time_nsec: U32, + pub blocks: U32, + pub meta_blkaddr: U32, + pub xattr_blkaddr: U32, + + pub uuid: [u8; 16], + + pub volume_name: [u8; 16], + + pub feature_incompat: U32, + pub available_compr_algs: U16, + pub extra_devices: U16, + pub devt_slotoff: U16, + pub dirblkbits: u8, + pub xattr_prefix_count: u8, + pub xattr_prefix_start: U32, + + pub packed_nid: U64, + pub xattr_filter_reserved: u8, + pub reserved2: [u8; 23], +} + +/* Inodes */ + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct CompactInodeHeader { + pub format: FormatField, + pub xattr_icount: U16, + pub mode: U16, + pub nlink: U16, + + pub size: U32, + pub reserved: U32, + + pub u: U32, + pub ino: U32, // only used for 32-bit stat compatibility + + pub uid: U16, + pub gid: U16, + pub reserved2: [u8; 4], +} + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct ExtendedInodeHeader { + pub format: FormatField, + pub xattr_icount: U16, + pub mode: U16, + pub reserved: U16, + pub size: U64, + + pub u: U32, + pub ino: U32, // only used for 32-bit stat compatibility + pub uid: U32, + pub gid: U32, + + pub mtime: U64, + + pub mtime_nsec: U32, + pub nlink: U32, + + pub reserved2: [u8; 16], +} + +#[derive(Debug, Default, Immutable, KnownLayout, IntoBytes, TryFromBytes)] +#[repr(C)] +pub struct InodeXAttrHeader { + pub name_filter: U32, + pub shared_count: u8, + pub reserved: [u8; 7], +} + +#[derive(Clone, Copy, Immutable, KnownLayout, IntoBytes, PartialEq, TryFromBytes)] +pub struct FormatField(U16); + +impl Default for FormatField { + fn default() -> Self { + Self(0xffff.into()) + } +} + +impl std::fmt::Debug for FormatField { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "{} = {:?} | {:?}", + self.0.get(), + InodeLayout::from(*self), + DataLayout::try_from(*self) + ) + } +} + +const INODE_LAYOUT_MASK: u16 = 0b00000001; +const INODE_LAYOUT_COMPACT: u16 = 0; +const INODE_LAYOUT_EXTENDED: u16 = 1; + +#[derive(Debug)] +#[repr(u16)] +pub enum InodeLayout { + Compact = INODE_LAYOUT_COMPACT, + Extended = INODE_LAYOUT_EXTENDED, +} + +impl From for InodeLayout { + fn from(value: FormatField) -> Self { + match value.0.get() & INODE_LAYOUT_MASK { + INODE_LAYOUT_COMPACT => InodeLayout::Compact, + INODE_LAYOUT_EXTENDED => InodeLayout::Extended, + _ => unreachable!(), + } + } +} + +const INODE_DATALAYOUT_MASK: u16 = 0b00001110; +const INODE_DATALAYOUT_FLAT_PLAIN: u16 = 0; +const INODE_DATALAYOUT_FLAT_INLINE: u16 = 4; +const INODE_DATALAYOUT_CHUNK_BASED: u16 = 8; + +#[derive(Debug)] +#[repr(u16)] +pub enum DataLayout { + FlatPlain = 0, + FlatInline = 4, + ChunkBased = 8, +} + +impl TryFrom for DataLayout { + type Error = FormatError; + + fn try_from(value: FormatField) -> Result { + match value.0.get() & INODE_DATALAYOUT_MASK { + INODE_DATALAYOUT_FLAT_PLAIN => Ok(DataLayout::FlatPlain), + INODE_DATALAYOUT_FLAT_INLINE => Ok(DataLayout::FlatInline), + INODE_DATALAYOUT_CHUNK_BASED => Ok(DataLayout::ChunkBased), + _ => Err(FormatError::InvalidDataLayout), + } + } +} + +impl From<(InodeLayout, DataLayout)> for FormatField { + fn from(value: (InodeLayout, DataLayout)) -> FormatField { + FormatField( + (match value.0 { + InodeLayout::Compact => INODE_LAYOUT_COMPACT, + InodeLayout::Extended => INODE_LAYOUT_EXTENDED, + } | match value.1 { + DataLayout::FlatPlain => INODE_DATALAYOUT_FLAT_PLAIN, + DataLayout::FlatInline => INODE_DATALAYOUT_FLAT_INLINE, + DataLayout::ChunkBased => INODE_DATALAYOUT_CHUNK_BASED, + }) + .into(), + ) + } +} + +/* Extended attributes */ +pub const XATTR_FILTER_SEED: u32 = 0x25BBE08F; + +#[derive(Debug, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct XAttrHeader { + pub name_len: u8, + pub name_index: u8, + pub value_size: U16, +} + +pub const XATTR_PREFIXES: [&[u8]; 7] = [ + b"", + b"user.", + b"system.posix_acl_access", + b"system.posix_acl_default", + b"trusted.", + b"lustre.", + b"security.", +]; + +/* Directories */ + +#[derive(Clone, Copy, Debug, Default, Immutable, IntoBytes, TryFromBytes)] +#[repr(u8)] +pub enum FileType { + #[default] + Unknown, + RegularFile, + Directory, + CharacterDevice, + BlockDevice, + Fifo, + Socket, + Symlink, +} +pub const S_IFMT: u16 = 0o170000; +pub const S_IFREG: u16 = 0o100000; +pub const S_IFCHR: u16 = 0o020000; +pub const S_IFDIR: u16 = 0o040000; +pub const S_IFBLK: u16 = 0o060000; +pub const S_IFIFO: u16 = 0o010000; +pub const S_IFLNK: u16 = 0o120000; +pub const S_IFSOCK: u16 = 0o140000; + +impl FileType { + pub fn to_ifmt(&self) -> u16 { + match self { + Self::RegularFile => S_IFREG, + Self::CharacterDevice => S_IFCHR, + Self::Directory => S_IFDIR, + Self::BlockDevice => S_IFBLK, + Self::Fifo => S_IFIFO, + Self::Symlink => S_IFLNK, + Self::Socket => S_IFSOCK, + Self::Unknown => unreachable!(), + } + } +} + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct DirectoryEntryHeader { + pub inode_offset: U64, + pub name_offset: U16, + pub file_type: FileType, // TODO: change to u8 for trivial transmute? + pub reserved: u8, +} diff --git a/src/erofs/mod.rs b/src/erofs/mod.rs new file mode 100644 index 0000000..8c0cc51 --- /dev/null +++ b/src/erofs/mod.rs @@ -0,0 +1,3 @@ +pub mod debug; +pub mod format; +pub mod reader; diff --git a/src/erofs/reader.rs b/src/erofs/reader.rs new file mode 100644 index 0000000..942e0e7 --- /dev/null +++ b/src/erofs/reader.rs @@ -0,0 +1,460 @@ +use core::mem::size_of; +use std::ops::Range; + +use zerocopy::{little_endian::U32, Immutable, KnownLayout, TryFromBytes}; + +use super::format::{ + CompactInodeHeader, ComposefsHeader, DataLayout, DirectoryEntryHeader, ExtendedInodeHeader, + FileType, InodeXAttrHeader, Superblock, XAttrHeader, +}; + +fn round_up(n: usize, to: usize) -> usize { + (n + to - 1) & !(to - 1) +} + +pub trait InodeHeader { + fn data_layout(&self) -> DataLayout; + fn xattr_icount(&self) -> u16; + fn mode(&self) -> u16; + fn size(&self) -> u64; + fn u(&self) -> u32; + + fn additional_bytes(&self, blkszbits: u8) -> usize { + let block_size = 1 << blkszbits; + self.xattr_size() + + match self.data_layout() { + DataLayout::FlatPlain => 0, + DataLayout::FlatInline => self.size() as usize % block_size, + DataLayout::ChunkBased => 4, + } + } + + fn xattr_size(&self) -> usize { + match self.xattr_icount() { + 0 => 0, + n => (n as usize - 1) * 4 + 12, + } + } +} + +impl InodeHeader for ExtendedInodeHeader { + fn data_layout(&self) -> DataLayout { + self.format.try_into().unwrap() + } + + fn xattr_icount(&self) -> u16 { + self.xattr_icount.get() + } + + fn mode(&self) -> u16 { + self.mode.get() + } + + fn size(&self) -> u64 { + self.size.get() + } + + fn u(&self) -> u32 { + self.u.get() + } +} + +impl InodeHeader for CompactInodeHeader { + fn data_layout(&self) -> DataLayout { + self.format.try_into().unwrap() + } + + fn xattr_icount(&self) -> u16 { + self.xattr_icount.get() + } + + fn mode(&self) -> u16 { + self.mode.get() + } + + fn size(&self) -> u64 { + self.size.get() as u64 + } + + fn u(&self) -> u32 { + self.u.get() + } +} + +#[repr(C)] +#[derive(TryFromBytes, KnownLayout, Immutable)] +pub struct XAttr { + pub header: XAttrHeader, + pub data: [u8], +} + +#[repr(C)] +#[derive(Debug, TryFromBytes, KnownLayout, Immutable)] +pub struct Inode { + pub header: Header, + pub data: [u8], +} + +#[repr(C)] +#[derive(Debug, TryFromBytes, KnownLayout, Immutable)] +pub struct InodeXAttrs { + pub header: InodeXAttrHeader, + pub data: [u8], +} + +impl XAttrHeader { + pub fn calculate_n_elems(&self) -> usize { + round_up(self.name_len as usize + self.value_size.get() as usize, 4) + } +} + +impl XAttr { + pub fn from_prefix(data: &[u8]) -> (&XAttr, &[u8]) { + let header = XAttrHeader::try_ref_from_bytes(&data[..4]).unwrap(); + Self::try_ref_from_prefix_with_elems(data, header.calculate_n_elems()).unwrap() + } + + pub fn suffix(&self) -> &[u8] { + &self.data[..self.header.name_len as usize] + } + + pub fn value(&self) -> &[u8] { + &self.data[self.header.name_len as usize..][..self.header.value_size.get() as usize] + } + + pub fn padding(&self) -> &[u8] { + &self.data[self.header.name_len as usize + self.header.value_size.get() as usize..] + } +} + +pub trait InodeOps { + fn xattrs(&self) -> Option<&InodeXAttrs>; + fn inline(&self) -> &[u8]; + fn blocks(&self, blkszbits: u8) -> Range; +} + +impl InodeHeader for &Inode
{ + fn data_layout(&self) -> DataLayout { + self.header.data_layout() + } + + fn xattr_icount(&self) -> u16 { + self.header.xattr_icount() + } + + fn mode(&self) -> u16 { + self.header.mode() + } + + fn size(&self) -> u64 { + self.header.size() + } + + fn u(&self) -> u32 { + self.header.u() + } +} + +impl InodeOps for &Inode
{ + fn xattrs(&self) -> Option<&InodeXAttrs> { + match self.header.xattr_size() { + 0 => None, + n => Some(InodeXAttrs::try_ref_from_bytes(&self.data[..n]).unwrap()), + } + } + + fn inline(&self) -> &[u8] { + &self.data[self.header.xattr_size()..] + } + + fn blocks(&self, blkszbits: u8) -> Range { + let size = self.header.size(); + let block_size = 1 << blkszbits; + let start = self.header.u() as u64; + + match self.header.data_layout() { + DataLayout::FlatPlain => Range { + start, + end: start + size.div_ceil(block_size), + }, + DataLayout::FlatInline => Range { + start, + end: start + size / block_size, + }, + DataLayout::ChunkBased => Range { start, end: start }, + } + } +} + +// this lets us avoid returning Box from Image.inode() +// but ... wow. +#[derive(Debug)] +pub enum InodeType<'img> { + Compact(&'img Inode), + Extended(&'img Inode), +} + +impl InodeHeader for InodeType<'_> { + fn u(&self) -> u32 { + match self { + Self::Compact(inode) => inode.u(), + Self::Extended(inode) => inode.u(), + } + } + + fn size(&self) -> u64 { + match self { + Self::Compact(inode) => inode.size(), + Self::Extended(inode) => inode.size(), + } + } + + fn xattr_icount(&self) -> u16 { + match self { + Self::Compact(inode) => inode.xattr_icount(), + Self::Extended(inode) => inode.xattr_icount(), + } + } + + fn data_layout(&self) -> DataLayout { + match self { + Self::Compact(inode) => inode.data_layout(), + Self::Extended(inode) => inode.data_layout(), + } + } + + fn mode(&self) -> u16 { + match self { + Self::Compact(inode) => inode.mode(), + Self::Extended(inode) => inode.mode(), + } + } +} + +impl InodeOps for InodeType<'_> { + fn xattrs(&self) -> Option<&InodeXAttrs> { + match self { + Self::Compact(inode) => inode.xattrs(), + Self::Extended(inode) => inode.xattrs(), + } + } + + fn inline(&self) -> &[u8] { + match self { + Self::Compact(inode) => inode.inline(), + Self::Extended(inode) => inode.inline(), + } + } + + fn blocks(&self, blkszbits: u8) -> Range { + match self { + Self::Compact(inode) => inode.blocks(blkszbits), + Self::Extended(inode) => inode.blocks(blkszbits), + } + } +} + +#[derive(Debug)] +pub struct Image<'i> { + pub image: &'i [u8], + pub header: &'i ComposefsHeader, + pub blkszbits: u8, + pub block_size: usize, + pub sb: &'i Superblock, + pub inodes: &'i [u8], + pub xattrs: &'i [u8], +} + +impl<'img> Image<'img> { + pub fn open(image: &'img [u8]) -> Self { + let header = ComposefsHeader::try_ref_from_prefix(image) + .expect("header err") + .0; + let sb = Superblock::try_ref_from_prefix(&image[1024..]) + .expect("superblock err") + .0; + let blkszbits = sb.blkszbits; + let block_size = 1usize << blkszbits; + assert!(block_size != 0); + let inodes = &image[sb.meta_blkaddr.get() as usize * block_size..]; + let xattrs = &image[sb.xattr_blkaddr.get() as usize * block_size..]; + Image { + image, + header, + blkszbits, + block_size, + sb, + inodes, + xattrs, + } + } + + pub fn inode(&self, id: u64) -> InodeType { + let inode_data = &self.inodes[id as usize * 32..]; + if inode_data[0] & 1 != 0 { + let header = ExtendedInodeHeader::try_ref_from_bytes(&inode_data[..64]).unwrap(); + InodeType::Extended( + Inode::::try_ref_from_prefix_with_elems( + inode_data, + header.additional_bytes(self.blkszbits), + ) + .unwrap() + .0, + ) + } else { + let header = CompactInodeHeader::try_ref_from_bytes(&inode_data[..32]).unwrap(); + InodeType::Compact( + Inode::::try_ref_from_prefix_with_elems( + inode_data, + header.additional_bytes(self.blkszbits), + ) + .unwrap() + .0, + ) + } + } + + pub fn shared_xattr(&self, id: u32) -> &XAttr { + let xattr_data = &self.xattrs[id as usize * 4..]; + let header = XAttrHeader::try_ref_from_bytes(&xattr_data[..4]).unwrap(); + XAttr::try_ref_from_prefix_with_elems(xattr_data, header.calculate_n_elems()) + .unwrap() + .0 + } + + pub fn data_block(&self, id: u64) -> &[u8] { + &self.image[id as usize * self.block_size..][..self.block_size] + } + + pub fn directory_block(&self, id: u64) -> &DirectoryBlock { + DirectoryBlock::try_ref_from_bytes(self.data_block(id)).unwrap() + } + + pub fn root(&self) -> InodeType { + self.inode(self.sb.root_nid.get() as u64) + } +} + +impl InodeXAttrs { + pub fn shared(&self) -> &[U32] { + // TODO: there must be an easier way... + #[derive(TryFromBytes, KnownLayout, Immutable)] + #[repr(C)] + struct U32Array([U32]); + &U32Array::try_ref_from_prefix_with_elems(&self.data, self.header.shared_count as usize) + .unwrap() + .0 + .0 + } + + pub fn local(&self) -> XAttrIter { + XAttrIter { + data: &self.data[self.header.shared_count as usize * 4..], + } + } +} + +#[derive(Debug)] +pub struct XAttrIter<'img> { + data: &'img [u8], +} + +impl<'img> Iterator for XAttrIter<'img> { + type Item = &'img XAttr; + + fn next(&mut self) -> Option { + if !self.data.is_empty() { + let (result, rest) = XAttr::from_prefix(self.data); + self.data = rest; + Some(result) + } else { + None + } + } +} + +#[repr(C)] +#[derive(Debug, Immutable, KnownLayout, TryFromBytes)] +pub struct DirectoryBlock { + pub data: [u8], +} + +impl DirectoryBlock { + pub fn get_entry_header(&self, n: usize) -> &DirectoryEntryHeader { + let entry_data = &self.data + [n * size_of::()..(n + 1) * size_of::()]; + DirectoryEntryHeader::try_ref_from_bytes(entry_data).unwrap() + } + + pub fn get_entry_headers(&self) -> &[DirectoryEntryHeader] { + // TODO: there must be an easier way... + #[derive(TryFromBytes, KnownLayout, Immutable)] + #[repr(C)] + struct EntryArray([DirectoryEntryHeader]); + &EntryArray::try_ref_from_prefix_with_elems(&self.data, self.n_entries()) + .unwrap() + .0 + .0 + } + + pub fn n_entries(&self) -> usize { + let first = self.get_entry_header(0); + let offset = first.name_offset.get(); + assert!(offset != 0); + assert!(offset % 12 == 0); + offset as usize / 12 + } + + pub fn entries(&self) -> DirectoryEntries { + DirectoryEntries { + block: self, + length: self.n_entries(), + position: 0, + } + } +} + +// High-level iterator interface +#[derive(Debug)] +pub struct DirectoryEntry<'a> { + pub file_type: FileType, + pub name: &'a [u8], + pub inode: u64, +} + +#[derive(Debug)] +pub struct DirectoryEntries<'d> { + block: &'d DirectoryBlock, + length: usize, + position: usize, +} + +impl<'d> Iterator for DirectoryEntries<'d> { + type Item = DirectoryEntry<'d>; + + fn next(&mut self) -> Option { + if self.position < self.length { + let item = self.block.get_entry_header(self.position); + let name_start = item.name_offset.get() as usize; + self.position += 1; + + let name = if self.position == self.length { + let with_padding = &self.block.data[name_start..]; + let end = with_padding.partition_point(|c| *c != 0); + &with_padding[..end] + } else { + let next = self.block.get_entry_header(self.position); + let name_end = next.name_offset.get() as usize; + &self.block.data[name_start..name_end] + }; + + Some(DirectoryEntry { + name, + file_type: item.file_type, + inode: item.inode_offset.get(), + }) + } else { + None + } + } +} diff --git a/src/fs.rs b/src/fs.rs index 3430bde..ec99590 100644 --- a/src/fs.rs +++ b/src/fs.rs @@ -314,7 +314,7 @@ pub fn read_from_path(path: &Path, repo: Option<&Repository>) -> Result) -> Result { let fs = read_from_path(path, repo)?; - let image = super::image::mkcomposefs(fs)?; + let image = crate::mkfs::mkfs(&fs)?; if let Some(repo) = repo { Ok(repo.write_image(None, &image)?) } else { diff --git a/src/image.rs b/src/image.rs index 5205b23..a210760 100644 --- a/src/image.rs +++ b/src/image.rs @@ -3,15 +3,13 @@ use std::{ cmp::{Ord, Ordering}, collections::BTreeMap, ffi::{OsStr, OsString}, - io::Read, path::Path, - process::{Command, Stdio}, rc::Rc, }; use anyhow::{bail, Context, Result}; -use crate::{dumpfile::write_dumpfile, fsverity::Sha256HashValue}; +use crate::fsverity::Sha256HashValue; #[derive(Debug)] pub struct Stat { @@ -290,26 +288,3 @@ impl FileSystem { } } } - -pub fn mkcomposefs(filesystem: FileSystem) -> Result> { - let mut mkcomposefs = Command::new("mkcomposefs") - .args(["--from-file", "-", "-"]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .spawn()?; - - let mut stdin = mkcomposefs.stdin.take().unwrap(); - write_dumpfile(&mut stdin, &filesystem)?; - drop(stdin); - - let mut stdout = mkcomposefs.stdout.take().unwrap(); - let mut image = vec![]; - stdout.read_to_end(&mut image)?; - drop(stdout); - - if !mkcomposefs.wait()?.success() { - bail!("mkcomposefs failed"); - }; - - Ok(image) -} diff --git a/src/lib.rs b/src/lib.rs index ff8f4e4..61507e6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,9 +2,11 @@ pub mod dumpfile; pub mod dumpfile_parse; +pub mod erofs; pub mod fs; pub mod fsverity; pub mod image; +pub mod mkfs; pub mod mount; pub mod oci; pub mod repository; diff --git a/src/mkfs.rs b/src/mkfs.rs new file mode 100644 index 0000000..cf27589 --- /dev/null +++ b/src/mkfs.rs @@ -0,0 +1,749 @@ +use std::{ + env, + io::Read, + process::{Command, Stdio}, +}; + +use anyhow::{bail, Result}; + +use crate::{dumpfile::write_dumpfile, image::FileSystem}; + +use std::{ + cell::RefCell, + collections::{BTreeMap, HashMap}, + mem::{align_of_val, size_of}, + os::unix::ffi::OsStrExt, + rc::Rc, +}; + +use log::debug; +use xxhash_rust::xxh32::xxh32; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{ + erofs::{debug::debug_img, format}, + image, +}; + +fn round_up(n: usize, to: usize) -> usize { + (n + to - 1) & !(to - 1) +} + +#[derive(Clone, Copy, Debug)] +enum Offset { + Header, + Superblock, + Inode, + XAttr, + Block, + End, +} + +trait Output { + fn note_offset(&mut self, offset_type: Offset); + fn get(&self, offset_type: Offset, idx: usize) -> usize; + fn write(&mut self, data: &[u8]); + fn pad(&mut self, alignment: usize); + fn len(&self) -> usize; + + fn get_div(&self, offset_type: Offset, idx: usize, div: usize) -> usize { + let offset = self.get(offset_type, idx); + assert_eq!(offset % div, 0); + offset / div + } + + fn get_nid(&self, idx: usize) -> u64 { + self.get_div(Offset::Inode, idx, 32) as u64 + } + + fn get_xattr(&self, idx: usize) -> u32 { + self.get_div(Offset::XAttr, idx, 4).try_into().unwrap() + } + + fn write_struct(&mut self, st: impl IntoBytes + Immutable) { + assert_eq!(self.len() % align_of_val(&st), 0); // TODO: this is less than we want + self.write(st.as_bytes()); + } +} + +#[derive(PartialOrd, PartialEq, Eq, Ord, Clone)] +struct XAttr { + prefix: u8, + suffix: Box<[u8]>, + value: Box<[u8]>, +} + +#[derive(Clone, Default)] +struct InodeXAttrs { + shared: Vec, + local: Vec, + filter: u32, +} + +struct DirEnt<'a> { + name: &'a [u8], + inode: usize, + file_type: format::FileType, +} + +#[derive(Default)] +struct Directory<'a> { + blocks: Box<[Box<[DirEnt<'a>]>]>, + inline: Box<[DirEnt<'a>]>, + size: usize, + nlink: usize, +} + +struct Leaf<'a> { + content: &'a image::LeafContent, + nlink: usize, +} + +enum InodeContent<'a> { + Directory(Directory<'a>), + Leaf(Leaf<'a>), +} + +struct Inode<'a> { + stat: &'a image::Stat, + xattrs: InodeXAttrs, + content: InodeContent<'a>, +} + +impl XAttr { + pub fn write(&self, output: &mut impl Output) { + output.write_struct(format::XAttrHeader { + name_len: self.suffix.len() as u8, + name_index: self.prefix, + value_size: (self.value.len() as u16).into(), + }); + output.write(&self.suffix); + output.write(&self.value); + output.pad(4); + } +} + +impl InodeXAttrs { + fn add(&mut self, name: &[u8], value: &[u8]) { + for (idx, prefix) in format::XATTR_PREFIXES.iter().enumerate().rev() { + if let Some(suffix) = name.strip_prefix(*prefix) { + self.filter |= 1 << (xxh32(suffix, format::XATTR_FILTER_SEED + idx as u32) % 32); + self.local.push(XAttr { + prefix: idx as u8, + suffix: Box::from(suffix), + value: Box::from(value), + }); + return; + } + } + unreachable!("{:?}", std::str::from_utf8(name)); // worst case: we matched the empty prefix (0) + } + + fn write(&self, output: &mut impl Output) { + if self.filter != 0 { + debug!(" write xattrs block"); + output.write_struct(format::InodeXAttrHeader { + name_filter: (!self.filter).into(), + shared_count: self.shared.len() as u8, + ..Default::default() + }); + for idx in &self.shared { + debug!(" shared {} @{}", idx, output.len()); + output.write(&output.get_xattr(*idx).to_le_bytes()); + } + for attr in &self.local { + debug!(" local @{}", output.len()); + attr.write(output); + } + } + // our alignment is equal to xattr alignment: no need to pad + } +} + +impl<'a> Directory<'a> { + pub fn from_entries(entries: Vec>) -> Self { + let mut blocks = vec![]; + let mut rest = vec![]; + + let mut n_bytes = 0; + let mut nlink = 0; + + debug!("Directory with {} items", entries.len()); + + // The content of the directory is fixed at this point so we may as well split it into + // blocks. This lets us avoid measuring and re-measuring. + for entry in entries.into_iter() { + let entry_size = size_of::() + entry.name.len(); + assert!(entry_size <= 4096); + + debug!(" {:?}", entry.file_type); + + if matches!(entry.file_type, format::FileType::Directory) { + nlink += 1; + } + + n_bytes += entry_size; + if n_bytes <= 4096 { + rest.push(entry); + } else { + // It won't fit, so we need to store the existing entries in a block. + debug!(" block {}", rest.len()); + blocks.push(rest.into_boxed_slice()); + + // Start over + rest = vec![entry]; + n_bytes = entry_size; + } + } + + // Don't try to store more than 2048 bytes of tail data + if n_bytes > 2048 { + blocks.push(rest.into_boxed_slice()); + rest = vec![]; + n_bytes = 0; + } + + debug!( + " blocks {} inline {} inline_size {n_bytes}", + blocks.len(), + rest.len() + ); + + let size = format::BLOCK_SIZE * blocks.len() + n_bytes; + Self { + blocks: blocks.into_boxed_slice(), + inline: rest.into_boxed_slice(), + size, + nlink, + } + } + + fn write_block(&self, output: &mut impl Output, block: &[DirEnt]) { + debug!(" write dir block {} @{}", block.len(), output.len()); + let mut nameofs = size_of::() * block.len(); + + for entry in block { + debug!( + " entry {:?} name {} @{}", + entry.file_type, + nameofs, + output.len() + ); + output.write_struct(format::DirectoryEntryHeader { + name_offset: (nameofs as u16).into(), + inode_offset: output.get_nid(entry.inode).into(), + file_type: entry.file_type, + ..Default::default() + }); + nameofs += entry.name.len(); + } + + for entry in block { + debug!(" name @{}", output.len()); + output.write(entry.name.as_bytes()); + } + } + + fn write_inline(&self, output: &mut impl Output) { + debug!( + " write inline len {} expected size {} of {}", + self.inline.len(), + self.size % 4096, + self.size + ); + self.write_block(output, &self.inline); + } + + fn write_blocks(&self, output: &mut impl Output) { + for block in &self.blocks { + assert_eq!(output.len() % format::BLOCK_SIZE, 0); + self.write_block(output, block); + output.pad(format::BLOCK_SIZE); + } + } + + fn inode_meta(&self, block_offset: usize) -> (format::DataLayout, u32, u64, usize) { + let (layout, u) = if self.inline.len() == 0 { + (format::DataLayout::FlatPlain, block_offset as u32 / 4096) + } else if self.blocks.len() > 0 { + (format::DataLayout::FlatInline, block_offset as u32 / 4096) + } else { + (format::DataLayout::FlatInline, 0) + }; + (layout, u, self.size as u64, self.nlink) + } +} + +impl Leaf<'_> { + fn inode_meta(&self) -> (format::DataLayout, u32, u64, usize) { + let (layout, u, size) = match &self.content { + image::LeafContent::InlineFile(data) => { + if data.is_empty() { + (format::DataLayout::FlatPlain, 0, data.len() as u64) + } else { + (format::DataLayout::FlatInline, 0, data.len() as u64) + } + } + image::LeafContent::ExternalFile(.., size) => { + // TODO: libcomposefs tries harder here. Should we? + (format::DataLayout::ChunkBased, 31, *size) + } + image::LeafContent::CharacterDevice(rdev) | image::LeafContent::BlockDevice(rdev) => { + (format::DataLayout::FlatPlain, *rdev as u32, 0) + } + image::LeafContent::Fifo | image::LeafContent::Socket => { + (format::DataLayout::FlatPlain, 0, 0) + } + image::LeafContent::Symlink(target) => { + (format::DataLayout::FlatInline, 0, target.len() as u64) + } + }; + (layout, u, size, self.nlink) + } + + fn write_inline(&self, output: &mut impl Output) { + output.write(match self.content { + image::LeafContent::InlineFile(data) => data, + image::LeafContent::ExternalFile(..) => b"\xff\xff\xff\xff", // null chunk + image::LeafContent::Symlink(target) => target.as_bytes(), + _ => &[], + }); + } +} + +impl Inode<'_> { + fn file_type(&self) -> format::FileType { + match &self.content { + InodeContent::Directory(..) => format::FileType::Directory, + InodeContent::Leaf(leaf) => match &leaf.content { + image::LeafContent::ExternalFile(..) | image::LeafContent::InlineFile(..) => { + format::FileType::RegularFile + } + image::LeafContent::CharacterDevice(..) => format::FileType::CharacterDevice, + image::LeafContent::BlockDevice(..) => format::FileType::BlockDevice, + image::LeafContent::Fifo => format::FileType::Fifo, + image::LeafContent::Socket => format::FileType::Socket, + image::LeafContent::Symlink(..) => format::FileType::Symlink, + }, + } + } + + fn write_inode(&self, output: &mut impl Output, idx: usize) { + let (layout, u, size, nlink) = match &self.content { + InodeContent::Directory(dir) => dir.inode_meta(output.get(Offset::Block, idx)), + InodeContent::Leaf(leaf) => leaf.inode_meta(), + }; + + let xattr_size = { + let mut xattr = FirstPass::default(); + self.xattrs.write(&mut xattr); + xattr.offset + }; + + // We need to make sure the inline part doesn't overlap a block boundary + if matches!(layout, format::DataLayout::FlatInline) { + let inode_and_xattr_size = size_of::() + xattr_size; + let inline_start = output.len() + inode_and_xattr_size; + let inline_end = inline_start + (size as usize % format::BLOCK_SIZE); + if inline_start / format::BLOCK_SIZE != inline_end / format::BLOCK_SIZE { + // If we proceed, then we'll violate the rule about crossing block boundaries. + // The easiest thing to do is to add padding so that the inline data starts at a + // fresh block boundary. + let pad = vec![0; 4096 - inline_start % 4096]; + debug!("added pad {}", pad.len()); + output.write(&pad); + } + } + + let format = format::FormatField::from((format::InodeLayout::Extended, layout)); + + output.pad(32); + + debug!( + "write inode {idx} nid {} {:?} {:?} xattrsize{xattr_size} icount{} inline{} @{}", + output.len() / 32, + format, + self.file_type(), + match xattr_size { + 0 => 0, + n => (1 + (n - 12) / 4) as u16, + }, + size % 4096, + output.len() + ); + + output.note_offset(Offset::Inode); + output.write_struct(format::ExtendedInodeHeader { + format, + xattr_icount: match xattr_size { + 0 => 0, + n => (1 + (n - 12) / 4) as u16, + } + .into(), + mode: (self.stat.st_mode as u16 | self.file_type().to_ifmt()).into(), + size: size.into(), + u: u.into(), + ino: ((output.len() / 32) as u32).into(), + uid: self.stat.st_uid.into(), + gid: self.stat.st_gid.into(), + mtime: (self.stat.st_mtim_sec as u64).into(), + nlink: (nlink as u32).into(), + ..Default::default() + }); + + self.xattrs.write(output); + + match &self.content { + InodeContent::Directory(dir) => dir.write_inline(output), + InodeContent::Leaf(leaf) => leaf.write_inline(output), + }; + + output.pad(32); + } + + fn write_blocks(&self, output: &mut impl Output) { + if let InodeContent::Directory(dir) = &self.content { + dir.write_blocks(output); + } + } +} + +struct InodeCollector<'a> { + inodes: Vec>, + hardlinks: HashMap<*const image::Leaf, usize>, +} + +impl<'a> InodeCollector<'a> { + fn push_inode(&mut self, stat: &'a image::Stat, content: InodeContent<'a>) -> usize { + let mut xattrs = InodeXAttrs::default(); + + // We need to record extra xattrs for some files. These come first. + if let InodeContent::Leaf(Leaf { + content: image::LeafContent::ExternalFile(id, ..), + .. + }) = content + { + let metacopy = [&[0, 36, 0, 1], &id[..]].concat(); + xattrs.add(b"trusted.overlay.metacopy", &metacopy); + + let redirect = format!("/{:02x}/{}", id[0], hex::encode(&id[1..])); + xattrs.add(b"trusted.overlay.redirect", redirect.as_bytes()); + } + + // Add the normal xattrs. They're already listed in sorted order. + for (name, value) in RefCell::borrow(&stat.xattrs).iter() { + let name = name.as_bytes(); + + if let Some(escapee) = name.strip_prefix(b"trusted.overlay.") { + let escaped = [b"trusted.overlay.overlay.", escapee].concat(); + xattrs.add(&escaped, value); + } else { + xattrs.add(name, value); + } + } + + // Allocate an inode for ourselves. At first we write all xattrs as local. Later (after + // we've determined which xattrs ought to be shared) we'll come and move some of them over. + let inode = self.inodes.len(); + self.inodes.push(Inode { + stat, + xattrs, + content, + }); + inode + } + + fn collect_leaf(&mut self, leaf: &'a Rc) -> usize { + let nlink = Rc::strong_count(leaf); + + if nlink > 1 { + if let Some(inode) = self.hardlinks.get(&Rc::as_ptr(leaf)) { + return *inode; + } + } + + let inode = self.push_inode( + &leaf.stat, + InodeContent::Leaf(Leaf { + content: &leaf.content, + nlink, + }), + ); + + if nlink > 1 { + self.hardlinks.insert(Rc::as_ptr(leaf), inode); + } + + inode + } + + fn insert_sorted( + entries: &mut Vec>, + name: &'a [u8], + inode: usize, + file_type: format::FileType, + ) { + let entry = DirEnt { + name, + inode, + file_type, + }; + let point = entries.partition_point(|e| e.name < entry.name); + entries.insert(point, entry); + } + + fn collect_dir(&mut self, dir: &'a image::Directory, parent: usize) -> usize { + // The root inode number needs to fit in a u16. That more or less compels us to write the + // directory inode before the inode of the children of the directory. Reserve a slot. + let me = self.push_inode(&dir.stat, InodeContent::Directory(Directory::default())); + + let mut entries = vec![]; + + for entry in &dir.entries { + let child = match &entry.inode { + image::Inode::Directory(dir) => self.collect_dir(dir, me), + image::Inode::Leaf(leaf) => self.collect_leaf(leaf), + }; + entries.push(DirEnt { + name: entry.name.as_bytes(), + inode: child, + file_type: self.inodes[child].file_type(), + }); + } + + // We're expected to add those, too + Self::insert_sorted(&mut entries, b".", me, format::FileType::Directory); + Self::insert_sorted(&mut entries, b"..", parent, format::FileType::Directory); + + // Now that we know the actual content, we can write it to our reserved slot + self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries)); + me + } + + pub fn collect(fs: &'a image::FileSystem) -> Vec> { + let mut this = Self { + inodes: vec![], + hardlinks: HashMap::new(), + }; + + // '..' of the root directory is the root directory again + let root_inode = this.collect_dir(&fs.root, 0); + assert_eq!(root_inode, 0); + + this.inodes + } +} + +/// Takes a list of inodes where each inode contains only local xattr values, determines which +/// xattrs (key, value) pairs appear more than once, and shares them. +fn share_xattrs(inodes: &mut [Inode]) -> Vec { + let mut xattrs: BTreeMap = BTreeMap::new(); + + // Collect all xattrs from the inodes + for inode in inodes.iter() { + for attr in &inode.xattrs.local { + if let Some(count) = xattrs.get_mut(attr) { + *count += 1; + } else { + xattrs.insert(attr.clone(), 1); + } + } + } + + // Share only xattrs with more than one user + xattrs.retain(|_k, v| *v > 1); + + // Repurpose the refcount field as an index lookup + for (idx, value) in xattrs.values_mut().enumerate() { + *value = idx; + } + + // Visit each inode and change local xattrs into shared xattrs + for inode in inodes.iter_mut() { + inode.xattrs.local.retain(|attr| { + if let Some(idx) = xattrs.get(attr) { + inode.xattrs.shared.push(*idx); + false // drop the local xattr: we converted it + } else { + true // retain the local xattr: we didn't convert it + } + }); + } + + // Return the shared xattrs as a vec + xattrs.into_keys().collect() +} + +fn write_erofs(output: &mut impl Output, inodes: &[Inode], xattrs: &[XAttr]) { + // Write composefs header + output.note_offset(Offset::Header); + output.write_struct(format::ComposefsHeader { + magic: format::COMPOSEFS_MAGIC, + version: format::COMPOSEFS_VERSION, + flags: 0.into(), + composefs_version: format::COMPOSEFS_VERSION, + ..Default::default() + }); + output.pad(1024); + + // Write superblock + output.note_offset(Offset::Superblock); + output.write_struct(format::Superblock { + magic: format::MAGIC_V1, + blkszbits: format::BLOCK_BITS, + feature_compat: format::FEATURE_COMPAT_MTIME | format::FEATURE_COMPAT_XATTR_FILTER, + root_nid: (output.get_nid(0) as u16).into(), + inos: (inodes.len() as u64).into(), + blocks: ((output.get(Offset::End, 0) / format::BLOCK_SIZE) as u32).into(), + ..Default::default() + }); + + // Write inode table + for (idx, inode) in inodes.iter().enumerate() { + // The inode may add padding to itself, so it notes its own offset + inode.write_inode(output, idx); + } + + // Write shared xattr table + for xattr in xattrs { + output.note_offset(Offset::XAttr); + xattr.write(output); + } + + // Write blocks from inodes that have them + output.pad(4096); + for inode in inodes.iter() { + output.note_offset(Offset::Block); + inode.write_blocks(output); + } + + // That's it + output.note_offset(Offset::End); +} + +#[derive(Default)] +struct Layout { + offset_types: Vec, + offsets: Vec, +} + +#[derive(Default)] +struct FirstPass { + offset: usize, + layout: Layout, +} + +struct SecondPass { + output: Vec, + layout: Layout, +} + +impl Output for SecondPass { + fn note_offset(&mut self, _offset_type: Offset) { + /* no-op */ + } + + fn get(&self, offset_type: Offset, idx: usize) -> usize { + self.layout.offsets[self.layout.offset_types[offset_type as usize] + idx] + } + + fn write(&mut self, data: &[u8]) { + self.output.extend_from_slice(data); + } + + fn pad(&mut self, alignment: usize) { + self.output + .resize(round_up(self.output.len(), alignment), 0); + } + + fn len(&self) -> usize { + self.output.len() + } +} + +impl Output for FirstPass { + fn note_offset(&mut self, offset_type: Offset) { + if self.layout.offset_types.len() == offset_type as usize { + self.layout.offset_types.push(self.layout.offsets.len()); + } + debug!( + "{:?} #{} @{}", + offset_type, + self.layout.offsets.len() - self.layout.offset_types[offset_type as usize], + self.offset + ); + self.layout.offsets.push(self.offset); + } + + fn get(&self, _: Offset, _: usize) -> usize { + 0 // We don't know offsets in the first pass, so fake it + } + + fn write(&mut self, data: &[u8]) { + self.offset += data.len(); + } + + fn pad(&mut self, alignment: usize) { + self.offset = round_up(self.offset, alignment); + } + + fn len(&self) -> usize { + self.offset + } +} + +pub fn mkfs_erofs(fs: &image::FileSystem) -> Box<[u8]> { + // Create the intermediate representation: flattened inodes and shared xattrs + let mut inodes = InodeCollector::collect(fs); + let xattrs = share_xattrs(&mut inodes); + + // Do a first pass with the writer to determine the layout + let mut first_pass = FirstPass::default(); + write_erofs(&mut first_pass, &inodes, &xattrs); + + // Do a second pass with the writer to get the actual bytes + let mut second_pass = SecondPass { + output: vec![], + layout: first_pass.layout, + }; + write_erofs(&mut second_pass, &inodes, &xattrs); + + // That's it + second_pass.output.into_boxed_slice() +} + +pub fn mkfs_mkcomposefs(filesystem: &FileSystem) -> Result> { + let mut mkcomposefs = Command::new("mkcomposefs") + .args(["--from-file", "-", "-"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn()?; + + let mut stdin = mkcomposefs.stdin.take().unwrap(); + write_dumpfile(&mut stdin, filesystem)?; + drop(stdin); + + let mut stdout = mkcomposefs.stdout.take().unwrap(); + let mut image = vec![]; + stdout.read_to_end(&mut image)?; + drop(stdout); + + if !mkcomposefs.wait()?.success() { + bail!("mkcomposefs failed"); + }; + + Ok(image.into()) +} + +pub fn mkfs(fs: &FileSystem) -> Result> { + let image = match env::var("COMPOSEFS_FORMAT") { + Ok(s) if s == "new" => mkfs_erofs(fs), + _ => mkfs_mkcomposefs(fs)?, + }; + + if env::var("COMPOSEFS_DUMP_EROFS") == Ok("1".to_string()) { + debug_img(&image); + } + + Ok(image) +} diff --git a/src/oci/image.rs b/src/oci/image.rs index 0358f6f..b75f0c7 100644 --- a/src/oci/image.rs +++ b/src/oci/image.rs @@ -6,7 +6,8 @@ use oci_spec::image::ImageConfiguration; use crate::{ dumpfile::write_dumpfile, fsverity::Sha256HashValue, - image::{mkcomposefs, FileSystem, Inode, Leaf}, + image::{FileSystem, Inode, Leaf}, + mkfs::mkfs, oci, repository::Repository, selabel::selabel, @@ -101,8 +102,8 @@ pub fn create_image( selabel(&mut filesystem, repo)?; filesystem.done(); - let image = mkcomposefs(filesystem)?; - repo.write_image(name, &image) + let erofs = mkfs(&filesystem)?; + repo.write_image(name, &erofs) } #[cfg(test)]