diff --git a/Cargo.toml b/Cargo.toml index 38ff692..f6f7dc6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,8 +16,10 @@ anyhow = { version = "1.0.89", default-features = false } async-compression = { version = "0.4.17", default-features = false, features = ["tokio", "gzip"] } clap = { version = "4.5.19", default-features = false, features = ["std", "help", "usage", "derive"] } containers-image-proxy = "0.7.0" +env_logger = "0.11.5" hex = "0.4.3" indicatif = { version = "0.17.8", features = ["tokio"] } +log = "0.4.22" oci-spec = "0.7.0" regex-automata = { version = "0.4.8", default-features = false } rustix = { version = "0.38.37", features = ["fs", "mount", "process"] } @@ -26,7 +28,8 @@ tar = { version = "0.4.42", default-features = false } tempfile = "3.13.0" thiserror = "2.0.4" tokio = "1.41.0" -zerocopy = "0.8.13" +xxhash-rust = { version = "0.8.12", features = ["xxh32"] } +zerocopy = { version = "0.8.13", features = ["derive"] } zstd = "0.13.2" [dev-dependencies] diff --git a/doc/erofs.md b/doc/erofs.md new file mode 100644 index 0000000..07b5755 --- /dev/null +++ b/doc/erofs.md @@ -0,0 +1,431 @@ +# erofs: the missing manual + +## Introduction + +This is an attempt to document the format of erofs (or at least the subsets of +it that we use in composefs). + +It probably makes sense to have `erofs_fs.h` open when reading this. + +## Overall concepts + +All integers (including all offsets) are stored in little-endian byte order. + +The file layout is fairly free-form. You can freely mix inodes, data blocks, +and shared xattr entries. inodes are 64-bit values based on file offsets +rather than integer indexes into a fixed table, so they can be anywhere at all. +xattrs are 32-bit values based on offsets, so they're a bit more limited (but +not in filesystems of reasonable size). + +## The first 1024 bytes (pre-superblock) + +The first 1024 bytes of an erofs have no particular meaning. You can put +anything you want there, like partition tables or boot sectors or anything +else. composefs puts its own header inside of this area, at the start. + +## The superblock (at 1024 bytes, 128 bytes long) + +The superblock is defined by `struct erofs_super_block`. + +Here's some notes about some of the fields. Anything not mentioned is left as +0 by us. There's some pretty wild features in here, but we don't use them all +(and I don't understand them, either) so they're not all documented. + +* `magic`: set that to `EROFS_SUPER_MAGIC_V1` (`0xE0F5E1E2`) +* `checksum`: only meaningful of the `SB_CHKSUM` feature is enabled. This is + a crc32c over a block-sized-chunk of data starting from the superblock, + with this field set to 0. That's pretty weird. Maybe don't use this. +* `feature_compat`: a flags field. The filesystem will still mount even if + the kernel doesn't know about any features which might be present. The + flags: + - `SB_CHKSUM` (`0x0001`): set if the checksum field in the superblock is + populated. Otherwise, the checksum is ignored. + - `MTIME` (`0x0002`): at first, erofs named the timestamp fields `ctime` + instead of `mtime`. That got changed a long time ago, and this flag + got added to indicate filesystems that were created with the new + semantics. This flag has absolutely zero impact at run time: the kernel + ignores it. + - `XATTR_FILTER` (`0x0004`): set if the xattr bloom filter should be + used. Read about this in the inode section. +* `blkszbits`: log2 of the block size. Better set this to 12 (4096). +* `root_nid`: the reference to the root inode. See the inodes section for + what that means. Normally inodes are stored in u64, but this is somewhat + randomly a u16, which means that you're gonna need to put the root + directory near the start. +* `inos`: the total number of inodes defined. This is only used for + `statfs()` purposes. +* `build_time`, `build_time_nsec`: this is something like a compression + feature if you want all (or many) files in your filesystem to have the same + mtime. Then you can use the "compact" inode layout, which doesn't have its + own `mtime` field, and this one will be used instead. If you don't have + compact inodes then this is meaningless. +* `blocks`: total filesystem block size. This is only used for `statfs()`. +* `meta_blkaddr`: the start of the "metadata area". This is where the inodes + are. This is a block address, so it gets multiplied by the block size to + determine the actual offset. +* `xattr_blkaddr`: the start of the "shared xattr area". See the "Shared + xattr" and "Inodes" sections for more info. + +## Extended attributes + +There are two options for storing xattr data in a erofs: +* inline with the inode itself +* in a "shared xattr" struct somewhere + +The format of both of these is the same. + +The inline thing is nice and simple, but it might be space-inefficient for +cases where the same (key, value) pair appears over and over again (which might +be the case for things like security labels and acls and the like). + +### Prefix indexes + +A rudimentary form of compression is supported on xattr names. There are a +number of hardcoded "common prefixes" defined with the `EROFS_XATTR_INDEX_` +constants in `erofs_fs.h`. Confusingly, although `LUSTRE` is present, it's not +wired up in the kernel. Don't use that one. + +The basic idea is that you find the prefix for your xattr from the list (like +`user.` or `security.`) and then you store only the "suffix" part, along with +the prefix index. If you can't find a prefix, you use 0 (which is conceptually +a prefix of ""). If the prefix matches the entire name then the suffix is `""`. + +Note: you really need to do this "compression" step, because it's assumed +during the lookup phase. ie: if we're looking for an xattr `"user.xyz"` then +we'll only consider the entries that have the prefix index for `user.` set on +them. If you didn't properly "compress" your xattr names, they won't be found. + +There's support in the erofs format for custom prefixes. That's when the high +bit of the prefix index is set. These got added circa kernel version 6.4 with +a patch series ending with `6a318ccd7e08` ("erofs: enable long extended +attribute name prefixes") but aren't documented here because we don't use them. + +### On-disk format + +All extended attributes (both shared and inode-inline) are stored in a +simple format with a small header. That's `struct erofs_xattr_entry`. It's just 4 bytes: +* u8: the suffix length (in bytes, no nul) +* u8: the prefix index (see above) +* u16: the value length (in bytes, no nul) + +The header must start at an offset with an alignment of 4. + +Immediately following the header is the suffix (name with prefix removed), +immediately followed by the value. There's no nul after the name (which is OK, +since we know the length from the header). + +### Shared xattrs + +This is basically just an xattr stored somewhere in the filesystem image, using +the format mentioned above. It is referred to by a 32-bit identifier: +* start at the `xattr_blkaddr` mentioned in the super block. That's a block + address, so remember to multiply that by the block size. +* add 4 times the shared xattr identifier (since the header must be 4-aligned) +* that's the xattr header (mentioned above) + +If your filesystem image is going to be smaller than 16GB then you can probably +just leave the `xattr_blkaddr` set to 0 to make your life easier. + +### Inode-inline xattrs + +We talk about those in the Inode section. Speaking of which, let's talk about... + +## Inodes + +Here's where things get complicated. + +First, the easy part: similar to shared xattrs, inodes are just a structure +stored somewhere in the filesystem image. There's no "inode table". This +works because the way that you refer to inodes is with an "nid": +* start at the `meta_blkaddr` mentioned in the super block. That's a block + address, so remember to multiply that by the block size. +* add 32 times the nid (since inodes must be 32-aligned) +* that's the inode header + +### On-disk formats + +The very first thing in the inode is the format field. This is a mix of two +things, but the most important thing to talk about first is the low-order bit: +it's set to 0 if this is a "compact" inode and 1 if it's a "extended" inode. + +We don't use compact inodes, so I'm not going to document them, but you can get +a pretty good idea of what they're capable of by reading the headers. The rest +of this section discusses extended inodes. + +The extended inode header (`struct erofs_inode_extended`) has a size of 64 and +needs to be 32-aligned. It has these interesting fields: +* `format`: + - first bit: as mentioned above, for an extended inode the low order bit + will always be set + - the rest: the "data layout" (which is complicated enough to get its own + section) +* `xattr_icount`: this is also complicated enough that we want to talk about + it elsewhere. See the "Extended attributes" section below (not the one + above!). The main thing to know is that this will be 0 if there are none. +* `mode`: that's the same like you'd find in `.st_mode` from `stat()` +* `size`: ditto, except `.st_size` +* `i_u`: you'd better look at the "data layout" section about this one... +* `ino`: a compatibility shim for cases where we need to report `st_ino` in + 32-bits. For 64-bit userlands, we use the nid directly as the `.st_ino`. + You can do what you want with this (as long as it's unique), but for + filesystems smaller than 128GB you can probably just use the nid. +* `uid`, `gid`: those are fairly obvious, I guess +* `mtime`, `mtime_nsec`: those too +* `nlink`: try to set this correctly: some things might get upset if it's not + right. For non-directories, that's the number of hardlinks (ie: 1 for + non-hardlinked files). For directories, that's 2 plus the number of + subdirectories. + +Directly following the inode header is the extended attribute header (if +`xattr_icount` is non-zero). Then comes any inline data (as per the "data +layout" section). + +### Extended attributes + +If the `xattr_icount` field in the inode header is set to 0 then this section +is skipped entirely. Otherwise we write out the inode xattr header (`struct +erofs_xattr_ibody_header`). This has: +* `name_filter` (`u32`): a bloom filter for which xattrs are present. This + needs its own section. +* `shared_count` (`u8`): the number of shared xattrs +* some reserved bytes to pad things up to 12 + +Immediately following the header come the shared xattr references. They're in +the format mentioned in the "Shared xattrs" section above, simply encoded as +little-endian u32s. So: the first `4 * shared_count` bytes after the header +are those. + +Then the inline xattrs are next. Those are stored in the format mentioned in +the "On-disk format" sub-section in the "Extended attributes" section. They're +just written here one after another, with padding added so that each header is +4-aligned. There is also padding after the last one, which is important if +inline data is to follow (as per the "data layout" section). + +#### About `xattr_icount` + +So, if there's no xattrs then this is zero. + +Otherwise this is basically the size of the extended attributes area divided by +4, with the exception that the 12-byte header counts for only 4 bytes. Put +another way: you remove the size of the header, divide by 4, then add 1 back +again. + +A value of 1 would be pretty suspicious, since that would indicate the presence +of a header, but no xattrs (shared or inline), and in that case normally we'd +omit the header. + +The kernel basically uses this to know how many bytes it needs to skip over +before it can find the inline file data. It will remove the 1, multiply by 4, +then add 12 (the header). See `erofs_xattr_ibody_size()`. + +#### About `name_filter` + +This is a 32-bit bloom filter used to quickly determine if a given xattr is not present. + +The hash algorithm is xxh32. The thing that gets hashed is not the name, but +the "suffix" that's left after removing the prefix. The seed is +`EROFS_XATTR_FILTER_SEED` plus the prefix index. The lower 5 bits of the hash +value (0..31) are used to determine which bit is used. + +For some reason a bit value of 1 here indicates the absence of a particular +xattr, which is opposite to the usual arrangement. You'd think it was for +compatibility, but the filter is only engaged if the feature bit is present in +the superblock. + +This feature got added in kernel commits: +* `3f339920175c` ("erofs: update on-disk format for xattr name filter") +* `fd73a4395d47` ("erofs: boost negative xattr lookup with bloom filter") + +### Data layout + +erofs has a bunch of different ways to represent the actual content associated +with an inode (regular file content, directory entries, symlink target). + +We describe three of them here: +* plain +* inline +* chunked + +The data layout is chosen using some of the bits of the `format` field in the +inode header. + +#### `EROFS_INODE_FLAT_PLAIN` + +In this case there's never any inline data. The inode content is stored +entirely as a series of contiguous blocks. The offset of the first block is +what goes in the `i_u` field (measured in blocks, not bytes). + +The number of blocks is determined by the `.size` field (divided by block size, +rounded up). + +If the content is not a multiple of the blocksize then the last block should be +0-padded. + +#### `EROFS_INODE_FLAT_INLINE` + +This is similar to `EROFS_INODE_FLAT_PLAIN` except if the content is not a +multiple of the blocksize. In that case, instead of 0-padding the last block +to fill up a block, the content of the last block is stored directly inline +with the inode, without padding. + +So, imagining the content is 2.5 blocks worth of data: +* the first block is the one pointed to by `i_u` +* the second block is the one immediately following it +* the last block is stored at the end of the inode + +The number of blocks is determined by the `.size` field, divided by block size, +rounded down. The remainder is the number of bytes of inline data. + +The inline data must be written in such a way that it does not cross a block +boundary. It is theoretically permitted for the inline data to be in a +separate block (ie: the block directly following the inode data). It is also +permitted for the inode data itself to cross block boundaries. There are a +couple of caveats to be aware of, however: +* the alignment of inodes is 32 bytes, but the size of an extended inode is 64 + bytes. `mkfs.erofs` tries to ensure that extended inodes headers land + entirely within on disk block (for efficiency), but this isn't required by + the kernel. +* `mkfs.erofs` also tries to ensure that the inline data ends in the same + disk block as the last byte of the inode metadata (ie: inode header plus + xattrs). This is theoretically not required by the kernel. +* A bug present in the kernel before 6.12 meant that this was required for + inline symlink targets. This was fixed by `9ed50b8231e3` ("erofs: fix + incorrect symlink detection in fast symlink"). +* In general, when faced with the task of writing out an inode with inline + data present, you may need to add padding bytes before the start of the + inode in order to ensure that the inline data falls within a single block. + If you allow inlining of large amounts of data (approaching the block size) + then you'll almost always need to add padding to get the correct alignment + (and often a large amount of it), which is wasteful. On the other hand, if + you only inline very small amounts of data then you are wasting space by + padding out filesystem blocks with zeros. There is a balance to be struck, + and `mkcomposefs` uses a "heuristic" of half a block size as the inlining + limit. I've performed simulations which show that this value is fairly + close to ideal for a random distribution of file sizes, starting inode + alignment and xattr content sizes. + +#### `EROFS_INODE_FLAT_CHUNK_BASED` + +In this case, the `i_u` field isn't a block reference but is instead split into +sub-fields. The main gist of it, though, is that this stores the log2 of the +number of blocks per chunk (maximum of 31). + +So if you write 4 here, then there are 16 blocks in each chunk. + +The references to the chunks are then written as the inline data, 4 bytes per +chunk, as block indexes (to the starting block). I'm not sure if that's +measured in blocks or in chunks, because the only reason we use this feature is +for a special purpose: null chunks. + +If a chunk index is written as -1 (ie: 0xffffffff) then it refers to a "null" +chunk of the given size. This effectively gets you support for sparse files. + +For the sparse file use-case there's no benefit to choosing anything other than +the maximum chunk format of 31 for the `-i_u` field. The number of chunks you +need to write is determined by the file size, but for a 4096 byte block size +and a chunk format of 31 all files less than 8TB can be handled with a single +"chunk". + +#### Character and block devices + +If the `mode` field of the inode indicates that this is a device, then the data +layout isn't relevant, and the `i_u` field gets the `rdev` of the device. Note +that this is a 32-bit field, so 32-bit rdev. `size` is zero. + + +#### Fifos and sockets + +These have no storage at all. `i_u` is ignored and there is never inline data. +`size` should always be 0. + +## Directories + +The final thing that needs describing is how a directory gets stored. erofs +directories are the classical mapping from names to inodes, with the extra +'file type' field that gets returned via the `d_type` field in `struct dirent` +(to avoid needing to `stat()` the inode). + +The dirent structure has a size of 12 (and an alignment of 4) and looks like: +* `nid` (`u64`): the inode referred to by this entry +* `nameoff` (`u16`): an offset to the name (inside of this block). See below. +* `file_type` (`u8`): the filetype field for `d_type` + +The directory needs to explicitly include the `.` and `..` entries. All +entries (including `.` and `..`) are sorted in asciibetical order. Note: the +`.` and `..` are not handled specially and are not necessarily at the start: +they're in asciibetical order too. + +The directory entries are taken in their sorted order and split into blocks. +However many entries will fit into the first block go into the first block, and +so on. All blocks except for the last one are padded with zeros. A directory +has a specific encoded size (which ends up in the `size` field of the inode). +It is made from a number of complete blocks, times the blocksize, plus the size +of the (possible) trailing partial block (which might be inlined, depending on +the selected data layout). + +Each block is a number of dirent structs packed at the start, plus the entry +names referred to from those structs. The entry names must immediately follow +the structs, and each entry name must immediately follow the previous (with no +nul). The reason for that will become clear with our example: + +Let's consider an example directory with entries `.`, `..`, +`someverylongfilename`, `subdir`. To keep things interesting, let's further +imagine that our filesystem block size is 32 bytes. + +We segment into blocks by taking entries until no more entries fit. Each entry +is the 12 byte dirent struct, plus the name, so: +* `.`: (12 + 1) = 13 → 13 total bytes +* `..`: (12 + 2) = 14 → 27 total bytes +* `file`: (12 + 4) = 16 → too big, won't fit. + +So we know that the first directory block will contain `.` and `..`. It looks like: +* offset `0`: the dirent struct for `.`, `nameoff` is `24`. +* offset `12`: the dirent struct for `..`, `nameoff` is `25`. +* offset `24`: `.` +* offset `25`: `..` +* offset `27`: padded with `nul` + +The `nameoff` fields are more important here than they seem. If we look at the +first `nameoff` field, it's `24`. That tells us that there are two entries in +this block (since the entry size is 12). We also know the length of the name +of the first entry because the name of the second entry starts right after it. + +How do we know the name of the last entry? One of three ways: +* if this is the final block of the directory, then the overall size of the + directory (in the inode `size` field) will indicate where the final name + must surely terminate +* if this is a non-final block, it might be that the name fits exactly into + the block size. In that case, the end of the name is the end of the block. +* if this is a non-final block, and the name doesn't fit exactly into the + block size then it means we'll have added some padding. In this case the + name is `nul`-terminated. That's the case for our `..` entry here. + +Now let's do our next block: +* `someverylongfilename` (12 + 20) = 32 → 32 total bytes +* `subdir` (12 + 6) = 18 → too big, won't fit. + +So we only get one entry in this block. The layout is: +* offset `0`: the dirent struct for `someverylongfilename`, `nameoff` is `12`. +* offset `12`: `someverylongfilename` +* no padding, since we're already at 32 bytes. + +In this case we look at the `nameoff` of the first entry (`12`) and know that +there must only be one entry in this block. And in this case, the name fills +the block exactly, so we won't find a `nul` terminator, and we know the name +must have a length of `12`. + +Finally, `subdir` gets put in the last partial block: +* offset `0`: the dirent struct, `nameoff` is `12` +* offset `12`: `subdir` +* offset `18`: that's the end of the directory + +What comes at offset `18`? Nothing. The `size` field of the directory is 2 +blocks (`2 * 32` = `64`) plus the `18` bytes from this block, so a total of +`82`. + +Of course, if we're storing the directory as "flat plain" or "chunk based" then +we need to pad this out to a complete block size (and we'll do that with +`nul`s), but those padding bytes are not conceptually part of the directory +content. But what if we stored it "flat inline"? We might have the next inode +directly following. In that case, we effectively depend on the inode `size` to +know that the final filename has a length of `6`. diff --git a/doc/image-format.md b/doc/image-format.md new file mode 100644 index 0000000..40356c4 --- /dev/null +++ b/doc/image-format.md @@ -0,0 +1,276 @@ +# Canonical composefs file format + +## Prelude + +We expect the process of creating an erofs from a filesystem image to be +deterministic. `erofs` is very free-form and there are many ways things could +be organized. + +Here's where we try to document some of the decisions we make. This documents +the erofs images produced by the `composefs` rust crate, which are currently +different from the official `composefs` repository (ie: `libcomposefs`, in C). +It would be very desirable to try to make this implementation exactly match the +`libcomposefs` implementation so that we could check them against each other to +ensure that they produce bitwise identical output. On the other hand, we've +been discussing creating a "version 1.1" format, and this might be a good +jumping-off spot for that. + +The goal of this document is to completely and unambiguously document every +decision we made in such a way that you could use this document as a guide to +produce a new composefs erofs writer implementation, from scratch, which +produces exactly the same output. However, this document is probably currently +very incomplete, and maybe even incorrect. We should strive to cover every +possible detail here, but it's hard. Hopefully things will improve with time, +but until then, you might need to check the implementation. + +In cases of ambiguity or incorrectness, issues and patches are extremely +welcome. + +## Overall layout concept + +The composefs header and superblock are the only things that need to be at +fixed offsets. How do we organize everything else? + +Generally speaking, we perform these steps: +* collect the filesystem into a flat list of inodes +* collect and "share" xattrs, as appropriate +* write the composefs header and the superblock +* write the inodes directly following the superblock +* write the shared xattrs directly following the inodes +* then the blocks (only for directories) + +## Collecting inodes + +We collect the inodes into a flat list according to the following algorithm: +* our goal is to visit each inode, collecting it into the inode list as we + visit it, in the order that we visited it +* start at the root directory +* for each directory that we visit: + - the directory is stored first, then the children + - we visit the children in asciibetical order, regardless of file type + (ie: we interleave directories and regular files) + - when visiting a child directory, we store all content of the child + directory before returning to the parent directory (ie: depth first) +* in the case of hardlinks, the inode gets added to the list at the spot that + the first link was encountered + +Consider a filesystem tree + +``` + / + bin/ + cfsctl + usr/ + lib/ + libcomposefs.so + libglib-2.0.so + libexec/ + cfsctl +``` + +where `/bin/cfsctl` and `/usr/libexec/cfsctl` are hardlinks. + +In that case, we'd collect the inodes in this order: +1. `/` +1. `/bin/` +1. `/bin/cfsctl` (aka `/usr/libexec/cfsctl`) +1. `/usr/` +1. `/usr/lib/` +1. `/usr/lib/libcomposefs.so` +1. `/usr/lib/libglib-2.0.so` +1. `/usr/libexec/` + +(skipping `/usr/libexec/ctlctl` because we already had it by the time we encountered it). + +So that's 8 inodes, in that order. + +## Special handling for overlayfs + +Ultimately, the erofs image that we produce needs to be used as a layer in an +overlayfs stack. There are a lot of cases where the thing that we write out +only makes sense to overlayfs. There are other cases where we need to avoiding +writing out things that overlayfs would treat as "special". + +`libcomposefs` writes 256 files named from `00` to `ff` into the root directory +as character devices with major/minor of (0, 0). Those are overlayfs whiteouts +and they are needed for older versions of overlayfs which don't support "data +only" layers. We don't target these versions, so *we don't add these files*. +We also don't mark the root directory as opaque or do anything else special +with it. + +Conversely, if we encounter a character device with major/minor (0, 0) then we +need to escape it to make sure that it appears as such in the final composed +image (and does not get handled by overlayfs as a whiteout). We do that by: +TODO (not implemented yet). + +We also need to make sure that the only `trusted.overlay.*` attributes which we +write are ones that came from us. If we encounter any `trusted.overlay.*` +attributes in the source, we escape them to `trusted.overlay.overlay.`, causing +them to lose their special meaning. + +## Extended attribute handling + +For each inode, we collect and write the extended attributes in asciibetical +order, by full name. Note: this is different than the shared xattr table which +has a more complicated sorting, but maybe we want to unify the two. + +We use the hardcoded prefix indexes (which is actually mandatory). + +We don't use "long prefixes", but we might start doing that at some point, +because it would sure be nice to not have to write `"overlay.redirect"`, +`"overlay.metacopy"` and `"selinux"` over and over again. The feature seems +complicated, though... + +## Collecting shared xattrs + +`erofs` has a facility for sharing xattrs where the name and the value are +identical, and we use it. After we've collected all of our inodes, we iterate +the list and take note of all (name, value) pairs. If any (name, value) pair +appears more than once, we share it. + +The process of "sharing" involves modifying the original inode. We iterate the +present xattrs, and for each attribute that we share, we remove it from the +"inline" list and add it to the "shared" list, in the same order as it appeared +in the inline list. + +NB: this operation is performed on the flattened inode list, not the directory +tree. That means that if a particular (name, value) pair appears uniquely on +an inode with multiple hardlinks, we'll count that as a single occurrence and +it won't be shared. + +Note also: the attributes that we add ourselves are considered candidates for +sharing. That means that if we had two external files which were not hardlinks +but nevertheless contained the same data, we'd end up sharing their +`trusted.overlayfs.` attributes. + +## The composefs header + +`erofs` leaves the first 1024 bytes of the file free to us, and we store a +32-byte header at offset 0. The kernel ignores this, and our mount code +doesn't actually do anything with it at the moment, either. We try to fill it +out in the same way as `libcomposefs`: + +* `magic` (`u32`): `0xd078629a` +* `version` (`u32`): I think this is something like the overall file format + version. If this changes, then things are possibly incompatible, and maybe + this isn't even an `erofs` anymore. Currently `1`. +* `flags`: `0` +* `composefs_version`: I think this is something like a statement about the + current strategy for layout decisions. If this changes, the algorithm for + building the file has probably decided to put things in different places + (and the checksum of the file will have changed), but the result is still + understandable as an `erofs`. Currently `1`. + +## The superblock + +* `checksum`: we don't fill that out +* `feature_compat`: we set `MTIME` and `XATTR_FILTER` +* `blkszbits`: we use 12, for a block size of 4096 +* `root_nid`: that's going to end up being 36, which follows from the fact + that we put the root inode directly following the superblock, at offset + `1024 + 128` = `1152`. `1152 / 32` = `36`. +* `inos`: we currently set that to the number of inodes in the filesystem. + `libcomposefs` adds some extra file content (the `00`..`ff` whiteouts) so + it gets a larger number than we do. +* `blocks`: the total filesize, divided by 4096. +* `build_time`, `build_time_nsec`: since we only use extended format inodes, + these fields are meaningless and we currently set them to 0 (which is + different from `libcomposefs`). +* `meta_blkaddr`, `xattr_blkaddr`. We currently set both of these to 0 to + keep things simple. `libcomposefs` performs a complicated calculation to + set `meta_blkaddr` to zero as well (since the first inode directly follows + the superblock, it will always be within the first 4096 byte filesystem + block), but its complicated calculation for `xattr_blkaddr` might well land + on a non-zero value, so that's different from us. + +## The inodes + +After the superblock, we write the inodes. Some notes: + +* we only use extended inodes, because mtime is important to us and we + generally expect every file to have a unique mtime. This is a difference + from `libcomposefs`. + +* we use a "chunk based" data layout for non-inline regular files: + + - the way this works in overlayfs, we want to store a correctly-sized + sparse file in the upper layer. This lets us have the correct `size` + field on the inode, so we don't need to interact with the data layer in + order to do `stat()`. + + - we set the chunk format (ie: the `i_u` field) to 31, the maximum + + - we store a single "null" chunk pointer + + - this corresponds to a chunk size of 8TB, which is then the upper limit + of files we can store + + - `libcomposefs` tries to take the smallest chunk format value which will + get the job done with a single chunk pointer, and will write multiple + chunk pointers if necessary (for extreemely large files). Maybe we + should do that too. + + - in this case we set the `trusted.overlay.metacopy` and + `trusted.overlay.redirect` attributes (in that order) on the file. + These attributes are written first, before the other attributes that + would be present on the same file (which are otherwise in sorted + order). + + - the `trusted.overlay.metacopy` attribute is 36 bytes long, and is set to: + + the 4-byte header: [0 36, 0, 1] + + the 32-byte SHA256 fs-verity digest + + - the `trusted.overlay.redirect` attribute is set to the string + `"/xx/yyyy..."` where `xx` is the first two lowercase hexidecimal bytes + of the fs-verity digest and the `yyyy...` is the rest. That's just a + reference into the `objects/` subdirectory of the repository (which is + mounted in the overlayfs stack as the data layer). + +* we use a "flat inline" data layout for all other inodes: + + - for character and block devices, as well as fifos and sockets this is + meaningless, but we need to set something + + - for inline regular files we store the content inline. This will break + if we try to inline a file larger than 4095 characters, but our current + cut-off is 64. + + - for symlinks this means that the link target gets stored inline. + Hopefully we don't have symlinks with targets longer than 4095 + characters, or we're gonna get in trouble. + + - directories may well be larger than 4096 bytes, so we might end up + needing to store blocks for those. These follow the "shared xattrs" + area. We could probably set "flat plain" for directories that are an + exact multiple of 4096 bytes in size, and `libcomposefs` does that, but + we don't bother. + +We pad the last inode to the required alignment for inodes, even though it is +generally followed by a shared xattr (which has a less stringent alignment +requirement). + +## The shared xattrs + +There's not much left to be said about these. We currently write them out in +the order that `collections::BTreeMap` applies to our `struct XAttr`, which I +think basically ends up sorting them by prefix index, then by suffix, then by +value. We might like to firm that up at some point. This is notably different +than the sorting applied to the attributes as they appear in the inodes, and we +also don't give any special treatment to the `trusted.overlay.` attributes that +we added: they're sorted here in the usual way. + +After we do this, and even if there was no shared xattrs, we always pad up to a +4096 byte boundary, even if there are no data blocks. That means that the +filesystem image will always be a multiple of 4096. + +## The blocks + +Now comes the data blocks. These are written in sequence for each inode, +according to the sequence of the inode in the inode list. Due to our use of +"flat inline" data layout, only full blocks are stored (although they may have +included inter-block padding in directories), so we keep 4096-byte alignment +from here on out. + +## The end + +That's it. The file is over now. We'll have ended on a multiple of 4096. diff --git a/src/bin/cfsctl.rs b/src/bin/cfsctl.rs index d62a03e..1e0cc5b 100644 --- a/src/bin/cfsctl.rs +++ b/src/bin/cfsctl.rs @@ -97,6 +97,8 @@ enum Command { } fn main() -> Result<()> { + env_logger::init(); + let args = App::parse(); let repo = (if let Some(path) = args.repo { diff --git a/src/bin/erofs-debug.rs b/src/bin/erofs-debug.rs new file mode 100644 index 0000000..e8afea4 --- /dev/null +++ b/src/bin/erofs-debug.rs @@ -0,0 +1,25 @@ +use std::{fs::File, io::Read, path::PathBuf}; + +use clap::Parser; + +use composefs::erofs::debug::debug_img; + +/// Produce a detailed dump of an entire erofs image +/// +/// The output is in a diff-friendly format, such that every distinct image produces a distinct +/// output (ie: an injective mapping). This is useful for determining the exact ways in which two +/// different images are different. +#[derive(Parser)] +struct Args { + /// The path to the image file to dump + image: PathBuf, +} + +fn main() { + let args = Args::parse(); + let mut image = File::open(args.image).expect("Opening file"); + + let mut data = vec![]; + image.read_to_end(&mut data).expect("read_to_end() failed"); + debug_img(&data); +} diff --git a/src/erofs/debug.rs b/src/erofs/debug.rs new file mode 100644 index 0000000..1cc082c --- /dev/null +++ b/src/erofs/debug.rs @@ -0,0 +1,377 @@ +use core::mem::offset_of; +use std::{ + collections::BTreeMap, + ffi::OsStr, + mem::discriminant, + os::unix::ffi::OsStrExt, + path::{Path, PathBuf}, +}; + +use zerocopy::{Immutable, KnownLayout, TryFromBytes}; + +use super::{ + format::{self, CompactInodeHeader, ComposefsHeader, ExtendedInodeHeader, Superblock}, + reader::{DirectoryBlock, Image, Inode, InodeHeader, InodeOps, InodeType, InodeXAttrs, XAttr}, +}; + +macro_rules! print_fields { + ($ty: ty, $s: expr, $f: ident) => {{ + let value = &$s.$f; + let default = if false { value } else { &Default::default() }; + if value != default { + println!(" +{:02x} {}: {:?}", offset_of!($ty, $f), stringify!($f), value); + } + }}; + ($ty: ty, $s:expr, $head: ident; $($tail: ident);+) => {{ + print_fields!($ty, $s, $head); + print_fields!($ty, $s, $($tail);+); + }}; +} + +fn utf8_or_hex(data: &[u8]) -> String { + if let Ok(str) = std::str::from_utf8(data) { + format!("\"{str}\"") + } else { + hex::encode(data) + } +} + +// This is basically just a fancy fat pointer type +enum SegmentType<'img> { + Header(&'img ComposefsHeader), + Superblock(&'img Superblock), + CompactInode(&'img Inode), + ExtendedInode(&'img Inode), + XAttr(&'img XAttr), + DataBlock(&'img [u8]), + DirectoryBlock(&'img DirectoryBlock), +} + +// TODO: Something for `enum_dispatch` would be good here, but I couldn't get it working... +impl SegmentType<'_> { + fn addr(&self) -> usize { + match self { + SegmentType::Header(h) => &raw const **h as usize, + SegmentType::Superblock(sb) => &raw const **sb as usize, + SegmentType::CompactInode(i) => &raw const **i as *const u8 as usize, + SegmentType::ExtendedInode(i) => &raw const **i as *const u8 as usize, + SegmentType::XAttr(x) => &raw const **x as *const u8 as usize, + SegmentType::DataBlock(b) => &raw const **b as *const u8 as usize, + SegmentType::DirectoryBlock(b) => &raw const **b as *const u8 as usize, + } + } + + fn size(&self) -> usize { + match self { + SegmentType::Header(h) => size_of_val(*h), + SegmentType::Superblock(sb) => size_of_val(*sb), + SegmentType::CompactInode(i) => size_of_val(*i), + SegmentType::ExtendedInode(i) => size_of_val(*i), + SegmentType::XAttr(x) => size_of_val(*x), + SegmentType::DataBlock(b) => size_of_val(*b), + SegmentType::DirectoryBlock(b) => size_of_val(*b), + } + } +} + +#[repr(C)] +#[derive(TryFromBytes, KnownLayout, Immutable)] +struct DataBlock([u8]); + +struct ImageVisitor<'img> { + image: &'img Image<'img>, + visited: BTreeMap, Vec>)>, +} + +impl<'img> ImageVisitor<'img> { + fn note(&mut self, segment: SegmentType<'img>, path: Option<&Path>) -> bool { + let offset = segment.addr() - self.image.image.as_ptr() as usize; + match self.visited.entry(offset) { + std::collections::btree_map::Entry::Occupied(mut e) => { + let (existing, paths) = e.get_mut(); + // TODO: figure out pointer value equality... + assert_eq!(discriminant(existing), discriminant(&segment)); + assert_eq!(existing.addr(), segment.addr()); + assert_eq!(existing.size(), segment.size()); + if let Some(path) = path { + paths.push(Box::from(path)); + } + true + } + std::collections::btree_map::Entry::Vacant(e) => { + let mut paths = vec![]; + if let Some(path) = path { + paths.push(Box::from(path)); + } + e.insert((segment, paths)); + false + } + } + } + + fn visit_directory_block(&mut self, block: &DirectoryBlock, path: &Path) { + for entry in block.entries() { + if entry.name == b"." || entry.name == b".." { + // TODO: maybe we want to follow those and let deduplication happen + continue; + } + self.visit_inode(entry.inode, &path.join(OsStr::from_bytes(entry.name))); + } + } + + fn visit_inode(&mut self, id: u64, path: &Path) { + let inode = self.image.inode(id); + let segment = match inode { + InodeType::Compact(inode) => SegmentType::CompactInode(inode), + InodeType::Extended(inode) => SegmentType::ExtendedInode(inode), + }; + if self.note(segment, Some(path)) { + // TODO: maybe we want to throw an error if we detect loops + /* already processed */ + return; + } + + if let Some(xattrs) = inode.xattrs() { + for id in xattrs.shared() { + self.note( + SegmentType::XAttr(self.image.shared_xattr(id.get())), + Some(path), + ); + } + } + + if inode.mode() & format::S_IFMT == format::S_IFDIR { + let inline = inode.inline(); + if !inline.is_empty() { + let inline_block = DirectoryBlock::try_ref_from_bytes(inode.inline()).unwrap(); + self.visit_directory_block(inline_block, path); + } + + for id in inode.blocks(self.image.blkszbits) { + let block = self.image.directory_block(id); + self.visit_directory_block(block, path); + self.note(SegmentType::DirectoryBlock(block), Some(path)); + } + } else { + for id in inode.blocks(self.image.blkszbits) { + let block = self.image.data_block(id); + self.note(SegmentType::DataBlock(block), Some(path)); + } + } + } + + fn visit_image( + image: &'img Image<'img>, + ) -> BTreeMap, Vec>)> { + let mut this = Self { + image, + visited: BTreeMap::new(), + }; + this.note(SegmentType::Header(image.header), None); + this.note(SegmentType::Superblock(image.sb), None); + this.visit_inode(image.sb.root_nid.get() as u64, &PathBuf::from("/")); + this.visited + } +} + +pub fn print_paths(paths: &[Box]) { + match paths { + [] => {} + [one] => { + println!(" filename: {one:?}"); + } + many => { + println!(" links:"); + many.iter() + .for_each(|one| println!(" - {one:?}")); + } + } +} + +impl std::fmt::Debug for XAttr { + // Injective (ie: accounts for every byte in the input) + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "({} {} {}) {}{} = {}", + self.header.name_index, + self.header.name_len, + self.header.value_size, + std::str::from_utf8(format::XATTR_PREFIXES[self.header.name_index as usize]).unwrap(), + utf8_or_hex(self.suffix()), + utf8_or_hex(self.value()), + )?; + if self.padding().iter().any(|c| *c != 0) { + write!(f, " {:?}", self.padding())?; + } + Ok(()) + } +} + +// This accounts for every bytes of InodeXAttrs +fn print_xattrs(xattrs: Option<&InodeXAttrs>) { + let Some(xattrs) = xattrs else { + return; + }; + + if !xattrs.shared().is_empty() { + print!(" Shared xattrs:"); + for id in xattrs.shared() { + print!(" {id}"); + } + println!(); + } + println!(" Local xattrs:"); + for xattr in xattrs.local() { + println!(" - {:?}", xattr); + } +} + +fn hexdump(block: &[u8]) { + for row in 0..((block.len() + 15) / 16) { + let offset = row * 16; + print!(" +{offset:04x} "); + for idx in offset..(offset + 16) { + if idx < block.len() { + print!("{:02x} ", block[idx]); + } else { + print!(" "); + } + if idx % 8 == 7 { + print!(" "); + } + } + print!("|"); + for idx in offset..(offset + 16) { + if idx < block.len() { + let c = block[idx]; + if c.is_ascii() && !c.is_ascii_control() { + print!("{}", c as char); + } else { + print!("."); + } + } else { + print!(" "); + } + } + println!("|"); + } +} + +pub fn print_directory_block(block: &DirectoryBlock) { + for entry in block.entries() { + println!( + " {} {:?} -> {}", + utf8_or_hex(entry.name), + entry.file_type, + entry.inode + ); + } +} + +fn print_inode_extra(inode: impl InodeOps + InodeHeader) { + print_xattrs(inode.xattrs()); + let inline = inode.inline(); + if !inline.is_empty() { + if inode.mode() & format::S_IFMT == format::S_IFDIR { + let block = DirectoryBlock::try_ref_from_bytes(inline).unwrap(); + print_directory_block(block); + } else { + hexdump(inode.inline()); + } + } +} + +pub fn debug_img(data: &[u8]) { + let image = Image::open(data); + let visited = ImageVisitor::visit_image(&image); + + let mut offset = 0; + for (start, (segment, paths)) in visited { + if offset > start { + println!("*** Overlapping segments!"); + offset = start; + } + if offset < start { + println!("{offset:08x} Padding"); + let padding = &data[offset..start]; + if padding.iter().all(|c| *c == 0) { + println!(" {} * nul", padding.len()); + } else { + println!(" {:?}", padding); + } + println!(); + offset = start; + } + + match segment { + SegmentType::Header(header) => { + println!("{offset:08x} ComposefsHeader"); + print_fields!( + ComposefsHeader, header, + magic; flags; version; composefs_version; unused + ); + } + SegmentType::Superblock(sb) => { + println!("{offset:08x} Superblock"); + print_fields!( + Superblock, sb, + magic; checksum; feature_compat; blkszbits; extslots; root_nid; inos; build_time; + build_time_nsec; blocks; meta_blkaddr; xattr_blkaddr; uuid; volume_name; + feature_incompat; available_compr_algs; extra_devices; devt_slotoff; dirblkbits; + xattr_prefix_count; xattr_prefix_start; packed_nid; xattr_filter_reserved; reserved2 + ); + } + SegmentType::CompactInode(inode) => { + println!("{offset:08x} Inode (compact) #{}", offset / 32); // TODO: doesn't take metablk into account + print_paths(&paths); + print_fields!( + CompactInodeHeader, inode.header, + format; xattr_icount; mode; reserved; size; u; ino; uid; gid; nlink; reserved2; + reserved2 + ); + print_inode_extra(inode); + } + SegmentType::ExtendedInode(inode) => { + println!("{offset:08x} Inode (extended) #{}", offset / 32); // TODO: doesn't take metablk into account + print_paths(&paths); + print_fields!( + ExtendedInodeHeader, inode.header, + format; xattr_icount; mode; reserved; size; u; ino; uid; gid; mtime; mtime_nsec; nlink; + reserved2 + ); + print_inode_extra(inode); + } + SegmentType::XAttr(xattr) => { + println!("{offset:08x} XAttr #{}", offset / 4); // TODO: doesn't take xattrblk into account + print_paths(&paths); + println!(" {:?}", xattr); + } + SegmentType::DirectoryBlock(block) => { + println!("{offset:08x} Directory block"); + print_paths(&paths); + print_directory_block(block); + } + SegmentType::DataBlock(block) => { + println!("{offset:08x} Data block"); + print_paths(&paths); + hexdump(block); + } + } + println!(); + + offset = start + segment.size(); + } + if offset < data.len() { + println!("{offset:08x} Padding"); + let padding = &data[offset..data.len()]; + if padding.iter().any(|c| *c != 0) { + println!(" {:?}", padding); + } + println!(); + } + + if offset > data.len() { + println!("*** Segments past EOF!"); + } +} diff --git a/src/erofs/format.rs b/src/erofs/format.rs new file mode 100644 index 0000000..9927ca5 --- /dev/null +++ b/src/erofs/format.rs @@ -0,0 +1,279 @@ +use zerocopy::{ + little_endian::{U16, U32, U64}, + Immutable, IntoBytes, KnownLayout, TryFromBytes, +}; + +#[derive(Debug)] +pub enum FormatError { + InvalidDataLayout, +} + +pub const BLOCK_BITS: u8 = 12; +pub const BLOCK_SIZE: usize = 1 << BLOCK_BITS; + +/* composefs Header */ + +pub const COMPOSEFS_VERSION: U32 = U32::new(1); +pub const COMPOSEFS_MAGIC: U32 = U32::new(0xd078629a); + +#[derive(Debug, Immutable, IntoBytes, TryFromBytes)] +#[repr(u32)] +pub enum ComposefsFlags { + HasAcl = 1 << 0, +} + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct ComposefsHeader { + pub magic: U32, + pub version: U32, + pub flags: U32, + pub composefs_version: U32, + pub unused: [U32; 4], +} + +/* Superblock */ + +pub const MAGIC_V1: U32 = U32::new(0xE0F5E1E2); +pub const FEATURE_COMPAT_MTIME: U32 = U32::new(2); +pub const FEATURE_COMPAT_XATTR_FILTER: U32 = U32::new(4); + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct Superblock { + // vertical whitespace every 16 bytes (hexdump-friendly) + pub magic: U32, + pub checksum: U32, + pub feature_compat: U32, + pub blkszbits: u8, + pub extslots: u8, + pub root_nid: U16, + + pub inos: U64, + pub build_time: U64, + + pub build_time_nsec: U32, + pub blocks: U32, + pub meta_blkaddr: U32, + pub xattr_blkaddr: U32, + + pub uuid: [u8; 16], + + pub volume_name: [u8; 16], + + pub feature_incompat: U32, + pub available_compr_algs: U16, + pub extra_devices: U16, + pub devt_slotoff: U16, + pub dirblkbits: u8, + pub xattr_prefix_count: u8, + pub xattr_prefix_start: U32, + + pub packed_nid: U64, + pub xattr_filter_reserved: u8, + pub reserved2: [u8; 23], +} + +/* Inodes */ + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct CompactInodeHeader { + pub format: FormatField, + pub xattr_icount: U16, + pub mode: U16, + pub nlink: U16, + + pub size: U32, + pub reserved: U32, + + pub u: U32, + pub ino: U32, // only used for 32-bit stat compatibility + + pub uid: U16, + pub gid: U16, + pub reserved2: [u8; 4], +} + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct ExtendedInodeHeader { + pub format: FormatField, + pub xattr_icount: U16, + pub mode: U16, + pub reserved: U16, + pub size: U64, + + pub u: U32, + pub ino: U32, // only used for 32-bit stat compatibility + pub uid: U32, + pub gid: U32, + + pub mtime: U64, + + pub mtime_nsec: U32, + pub nlink: U32, + + pub reserved2: [u8; 16], +} + +#[derive(Debug, Default, Immutable, KnownLayout, IntoBytes, TryFromBytes)] +#[repr(C)] +pub struct InodeXAttrHeader { + pub name_filter: U32, + pub shared_count: u8, + pub reserved: [u8; 7], +} + +#[derive(Clone, Copy, Immutable, KnownLayout, IntoBytes, PartialEq, TryFromBytes)] +pub struct FormatField(U16); + +impl Default for FormatField { + fn default() -> Self { + Self(0xffff.into()) + } +} + +impl std::fmt::Debug for FormatField { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "{} = {:?} | {:?}", + self.0.get(), + InodeLayout::from(*self), + DataLayout::try_from(*self) + ) + } +} + +const INODE_LAYOUT_MASK: u16 = 0b00000001; +const INODE_LAYOUT_COMPACT: u16 = 0; +const INODE_LAYOUT_EXTENDED: u16 = 1; + +#[derive(Debug)] +#[repr(u16)] +pub enum InodeLayout { + Compact = INODE_LAYOUT_COMPACT, + Extended = INODE_LAYOUT_EXTENDED, +} + +impl From for InodeLayout { + fn from(value: FormatField) -> Self { + match value.0.get() & INODE_LAYOUT_MASK { + INODE_LAYOUT_COMPACT => InodeLayout::Compact, + INODE_LAYOUT_EXTENDED => InodeLayout::Extended, + _ => unreachable!(), + } + } +} + +const INODE_DATALAYOUT_MASK: u16 = 0b00001110; +const INODE_DATALAYOUT_FLAT_PLAIN: u16 = 0; +const INODE_DATALAYOUT_FLAT_INLINE: u16 = 4; +const INODE_DATALAYOUT_CHUNK_BASED: u16 = 8; + +#[derive(Debug)] +#[repr(u16)] +pub enum DataLayout { + FlatPlain = 0, + FlatInline = 4, + ChunkBased = 8, +} + +impl TryFrom for DataLayout { + type Error = FormatError; + + fn try_from(value: FormatField) -> Result { + match value.0.get() & INODE_DATALAYOUT_MASK { + INODE_DATALAYOUT_FLAT_PLAIN => Ok(DataLayout::FlatPlain), + INODE_DATALAYOUT_FLAT_INLINE => Ok(DataLayout::FlatInline), + INODE_DATALAYOUT_CHUNK_BASED => Ok(DataLayout::ChunkBased), + _ => Err(FormatError::InvalidDataLayout), + } + } +} + +impl From<(InodeLayout, DataLayout)> for FormatField { + fn from(value: (InodeLayout, DataLayout)) -> FormatField { + FormatField( + (match value.0 { + InodeLayout::Compact => INODE_LAYOUT_COMPACT, + InodeLayout::Extended => INODE_LAYOUT_EXTENDED, + } | match value.1 { + DataLayout::FlatPlain => INODE_DATALAYOUT_FLAT_PLAIN, + DataLayout::FlatInline => INODE_DATALAYOUT_FLAT_INLINE, + DataLayout::ChunkBased => INODE_DATALAYOUT_CHUNK_BASED, + }) + .into(), + ) + } +} + +/* Extended attributes */ +pub const XATTR_FILTER_SEED: u32 = 0x25BBE08F; + +#[derive(Debug, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct XAttrHeader { + pub name_len: u8, + pub name_index: u8, + pub value_size: U16, +} + +pub const XATTR_PREFIXES: [&[u8]; 7] = [ + b"", + b"user.", + b"system.posix_acl_access", + b"system.posix_acl_default", + b"trusted.", + b"lustre.", + b"security.", +]; + +/* Directories */ + +#[derive(Clone, Copy, Debug, Default, Immutable, IntoBytes, TryFromBytes)] +#[repr(u8)] +pub enum FileType { + #[default] + Unknown, + RegularFile, + Directory, + CharacterDevice, + BlockDevice, + Fifo, + Socket, + Symlink, +} +pub const S_IFMT: u16 = 0o170000; +pub const S_IFREG: u16 = 0o100000; +pub const S_IFCHR: u16 = 0o020000; +pub const S_IFDIR: u16 = 0o040000; +pub const S_IFBLK: u16 = 0o060000; +pub const S_IFIFO: u16 = 0o010000; +pub const S_IFLNK: u16 = 0o120000; +pub const S_IFSOCK: u16 = 0o140000; + +impl FileType { + pub fn to_ifmt(&self) -> u16 { + match self { + Self::RegularFile => S_IFREG, + Self::CharacterDevice => S_IFCHR, + Self::Directory => S_IFDIR, + Self::BlockDevice => S_IFBLK, + Self::Fifo => S_IFIFO, + Self::Symlink => S_IFLNK, + Self::Socket => S_IFSOCK, + Self::Unknown => unreachable!(), + } + } +} + +#[derive(Debug, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] +#[repr(C)] +pub struct DirectoryEntryHeader { + pub inode_offset: U64, + pub name_offset: U16, + pub file_type: FileType, // TODO: change to u8 for trivial transmute? + pub reserved: u8, +} diff --git a/src/erofs/mod.rs b/src/erofs/mod.rs new file mode 100644 index 0000000..8c0cc51 --- /dev/null +++ b/src/erofs/mod.rs @@ -0,0 +1,3 @@ +pub mod debug; +pub mod format; +pub mod reader; diff --git a/src/erofs/reader.rs b/src/erofs/reader.rs new file mode 100644 index 0000000..942e0e7 --- /dev/null +++ b/src/erofs/reader.rs @@ -0,0 +1,460 @@ +use core::mem::size_of; +use std::ops::Range; + +use zerocopy::{little_endian::U32, Immutable, KnownLayout, TryFromBytes}; + +use super::format::{ + CompactInodeHeader, ComposefsHeader, DataLayout, DirectoryEntryHeader, ExtendedInodeHeader, + FileType, InodeXAttrHeader, Superblock, XAttrHeader, +}; + +fn round_up(n: usize, to: usize) -> usize { + (n + to - 1) & !(to - 1) +} + +pub trait InodeHeader { + fn data_layout(&self) -> DataLayout; + fn xattr_icount(&self) -> u16; + fn mode(&self) -> u16; + fn size(&self) -> u64; + fn u(&self) -> u32; + + fn additional_bytes(&self, blkszbits: u8) -> usize { + let block_size = 1 << blkszbits; + self.xattr_size() + + match self.data_layout() { + DataLayout::FlatPlain => 0, + DataLayout::FlatInline => self.size() as usize % block_size, + DataLayout::ChunkBased => 4, + } + } + + fn xattr_size(&self) -> usize { + match self.xattr_icount() { + 0 => 0, + n => (n as usize - 1) * 4 + 12, + } + } +} + +impl InodeHeader for ExtendedInodeHeader { + fn data_layout(&self) -> DataLayout { + self.format.try_into().unwrap() + } + + fn xattr_icount(&self) -> u16 { + self.xattr_icount.get() + } + + fn mode(&self) -> u16 { + self.mode.get() + } + + fn size(&self) -> u64 { + self.size.get() + } + + fn u(&self) -> u32 { + self.u.get() + } +} + +impl InodeHeader for CompactInodeHeader { + fn data_layout(&self) -> DataLayout { + self.format.try_into().unwrap() + } + + fn xattr_icount(&self) -> u16 { + self.xattr_icount.get() + } + + fn mode(&self) -> u16 { + self.mode.get() + } + + fn size(&self) -> u64 { + self.size.get() as u64 + } + + fn u(&self) -> u32 { + self.u.get() + } +} + +#[repr(C)] +#[derive(TryFromBytes, KnownLayout, Immutable)] +pub struct XAttr { + pub header: XAttrHeader, + pub data: [u8], +} + +#[repr(C)] +#[derive(Debug, TryFromBytes, KnownLayout, Immutable)] +pub struct Inode { + pub header: Header, + pub data: [u8], +} + +#[repr(C)] +#[derive(Debug, TryFromBytes, KnownLayout, Immutable)] +pub struct InodeXAttrs { + pub header: InodeXAttrHeader, + pub data: [u8], +} + +impl XAttrHeader { + pub fn calculate_n_elems(&self) -> usize { + round_up(self.name_len as usize + self.value_size.get() as usize, 4) + } +} + +impl XAttr { + pub fn from_prefix(data: &[u8]) -> (&XAttr, &[u8]) { + let header = XAttrHeader::try_ref_from_bytes(&data[..4]).unwrap(); + Self::try_ref_from_prefix_with_elems(data, header.calculate_n_elems()).unwrap() + } + + pub fn suffix(&self) -> &[u8] { + &self.data[..self.header.name_len as usize] + } + + pub fn value(&self) -> &[u8] { + &self.data[self.header.name_len as usize..][..self.header.value_size.get() as usize] + } + + pub fn padding(&self) -> &[u8] { + &self.data[self.header.name_len as usize + self.header.value_size.get() as usize..] + } +} + +pub trait InodeOps { + fn xattrs(&self) -> Option<&InodeXAttrs>; + fn inline(&self) -> &[u8]; + fn blocks(&self, blkszbits: u8) -> Range; +} + +impl InodeHeader for &Inode
{ + fn data_layout(&self) -> DataLayout { + self.header.data_layout() + } + + fn xattr_icount(&self) -> u16 { + self.header.xattr_icount() + } + + fn mode(&self) -> u16 { + self.header.mode() + } + + fn size(&self) -> u64 { + self.header.size() + } + + fn u(&self) -> u32 { + self.header.u() + } +} + +impl InodeOps for &Inode
{ + fn xattrs(&self) -> Option<&InodeXAttrs> { + match self.header.xattr_size() { + 0 => None, + n => Some(InodeXAttrs::try_ref_from_bytes(&self.data[..n]).unwrap()), + } + } + + fn inline(&self) -> &[u8] { + &self.data[self.header.xattr_size()..] + } + + fn blocks(&self, blkszbits: u8) -> Range { + let size = self.header.size(); + let block_size = 1 << blkszbits; + let start = self.header.u() as u64; + + match self.header.data_layout() { + DataLayout::FlatPlain => Range { + start, + end: start + size.div_ceil(block_size), + }, + DataLayout::FlatInline => Range { + start, + end: start + size / block_size, + }, + DataLayout::ChunkBased => Range { start, end: start }, + } + } +} + +// this lets us avoid returning Box from Image.inode() +// but ... wow. +#[derive(Debug)] +pub enum InodeType<'img> { + Compact(&'img Inode), + Extended(&'img Inode), +} + +impl InodeHeader for InodeType<'_> { + fn u(&self) -> u32 { + match self { + Self::Compact(inode) => inode.u(), + Self::Extended(inode) => inode.u(), + } + } + + fn size(&self) -> u64 { + match self { + Self::Compact(inode) => inode.size(), + Self::Extended(inode) => inode.size(), + } + } + + fn xattr_icount(&self) -> u16 { + match self { + Self::Compact(inode) => inode.xattr_icount(), + Self::Extended(inode) => inode.xattr_icount(), + } + } + + fn data_layout(&self) -> DataLayout { + match self { + Self::Compact(inode) => inode.data_layout(), + Self::Extended(inode) => inode.data_layout(), + } + } + + fn mode(&self) -> u16 { + match self { + Self::Compact(inode) => inode.mode(), + Self::Extended(inode) => inode.mode(), + } + } +} + +impl InodeOps for InodeType<'_> { + fn xattrs(&self) -> Option<&InodeXAttrs> { + match self { + Self::Compact(inode) => inode.xattrs(), + Self::Extended(inode) => inode.xattrs(), + } + } + + fn inline(&self) -> &[u8] { + match self { + Self::Compact(inode) => inode.inline(), + Self::Extended(inode) => inode.inline(), + } + } + + fn blocks(&self, blkszbits: u8) -> Range { + match self { + Self::Compact(inode) => inode.blocks(blkszbits), + Self::Extended(inode) => inode.blocks(blkszbits), + } + } +} + +#[derive(Debug)] +pub struct Image<'i> { + pub image: &'i [u8], + pub header: &'i ComposefsHeader, + pub blkszbits: u8, + pub block_size: usize, + pub sb: &'i Superblock, + pub inodes: &'i [u8], + pub xattrs: &'i [u8], +} + +impl<'img> Image<'img> { + pub fn open(image: &'img [u8]) -> Self { + let header = ComposefsHeader::try_ref_from_prefix(image) + .expect("header err") + .0; + let sb = Superblock::try_ref_from_prefix(&image[1024..]) + .expect("superblock err") + .0; + let blkszbits = sb.blkszbits; + let block_size = 1usize << blkszbits; + assert!(block_size != 0); + let inodes = &image[sb.meta_blkaddr.get() as usize * block_size..]; + let xattrs = &image[sb.xattr_blkaddr.get() as usize * block_size..]; + Image { + image, + header, + blkszbits, + block_size, + sb, + inodes, + xattrs, + } + } + + pub fn inode(&self, id: u64) -> InodeType { + let inode_data = &self.inodes[id as usize * 32..]; + if inode_data[0] & 1 != 0 { + let header = ExtendedInodeHeader::try_ref_from_bytes(&inode_data[..64]).unwrap(); + InodeType::Extended( + Inode::::try_ref_from_prefix_with_elems( + inode_data, + header.additional_bytes(self.blkszbits), + ) + .unwrap() + .0, + ) + } else { + let header = CompactInodeHeader::try_ref_from_bytes(&inode_data[..32]).unwrap(); + InodeType::Compact( + Inode::::try_ref_from_prefix_with_elems( + inode_data, + header.additional_bytes(self.blkszbits), + ) + .unwrap() + .0, + ) + } + } + + pub fn shared_xattr(&self, id: u32) -> &XAttr { + let xattr_data = &self.xattrs[id as usize * 4..]; + let header = XAttrHeader::try_ref_from_bytes(&xattr_data[..4]).unwrap(); + XAttr::try_ref_from_prefix_with_elems(xattr_data, header.calculate_n_elems()) + .unwrap() + .0 + } + + pub fn data_block(&self, id: u64) -> &[u8] { + &self.image[id as usize * self.block_size..][..self.block_size] + } + + pub fn directory_block(&self, id: u64) -> &DirectoryBlock { + DirectoryBlock::try_ref_from_bytes(self.data_block(id)).unwrap() + } + + pub fn root(&self) -> InodeType { + self.inode(self.sb.root_nid.get() as u64) + } +} + +impl InodeXAttrs { + pub fn shared(&self) -> &[U32] { + // TODO: there must be an easier way... + #[derive(TryFromBytes, KnownLayout, Immutable)] + #[repr(C)] + struct U32Array([U32]); + &U32Array::try_ref_from_prefix_with_elems(&self.data, self.header.shared_count as usize) + .unwrap() + .0 + .0 + } + + pub fn local(&self) -> XAttrIter { + XAttrIter { + data: &self.data[self.header.shared_count as usize * 4..], + } + } +} + +#[derive(Debug)] +pub struct XAttrIter<'img> { + data: &'img [u8], +} + +impl<'img> Iterator for XAttrIter<'img> { + type Item = &'img XAttr; + + fn next(&mut self) -> Option { + if !self.data.is_empty() { + let (result, rest) = XAttr::from_prefix(self.data); + self.data = rest; + Some(result) + } else { + None + } + } +} + +#[repr(C)] +#[derive(Debug, Immutable, KnownLayout, TryFromBytes)] +pub struct DirectoryBlock { + pub data: [u8], +} + +impl DirectoryBlock { + pub fn get_entry_header(&self, n: usize) -> &DirectoryEntryHeader { + let entry_data = &self.data + [n * size_of::()..(n + 1) * size_of::()]; + DirectoryEntryHeader::try_ref_from_bytes(entry_data).unwrap() + } + + pub fn get_entry_headers(&self) -> &[DirectoryEntryHeader] { + // TODO: there must be an easier way... + #[derive(TryFromBytes, KnownLayout, Immutable)] + #[repr(C)] + struct EntryArray([DirectoryEntryHeader]); + &EntryArray::try_ref_from_prefix_with_elems(&self.data, self.n_entries()) + .unwrap() + .0 + .0 + } + + pub fn n_entries(&self) -> usize { + let first = self.get_entry_header(0); + let offset = first.name_offset.get(); + assert!(offset != 0); + assert!(offset % 12 == 0); + offset as usize / 12 + } + + pub fn entries(&self) -> DirectoryEntries { + DirectoryEntries { + block: self, + length: self.n_entries(), + position: 0, + } + } +} + +// High-level iterator interface +#[derive(Debug)] +pub struct DirectoryEntry<'a> { + pub file_type: FileType, + pub name: &'a [u8], + pub inode: u64, +} + +#[derive(Debug)] +pub struct DirectoryEntries<'d> { + block: &'d DirectoryBlock, + length: usize, + position: usize, +} + +impl<'d> Iterator for DirectoryEntries<'d> { + type Item = DirectoryEntry<'d>; + + fn next(&mut self) -> Option { + if self.position < self.length { + let item = self.block.get_entry_header(self.position); + let name_start = item.name_offset.get() as usize; + self.position += 1; + + let name = if self.position == self.length { + let with_padding = &self.block.data[name_start..]; + let end = with_padding.partition_point(|c| *c != 0); + &with_padding[..end] + } else { + let next = self.block.get_entry_header(self.position); + let name_end = next.name_offset.get() as usize; + &self.block.data[name_start..name_end] + }; + + Some(DirectoryEntry { + name, + file_type: item.file_type, + inode: item.inode_offset.get(), + }) + } else { + None + } + } +} diff --git a/src/fs.rs b/src/fs.rs index 3430bde..ec99590 100644 --- a/src/fs.rs +++ b/src/fs.rs @@ -314,7 +314,7 @@ pub fn read_from_path(path: &Path, repo: Option<&Repository>) -> Result) -> Result { let fs = read_from_path(path, repo)?; - let image = super::image::mkcomposefs(fs)?; + let image = crate::mkfs::mkfs(&fs)?; if let Some(repo) = repo { Ok(repo.write_image(None, &image)?) } else { diff --git a/src/image.rs b/src/image.rs index 5205b23..a210760 100644 --- a/src/image.rs +++ b/src/image.rs @@ -3,15 +3,13 @@ use std::{ cmp::{Ord, Ordering}, collections::BTreeMap, ffi::{OsStr, OsString}, - io::Read, path::Path, - process::{Command, Stdio}, rc::Rc, }; use anyhow::{bail, Context, Result}; -use crate::{dumpfile::write_dumpfile, fsverity::Sha256HashValue}; +use crate::fsverity::Sha256HashValue; #[derive(Debug)] pub struct Stat { @@ -290,26 +288,3 @@ impl FileSystem { } } } - -pub fn mkcomposefs(filesystem: FileSystem) -> Result> { - let mut mkcomposefs = Command::new("mkcomposefs") - .args(["--from-file", "-", "-"]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .spawn()?; - - let mut stdin = mkcomposefs.stdin.take().unwrap(); - write_dumpfile(&mut stdin, &filesystem)?; - drop(stdin); - - let mut stdout = mkcomposefs.stdout.take().unwrap(); - let mut image = vec![]; - stdout.read_to_end(&mut image)?; - drop(stdout); - - if !mkcomposefs.wait()?.success() { - bail!("mkcomposefs failed"); - }; - - Ok(image) -} diff --git a/src/lib.rs b/src/lib.rs index ff8f4e4..61507e6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,9 +2,11 @@ pub mod dumpfile; pub mod dumpfile_parse; +pub mod erofs; pub mod fs; pub mod fsverity; pub mod image; +pub mod mkfs; pub mod mount; pub mod oci; pub mod repository; diff --git a/src/mkfs.rs b/src/mkfs.rs new file mode 100644 index 0000000..cf27589 --- /dev/null +++ b/src/mkfs.rs @@ -0,0 +1,749 @@ +use std::{ + env, + io::Read, + process::{Command, Stdio}, +}; + +use anyhow::{bail, Result}; + +use crate::{dumpfile::write_dumpfile, image::FileSystem}; + +use std::{ + cell::RefCell, + collections::{BTreeMap, HashMap}, + mem::{align_of_val, size_of}, + os::unix::ffi::OsStrExt, + rc::Rc, +}; + +use log::debug; +use xxhash_rust::xxh32::xxh32; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{ + erofs::{debug::debug_img, format}, + image, +}; + +fn round_up(n: usize, to: usize) -> usize { + (n + to - 1) & !(to - 1) +} + +#[derive(Clone, Copy, Debug)] +enum Offset { + Header, + Superblock, + Inode, + XAttr, + Block, + End, +} + +trait Output { + fn note_offset(&mut self, offset_type: Offset); + fn get(&self, offset_type: Offset, idx: usize) -> usize; + fn write(&mut self, data: &[u8]); + fn pad(&mut self, alignment: usize); + fn len(&self) -> usize; + + fn get_div(&self, offset_type: Offset, idx: usize, div: usize) -> usize { + let offset = self.get(offset_type, idx); + assert_eq!(offset % div, 0); + offset / div + } + + fn get_nid(&self, idx: usize) -> u64 { + self.get_div(Offset::Inode, idx, 32) as u64 + } + + fn get_xattr(&self, idx: usize) -> u32 { + self.get_div(Offset::XAttr, idx, 4).try_into().unwrap() + } + + fn write_struct(&mut self, st: impl IntoBytes + Immutable) { + assert_eq!(self.len() % align_of_val(&st), 0); // TODO: this is less than we want + self.write(st.as_bytes()); + } +} + +#[derive(PartialOrd, PartialEq, Eq, Ord, Clone)] +struct XAttr { + prefix: u8, + suffix: Box<[u8]>, + value: Box<[u8]>, +} + +#[derive(Clone, Default)] +struct InodeXAttrs { + shared: Vec, + local: Vec, + filter: u32, +} + +struct DirEnt<'a> { + name: &'a [u8], + inode: usize, + file_type: format::FileType, +} + +#[derive(Default)] +struct Directory<'a> { + blocks: Box<[Box<[DirEnt<'a>]>]>, + inline: Box<[DirEnt<'a>]>, + size: usize, + nlink: usize, +} + +struct Leaf<'a> { + content: &'a image::LeafContent, + nlink: usize, +} + +enum InodeContent<'a> { + Directory(Directory<'a>), + Leaf(Leaf<'a>), +} + +struct Inode<'a> { + stat: &'a image::Stat, + xattrs: InodeXAttrs, + content: InodeContent<'a>, +} + +impl XAttr { + pub fn write(&self, output: &mut impl Output) { + output.write_struct(format::XAttrHeader { + name_len: self.suffix.len() as u8, + name_index: self.prefix, + value_size: (self.value.len() as u16).into(), + }); + output.write(&self.suffix); + output.write(&self.value); + output.pad(4); + } +} + +impl InodeXAttrs { + fn add(&mut self, name: &[u8], value: &[u8]) { + for (idx, prefix) in format::XATTR_PREFIXES.iter().enumerate().rev() { + if let Some(suffix) = name.strip_prefix(*prefix) { + self.filter |= 1 << (xxh32(suffix, format::XATTR_FILTER_SEED + idx as u32) % 32); + self.local.push(XAttr { + prefix: idx as u8, + suffix: Box::from(suffix), + value: Box::from(value), + }); + return; + } + } + unreachable!("{:?}", std::str::from_utf8(name)); // worst case: we matched the empty prefix (0) + } + + fn write(&self, output: &mut impl Output) { + if self.filter != 0 { + debug!(" write xattrs block"); + output.write_struct(format::InodeXAttrHeader { + name_filter: (!self.filter).into(), + shared_count: self.shared.len() as u8, + ..Default::default() + }); + for idx in &self.shared { + debug!(" shared {} @{}", idx, output.len()); + output.write(&output.get_xattr(*idx).to_le_bytes()); + } + for attr in &self.local { + debug!(" local @{}", output.len()); + attr.write(output); + } + } + // our alignment is equal to xattr alignment: no need to pad + } +} + +impl<'a> Directory<'a> { + pub fn from_entries(entries: Vec>) -> Self { + let mut blocks = vec![]; + let mut rest = vec![]; + + let mut n_bytes = 0; + let mut nlink = 0; + + debug!("Directory with {} items", entries.len()); + + // The content of the directory is fixed at this point so we may as well split it into + // blocks. This lets us avoid measuring and re-measuring. + for entry in entries.into_iter() { + let entry_size = size_of::() + entry.name.len(); + assert!(entry_size <= 4096); + + debug!(" {:?}", entry.file_type); + + if matches!(entry.file_type, format::FileType::Directory) { + nlink += 1; + } + + n_bytes += entry_size; + if n_bytes <= 4096 { + rest.push(entry); + } else { + // It won't fit, so we need to store the existing entries in a block. + debug!(" block {}", rest.len()); + blocks.push(rest.into_boxed_slice()); + + // Start over + rest = vec![entry]; + n_bytes = entry_size; + } + } + + // Don't try to store more than 2048 bytes of tail data + if n_bytes > 2048 { + blocks.push(rest.into_boxed_slice()); + rest = vec![]; + n_bytes = 0; + } + + debug!( + " blocks {} inline {} inline_size {n_bytes}", + blocks.len(), + rest.len() + ); + + let size = format::BLOCK_SIZE * blocks.len() + n_bytes; + Self { + blocks: blocks.into_boxed_slice(), + inline: rest.into_boxed_slice(), + size, + nlink, + } + } + + fn write_block(&self, output: &mut impl Output, block: &[DirEnt]) { + debug!(" write dir block {} @{}", block.len(), output.len()); + let mut nameofs = size_of::() * block.len(); + + for entry in block { + debug!( + " entry {:?} name {} @{}", + entry.file_type, + nameofs, + output.len() + ); + output.write_struct(format::DirectoryEntryHeader { + name_offset: (nameofs as u16).into(), + inode_offset: output.get_nid(entry.inode).into(), + file_type: entry.file_type, + ..Default::default() + }); + nameofs += entry.name.len(); + } + + for entry in block { + debug!(" name @{}", output.len()); + output.write(entry.name.as_bytes()); + } + } + + fn write_inline(&self, output: &mut impl Output) { + debug!( + " write inline len {} expected size {} of {}", + self.inline.len(), + self.size % 4096, + self.size + ); + self.write_block(output, &self.inline); + } + + fn write_blocks(&self, output: &mut impl Output) { + for block in &self.blocks { + assert_eq!(output.len() % format::BLOCK_SIZE, 0); + self.write_block(output, block); + output.pad(format::BLOCK_SIZE); + } + } + + fn inode_meta(&self, block_offset: usize) -> (format::DataLayout, u32, u64, usize) { + let (layout, u) = if self.inline.len() == 0 { + (format::DataLayout::FlatPlain, block_offset as u32 / 4096) + } else if self.blocks.len() > 0 { + (format::DataLayout::FlatInline, block_offset as u32 / 4096) + } else { + (format::DataLayout::FlatInline, 0) + }; + (layout, u, self.size as u64, self.nlink) + } +} + +impl Leaf<'_> { + fn inode_meta(&self) -> (format::DataLayout, u32, u64, usize) { + let (layout, u, size) = match &self.content { + image::LeafContent::InlineFile(data) => { + if data.is_empty() { + (format::DataLayout::FlatPlain, 0, data.len() as u64) + } else { + (format::DataLayout::FlatInline, 0, data.len() as u64) + } + } + image::LeafContent::ExternalFile(.., size) => { + // TODO: libcomposefs tries harder here. Should we? + (format::DataLayout::ChunkBased, 31, *size) + } + image::LeafContent::CharacterDevice(rdev) | image::LeafContent::BlockDevice(rdev) => { + (format::DataLayout::FlatPlain, *rdev as u32, 0) + } + image::LeafContent::Fifo | image::LeafContent::Socket => { + (format::DataLayout::FlatPlain, 0, 0) + } + image::LeafContent::Symlink(target) => { + (format::DataLayout::FlatInline, 0, target.len() as u64) + } + }; + (layout, u, size, self.nlink) + } + + fn write_inline(&self, output: &mut impl Output) { + output.write(match self.content { + image::LeafContent::InlineFile(data) => data, + image::LeafContent::ExternalFile(..) => b"\xff\xff\xff\xff", // null chunk + image::LeafContent::Symlink(target) => target.as_bytes(), + _ => &[], + }); + } +} + +impl Inode<'_> { + fn file_type(&self) -> format::FileType { + match &self.content { + InodeContent::Directory(..) => format::FileType::Directory, + InodeContent::Leaf(leaf) => match &leaf.content { + image::LeafContent::ExternalFile(..) | image::LeafContent::InlineFile(..) => { + format::FileType::RegularFile + } + image::LeafContent::CharacterDevice(..) => format::FileType::CharacterDevice, + image::LeafContent::BlockDevice(..) => format::FileType::BlockDevice, + image::LeafContent::Fifo => format::FileType::Fifo, + image::LeafContent::Socket => format::FileType::Socket, + image::LeafContent::Symlink(..) => format::FileType::Symlink, + }, + } + } + + fn write_inode(&self, output: &mut impl Output, idx: usize) { + let (layout, u, size, nlink) = match &self.content { + InodeContent::Directory(dir) => dir.inode_meta(output.get(Offset::Block, idx)), + InodeContent::Leaf(leaf) => leaf.inode_meta(), + }; + + let xattr_size = { + let mut xattr = FirstPass::default(); + self.xattrs.write(&mut xattr); + xattr.offset + }; + + // We need to make sure the inline part doesn't overlap a block boundary + if matches!(layout, format::DataLayout::FlatInline) { + let inode_and_xattr_size = size_of::() + xattr_size; + let inline_start = output.len() + inode_and_xattr_size; + let inline_end = inline_start + (size as usize % format::BLOCK_SIZE); + if inline_start / format::BLOCK_SIZE != inline_end / format::BLOCK_SIZE { + // If we proceed, then we'll violate the rule about crossing block boundaries. + // The easiest thing to do is to add padding so that the inline data starts at a + // fresh block boundary. + let pad = vec![0; 4096 - inline_start % 4096]; + debug!("added pad {}", pad.len()); + output.write(&pad); + } + } + + let format = format::FormatField::from((format::InodeLayout::Extended, layout)); + + output.pad(32); + + debug!( + "write inode {idx} nid {} {:?} {:?} xattrsize{xattr_size} icount{} inline{} @{}", + output.len() / 32, + format, + self.file_type(), + match xattr_size { + 0 => 0, + n => (1 + (n - 12) / 4) as u16, + }, + size % 4096, + output.len() + ); + + output.note_offset(Offset::Inode); + output.write_struct(format::ExtendedInodeHeader { + format, + xattr_icount: match xattr_size { + 0 => 0, + n => (1 + (n - 12) / 4) as u16, + } + .into(), + mode: (self.stat.st_mode as u16 | self.file_type().to_ifmt()).into(), + size: size.into(), + u: u.into(), + ino: ((output.len() / 32) as u32).into(), + uid: self.stat.st_uid.into(), + gid: self.stat.st_gid.into(), + mtime: (self.stat.st_mtim_sec as u64).into(), + nlink: (nlink as u32).into(), + ..Default::default() + }); + + self.xattrs.write(output); + + match &self.content { + InodeContent::Directory(dir) => dir.write_inline(output), + InodeContent::Leaf(leaf) => leaf.write_inline(output), + }; + + output.pad(32); + } + + fn write_blocks(&self, output: &mut impl Output) { + if let InodeContent::Directory(dir) = &self.content { + dir.write_blocks(output); + } + } +} + +struct InodeCollector<'a> { + inodes: Vec>, + hardlinks: HashMap<*const image::Leaf, usize>, +} + +impl<'a> InodeCollector<'a> { + fn push_inode(&mut self, stat: &'a image::Stat, content: InodeContent<'a>) -> usize { + let mut xattrs = InodeXAttrs::default(); + + // We need to record extra xattrs for some files. These come first. + if let InodeContent::Leaf(Leaf { + content: image::LeafContent::ExternalFile(id, ..), + .. + }) = content + { + let metacopy = [&[0, 36, 0, 1], &id[..]].concat(); + xattrs.add(b"trusted.overlay.metacopy", &metacopy); + + let redirect = format!("/{:02x}/{}", id[0], hex::encode(&id[1..])); + xattrs.add(b"trusted.overlay.redirect", redirect.as_bytes()); + } + + // Add the normal xattrs. They're already listed in sorted order. + for (name, value) in RefCell::borrow(&stat.xattrs).iter() { + let name = name.as_bytes(); + + if let Some(escapee) = name.strip_prefix(b"trusted.overlay.") { + let escaped = [b"trusted.overlay.overlay.", escapee].concat(); + xattrs.add(&escaped, value); + } else { + xattrs.add(name, value); + } + } + + // Allocate an inode for ourselves. At first we write all xattrs as local. Later (after + // we've determined which xattrs ought to be shared) we'll come and move some of them over. + let inode = self.inodes.len(); + self.inodes.push(Inode { + stat, + xattrs, + content, + }); + inode + } + + fn collect_leaf(&mut self, leaf: &'a Rc) -> usize { + let nlink = Rc::strong_count(leaf); + + if nlink > 1 { + if let Some(inode) = self.hardlinks.get(&Rc::as_ptr(leaf)) { + return *inode; + } + } + + let inode = self.push_inode( + &leaf.stat, + InodeContent::Leaf(Leaf { + content: &leaf.content, + nlink, + }), + ); + + if nlink > 1 { + self.hardlinks.insert(Rc::as_ptr(leaf), inode); + } + + inode + } + + fn insert_sorted( + entries: &mut Vec>, + name: &'a [u8], + inode: usize, + file_type: format::FileType, + ) { + let entry = DirEnt { + name, + inode, + file_type, + }; + let point = entries.partition_point(|e| e.name < entry.name); + entries.insert(point, entry); + } + + fn collect_dir(&mut self, dir: &'a image::Directory, parent: usize) -> usize { + // The root inode number needs to fit in a u16. That more or less compels us to write the + // directory inode before the inode of the children of the directory. Reserve a slot. + let me = self.push_inode(&dir.stat, InodeContent::Directory(Directory::default())); + + let mut entries = vec![]; + + for entry in &dir.entries { + let child = match &entry.inode { + image::Inode::Directory(dir) => self.collect_dir(dir, me), + image::Inode::Leaf(leaf) => self.collect_leaf(leaf), + }; + entries.push(DirEnt { + name: entry.name.as_bytes(), + inode: child, + file_type: self.inodes[child].file_type(), + }); + } + + // We're expected to add those, too + Self::insert_sorted(&mut entries, b".", me, format::FileType::Directory); + Self::insert_sorted(&mut entries, b"..", parent, format::FileType::Directory); + + // Now that we know the actual content, we can write it to our reserved slot + self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries)); + me + } + + pub fn collect(fs: &'a image::FileSystem) -> Vec> { + let mut this = Self { + inodes: vec![], + hardlinks: HashMap::new(), + }; + + // '..' of the root directory is the root directory again + let root_inode = this.collect_dir(&fs.root, 0); + assert_eq!(root_inode, 0); + + this.inodes + } +} + +/// Takes a list of inodes where each inode contains only local xattr values, determines which +/// xattrs (key, value) pairs appear more than once, and shares them. +fn share_xattrs(inodes: &mut [Inode]) -> Vec { + let mut xattrs: BTreeMap = BTreeMap::new(); + + // Collect all xattrs from the inodes + for inode in inodes.iter() { + for attr in &inode.xattrs.local { + if let Some(count) = xattrs.get_mut(attr) { + *count += 1; + } else { + xattrs.insert(attr.clone(), 1); + } + } + } + + // Share only xattrs with more than one user + xattrs.retain(|_k, v| *v > 1); + + // Repurpose the refcount field as an index lookup + for (idx, value) in xattrs.values_mut().enumerate() { + *value = idx; + } + + // Visit each inode and change local xattrs into shared xattrs + for inode in inodes.iter_mut() { + inode.xattrs.local.retain(|attr| { + if let Some(idx) = xattrs.get(attr) { + inode.xattrs.shared.push(*idx); + false // drop the local xattr: we converted it + } else { + true // retain the local xattr: we didn't convert it + } + }); + } + + // Return the shared xattrs as a vec + xattrs.into_keys().collect() +} + +fn write_erofs(output: &mut impl Output, inodes: &[Inode], xattrs: &[XAttr]) { + // Write composefs header + output.note_offset(Offset::Header); + output.write_struct(format::ComposefsHeader { + magic: format::COMPOSEFS_MAGIC, + version: format::COMPOSEFS_VERSION, + flags: 0.into(), + composefs_version: format::COMPOSEFS_VERSION, + ..Default::default() + }); + output.pad(1024); + + // Write superblock + output.note_offset(Offset::Superblock); + output.write_struct(format::Superblock { + magic: format::MAGIC_V1, + blkszbits: format::BLOCK_BITS, + feature_compat: format::FEATURE_COMPAT_MTIME | format::FEATURE_COMPAT_XATTR_FILTER, + root_nid: (output.get_nid(0) as u16).into(), + inos: (inodes.len() as u64).into(), + blocks: ((output.get(Offset::End, 0) / format::BLOCK_SIZE) as u32).into(), + ..Default::default() + }); + + // Write inode table + for (idx, inode) in inodes.iter().enumerate() { + // The inode may add padding to itself, so it notes its own offset + inode.write_inode(output, idx); + } + + // Write shared xattr table + for xattr in xattrs { + output.note_offset(Offset::XAttr); + xattr.write(output); + } + + // Write blocks from inodes that have them + output.pad(4096); + for inode in inodes.iter() { + output.note_offset(Offset::Block); + inode.write_blocks(output); + } + + // That's it + output.note_offset(Offset::End); +} + +#[derive(Default)] +struct Layout { + offset_types: Vec, + offsets: Vec, +} + +#[derive(Default)] +struct FirstPass { + offset: usize, + layout: Layout, +} + +struct SecondPass { + output: Vec, + layout: Layout, +} + +impl Output for SecondPass { + fn note_offset(&mut self, _offset_type: Offset) { + /* no-op */ + } + + fn get(&self, offset_type: Offset, idx: usize) -> usize { + self.layout.offsets[self.layout.offset_types[offset_type as usize] + idx] + } + + fn write(&mut self, data: &[u8]) { + self.output.extend_from_slice(data); + } + + fn pad(&mut self, alignment: usize) { + self.output + .resize(round_up(self.output.len(), alignment), 0); + } + + fn len(&self) -> usize { + self.output.len() + } +} + +impl Output for FirstPass { + fn note_offset(&mut self, offset_type: Offset) { + if self.layout.offset_types.len() == offset_type as usize { + self.layout.offset_types.push(self.layout.offsets.len()); + } + debug!( + "{:?} #{} @{}", + offset_type, + self.layout.offsets.len() - self.layout.offset_types[offset_type as usize], + self.offset + ); + self.layout.offsets.push(self.offset); + } + + fn get(&self, _: Offset, _: usize) -> usize { + 0 // We don't know offsets in the first pass, so fake it + } + + fn write(&mut self, data: &[u8]) { + self.offset += data.len(); + } + + fn pad(&mut self, alignment: usize) { + self.offset = round_up(self.offset, alignment); + } + + fn len(&self) -> usize { + self.offset + } +} + +pub fn mkfs_erofs(fs: &image::FileSystem) -> Box<[u8]> { + // Create the intermediate representation: flattened inodes and shared xattrs + let mut inodes = InodeCollector::collect(fs); + let xattrs = share_xattrs(&mut inodes); + + // Do a first pass with the writer to determine the layout + let mut first_pass = FirstPass::default(); + write_erofs(&mut first_pass, &inodes, &xattrs); + + // Do a second pass with the writer to get the actual bytes + let mut second_pass = SecondPass { + output: vec![], + layout: first_pass.layout, + }; + write_erofs(&mut second_pass, &inodes, &xattrs); + + // That's it + second_pass.output.into_boxed_slice() +} + +pub fn mkfs_mkcomposefs(filesystem: &FileSystem) -> Result> { + let mut mkcomposefs = Command::new("mkcomposefs") + .args(["--from-file", "-", "-"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn()?; + + let mut stdin = mkcomposefs.stdin.take().unwrap(); + write_dumpfile(&mut stdin, filesystem)?; + drop(stdin); + + let mut stdout = mkcomposefs.stdout.take().unwrap(); + let mut image = vec![]; + stdout.read_to_end(&mut image)?; + drop(stdout); + + if !mkcomposefs.wait()?.success() { + bail!("mkcomposefs failed"); + }; + + Ok(image.into()) +} + +pub fn mkfs(fs: &FileSystem) -> Result> { + let image = match env::var("COMPOSEFS_FORMAT") { + Ok(s) if s == "new" => mkfs_erofs(fs), + _ => mkfs_mkcomposefs(fs)?, + }; + + if env::var("COMPOSEFS_DUMP_EROFS") == Ok("1".to_string()) { + debug_img(&image); + } + + Ok(image) +} diff --git a/src/oci/image.rs b/src/oci/image.rs index 0358f6f..b75f0c7 100644 --- a/src/oci/image.rs +++ b/src/oci/image.rs @@ -6,7 +6,8 @@ use oci_spec::image::ImageConfiguration; use crate::{ dumpfile::write_dumpfile, fsverity::Sha256HashValue, - image::{mkcomposefs, FileSystem, Inode, Leaf}, + image::{FileSystem, Inode, Leaf}, + mkfs::mkfs, oci, repository::Repository, selabel::selabel, @@ -101,8 +102,8 @@ pub fn create_image( selabel(&mut filesystem, repo)?; filesystem.done(); - let image = mkcomposefs(filesystem)?; - repo.write_image(name, &image) + let erofs = mkfs(&filesystem)?; + repo.write_image(name, &erofs) } #[cfg(test)]