Skip to content

Commit

Permalink
Start implementing something like a repo format
Browse files Browse the repository at this point in the history
Also: some kind of support for reading tar files into the repo in a way
that allows byte-for-byte reproducing them later.
  • Loading branch information
allisonkarlitskaya committed Oct 4, 2024
1 parent c03acf7 commit 58b32d6
Show file tree
Hide file tree
Showing 8 changed files with 329 additions and 7 deletions.
5 changes: 1 addition & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]
clap = { version = "4.5.19", features = ["derive"] }
hex = "0.4.3"
rand = "0.8.5"
rustix = { version = "0.38.37", features = ["fs", "mount"] }
Expand All @@ -16,7 +17,3 @@ opt-level = 3
[lib]
name = "composefs_experiments"
path = "src/lib.rs"

[[bin]]
name = "mount"
path = "src/bin/mount.rs"
29 changes: 26 additions & 3 deletions src/bin/mount.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,35 @@ impl<'a> MountOptions<'a> {
}
}

use clap::Parser;

/// mount a composefs
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
#[arg()]
image: String,

#[arg()]
mountpoint: String,

#[arg(short, long)]
basedir: String,

#[arg(short, long)]
digest: Option<String>,
}

fn main() {
let mut options = MountOptions::new("/home/lis/src/mountcfs/cfs", "/home/lis/src/mountcfs/digest");
options.set_digest("77fc256436a40bc31088f212935130724e039e401e5ffc7936c6bdb750b1dfdb");
let args = Args::parse();

let mut options = MountOptions::new(&args.image, &args.basedir);
if let Some(expected) = &args.digest {
options.set_digest(expected);
}
options.set_require_verity();

if let Err(x) = options.mount("mnt") {
if let Err(x) = options.mount(&args.mountpoint) {
println!("err {}", x);
}
}
18 changes: 18 additions & 0 deletions src/bin/splitstream.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
use composefs_experiments::{
fsverity::Sha256HashValue,
tar::split,
repository::Repository,
};

// produce a splitstream from a tar
fn main() {
let repo = Repository::open("/home/lis/.var/lib/composefs").expect("open cfs-repo");

split(
&mut std::io::stdin(),
&mut std::io::stdout(),
|data: Vec<u8>| -> std::io::Result<Sha256HashValue> {
repo.ensure_data(&data)
}
).expect("split");
}
4 changes: 4 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
mod util;
pub mod repository;
pub mod fsverity;
pub mod splitstream;
pub mod tar;
pub mod tmpdir;
86 changes: 86 additions & 0 deletions src/repository.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use std::path::PathBuf;
use std::io::ErrorKind;
use std::os::fd::OwnedFd;

use rustix::fs::{
CWD,
linkat,
Access,
AtFlags,
Mode,
OFlags,
accessat,
mkdirat,
open,
openat,
fdatasync,
};

use crate::{
fsverity::{
Sha256HashValue,
digest::FsVerityHasher,
ioctl::{
fs_ioc_enable_verity,
fs_ioc_measure_verity,
},
},
util::proc_self_fd,
};

pub struct Repository {
repository: OwnedFd,
objects: OwnedFd,
}

impl Repository {
pub fn open(path: &str) -> std::io::Result<Repository> {
let repository = open(path, OFlags::PATH, Mode::empty())?;
let objects = openat(&repository, "objects", OFlags::PATH, Mode::empty())?;

Ok(Repository { repository, objects })
}

pub fn ensure_data(&self, data: &[u8]) -> std::io::Result<Sha256HashValue> {
let digest = FsVerityHasher::hash(data);
let dir = PathBuf::from(format!("{:02x}", digest[0]));
let file = dir.join(hex::encode(&digest[1..]));

if accessat(&self.objects, &file, Access::READ_OK, AtFlags::empty()) == Ok(()) {
return Ok(digest);
}

if let Err(err) = mkdirat(&self.objects, &dir, 0o777.into()) {
if err.kind() != ErrorKind::AlreadyExists {
return Err(err.into());
}
}

let fd = openat(&self.objects, &dir,
OFlags::RDWR | OFlags::CLOEXEC | OFlags::TMPFILE, 0o666.into()
)?;

rustix::io::write(&fd, data)?; // TODO: no write_all() here...

fdatasync(&fd)?;

// We can't enable verity with an open writable fd, so re-open and close the old one.
let ro_fd = open(proc_self_fd(&fd), OFlags::RDONLY, Mode::empty())?;
drop(fd);

fs_ioc_enable_verity::<&OwnedFd, Sha256HashValue>(&ro_fd)?;

// double-check
let measured_digest: Sha256HashValue = fs_ioc_measure_verity(&ro_fd)?;
assert!(measured_digest == digest);

if let Err(err) = linkat(CWD, proc_self_fd(&ro_fd), &self.objects, file, AtFlags::SYMLINK_FOLLOW) {
if err.kind() != ErrorKind::AlreadyExists {
return Err(err.into());
}
}

drop(ro_fd);
Ok(digest)
}
}
81 changes: 81 additions & 0 deletions src/splitstream.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/* Implementation of the Split Stream file format
*
* Split Stream is a trivial way of storing file formats (like tar) with the "data blocks" stored
* in the composefs object tree. It's something like tar-split, but is based on content-addressed
* storage of the file data and is implemented using a trivial binary-ish format.
*
* It is expected that the splitstream will be compressed before being stored on disk.
*
* The file format consists of a number of data blocks.
*
*
* Each block starts with a u64 le "size" field followed by some amount of data.
*
* 64bit variable
* +--------+---------------....
* | size | data...
* +--------+---------------....
*
* There are two kinds of blocks.
*
* - size != 0: in this case the length of the data is equal to the size. This is "inline data".
* There is no padding, which implies that the size fields after the first may be unaligned.
*
* - size == 0: in this case the length of the data is 32 bytes. This is the binary form of a
* sha256 hash value and is a reference to an object in the composefs repository.
*
* That's it, really. There's no header. The file is over when there's no more blocks.
*/

use std::io::Write;
use crate::fsverity::Sha256HashValue;

// utility class to help write splitstreams
pub struct SplitStreamWriter<'w, W: Write> {
inline_content: Vec<u8>,
writer: &'w mut W

}

impl<'w, W: Write> SplitStreamWriter<'w, W> {
pub fn new(writer: &'w mut W) -> SplitStreamWriter<'w, W> {
SplitStreamWriter { inline_content: vec![], writer }
}

fn write_fragment(writer: &mut W, size: usize, data: &[u8]) -> std::io::Result<()> {
writer.write_all(&(size as u64).to_le_bytes())?;
writer.write_all(data)
}

/// flush any buffered inline data, taking new_value as the new value of the buffer
fn flush_inline(&mut self, new_value: Vec<u8>) -> std::io::Result<()> {
if !self.inline_content.is_empty() {
SplitStreamWriter::write_fragment(self.writer, self.inline_content.len(), &self.inline_content)?;
self.inline_content = new_value;
}
Ok(())
}

/// really, "add inline content to the buffer"
/// you need to call .flush_inline() later
pub fn write_inline(&mut self, data: &[u8]) {
self.inline_content.extend(data);
}

/// write a reference to external data to the stream. If the external data had padding in the
/// stream which is not stored in the object then pass it here as well and it will be stored
/// inline after the reference.
pub fn write_reference(&mut self, reference: Sha256HashValue, padding: Vec<u8>) -> std::io::Result<()> {
// Flush the inline data before we store the external reference. Any padding from the
// external data becomes the start of a new inline block.
self.flush_inline(padding)?;

SplitStreamWriter::write_fragment(self.writer, 0, &reference)
}

pub fn done(&mut self) -> std::io::Result<()> {
self.flush_inline(vec![])
}
}

// TODO: reader side...
105 changes: 105 additions & 0 deletions src/tar.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
use std::io::{Read, Write};
use crate::fsverity::Sha256HashValue;
use crate::splitstream::SplitStreamWriter;

struct TarHeader {
data: [u8; 512],
}

impl TarHeader {
// we can't use Read::read_exact() because we need to be able to detect EOF
fn read<R: Read>(reader: &mut R) -> std::io::Result<Option<TarHeader>> {
let mut header = TarHeader { data: [0u8; 512] };
let mut todo: &mut [u8] = &mut header.data;

while !todo.is_empty() {
match reader.read(todo) {
Ok(0) => match todo.len() {
512 => return Ok(None),
_ => return Err(std::io::ErrorKind::UnexpectedEof.into()),
},
Ok(n) => {
todo = &mut todo[n..];
}
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {
}
Err(e) => {
return Err(e);
}
}
}

Ok(Some(header))
}

fn get_size(&self) -> usize {
let size_field = &self.data[124..124 + 12];
let mut value = 0usize;

if size_field[0] & 0x80 != 0 {
// binary representation
for byte in &size_field[4..12] {
value <<= 8;
value += *byte as usize;
}
} else {
// octal representation with nul terminator
for byte in size_field {
if *byte == b'\0' {
break;
} else {
// add octal digit value (no error checking)
value <<= 3;
value += (*byte - b'0') as usize;
}
}
}

// TODO: not too big, I hope...
value
}

fn get_storage_size(&self) -> usize {
// round up to nearest multiple of 512
(self.get_size() + 511) & !511
}

fn is_reg(&self) -> bool {
self.data[156] == b'0'
}
}

/// Splits the tar file from tar_stream into a Split Stream. The store_data function is
/// responsible for ensuring that "external data" is in the composefs repository and returns the
/// fsverity hash value of that data.
pub fn split<R: Read, W: Write, F: FnMut(Vec<u8>) -> std::io::Result<Sha256HashValue>>(
tar_stream: &mut R,
split_stream: &mut W,
mut store_data: F,
) -> std::io::Result<()> {
let mut writer = SplitStreamWriter::new(split_stream);

while let Some(header) = TarHeader::read(tar_stream)? {
// the header always gets stored as inline data
writer.write_inline(&header.data);

// read the corresponding data, if there is any
let storage_size = header.get_storage_size();
let mut buffer = vec![0u8; storage_size];
tar_stream.read_exact(&mut buffer)?;

if header.is_reg() && storage_size > 0 {
// non-empty regular file: store the data in the object store
let actual_size = header.get_size();
let padding = buffer.split_off(actual_size);
let reference = store_data(buffer)?;
writer.write_reference(reference, padding)?;
} else {
// else: store the data inline in the split stream
writer.write_inline(&buffer);
}
}

// flush out any remaining inline data
writer.done()
}
8 changes: 8 additions & 0 deletions src/util.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
use std::os::fd::{
AsFd,
AsRawFd,
};

pub fn proc_self_fd<A: AsFd>(fd: &A) -> String {
format!("/proc/self/fd/{}", fd.as_fd().as_raw_fd())
}

0 comments on commit 58b32d6

Please sign in to comment.