Skip to content

Commit

Permalink
Up our tar game a bit
Browse files Browse the repository at this point in the history
Drop our lame hacked up tar header implementation for the real thing
from the 'tar' crate.  Reading data directly into it is actually pretty
easy.

Add a reader-side utility class for splitstream.  Also: add a new
SplitStreamData type for inline/external data and port our existing
helper functions to use it.

Add a 'cfsctl ls' command which lists out the content of a tar
splitstream.  So far it just lists files and if they are inline or
external references.  This is the start of support for creating a
dumpfile...

This is some dreadfully ugly code, but it seems to be working.  Probably
some tests would be nice at some point...
  • Loading branch information
allisonkarlitskaya committed Oct 8, 2024
1 parent 716331d commit f4f9af0
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 84 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ hex = "0.4.3"
rand = "0.8.5"
rustix = { version = "0.38.37", features = ["fs", "mount", "process"] }
sha2 = "0.10.8"
tar = "0.4.42"
zstd = "0.13.2"

[profile.dev.package.sha2]
Expand Down
8 changes: 8 additions & 0 deletions src/bin/cfsctl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ enum Command {
reference: String,
tarfile: Option<PathBuf>,
},
/// Lists the contents of a tar stream
Ls {
/// the name of the stream
name: String,
},
/// Mounts a composefs, possibly enforcing fsverity of the image
Mount {
/// the name of the image to mount, either a sha256 digest or prefixed with 'ref/'
Expand Down Expand Up @@ -84,6 +89,9 @@ fn main() -> Result<()> {
Command::ImportTar { reference, tarfile: _ } => {
repo.import_tar(&reference, &mut std::io::stdin())
},
Command::Ls { name } => {
repo.ls(&name)
},
Command::Mount { name, mountpoint } => {
repo.mount(&name, &mountpoint)
},
Expand Down
6 changes: 6 additions & 0 deletions src/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,12 @@ impl Repository {
self.link_ref(name, "images", object_id)
}

pub fn ls(self, name: &str) -> Result<()> {
let file = File::from(self.open_in_category("streams", name)?);
let mut split_stream = zstd::stream::read::Decoder::new(file)?;
crate::tar::ls(&mut split_stream)
}

pub fn mount(self, name: &str, mountpoint: &str) -> Result<()> {
let image = self.open_in_category("images", name)?;
let object_path = format!("{}/objects", self.path);
Expand Down
122 changes: 94 additions & 28 deletions src/splitstream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,18 @@
* That's it, really. There's no header. The file is over when there's no more blocks.
*/

use std::io::{
Read,
Write,
use std::{
collections::VecDeque,
io::{
Read,
Write,
},
};

use anyhow::Result;
use anyhow::{
Result,
bail,
};

use crate::{
fsverity::{
Expand All @@ -46,7 +52,6 @@ use crate::{
pub struct SplitStreamWriter<'w, W: Write> {
inline_content: Vec<u8>,
writer: &'w mut W

}

impl<'w, W: Write> SplitStreamWriter<'w, W> {
Expand Down Expand Up @@ -90,28 +95,93 @@ impl<'w, W: Write> SplitStreamWriter<'w, W> {
}
}

fn read_u64_le<R: Read>(reader: &mut R) -> Result<Option<u64>> {
pub enum SplitStreamData {
Inline(Vec<u8>),
External(Sha256HashValue),
}

pub fn read_splitstream_chunk<R: Read>(reader: &mut R) -> Result<Option<SplitStreamData>> {
let mut buf = [0u8; 8];
if read_exactish(reader, &mut buf)? {
Ok(Some(u64::from_le_bytes(buf)))
} else {
Ok(None)
match read_exactish(reader, &mut buf)? {
false => Ok(None),
true => match u64::from_le_bytes(buf) as usize {
0 => {
let mut data = Sha256HashValue::EMPTY;
reader.read_exact(&mut data)?;
Ok(Some(SplitStreamData::External(data)))
},
size => {
let mut data = vec![0u8; size];
reader.read_exact(&mut data)?;
Ok(Some(SplitStreamData::Inline(data)))
}
}
}
}

// utility class to help read splitstreams
pub struct SplitStreamReader<'w, R: Read> {
inline_content: VecDeque<u8>,
reader: &'w mut R
}

impl<'r, R: Read> SplitStreamReader<'r, R> {
pub fn new(reader: &'r mut R) -> SplitStreamReader<'r, R> {
SplitStreamReader { inline_content: VecDeque::new(), reader }
}

/// assumes that the data cannot be split across chunks
pub fn read_inline_exact(&mut self, data: &mut [u8]) -> Result<bool> {
if self.inline_content.is_empty() {
match read_splitstream_chunk(&mut self.reader)? {
None => { return Ok(false); }
Some(SplitStreamData::Inline(data)) => { self.inline_content = data.into() },
Some(SplitStreamData::External(_)) => { bail!("Expecting inline data but found external chunk") }
}
}

self.inline_content.read_exact(data)?;
Ok(true)
}

pub fn read_exact(&mut self, actual_size: usize, stored_size: usize) -> Result<SplitStreamData> {
if self.inline_content.is_empty() {
match read_splitstream_chunk(&mut self.reader)? {
None => { bail!("Unexpected EOF") },
Some(SplitStreamData::Inline(data)) => { self.inline_content = data.into() },
Some(ext) => {
if actual_size != stored_size {
// need to eat the padding...
match read_splitstream_chunk(&mut self.reader)? {
None => { bail!("bad eof") },
Some(SplitStreamData::Inline(data)) => { self.inline_content = data.into() },
Some(SplitStreamData::External(_)) => { bail!("Expecting inline data but found external chunk") }
}
// TODO: make this suck less
let mut padding = vec![0u8; stored_size - actual_size];
self.inline_content.read_exact(&mut padding)?;
}

return Ok(ext)
}
}
}

// must be inline
let mut data = vec![0u8; stored_size];
self.inline_content.read_exact(&mut data)?;
data.truncate(actual_size);
Ok(SplitStreamData::Inline(data))
}
}

pub fn splitstream_merge<R: Read, W: Write, F: FnMut(Sha256HashValue) -> Result<Vec<u8>>>(
split_stream: &mut R, result: &mut W, mut load_data: F,
) -> Result<()> {
while let Some(size) = read_u64_le(split_stream)? {
if size == 0 {
let mut hash = Sha256HashValue::EMPTY;
split_stream.read_exact(&mut hash)?;
let data = load_data(hash)?;
result.write_all(&data)?;
} else {
let mut data = vec![0u8; size as usize]; // TODO: bzzt bzzt
split_stream.read_exact(&mut data)?;
result.write_all(&data)?;
while let Some(data) = read_splitstream_chunk(split_stream)? {
match data {
SplitStreamData::Inline(data) => result.write_all(&data)?,
SplitStreamData::External(id) => result.write_all(&load_data(id)?)?,
}
}

Expand All @@ -121,14 +191,10 @@ pub fn splitstream_merge<R: Read, W: Write, F: FnMut(Sha256HashValue) -> Result<
pub fn splitstream_objects<R: Read, F: FnMut(Sha256HashValue)>(
split_stream: &mut R, mut callback: F
) -> Result<()> {
while let Some(size) = read_u64_le(split_stream)? {
if size == 0 {
let mut hash = Sha256HashValue::EMPTY;
split_stream.read_exact(&mut hash)?;
callback(hash);
} else {
let mut discard = vec![0u8; size as usize]; // TODO: bzzt bzzt
split_stream.read_exact(&mut discard)?;
while let Some(data) = read_splitstream_chunk(split_stream)? {
match data {
SplitStreamData::Inline(_) => { /* no op */ },
SplitStreamData::External(id) => callback(id)
}
}

Expand Down
106 changes: 50 additions & 56 deletions src/tar.rs
Original file line number Diff line number Diff line change
@@ -1,64 +1,29 @@
use std::io::{Read, Write};

use anyhow::Result;
use tar::{
EntryType,
Header,
};

use crate::{
fsverity::Sha256HashValue,
splitstream::SplitStreamWriter,
splitstream::{
SplitStreamData,
SplitStreamReader,
SplitStreamWriter,
},
util::read_exactish,
};

struct TarHeader {
data: [u8; 512],
}

impl TarHeader {
// we can't use Read::read_exact() because we need to be able to detect EOF
fn read<R: Read>(reader: &mut R) -> Result<Option<TarHeader>> {
let mut header = TarHeader { data: [0u8; 512] };
if read_exactish(reader, &mut header.data)? {
Ok(Some(header))
} else {
Ok(None)
}
}

fn get_size(&self) -> usize {
let size_field = &self.data[124..124 + 12];
let mut value = 0usize;

if size_field[0] & 0x80 != 0 {
// binary representation
for byte in &size_field[4..12] {
value <<= 8;
value += *byte as usize;
}
} else {
// octal representation with nul terminator
for byte in size_field {
if *byte == b'\0' {
break;
} else {
// add octal digit value (no error checking)
value <<= 3;
value += (*byte - b'0') as usize;
}
}
}

// TODO: not too big, I hope...
value
}

fn get_storage_size(&self) -> usize {
// round up to nearest multiple of 512
(self.get_size() + 511) & !511
}

fn is_reg(&self) -> bool {
self.data[156] == b'0'
fn read_header<R: Read>(reader: &mut R) -> Result<Option<Header>> {
let mut header = Header::new_gnu();
if read_exactish(reader, header.as_mut_bytes())? {
Ok(Some(header))
} else {
Ok(None)
}
}
}

/// Splits the tar file from tar_stream into a Split Stream. The store_data function is
/// responsible for ensuring that "external data" is in the composefs repository and returns the
Expand All @@ -70,18 +35,22 @@ pub fn split<R: Read, W: Write, F: FnMut(&[u8]) -> Result<Sha256HashValue>>(
) -> Result<()> {
let mut writer = SplitStreamWriter::new(split_stream);

while let Some(header) = TarHeader::read(tar_stream)? {
while let Some(header) = read_header(tar_stream)? {
// the header always gets stored as inline data
writer.write_inline(&header.data);
writer.write_inline(header.as_bytes());

if header.as_bytes() == &[0u8; 512] {
continue;
}

// read the corresponding data, if there is any
let storage_size = header.get_storage_size();
let actual_size = header.entry_size()? as usize;
let storage_size = (actual_size + 511) & !511;
let mut buffer = vec![0u8; storage_size];
tar_stream.read_exact(&mut buffer)?;

if header.is_reg() && storage_size > 0 {
if header.entry_type() == EntryType::Regular && storage_size > 0 {
// non-empty regular file: store the data in the object store
let actual_size = header.get_size();
let padding = buffer.split_off(actual_size);
let reference = store_data(&buffer)?;
writer.write_reference(reference, padding)?;
Expand All @@ -94,3 +63,28 @@ pub fn split<R: Read, W: Write, F: FnMut(&[u8]) -> Result<Sha256HashValue>>(
// flush out any remaining inline data
writer.done()
}

pub fn ls<R: Read>(split_stream: &mut R) -> Result<()> {
let mut reader = SplitStreamReader::new(split_stream);

loop {
let mut buf = [0u8; 512];
if !reader.read_inline_exact(&mut buf)? {
return Ok(());
}

if buf == [0u8; 512] {
return Ok(());
}

let header = tar::Header::from_byte_slice(&buf);
let actual_size = header.size()? as usize;
let stored_size = (actual_size + 511) & !511;
println!("{:?}", header.path()?);
match reader.read_exact(actual_size, stored_size)? {
SplitStreamData::Inline(data) => println!("{} data bytes inline", data.len()),
SplitStreamData::External(id) => println!("ext {}", hex::encode(id))
}
println!();
}
}

0 comments on commit f4f9af0

Please sign in to comment.