Skip to content

Commit

Permalink
chunking: Bin packing algorithm which allows to minimize
Browse files Browse the repository at this point in the history
layer deltas using historical builds

Revamp basic_packing to follow the prior packing structure
if the --prior-build flag exists. This simply modifies existing
layers with upgrades/downgrades/removal of packages. The last layer
contains any new addition to packages.
In the case where --prior-build flag does not exist, the frequency
of updates of the packages (frequencyinfo) and size is utilized to
segment packages into different partitions (all combinations of
low, medium, high frequency and low, medium, high size). The partition
that each package falls into is decided by its deviation from mean.
Then the packages are alloted to different layers to ensure
1) low frequency packages don't mix with high frequency packages
2) High sized packages are alloted separate bins
3) Low sized packages can be put together in the same bin
This problem is aka multi-objective bin packing problem with constraints
aka multiple knapsack problem. The objectives are conflicting given our
constraints and hence a compromise is taken to minimize layer deltas
while respecting the hard limit of overlayfs that the kernel can handle.
  • Loading branch information
RishabhSaini committed May 15, 2023
1 parent 95f2366 commit 5159164
Show file tree
Hide file tree
Showing 11 changed files with 745 additions and 88 deletions.
630 changes: 564 additions & 66 deletions lib/src/chunking.rs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lib/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ async fn container_export(
..Default::default()
};
let pushed =
crate::container::encapsulate(repo, rev, &config, Some(opts), None, imgref).await?;
crate::container::encapsulate(repo, rev, &config, None, Some(opts), None, imgref).await?;
println!("{}", pushed);
Ok(())
}
Expand Down
48 changes: 40 additions & 8 deletions lib/src/container/encapsulate.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! APIs for creating container images from OSTree commits
use super::ocidir::{Layer, OciDir};
use super::{ocidir, OstreeImageReference, Transport};
use super::{ocidir, OstreeImageReference, Transport, CONTENT_ANNOTATION};
use super::{ImageReference, SignatureSource, OSTREE_COMMIT_LABEL};
use crate::chunking::{Chunk, Chunking, ObjectMetaSized};
use crate::container::skopeo;
Expand Down Expand Up @@ -104,7 +104,7 @@ fn export_chunks(
ociw: &mut OciDir,
chunks: Vec<Chunk>,
opts: &ExportOpts,
) -> Result<Vec<(Layer, String)>> {
) -> Result<Vec<(Layer, String, Vec<String>)>> {
chunks
.into_iter()
.enumerate()
Expand All @@ -113,7 +113,7 @@ fn export_chunks(
ostree_tar::export_chunk(repo, commit, chunk.content, &mut w)
.with_context(|| format!("Exporting chunk {i}"))?;
let w = w.into_inner()?;
Ok((w.complete()?, chunk.name))
Ok((w.complete()?, chunk.name, chunk.packages))
})
.collect()
}
Expand Down Expand Up @@ -151,11 +151,20 @@ fn export_chunked(
.clone();

// Add the ostree layer
ociw.push_layer(manifest, imgcfg, ostree_layer, description);
ociw.push_layer(manifest, imgcfg, ostree_layer, description, None);
// Add the component/content layers
for (layer, name) in layers {
ociw.push_layer(manifest, imgcfg, layer, name.as_str());
for (layer, name, packages) in layers {
let mut annotation_component_layer = HashMap::new();
annotation_component_layer.insert(CONTENT_ANNOTATION.to_string(), packages.join(","));
ociw.push_layer(
manifest,
imgcfg,
layer,
name.as_str(),
Some(annotation_component_layer),
);
}

// This label (mentioned above) points to the last layer that is part of
// the ostree commit.
labels.insert(
Expand All @@ -167,13 +176,15 @@ fn export_chunked(

/// Generate an OCI image from a given ostree root
#[context("Building oci")]
#[allow(clippy::too_many_arguments)]
fn build_oci(
repo: &ostree::Repo,
rev: &str,
ocidir_path: &Path,
tag: Option<&str>,
config: &Config,
opts: ExportOpts,
prior_build: Option<&oci_image::ImageManifest>,
contentmeta: Option<crate::chunking::ObjectMetaSized>,
) -> Result<ImageReference> {
if !ocidir_path.exists() {
Expand Down Expand Up @@ -209,7 +220,15 @@ fn build_oci(
let mut manifest = ocidir::new_empty_manifest().build().unwrap();

let chunking = contentmeta
.map(|meta| crate::chunking::Chunking::from_mapping(repo, commit, meta, opts.max_layers))
.map(|meta| {
crate::chunking::Chunking::from_mapping(
repo,
commit,
meta,
&opts.max_layers,
prior_build,
)
})
.transpose()?;
// If no chunking was provided, create a logical single chunk.
let chunking = chunking
Expand Down Expand Up @@ -291,6 +310,7 @@ async fn build_impl(
repo: &ostree::Repo,
ostree_ref: &str,
config: &Config,
prior_build: Option<&oci_image::ImageManifest>,
opts: Option<ExportOpts>,
contentmeta: Option<ObjectMetaSized>,
dest: &ImageReference,
Expand All @@ -308,6 +328,7 @@ async fn build_impl(
tag,
config,
opts,
prior_build,
contentmeta,
)?;
None
Expand All @@ -323,6 +344,7 @@ async fn build_impl(
None,
config,
opts,
prior_build,
contentmeta,
)?;

Expand Down Expand Up @@ -377,9 +399,19 @@ pub async fn encapsulate<S: AsRef<str>>(
repo: &ostree::Repo,
ostree_ref: S,
config: &Config,
prior_build: Option<&oci_image::ImageManifest>,
opts: Option<ExportOpts>,
contentmeta: Option<ObjectMetaSized>,
dest: &ImageReference,
) -> Result<String> {
build_impl(repo, ostree_ref.as_ref(), config, opts, contentmeta, dest).await
build_impl(
repo,
ostree_ref.as_ref(),
config,
prior_build,
opts,
contentmeta,
dest,
)
.await
}
4 changes: 4 additions & 0 deletions lib/src/container/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ use std::str::FromStr;
/// The label injected into a container image that contains the ostree commit SHA-256.
pub const OSTREE_COMMIT_LABEL: &str = "ostree.commit";

/// The name of an annotation attached to a layer which names the packages/components
/// which are part of it.
pub(crate) const CONTENT_ANNOTATION: &str = "ostree.components";

/// Our generic catchall fatal error, expected to be converted
/// to a string to output to a terminal or logs.
type Result<T> = anyhow::Result<T>;
Expand Down
5 changes: 3 additions & 2 deletions lib/src/container/ocidir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ impl OciDir {
config: &mut oci_image::ImageConfiguration,
layer: Layer,
description: &str,
annotations: Option<HashMap<String, String>>,
) {
let annotations: Option<HashMap<String, String>> = None;
self.push_layer_annotated(manifest, config, layer, annotations, description);
}

Expand Down Expand Up @@ -531,7 +531,8 @@ mod tests {
let mut config = oci_image::ImageConfigurationBuilder::default()
.build()
.unwrap();
w.push_layer(&mut manifest, &mut config, root_layer, "root");
let annotations: Option<HashMap<String, String>> = None;
w.push_layer(&mut manifest, &mut config, root_layer, "root", annotations);
let config = w.write_config(config)?;
manifest.set_config(config);
w.replace_with_single_manifest(manifest.clone(), oci_image::Platform::default())?;
Expand Down
11 changes: 9 additions & 2 deletions lib/src/fixture.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ d tmp
"## };
pub const CONTENTS_CHECKSUM_V0: &str =
"5e41de82f9f861fa51e53ce6dd640a260e4fb29b7657f5a3f14157e93d2c0659";
pub static CONTENTS_V0_LEN: Lazy<usize> = Lazy::new(|| OWNERS.len().checked_sub(1).unwrap());
// 1 for ostree commit, 2 for max frequency packages, 3 as empty layer
pub const LAYERS_V0_LEN: usize = 3usize;
pub const PKGS_V0_LEN: usize = 7usize;

#[derive(Debug, PartialEq, Eq)]
enum SeLabel {
Expand Down Expand Up @@ -317,6 +319,7 @@ fn build_mapping_recurse(
name: Rc::clone(&owner),
srcid: Rc::clone(&owner),
change_time_offset: u32::MAX,
change_frequency: u32::MAX,
});
}

Expand Down Expand Up @@ -661,11 +664,15 @@ impl Fixture {
let contentmeta = self.get_object_meta().context("Computing object meta")?;
let contentmeta = ObjectMetaSized::compute_sizes(self.srcrepo(), contentmeta)
.context("Computing sizes")?;
let opts = ExportOpts::default();
let opts = ExportOpts {
max_layers: std::num::NonZeroU32::new(PKGS_V0_LEN as u32),
..Default::default()
};
let digest = crate::container::encapsulate(
self.srcrepo(),
self.testref(),
&config,
None,
Some(opts),
Some(contentmeta),
&imgref,
Expand Down
Binary file modified lib/src/fixtures/fedora-coreos-contentmeta.json.gz
Binary file not shown.
1 change: 1 addition & 0 deletions lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ pub mod objectsource;
pub(crate) mod objgv;
#[cfg(feature = "internal-testing-api")]
pub mod ostree_manual;
pub(crate) mod statistics;

mod utils;

Expand Down
6 changes: 3 additions & 3 deletions lib/src/objectsource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ pub struct ObjectSourceMeta {
/// Unique identifier, does not need to be human readable, but can be.
#[serde(with = "rcstr_serialize")]
pub identifier: ContentID,
/// Identifier for this source (e.g. package name-version, git repo).
/// Unlike the [`ContentID`], this should be human readable. It likely comes from an external source,
/// and may be re-serialized.
/// Just the name of the package (no version), needs to be human readable.
#[serde(with = "rcstr_serialize")]
pub name: Rc<str>,
/// Identifier for the *source* of this content; for example, if multiple binary
Expand All @@ -54,6 +52,8 @@ pub struct ObjectSourceMeta {
/// One suggested way to generate this number is to have it be in units of hours or days
/// since the earliest changed item.
pub change_time_offset: u32,
/// Change frequency
pub change_frequency: u32,
}

impl PartialEq for ObjectSourceMeta {
Expand Down
109 changes: 109 additions & 0 deletions lib/src/statistics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//! This module holds implementations of some basic statistical properties, such as mean and standard deviation.
pub(crate) fn mean(data: &[u64]) -> Option<f64> {
if data.is_empty() {
None
} else {
Some(data.iter().sum::<u64>() as f64 / data.len() as f64)
}
}

pub(crate) fn std_deviation(data: &[u64]) -> Option<f64> {
match (mean(data), data.len()) {
(Some(data_mean), count) if count > 0 => {
let variance = data
.iter()
.map(|value| {
let diff = data_mean - (*value as f64);
diff * diff
})
.sum::<f64>()
/ count as f64;
Some(variance.sqrt())
}
_ => None,
}
}

//Assumed sorted
pub(crate) fn median_absolute_deviation(data: &mut [u64]) -> Option<(f64, f64)> {
if data.is_empty() {
None
} else {
//Sort data
//data.sort_by(|a, b| a.partial_cmp(b).unwrap());

//Find median of data
let median_data: f64 = match data.len() % 2 {
1 => data[data.len() / 2] as f64,
_ => 0.5 * (data[data.len() / 2 - 1] + data[data.len() / 2]) as f64,
};

//Absolute deviations
let mut absolute_deviations = Vec::new();
for size in data {
absolute_deviations.push(f64::abs(*size as f64 - median_data))
}

absolute_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap());
let l = absolute_deviations.len();
let mad: f64 = match l % 2 {
1 => absolute_deviations[l / 2],
_ => 0.5 * (absolute_deviations[l / 2 - 1] + absolute_deviations[l / 2]),
};

Some((median_data, mad))
}
}

#[test]
fn test_mean() {
assert_eq!(mean(&[]), None);
for v in [0u64, 1, 5, 100] {
assert_eq!(mean(&[v]), Some(v as f64));
}
assert_eq!(mean(&[0, 1]), Some(0.5));
assert_eq!(mean(&[0, 5, 100]), Some(35.0));
assert_eq!(mean(&[7, 4, 30, 14]), Some(13.75));
}

#[test]
fn test_std_deviation() {
assert_eq!(std_deviation(&[]), None);
for v in [0u64, 1, 5, 100] {
assert_eq!(std_deviation(&[v]), Some(0 as f64));
}
assert_eq!(std_deviation(&[1, 4]), Some(1.5));
assert_eq!(std_deviation(&[2, 2, 2, 2]), Some(0.0));
assert_eq!(
std_deviation(&[1, 20, 300, 4000, 50000, 600000, 7000000, 80000000]),
Some(26193874.56387471)
);
}

#[test]
fn test_median_absolute_deviation() {
//Assumes sorted
assert_eq!(median_absolute_deviation(&mut []), None);
for v in [0u64, 1, 5, 100] {
assert_eq!(median_absolute_deviation(&mut [v]), Some((v as f64, 0.0)));
}
assert_eq!(median_absolute_deviation(&mut [1, 4]), Some((2.5, 1.5)));
assert_eq!(
median_absolute_deviation(&mut [2, 2, 2, 2]),
Some((2.0, 0.0))
);
assert_eq!(
median_absolute_deviation(&mut [
1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 9, 12, 52, 90
]),
Some((6.0, 2.0))
);

//if more than half of the data has the same value, MAD = 0, thus any
//value different from the residual median is classified as an outlier
assert_eq!(
median_absolute_deviation(&mut [0, 1, 1, 1, 1, 1, 1, 1, 0]),
Some((1.0, 0.0))
);
}
Loading

0 comments on commit 5159164

Please sign in to comment.