Skip to content

Commit

Permalink
feat: support deduplication in flatten
Browse files Browse the repository at this point in the history
  • Loading branch information
thesayyn committed Nov 27, 2024
1 parent 7b342e2 commit 883bea8
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 3 deletions.
1 change: 1 addition & 0 deletions distroless/private/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ load("@rules_java//java:defs.bzl", "java_binary")
exports_files([
"cacerts.sh",
"locale.sh",
"flatten.sh",
])

java_binary(
Expand Down
20 changes: 18 additions & 2 deletions distroless/private/flatten.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,17 @@ def _flatten_impl(ctx):
output = ctx.actions.declare_file(ctx.attr.name + ext)

args = ctx.actions.args()
args.add(bsdtar.tarinfo.binary)
args.add(output if ctx.attr.deduplicate else "-")
args.add_all(tar_lib.DEFAULT_ARGS)
args.add("--create")
tar_lib.common.add_compression_args(ctx.attr.compress, args)
tar_lib.add_default_compression_args(ctx.attr.compress, args)
args.add("--file", output)
args.add("--file", "-" if ctx.attr.deduplicate else output)
args.add_all(ctx.files.tars, format_each = "@%s")

ctx.actions.run(
executable = bsdtar.tarinfo.binary,
executable = ctx.executable._flatten_sh,
inputs = ctx.files.tars,
outputs = [output],
tools = bsdtar.default.files,
Expand All @@ -39,10 +41,24 @@ flatten = rule(
allow_empty = False,
doc = "List of tars to flatten",
),
"deduplicate": attr.bool(doc = """\
EXPERIMENTAL: Remove duplicate entries from the archives after flattening.
This requires `awk`, `sort` and `tar` to be available in the PATH.
To support macOS, presence of `gtar` is checked, and `tar` if it does not exist,
and ensured if supports the `--delete` mode.
On macOS: `brew install gnu-tar` can be run to install gnutar.
See: https://formulae.brew.sh/formula/gnu-tar
NOTE: You may also need to run `sudo ln -s /opt/homebrew/bin/gtar /usr/local/bin/gtar` to make it available to Bazel.
""", default = False),
"compress": attr.string(
doc = "Compress the archive file with a supported algorithm.",
values = tar_lib.common.accepted_compression_types,
),
"_flatten_sh": attr.label(default = "//distroless/private:flatten.sh", executable = True, cfg = "exec", allow_single_file = True),
},
implementation = _flatten_impl,
toolchains = [tar_lib.TOOLCHAIN_TYPE],
Expand Down
49 changes: 49 additions & 0 deletions distroless/private/flatten.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -o pipefail -o errexit

bsdtar="$1";
output="$2";
shift 2;

function run_gtar() {
local TAR=
if [[ "$(command -v gtar)" ]]; then
TAR="gtar";
elif [[ "$(command -v tar)" ]]; then
TAR="tar";
else
echo "Neither 'tar' nor 'gtar' command is available.";
exit 1;
fi
"$TAR" "$@";
}


# Deduplication requested, use this complex pipeline to deduplicate.
if [[ "$output" != "-" ]]; then

mtree=$(mktemp)
duplicates=$(mktemp)

for arg in "$@"; do
if [[ "$arg" == "@"* ]]; then
"$bsdtar" -cf - --format=mtree --options "mtree:!all,mtree:type" "$arg" >> "$mtree"
fi
done


awk '{
if (substr($0,0,1) == "#") {
next;
}
line_count[$1]++;
if (line_count[$1] > 1) {
print substr($1, 3, length($1));
}
}' "$mtree" | sort | uniq | sort -r > "$duplicates"
$bsdtar $@ | run_gtar --delete --file - --occurrence=1 --files-from="$duplicates" > "$output"
rm "$mtree"
else
# No deduplication, business as usual
$bsdtar $@
fi
3 changes: 2 additions & 1 deletion docs/rules.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 43 additions & 0 deletions examples/flatten/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,46 @@ assert_tar_listing(
./root time=0.0 mode=700 gid=0 uid=0 type=dir
""",
)

# Flatten with deduplication
tar(
name = "source1",
srcs = glob(["dir/*"]),
compress = "xz",
)

tar(
name = "source2",
srcs = glob(["dir/**/*"]),
compress = "xz",
)

tar(
name = "source3",
srcs = glob(["dir/**/*"]),
compress = "xz",
)

flatten(
name = "flatten_dedup",
deduplicate = True,
tars = [
":source2",
":source1",
":source3",
],
)

assert_tar_listing(
name = "test_flatten_dedup",
actual = "flatten_dedup",
expected = """\
#mtree
./examples time=1672560000.0 mode=755 gid=0 uid=0 type=dir
./examples/flatten time=1672560000.0 mode=755 gid=0 uid=0 type=dir
./examples/flatten/dir time=1672560000.0 mode=755 gid=0 uid=0 type=dir
./examples/flatten/dir/changelog time=1672560000.0 mode=755 gid=0 uid=0 type=file size=0
./examples/flatten/dir/sub time=1672560000.0 mode=755 gid=0 uid=0 type=dir
./examples/flatten/dir/sub/content.txt time=1672560000.0 mode=755 gid=0 uid=0 type=file size=0
""",
)

0 comments on commit 883bea8

Please sign in to comment.