Skip to content

Commit

Permalink
Improve generation of base images SBOMs
Browse files Browse the repository at this point in the history
The process how the base image SBOM is generated was slightly changed.

The most important change is due to requiring different inputs. The
script now requires the full json of parsed Dockerfile, so we can
extract the base images better. This allows us to extract base images
data even for stage aliases.

Another advantage is that we are counting
the stages/layers more carefully, because we have more information about
which stage was from scratch or from oci-archive.

KFLUXBUGS-1718

Signed-off-by: mkosiarc <[email protected]>
  • Loading branch information
mkosiarc committed Dec 4, 2024
1 parent 377a181 commit 2177875
Show file tree
Hide file tree
Showing 3 changed files with 543 additions and 86 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ This is a script that creates sbom data for base images. It is used in

It takes several inputs:
1. path to the sbom file, that will be updated in place with the base image data
2. path to a file containing base images as taken from from the dockerfile (with preserved order)
3. path to a file containing base images with digests, generated from the output of **buildah images --format '{{ .Name }}:{{ .Tag }}@{{ .Digest }}'**. The dockerfile order must be preserved as well
2. path to a json file containing parsed Dockerfile via dockerfile-json
3. path to a file containing base images references as used during in the Dockerfile mapped to the full image references
with digests. This mapping is expected to be in the format **<image-reference-used-in-dockerfile>|<full-image-reference-with-digest>**.
The full image reference with digest is generated from the output of **buildah images --format '{{ .Name }}:{{ .Tag }}@{{ .Digest }}'**.



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,56 @@ def parse_image_reference_to_parts(image):
return ParsedImage(repository=repository, digest=digest, name=name)


def get_base_images_sbom_components(base_images_digests, is_last_from_scratch):
def get_base_images_sbom_components(base_images, base_images_digests):
"""
Creates the base images sbom data
:param base_images_digests: (List) - list of base images digests, same as BASE_IMAGE_DIGESTS tekton result
:param is_last_from_scratch: (Boolean) - Is the last stage/base image from scratch?
:param base_images: (List) - List of base images used during build, in the order they were used. The values here
are the keys in the base_images_digests dict.
For example:
["registry.access.redhat.com/ubi8/ubi:latest"]
:param base_images_digests: (Dict) - Dict of base images references, where the key is the image reference as
used in the original Dockerfile (The elements of base_images param)
and the values are the full image reference with digests that was
actually used by buildah during build time.
For example:
{
"registry.access.redhat.com/ubi8/ubi:latest":
"registry.access.redhat.com/ubi8/ubi:latest@sha256:627867e53ad6846afba2dfbf5cef1d54c868a9025633ef0afd546278d4654eac"
}
:return: components (List) - List of dict items in which each item contains sbom data about each base image
"""

components = []
already_used_base_images = set()

# property_name shows whether the image was used only in the building process
# or if it is the final base image. If the final base image is scratch, then
# this is omitted, because we aren't including scratch in the sbom.
for index, image in enumerate(base_images_digests):
for index, image in enumerate(base_images):
# flatpak archive and scratch are not real base images. So we skip them, but
# in a way that allows us to keep the correct track of index variable that
# refers to stage number.
if image.startswith("oci-archive") or image == "scratch":
continue

# property_name shows whether the image was used only in the building process
# or if it is the final base image.
property_name = "konflux:container:is_builder_image:for_stage"
property_value = str(index)
if index == len(base_images_digests) - 1 and not is_last_from_scratch:

# This is not reached if the last "image" was scratch or oci-archive.
# That is because we don't consider them base images, and we aren't putting
# them in SBOM
if index == len(base_images) - 1:
property_name = "konflux:container:is_base_image"
property_value = "true"

parsed_image = parse_image_reference_to_parts(image)
# It could happen that we have a base image from the parsed Dockerfile, but we don't have
# a digest reference for it. This could happen when buildah skipped the stage, due to optimization
# when it is unreachable, or redundant. Since in this case, it was not used in the actual build,
# it is ok to just skip these stages
base_image_digest = base_images_digests.get(image)
if not base_image_digest:
continue
parsed_image = parse_image_reference_to_parts(base_image_digest)

purl = PackageURL(
type="oci",
Expand Down Expand Up @@ -87,23 +114,77 @@ def get_base_images_sbom_components(base_images_digests, is_last_from_scratch):
return components


def get_base_images_from_dockerfile(parsed_dockerfile):
"""
Reads the base images from provided parsed dockerfile
:param parsed_dockerfile: (Dict) - Contents of the parsed dockerfile
:return: base_images (List) - List of base images used during build as extracted
from the dockerfile in the order they were used.
Example:
If the Dockerfile looks like
FROM registry.access.redhat.com/ubi8/ubi:latest as builder
...
FROM builder
...
Then the relevant part of parsed_dockerfile look like
{
"Stages": [
{
"BaseName": "registry.access.redhat.com/ubi8/ubi:latest",
"As": "builder",
"From": {"Image": "registry.access.redhat.com/ubi8/ubi:latest"},
},
{
"BaseName": "builder",
"From": {"Stage": {"Named": "builder", "Index": 0}},
},
]
},
"""
base_images = []

# this part of the json is the relevant one that contains the
# info about base images
stages = parsed_dockerfile["Stages"]

for stage in stages:
if "Image" in stage["From"]:
base_images.append(stage["From"]["Image"])
elif "Scratch" in stage["From"]:
base_images.append("scratch")
elif "Stage" in stage["From"]:
stage_index = stage["From"]["Stage"]["Index"]
# Find the original stage/image. Named stage can refer to another named stage,
# so continue looking until we find the image those stages refer to.
while stage_index is not None and stage_index != {}:
refered_stage = stages[stage_index]
stage_index = refered_stage.get("From").get("Stage", {}).get("Index", None)
if stage_index is None or stage_index == {}:
base_images.append(refered_stage["From"]["Image"])

return base_images


def parse_args():
parser = argparse.ArgumentParser(
description="Updates the sbom file with base images data based on the provided files"
)
parser.add_argument("--sbom", type=pathlib.Path, help="Path to the sbom file", required=True)
parser.add_argument(
"--base-images-from-dockerfile",
"--parsed-dockerfile",
type=pathlib.Path,
help="Path to the file containing base images extracted from Dockerfile via grep, sed and awk in the buildah "
"task",
help="Path to the file containing parsed Dockerfile in json format extracted "
"from dockerfile-json in buildah task",
required=True,
)
parser.add_argument(
"--base-images-digests",
type=pathlib.Path,
help="Path to the file containing base images digests."
" This is taken from the BASE_IMAGES_DIGEST tekton result that was generated from"
" This is taken from the base_images_digests file that was generated from"
"the output of 'buildah images'",
required=True,
)
Expand All @@ -116,21 +197,25 @@ def main():

args = parse_args()

base_images_from_dockerfile = args.base_images_from_dockerfile.read_text().splitlines()
base_images_digests = args.base_images_digests.read_text().splitlines()
with args.parsed_dockerfile.open("r") as f:
parsed_dockerfile = json.load(f)

base_images = get_base_images_from_dockerfile(parsed_dockerfile)

is_last_from_scratch = False
if base_images_from_dockerfile[-1] == "scratch":
is_last_from_scratch = True
base_images_digests_raw = args.base_images_digests.read_text().splitlines()
base_images_digests = dict(item.split() for item in base_images_digests_raw)

with args.sbom.open("r") as f:
sbom = json.load(f)

base_images_sbom_components = get_base_images_sbom_components(base_images_digests, is_last_from_scratch)
if "formulation" in sbom:
sbom["formulation"].append({"components": base_images_sbom_components})
else:
sbom.update({"formulation": [{"components": base_images_sbom_components}]})
base_images_sbom_components = get_base_images_sbom_components(base_images, base_images_digests)

# base_images_sbom_components could be empty, when having just one stage FROM scratch
if base_images_sbom_components:
if "formulation" in sbom:
sbom["formulation"].append({"components": base_images_sbom_components})
else:
sbom.update({"formulation": [{"components": base_images_sbom_components}]})

with args.sbom.open("w") as f:
json.dump(sbom, f, indent=4)
Expand Down
Loading

0 comments on commit 2177875

Please sign in to comment.