Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve generation of base images SBOMs #191

Merged
merged 1 commit into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ This is a script that creates sbom data for base images. It is used in

It takes several inputs:
1. path to the sbom file, that will be updated in place with the base image data
2. path to a file containing base images as taken from from the dockerfile (with preserved order)
3. path to a file containing base images with digests, generated from the output of **buildah images --format '{{ .Name }}:{{ .Tag }}@{{ .Digest }}'**. The dockerfile order must be preserved as well
2. path to a json file containing parsed Dockerfile via dockerfile-json
3. path to a file containing base images references as used during in the Dockerfile mapped to the full image references
with digests. This mapping is expected to be in the format **<image-reference-used-in-dockerfile>|<full-image-reference-with-digest>**.
The full image reference with digest is generated from the output of **buildah images --format '{{ .Name }}:{{ .Tag }}@{{ .Digest }}'**.



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,56 @@ def parse_image_reference_to_parts(image):
return ParsedImage(repository=repository, digest=digest, name=name)


def get_base_images_sbom_components(base_images_digests, is_last_from_scratch):
def get_base_images_sbom_components(base_images, base_images_digests):
"""
Creates the base images sbom data

:param base_images_digests: (List) - list of base images digests, same as BASE_IMAGE_DIGESTS tekton result
:param is_last_from_scratch: (Boolean) - Is the last stage/base image from scratch?
:param base_images: (List) - List of base images used during build, in the order they were used. The values here
are the keys in the base_images_digests dict.
For example:
["registry.access.redhat.com/ubi8/ubi:latest"]
:param base_images_digests: (Dict) - Dict of base images references, where the key is the image reference as
used in the original Dockerfile (The elements of base_images param)
and the values are the full image reference with digests that was
actually used by buildah during build time.
For example:
{
"registry.access.redhat.com/ubi8/ubi:latest":
"registry.access.redhat.com/ubi8/ubi:latest@sha256:627867e53ad6846afba2dfbf5cef1d54c868a9025633ef0afd546278d4654eac"
}
:return: components (List) - List of dict items in which each item contains sbom data about each base image
"""

components = []
already_used_base_images = set()

# property_name shows whether the image was used only in the building process
# or if it is the final base image. If the final base image is scratch, then
# this is omitted, because we aren't including scratch in the sbom.
for index, image in enumerate(base_images_digests):
for index, image in enumerate(base_images):
# flatpak archive and scratch are not real base images. So we skip them, but
# in a way that allows us to keep the correct track of index variable that
# refers to stage number.
if image.startswith("oci-archive") or image == "scratch":
chmeliik marked this conversation as resolved.
Show resolved Hide resolved
continue

# property_name shows whether the image was used only in the building process
# or if it is the final base image.
property_name = "konflux:container:is_builder_image:for_stage"
property_value = str(index)
if index == len(base_images_digests) - 1 and not is_last_from_scratch:

# This is not reached if the last "image" was scratch or oci-archive.
# That is because we don't consider them base images, and we aren't putting
# them in SBOM
if index == len(base_images) - 1:
property_name = "konflux:container:is_base_image"
property_value = "true"

parsed_image = parse_image_reference_to_parts(image)
# It could happen that we have a base image from the parsed Dockerfile, but we don't have
# a digest reference for it. This could happen when buildah skipped the stage, due to optimization
# when it is unreachable, or redundant. Since in this case, it was not used in the actual build,
# it is ok to just skip these stages
base_image_digest = base_images_digests.get(image)
if not base_image_digest:
continue
parsed_image = parse_image_reference_to_parts(base_image_digest)

purl = PackageURL(
type="oci",
Expand Down Expand Up @@ -87,23 +114,77 @@ def get_base_images_sbom_components(base_images_digests, is_last_from_scratch):
return components


def get_base_images_from_dockerfile(parsed_dockerfile):
"""
Reads the base images from provided parsed dockerfile

:param parsed_dockerfile: (Dict) - Contents of the parsed dockerfile
:return: base_images (List) - List of base images used during build as extracted
from the dockerfile in the order they were used.

Example:
If the Dockerfile looks like
FROM registry.access.redhat.com/ubi8/ubi:latest as builder
...
FROM builder
...

Then the relevant part of parsed_dockerfile look like
{
"Stages": [
{
"BaseName": "registry.access.redhat.com/ubi8/ubi:latest",
"As": "builder",
"From": {"Image": "registry.access.redhat.com/ubi8/ubi:latest"},
},
{
"BaseName": "builder",
"From": {"Stage": {"Named": "builder", "Index": 0}},
},
]
},
"""
base_images = []

# this part of the json is the relevant one that contains the
# info about base images
stages = parsed_dockerfile["Stages"]

for stage in stages:
if "Image" in stage["From"]:
base_images.append(stage["From"]["Image"])
elif "Scratch" in stage["From"]:
base_images.append("scratch")
elif "Stage" in stage["From"]:
stage_index = stage["From"]["Stage"]["Index"]
# Find the original stage/image. Named stage can refer to another named stage,
# so continue looking until we find the image those stages refer to.
while stage_index is not None:
refered_stage = stages[stage_index]
stage_index = refered_stage.get("From").get("Stage", {}).get("Index", None)
if stage_index is None:
base_images.append(refered_stage["From"]["Image"])

return base_images


def parse_args():
parser = argparse.ArgumentParser(
description="Updates the sbom file with base images data based on the provided files"
)
parser.add_argument("--sbom", type=pathlib.Path, help="Path to the sbom file", required=True)
parser.add_argument(
"--base-images-from-dockerfile",
"--parsed-dockerfile",
type=pathlib.Path,
help="Path to the file containing base images extracted from Dockerfile via grep, sed and awk in the buildah "
"task",
help="Path to the file containing parsed Dockerfile in json format extracted "
"from dockerfile-json in buildah task",
required=True,
)
parser.add_argument(
"--base-images-digests",
type=pathlib.Path,
help="Path to the file containing base images digests."
" This is taken from the BASE_IMAGES_DIGEST tekton result that was generated from"
" This is taken from the base_images_digests file that was generated from"
"the output of 'buildah images'",
required=True,
)
Expand All @@ -116,21 +197,25 @@ def main():

args = parse_args()

base_images_from_dockerfile = args.base_images_from_dockerfile.read_text().splitlines()
base_images_digests = args.base_images_digests.read_text().splitlines()
with args.parsed_dockerfile.open("r") as f:
parsed_dockerfile = json.load(f)

base_images = get_base_images_from_dockerfile(parsed_dockerfile)

is_last_from_scratch = False
if base_images_from_dockerfile[-1] == "scratch":
is_last_from_scratch = True
base_images_digests_raw = args.base_images_digests.read_text().splitlines()
base_images_digests = dict(item.split() for item in base_images_digests_raw)

with args.sbom.open("r") as f:
sbom = json.load(f)

base_images_sbom_components = get_base_images_sbom_components(base_images_digests, is_last_from_scratch)
if "formulation" in sbom:
sbom["formulation"].append({"components": base_images_sbom_components})
else:
sbom.update({"formulation": [{"components": base_images_sbom_components}]})
base_images_sbom_components = get_base_images_sbom_components(base_images, base_images_digests)

# base_images_sbom_components could be empty, when having just one stage FROM scratch
if base_images_sbom_components:
if "formulation" in sbom:
sbom["formulation"].append({"components": base_images_sbom_components})
else:
sbom.update({"formulation": [{"components": base_images_sbom_components}]})

with args.sbom.open("w") as f:
json.dump(sbom, f, indent=4)
Expand Down
Loading
Loading