diff --git a/pyxis/test_upload_rpm_data.py b/pyxis/test_upload_rpm_data.py index 56c7aae..ed7e1e4 100644 --- a/pyxis/test_upload_rpm_data.py +++ b/pyxis/test_upload_rpm_data.py @@ -10,6 +10,7 @@ update_container_content_sets, load_sbom_packages, construct_rpm_items_and_content_sets, + get_purl_type, ) GRAPHQL_API = "myapiurl" @@ -458,3 +459,25 @@ def test_construct_rpm_items_and_content_sets__no_packages_result_in_empty_list( assert rpms == [] assert content_sets == [] + + +def test_get_purl_type__rpm(): + purl = ( + "pkg:rpm/rhel/acl@2.3.1-4.el9?arch=x86_64&upstream=acl-2.3.1-4.el9.src.rpm" + "&distro=rhel-9.4&repository_id=myrepo3" + ) + + type = get_purl_type(purl) + + assert type == "rpm" + + +def test_get_purl_type__invalid_docker(): + """This is an invalid purl that packageurl.PackageURL.from_string() would fail on, + but we can still get the type successfully. + """ + purl = "pkg:github/docker:/#docker.mirror.hashicorp.services/rhysd/actionlint:latest" + + type = get_purl_type(purl) + + assert type == "github" diff --git a/pyxis/test_upload_rpm_data_cyclonedx.py b/pyxis/test_upload_rpm_data_cyclonedx.py index bdc4de4..a670aed 100644 --- a/pyxis/test_upload_rpm_data_cyclonedx.py +++ b/pyxis/test_upload_rpm_data_cyclonedx.py @@ -11,6 +11,7 @@ load_sbom_components, check_bom_ref_duplicates, construct_rpm_items_and_content_sets, + get_purl_type, ) GRAPHQL_API = "myapiurl" @@ -435,3 +436,25 @@ def test_construct_rpm_items_and_content_sets__no_components_result_in_empty_lis assert rpms == [] assert content_sets == [] + + +def test_get_purl_type__rpm(): + purl = ( + "pkg:rpm/rhel/acl@2.3.1-4.el9?arch=x86_64&upstream=acl-2.3.1-4.el9.src.rpm" + "&distro=rhel-9.4&repository_id=myrepo3" + ) + + type = get_purl_type(purl) + + assert type == "rpm" + + +def test_get_purl_type__invalid_docker(): + """This is an invalid purl that packageurl.PackageURL.from_string() would fail on, + but we can still get the type successfully. + """ + purl = "pkg:github/docker:/#docker.mirror.hashicorp.services/rhysd/actionlint:latest" + + type = get_purl_type(purl) + + assert type == "github" diff --git a/pyxis/upload_rpm_data.py b/pyxis/upload_rpm_data.py index 6a4e216..e246163 100755 --- a/pyxis/upload_rpm_data.py +++ b/pyxis/upload_rpm_data.py @@ -235,9 +235,10 @@ def construct_rpm_items_and_content_sets( for externalRef in package.get("externalRefs", []): if externalRef.get("referenceType") != "purl": continue - purl_dict = PackageURL.from_string(externalRef["referenceLocator"]).to_dict() - if purl_dict["type"] != "rpm": + type = get_purl_type(externalRef["referenceLocator"]) + if type != "rpm": continue + purl_dict = PackageURL.from_string(externalRef["referenceLocator"]).to_dict() if purl_dict["name"] in IGNORED_PACKAGES: continue rpm_item = { @@ -271,6 +272,34 @@ def construct_rpm_items_and_content_sets( return rpms_items, sorted(content_sets) +def get_purl_type(purl: str): + """ + Return purl type parsed from a purl string. + + Copied and adapted from packageurl package. The reason we need this function + and cannot simply use the type component of + packageurl.PackageURL.from_string(purl) is that there can be invalid non-rpm + purls generated by syft. By getting just the type first and skipping those + purls, we avoid failing on those invalid purls. + + Raise ValueError on errors. + """ + scheme, sep, remainder = purl.partition(":") + if not sep or scheme != "pkg": + raise ValueError(f'purl is missing the required "pkg" scheme component: {repr(purl)}.') + + # this strip '/, // and /// as possible in :// or :/// + remainder = remainder.strip().lstrip("/") + + type, sep, remainder = remainder.partition("/") # NOQA + if not type or not sep: + raise ValueError(f"purl is missing the required type component: {repr(purl)}.") + + type = type.lower() + + return type + + def main(): # pragma: no cover """Main func""" args = parse_arguments() diff --git a/pyxis/upload_rpm_data_cyclonedx.py b/pyxis/upload_rpm_data_cyclonedx.py index 5c33fc0..9edca68 100755 --- a/pyxis/upload_rpm_data_cyclonedx.py +++ b/pyxis/upload_rpm_data_cyclonedx.py @@ -257,8 +257,9 @@ def construct_rpm_items_and_content_sets( content_sets = set() for component in components: if "purl" in component: - purl_dict = PackageURL.from_string(component["purl"]).to_dict() - if purl_dict["type"] == "rpm": + type = get_purl_type(component["purl"]) + if type == "rpm": + purl_dict = PackageURL.from_string(component["purl"]).to_dict() if purl_dict["name"] in IGNORED_PACKAGES: continue rpm_item = { @@ -292,6 +293,34 @@ def construct_rpm_items_and_content_sets( return rpms_items, sorted(content_sets) +def get_purl_type(purl: str): + """ + Return purl type parsed from a purl string. + + Copied and adapted from packageurl package. The reason we need this function + and cannot simply use the type component of + packageurl.PackageURL.from_string(purl) is that there can be invalid non-rpm + purls generated by syft. By getting just the type first and skipping those + purls, we avoid failing on those invalid purls. + + Raise ValueError on errors. + """ + scheme, sep, remainder = purl.partition(":") + if not sep or scheme != "pkg": + raise ValueError(f'purl is missing the required "pkg" scheme component: {repr(purl)}.') + + # this strip '/, // and /// as possible in :// or :/// + remainder = remainder.strip().lstrip("/") + + type, sep, remainder = remainder.partition("/") # NOQA + if not type or not sep: + raise ValueError(f"purl is missing the required type component: {repr(purl)}.") + + type = type.lower() + + return type + + def main(): # pragma: no cover """Main func""" args = parse_arguments()