gh-113257: Automatically generate pip SBOM metadata from wheel (#113295)

Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com>
This commit is contained in:
Seth Michael Larson 2023-12-20 11:28:20 -06:00 committed by GitHub
parent 11ee912327
commit b221e03010
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 107 additions and 11 deletions

2
Misc/sbom.spdx.json generated
View File

@ -1700,7 +1700,7 @@
"checksums": [
{
"algorithm": "SHA256",
"checksumValue": "7ccf472345f20d35bdc9d1841ff5f313260c2c33fe417f48c30ac46cccabf5be"
"checksumValue": "5052d7889c1f9d05224cd41741acb7c5d6fa735ab34e339624a614eaaa7e7d76"
}
],
"downloadLocation": "https://files.pythonhosted.org/packages/15/aa/3f4c7bcee2057a76562a5b33ecbd199be08cdb4443a02e26bd2c3cf6fc39/pip-23.3.2-py3-none-any.whl",

View File

@ -1,12 +1,16 @@
"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
import os
import re
import hashlib
import json
import glob
import pathlib
import subprocess
import sys
import typing
from urllib.request import urlopen
CPYTHON_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent
# Before adding a new entry to this list, double check that
# the license expression is a valid SPDX license expression:
@ -43,15 +47,14 @@ class PackageFiles(typing.NamedTuple):
# values to 'exclude' if we create new files within tracked
# directories that aren't sourced from third-party packages.
PACKAGE_TO_FILES = {
# NOTE: pip's entry in this structure is automatically generated in
# the 'discover_pip_sbom_package()' function below.
"mpdecimal": PackageFiles(
include=["Modules/_decimal/libmpdec/**"]
),
"expat": PackageFiles(
include=["Modules/expat/**"]
),
"pip": PackageFiles(
include=["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl"]
),
"macholib": PackageFiles(
include=["Lib/ctypes/macholib/**"],
exclude=[
@ -106,13 +109,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")])
def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None:
"""pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
automatable to discover the metadata we need like the version and checksums
so let's do that on behalf of our friends at the PyPA.
"""
global PACKAGE_TO_FILES
ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
pip_wheels = []
# Find the hopefully one pip wheel in the bundled directory.
for wheel_filename in os.listdir(ensurepip_bundled_dir):
if wheel_filename.startswith("pip-"):
pip_wheels.append(wheel_filename)
if len(pip_wheels) != 1:
print("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'")
sys.exit(1)
pip_wheel_filename = pip_wheels[0]
# Add the wheel filename to the list of files so the SBOM file
# and relationship generator can work its magic on the wheel too.
PACKAGE_TO_FILES["pip"] = PackageFiles(
include=[f"Lib/ensurepip/_bundled/{pip_wheel_filename}"]
)
# Wheel filename format puts the version right after the project name.
pip_version = pip_wheel_filename.split("-")[1]
pip_checksum_sha256 = hashlib.sha256(
(ensurepip_bundled_dir / pip_wheel_filename).read_bytes()
).hexdigest()
# Get pip's download location from PyPI. Check that the checksum is correct too.
try:
raw_text = urlopen(f"https://pypi.org/pypi/pip/{pip_version}/json").read()
pip_release_metadata = json.loads(raw_text)
url: dict[str, typing.Any]
# Look for a matching artifact filename and then check
# its remote checksum to the local one.
for url in pip_release_metadata["urls"]:
if url["filename"] == pip_wheel_filename:
break
else:
raise ValueError(f"No matching filename on PyPI for '{pip_wheel_filename}'")
if url["digests"]["sha256"] != pip_checksum_sha256:
raise ValueError(f"Local pip checksum doesn't match artifact on PyPI")
# Successfully found the download URL for the matching artifact.
pip_download_url = url["url"]
except (OSError, ValueError) as e:
print(f"Couldn't fetch pip's metadata from PyPI: {e}")
sys.exit(1)
# Remove pip from the existing SBOM packages if it's there
# and then overwrite its entry with our own generated one.
sbom_data["packages"] = [
sbom_package
for sbom_package in sbom_data["packages"]
if sbom_package["name"] != "pip"
]
sbom_data["packages"].append(
{
"SPDXID": spdx_id("SPDXRef-PACKAGE-pip"),
"name": "pip",
"versionInfo": pip_version,
"originator": "Organization: Python Packaging Authority",
"licenseConcluded": "MIT",
"downloadLocation": pip_download_url,
"checksums": [
{"algorithm": "SHA256", "checksumValue": pip_checksum_sha256}
],
"externalRefs": [
{
"referenceCategory": "SECURITY",
"referenceLocator": f"cpe:2.3:a:pypa:pip:{pip_version}:*:*:*:*:*:*:*",
"referenceType": "cpe23Type",
},
{
"referenceCategory": "PACKAGE_MANAGER",
"referenceLocator": f"pkg:pypi/pip@{pip_version}",
"referenceType": "purl",
},
],
"primaryPackagePurpose": "SOURCE",
}
)
def main() -> None:
root_dir = pathlib.Path(__file__).parent.parent.parent
sbom_path = root_dir / "Misc/sbom.spdx.json"
sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
sbom_data = json.loads(sbom_path.read_bytes())
# Make a bunch of assertions about the SBOM data to ensure it's consistent.
# Insert pip's SBOM metadata from the wheel.
discover_pip_sbom_package(sbom_data)
# Ensure all packages in this tool are represented also in the SBOM file.
assert {package["name"] for package in sbom_data["packages"]} == set(PACKAGE_TO_FILES)
# Make a bunch of assertions about the SBOM data to ensure it's consistent.
for package in sbom_data["packages"]:
# Properties and ID must be properly formed.
@ -138,17 +234,17 @@ def main() -> None:
for include in sorted(files.include):
# Find all the paths and then filter them through .gitignore.
paths = glob.glob(include, root_dir=root_dir, recursive=True)
paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True)
paths = filter_gitignored_paths(paths)
assert paths, include # Make sure that every value returns something!
for path in paths:
# Skip directories and excluded files
if not (root_dir / path).is_file() or path in exclude:
if not (CPYTHON_ROOT_DIR / path).is_file() or path in exclude:
continue
# SPDX requires SHA1 to be used for files, but we provide SHA256 too.
data = (root_dir / path).read_bytes()
data = (CPYTHON_ROOT_DIR / path).read_bytes()
checksum_sha1 = hashlib.sha1(data).hexdigest()
checksum_sha256 = hashlib.sha256(data).hexdigest()