From dfa0b3d4bebf19464d4ad33f12f5539122345eed Mon Sep 17 00:00:00 2001 From: dennisvang <29799340+dennisvang@users.noreply.github.com> Date: Tue, 21 Nov 2023 16:48:48 +0100 Subject: [PATCH] add tests for gzip header and reproducibility work in progres... --- src/tufup/common.py | 18 ++++++------ tests/test_common.py | 66 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 64 insertions(+), 20 deletions(-) diff --git a/src/tufup/common.py b/src/tufup/common.py index 1d95e0d..ac6c48d 100644 --- a/src/tufup/common.py +++ b/src/tufup/common.py @@ -2,7 +2,6 @@ import logging import pathlib import re -import shutil from tempfile import TemporaryDirectory from typing import Optional, Union @@ -147,19 +146,22 @@ def gzip( Supported kwargs, i.e. `compresslevel` and/or `mtime`, are passed on to `gzip.compress()` [5]. - Note that gzip includes filename and timestamp by default, which makes the - resulting file unreproducible. To fix this we need to do the equivalent of - `gzip --no-name` from GNU gzip [1]. Python's gzip package supports the - `mtime` argument to set the timestamp [2]. Also see SOURCE_DATE_EPOCH env - setting [3], [4] (not supported by Python's gzip, afaik). In addition, - we need to make sure the same algorithm is used, with the same compression - setting. + Note that gzip includes both *filename* and *timestamp* by default, + which makes the resulting files unreproducible. To fix this we need to do the + equivalent of `gzip --no-name` from GNU gzip [1]. Using `gzip.open()` or + using the `gzip.GzipFile` class will add the filename to the header, but this + can be prevented by using `gzip.compress()`, which also supports an `mtime` + argument to set the timestamp [2]. Also see SOURCE_DATE_EPOCH env setting [ + 3], [4] (not supported by Python's gzip, afaik). In addition, we need to make + sure the same algorithm is used, with the same compression setting. Also see + GZIP header definition in rfc1952 spec [6]. [1]: https://www.gnu.org/software/gzip/manual/gzip.html#Invoking-gzip [2]: https://docs.python.org/3/library/gzip.html#examples-of-usage [3]: https://reproducible-builds.org/docs/source-date-epoch/ [4]: https://www.gnu.org/software/gzip/manual/gzip.html#Environment [5]: https://docs.python.org/3/library/gzip.html#gzip.compress + [6]: https://datatracker.ietf.org/doc/html/rfc1952#page-5 """ if src_path.suffix == SUFFIX_GZIP: gzip_function = gzip.decompress diff --git a/tests/test_common.py b/tests/test_common.py index 728f09d..fe731c5 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,8 +1,9 @@ import gzip import logging import pathlib -import shutil +import struct import tarfile +import time import bsdiff4 from packaging.version import Version @@ -141,7 +142,19 @@ def setUp(self) -> None: self.file_paths = dict() self.tar_paths = dict() self.gz_paths = dict() - for key in ['old', 'new']: + # The gzip header contains an mtime field [1], and we need to make sure we + # can set this field properly. However, the resolution of os.stat.mtime + # depends on the operating system and file system, e.g. Windows/FAT32 has a 2 + # sec resolution [2], so to check for inequality of the *default* mtime in + # the gzip header, we would need to force a delay on the order of seconds in + # our tests. To work around this, we override the mtime field for test files. + # [1]: https://datatracker.ietf.org/doc/html/rfc1952#page-5 + # [2]: https://docs.python.org/3.12/library/os.html#os.stat_result + mtimes = dict( + old=time.time() - 100, # some arbitrary time in the past [seconds] + new=None, # i.e. just use the default mtime (current time) + ) + for key, mtime in mtimes.items(): # create dummy file file_path = self.temp_dir_path / key file_path.write_text(key) @@ -149,11 +162,9 @@ def setUp(self) -> None: tar_path = file_path.with_suffix('.tar') with tarfile.open(tar_path, 'w') as tar: tar.add(file_path) - # compress .tar file using gzip + # compress .tar file using gzip (without filename in header) gz_path = tar_path.with_suffix('.tar.gz') - with tar_path.open(mode='rb') as tar_file: - with gzip.open(gz_path, mode='wb') as gz_file: - shutil.copyfileobj(tar_file, gz_file) + gz_path.write_bytes(gzip.compress(data=tar_path.read_bytes(), mtime=mtime)) # keep reference self.file_paths[key] = file_path self.tar_paths[key] = tar_path @@ -164,6 +175,41 @@ def setUp(self) -> None: dst_bytes=self.tar_paths['new'].read_bytes(), ) + def test_gzip_header(self): + # see gzip header definition in RFC 1952 + # byte order: little endian + # https://datatracker.ietf.org/doc/html/rfc1952#page-4 + gzip_header_bytes = 10 # "basic" header size + # make dummy data + expected_mtime = 123 + gz_bytes = gzip.compress(data=b'dummy', mtime=expected_mtime) + # read basic header (variable names from RFC 1952) + (ID1, ID2, CM, FLG, MTIME, XFL, OS) = struct.unpack( + '