Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[23.2] Use python-isal for fast zip deflate compression in rocrate export #17342

Merged
merged 3 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/galaxy/dependencies/pinned-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ idna==3.5 ; python_version >= "3.7" and python_version < "3.12"
importlib-metadata==4.13.0 ; python_version >= "3.7" and python_version < "3.12"
importlib-resources==5.12.0 ; python_version >= "3.7" and python_version < "3.12"
isa-rwval==0.10.10 ; python_version >= "3.7" and python_version < "3.12"
isal==1.3.0 ; python_version >= "3.7" and python_version < "3.12"
isodate==0.6.1 ; python_version >= "3.7" and python_version < "3.12"
jinja2==3.1.2 ; python_version >= "3.7" and python_version < "3.12"
jmespath==1.0.1 ; python_version >= "3.7" and python_version < "3.12"
Expand Down
2 changes: 1 addition & 1 deletion lib/galaxy/model/store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2753,7 +2753,7 @@ def _finalize(self) -> None:
out_file = out_file_name[: -len(".zip")]
else:
out_file = out_file_name
rval = shutil.make_archive(out_file, "zip", self.export_directory)
rval = shutil.make_archive(out_file, "fastzip", self.export_directory)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this works only because this files also imports from galaxy.util.compression_utils import CompressedFile. Should we re-export shutil (maybe as gx_shutil) in lib/galaxy/util/compression_utils.py and import it in this file from there?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this works only because this files also imports from galaxy.util.compression_utils import CompressedFile

yes

Should we re-export shutil (maybe as gx_shutil) in lib/galaxy/util/compression_utils.py and import it in this file from there?

makes sense, sure

if not self.file_source_uri:
shutil.move(rval, self.out_file)
else:
Expand Down
88 changes: 88 additions & 0 deletions lib/galaxy/util/compression_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import io
import logging
import os
import shutil
import tarfile
import tempfile
import zipfile
Expand Down Expand Up @@ -30,6 +31,12 @@
is_gzip,
)

try:
from isal import isal_zlib
except ImportError:
isal_zlib = None # type: ignore[assignment]


log = logging.getLogger(__name__)

FileObjTypeStr = Union[IO[str], io.TextIOWrapper]
Expand Down Expand Up @@ -345,3 +352,84 @@ def zipfile_ok(self, path_to_archive: StrPath) -> bool:
if not member_path.startswith(basename):
return False
return True


class FastZipFile(zipfile.ZipFile):
"""
Simple wrapper around ZipFile that uses the default compression strategy of ISA-L
to write zip files. Ignores compresslevel and compresstype arguments, and is
3 to 4 times faster than the zlib implementation at the default compression level.
"""

def _open_to_write(self, *args, **kwargs): # type: ignore[no-untyped-def]
zwf = super()._open_to_write(*args, **kwargs) # type: ignore[misc]
if isal_zlib:
zwf._compressor = isal_zlib.compressobj(isal_zlib.ISAL_DEFAULT_COMPRESSION, isal_zlib.DEFLATED, -15, 9)
return zwf


# modified from shutil._make_zipfile
def make_fast_zipfile(
base_name: str,
base_dir: str,
verbose: int = 0,
dry_run: int = 0,
logger: Optional[logging.Logger] = None,
owner: Optional[str] = None,
group: Optional[str] = None,
root_dir: Optional[str] = None,
) -> str:
"""Create a zip file from all the files under 'base_dir'.

The output zip file will be named 'base_name' + ".zip". Returns the
name of the output zip file.
"""

zip_filename = base_name + ".zip"
archive_dir = os.path.dirname(base_name)

if archive_dir and not os.path.exists(archive_dir):
if logger is not None:
logger.info("creating %s", archive_dir)
if not dry_run:
os.makedirs(archive_dir)

if logger is not None:
logger.info("creating '%s' and adding '%s' to it", zip_filename, base_dir)

if not dry_run:
with FastZipFile(zip_filename, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
arcname = os.path.normpath(base_dir)
if root_dir is not None:
base_dir = os.path.join(root_dir, base_dir)
base_dir = os.path.normpath(base_dir)
if arcname != os.curdir:
zf.write(base_dir, arcname)
if logger is not None:
logger.info("adding '%s'", base_dir)
for dirpath, dirnames, filenames in os.walk(base_dir):
arcdirpath = dirpath
if root_dir is not None:
arcdirpath = os.path.relpath(arcdirpath, root_dir)
arcdirpath = os.path.normpath(arcdirpath)
for name in sorted(dirnames):
path = os.path.join(dirpath, name)
arcname = os.path.join(arcdirpath, name)
zf.write(path, arcname)
if logger is not None:
logger.info("adding '%s'", path)
for name in filenames:
path = os.path.join(dirpath, name)
path = os.path.normpath(path)
if os.path.isfile(path):
arcname = os.path.join(arcdirpath, name)
zf.write(path, arcname)
if logger is not None:
logger.info("adding '%s'", path)

if root_dir is not None:
zip_filename = os.path.abspath(zip_filename)
return zip_filename


shutil.register_archive_format("fastzip", make_fast_zipfile)
1 change: 1 addition & 0 deletions packages/data/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ install_requires =
h5grove
h5py
isa-rwval
isal
MarkupSafe
msal
mrcfile
Expand Down
1 change: 1 addition & 0 deletions packages/files/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ include_package_data = True
install_requires =
galaxy-util
fs
isal
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mvdbeek Do you remember why you added the isal dependency here as well?
I don't think it's currently used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, why it's used in the files package ? I don't know, it should be enough to just have it be a dependency of data I thuok ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, exactly. I'll drop it in #18449 .

typing-extensions
packages = find:
python_requires = >=3.7
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ h5py = "*"
importlib-metadata = "<5" # Work around https://github.com/celery/kombu/issues/1600
importlib-resources = "*"
isa-rwval = ">=0.10.10"
isal = "*"
kombu = "*"
lagom = "*"
Mako = "*"
Expand Down
Loading