From 0b01c9a6e876e87dbd1847903353f9ccb3cdca11 Mon Sep 17 00:00:00 2001 From: mxmlnkn Date: Sun, 22 Sep 2024 21:43:02 +0200 Subject: [PATCH] [feature] Add support for fsspec backends --- .github/workflows/tests.yml | 48 ++- AppImage/build-ratarmount-appimage.sh | 13 + README.md | 119 ++++-- core/pyproject.toml | 44 ++ core/ratarmountcore/FSSpecMountSource.py | 261 ++++++++++++ core/ratarmountcore/SQLiteIndex.py | 16 +- core/ratarmountcore/SQLiteIndexedTar.py | 16 +- core/ratarmountcore/compressions.py | 23 +- core/ratarmountcore/factory.py | 125 ++++++ pyproject.toml | 1 + ratarmount.py | 35 +- tests/.pylintrc | 4 +- tests/ratarmount-help.txt | 6 + tests/requirements-tests.txt | 34 +- tests/runtests.sh | 496 ++++++++++++++++++++++- tests/start-asyncssh-server.py | 23 ++ 16 files changed, 1191 insertions(+), 73 deletions(-) create mode 100644 core/ratarmountcore/FSSpecMountSource.py create mode 100644 tests/start-asyncssh-server.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 92d37a7a..85eed169 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,7 +21,7 @@ jobs: - name: Install pip Dependencies run: | python3 -m pip install --upgrade pip - python3 -m pip install --user fusepy pytest lz4 PySquashfsImage + python3 -m pip install --user fusepy pytest lz4 PySquashfsImage asyncssh - name: Style Check With Black run: | @@ -35,7 +35,8 @@ jobs: - name: Lint With Codespell run: | python3 -m pip install codespell - codespell --ignore-words-list fo,Nd,unx $( git ls-tree -r --name-only HEAD | 'grep' -E '[.](py|md|txt|sh|yml)' ) + # fsspec uses cachable instead of cacheable ... + codespell --ignore-words-list fo,Nd,unx,cachable $( git ls-tree -r --name-only HEAD | 'grep' -E '[.](py|md|txt|sh|yml)' ) - name: Lint With Flake8 run: | @@ -98,6 +99,14 @@ jobs: steps: - uses: actions/checkout@v4 + with: + # We need one tag for testing the git mount. + # This is BROKEN! God damn it. Is anything working at all... + # https://github.com/actions/checkout/issues/1781 + fetch-tags: true + + - name: Fetch tag for tests + run: git fetch origin refs/tags/v0.15.2:refs/tags/v0.15.2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -123,8 +132,21 @@ jobs: # zstd, may also call external binaries depending on how libarchive was compiled! # https://github.com/libarchive/libarchive/blob/ad5a0b542c027883d7069f6844045e6788c7d70c/libarchive/ # archive_read_support_filter_lrzip.c#L68 - sudo apt-get -y install libfuse2 fuse3 bzip2 pbzip2 pixz zstd unar lrzip lzop gcc liblzo2-dev - set -x + sudo apt-get -y install libfuse2 fuse3 bzip2 pbzip2 pixz zstd unar lrzip lzop gcc liblzo2-dev ruby-webrick + + - name: Install Dependencies For Unreleased Python Versions (Linux) + if: > + startsWith( matrix.os, 'ubuntu' ) && ( + matrix.python-version == '3.13.0-rc.3' || + matrix.python-version == '3.14.0-alpha.0') + run: | + #libgit2-dev is too old on Ubuntu 22.04. Leads to error about missing git2/sys/errors.h + #sudo apt-get -y install libgit2-dev + sudo apt-get -y install cmake + git clone --branch v1.7.2 --depth 1 https://github.com/libgit2/libgit2.git + ( cd libgit2 && mkdir build && cd build && cmake .. && cmake --build . && sudo cmake --build . -- install ) + echo "PATH=$PATH:/usr/local/bin" >> "$GITHUB_ENV" + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib" >> "$GITHUB_ENV" - name: Install Dependencies (MacOS) if: startsWith( matrix.os, 'macos' ) @@ -137,7 +159,16 @@ jobs: # TypeError: 'NoneType' object is not iterable brew install macfuse coreutils pixz pbzip2 zstd unar libarchive lrzip lzop lzo # Add brew installation binary folder to PATH so that command line tools like zstd can be found - export PATH="$PATH:/usr/local/bin" + echo PATH="$PATH:/usr/local/bin" >> "$GITHUB_ENV" + + - name: Install Dependencies For Unreleased Python Versions (MacOS) + if: > + startsWith( matrix.os, 'macos' ) && ( + matrix.python-version == '3.13.0-rc.3' || + matrix.python-version == '3.14.0-alpha.0') + run: | + brew install libgit2@1.7 + brew link libgit2@1.7 --force - name: Install pip Dependencies run: | @@ -203,6 +234,13 @@ jobs: # Segfaults (139) are not allowed but other exit codes are valid! python3 ratarmount.py tests/simple.bz2 || [ $? != 139 ] + - name: Install pip Test Dependencies + run: | + python3 -m pip install -r tests/requirements-tests.txt + # Explicitly install pygit2 even on Python 3.13+ because we have set up libgit2 manually. + python3 -m pip install pygit2 + python3 -c 'import pygit2' + - name: Unit Tests run: | python3 -m pip install pytest pytest-xdist diff --git a/AppImage/build-ratarmount-appimage.sh b/AppImage/build-ratarmount-appimage.sh index 362fe2a4..d30d43c7 100644 --- a/AppImage/build-ratarmount-appimage.sh +++ b/AppImage/build-ratarmount-appimage.sh @@ -62,6 +62,19 @@ function installAppImagePythonPackages() fi "$APP_PYTHON_BIN" -I -m pip install --no-cache-dir ../core "$APP_PYTHON_BIN" -I -m pip install --no-cache-dir ..[full] + + # These lines are only to document the individual package sizes. They are all installed with [full] above. + # ratarmount-0.10.0-manylinux2014_x86_64.AppImage (the first one!) was 13.6 MB + # ratarmount-v0.11.3-manylinux2014_x86_64.AppImage was 13.6 MB + # ratarmount-0.12.0-manylinux2014_x86_64.AppImage was 26.3 MB thanks to an error with the trime-down script. + # ratarmount-0.15.0-x86_64.AppImage was 14.8 MB + # ratarmount-0.15.1-x86_64.AppImage was 13.3 MB (manylinux_2014) + # ratarmount-0.15.2-x86_64.AppImage was 11.7 MB (manylinux_2_28) + # At this point, with pyfatfs, the AppImage is/was 13.0 MB. Extracts to 45.1 MB + # This bloats the AppImage to 23.7 MB, which is still ok, I guess. Extracts to 83.1 MB + # "$APP_PYTHON_BIN" -I -m pip install --no-cache-dir requests aiohttp sshfs smbprotocol pygit2<1.15 fsspec + # This bloats the AppImage to 38.5 MB :/. Extracts to 121.0 MB + # "$APP_PYTHON_BIN" -I -m pip install --no-cache-dir s3fs gcsfs adlfs dropboxdrivefs } function installAppImageSystemLibraries() diff --git a/README.md b/README.md index 04da0242..9d958732 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ And in contrast to [tarindexer](https://github.com/devsnd/tarindexer), which als *Capabilities:* + - **Random Access:** Care was taken to achieve fast random access inside compressed streams for bzip2, gzip, xz, and zstd and inside TAR files by building indices containing seek points. - **Highly Parallelized:** By default, all cores are used for parallelized algorithms like for the gzip, bzip2, and xz decoders. This can yield huge speedups on most modern processors but requires more main memory. It can be controlled or completely turned off using the `-P ` option. @@ -34,42 +35,11 @@ And in contrast to [tarindexer](https://github.com/devsnd/tarindexer), which als - **Union Mounting:** Multiple TARs, compressed files, and bind mounted folders can be mounted under the same mountpoint. - **Write Overlay:** A folder can be specified as write overlay. All changes below the mountpoint will be redirected to this folder and deletions are tracked so that all changes can be applied back to the archive. + - **Remote Files and Folders:** A remote archive or whole folder structure can be mounted similar to tools like [sshfs](https://github.com/libfuse/sshfs) thanks to the [filesystem_spec](https://github.com/fsspec/filesystem_spec) project. + These can be specified with URIs as explained in the section ["Remote Files"](#remote-files). + Supported remote protocols include: FTP, HTTP, HTTPS, SFTP, [SSH](https://github.com/fsspec/sshfs), Git, Github, [S3](https://github.com/fsspec/s3fs), Samba [v2 and v3](https://github.com/jborean93/smbprotocol), Dropbox, ... Many of these are very experimental and may be slow. Please open a feature request if further backends are desired. -*TAR compressions supported for random access:* - - - **BZip2** as provided by [indexed_bzip2](https://github.com/mxmlnkn/indexed_bzip2) as a backend, which is a refactored and extended version of [bzcat](https://github.com/landley/toybox/blob/c77b66455762f42bb824c1aa8cc60e7f4d44bdab/toys/other/bzcat.c) from [toybox](https://landley.net/code/toybox/). See also the [reverse engineered specification](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf). - - **Gzip** and **Zlib** as provided by [rapidgzip](https://github.com/mxmlnkn/rapidgzip) or [indexed_gzip](https://github.com/pauldmccarthy/indexed_gzip) by Paul McCarthy. See also [RFC1952](https://tools.ietf.org/html/rfc1952) and [RFC1950](https://tools.ietf.org/html/rfc1950). - - **Xz** as provided by [python-xz](https://github.com/Rogdham/python-xz) by Rogdham or [lzmaffi](https://github.com/r3m0t/backports.lzma) by Tomer Chachamu. See also [The .xz File Format](https://tukaani.org/xz/xz-file-format.txt). - - **Zstd** as provided by [indexed_zstd](https://github.com/martinellimarco/indexed_zstd) by Marco Martinelli. See also [Zstandard Compression Format](https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md). - -*Other supported archive formats:* - - - **Rar** as provided by [rarfile](https://github.com/markokr/rarfile) by Marko Kreen. See also the [RAR 5.0 archive format](https://www.rarlab.com/technote.htm). - - **SquashFS, AppImage, Snap** as provided by [PySquashfsImage](https://github.com/matteomattei/PySquashfsImage) by Matteo Mattei. There seems to be no authoritative, open format specification, only [this nicely-done reverse-engineered description](https://dr-emann.github.io/squashfs/squashfs.html), I assume based on the [source code](https://github.com/plougher/squashfs-tools). Note that [Snaps](https://snapcraft.io/docs/the-snap-format) and [Appimages](https://github.com/AppImage/AppImageSpec/blob/master/draft.md#type-2-image-format) are both SquashFS images, with an executable prepended for AppImages. - - **Zip** as provided by [zipfile](https://docs.python.org/3/library/zipfile.html), which is distributed with Python itself. See also the [ZIP File Format Specification](https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). - - **Many Others** as provided by [libarchive](https://github.com/libarchive/libarchive) via [python-libarchive-c](https://github.com/Changaco/python-libarchive-c). - - Formats with tests: - [7z](https://github.com/ip7z/7zip/blob/main/DOC/7zFormat.txt), - ar, - [cab](https://download.microsoft.com/download/4/d/a/4da14f27-b4ef-4170-a6e6-5b1ef85b1baa/[ms-cab].pdf), - compress, cpio, - [iso](http://www.brankin.com/main/technotes/Notes_ISO9660.htm), - [lrzip](https://github.com/ckolivas/lrzip), - [lzma](https://www.7-zip.org/a/lzma-specification.7z), - [lz4](https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md), - [lzip](https://www.ietf.org/archive/id/draft-diaz-lzip-09.txt), - lzo, - [warc](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/), - xar. - - Untested formats that might work or not: deb, grzip, - [rpm](https://refspecs.linuxbase.org/LSB_4.1.0/LSB-Core-generic/LSB-Core-generic/pkgformat.html), - [uuencoding](https://en.wikipedia.org/wiki/Uuencoding). - - Beware that libarchive has no performant random access to files and to file contents. - In order to seek or open a file, in general, it needs to be assumed that the archive has to be parsed from the beginning. - If you have a performance-critical use case for a format only supported via libarchive, - then please open a feature request for a faster customized archive format implementation. - The hope would be to add suitable stream compressors such as "short"-distance LZ-based compressions to [rapidgzip](https://github.com/mxmlnkn/rapidgzip). - +A complete list of supported formats can be found [here](supported-formats). # Examples @@ -79,6 +49,11 @@ And in contrast to [tarindexer](https://github.com/devsnd/tarindexer), which als - `ratarmount folder1 folder2 mountpoint` to bind-mount a merged view of two (or more) folders under `mountpoint`. - `ratarmount folder archive.zip folder` to mount a merged view of a folder on top of archive contents. - `ratarmount -o modules=subdir,subdir=squashfs-root archive.squashfs mountpoint` to mount an archive subfolder `squashfs-root` under `mountpoint`. + - `ratarmount http://server.org:80/archive.rar folder folder` Mount an archive that is accessible via HTTP range requests. + - `ratarmount ssh://hostname:22/relativefolder/ mountpoint` Mount a folder hierarchy via SSH. + - `ratarmount ssh://hostname:22//tmp/tmp-abcdef/ mountpoint` + - `ratarmount github://mxmlnkn:ratarmount@v0.15.2/tests/ mountpoint` Mount a github repo as if it was checked out at the given tag or SHA or branch. + - `AWS_ACCESS_KEY_ID=01234567890123456789 AWS_SECRET_ACCESS_KEY=0123456789012345678901234567890123456789 ratarmount s3://127.0.0.1/bucket/single-file.tar mounted` Mount an archive inside an S3 bucket reachable via a custom endpoint with the given credentials. Bogus credentials may be necessary for unsecured endpoints. # Table of Contents @@ -89,6 +64,9 @@ And in contrast to [tarindexer](https://github.com/devsnd/tarindexer), which als 1. [Arch Linux](#arch-linux) 3. [System Dependencies for PIP Installation (Rarely Necessary)](#system-dependencies-for-pip-installation-rarely-necessary) 4. [PIP Package Installation](#pip-package-installation) +2. [Supported Formats](#supported-formats) + 1. [TAR compressions supported for random access](tar-compressions-supported-for-random-access) + 2. [Other supported archive formats](other-supported-archive-formats) 2. [Benchmarks](#benchmarks) 3. [The Problem](#the-problem) 4. [The Solution](#the-solution) @@ -99,7 +77,9 @@ And in contrast to [tarindexer](https://github.com/devsnd/tarindexer), which als 4. [File versions](#file-versions) 5. [Compressed non-TAR files](#compressed-non-tar-files) 6. [Xz and Zst Files](#xz-and-zst-files) - 7. [As a Library](#as-a-library) + 7. [Remote Files](#remote-files) + 8. [Writable Mounting](#writable-mounting) + 9. [As a Library](#as-a-library) # Installation @@ -132,6 +112,9 @@ chmod u+x -- "$appImageName" sudo cp -- "$appImageName" /usr/local/bin/ratarmount # Example installation ``` +
+Other Installation Methods + ## Installation via Package Manager [![Packaging status](https://repology.org/badge/vertical-allrepos/ratarmount.svg)](https://repology.org/project/ratarmount/versions) @@ -199,6 +182,45 @@ If there are troubles with the compression backend dependencies, you can try the Ratarmount will work without the compression backends. The hard requirements are `fusepy` and for Python versions older than 3.7.0 `dataclasses`. +
+ +# Supported Formats + +## TAR compressions supported for random access + + - **BZip2** as provided by [indexed_bzip2](https://github.com/mxmlnkn/indexed_bzip2) as a backend, which is a refactored and extended version of [bzcat](https://github.com/landley/toybox/blob/c77b66455762f42bb824c1aa8cc60e7f4d44bdab/toys/other/bzcat.c) from [toybox](https://landley.net/code/toybox/). See also the [reverse engineered specification](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf). + - **Gzip** and **Zlib** as provided by [rapidgzip](https://github.com/mxmlnkn/rapidgzip) or [indexed_gzip](https://github.com/pauldmccarthy/indexed_gzip) by Paul McCarthy. See also [RFC1952](https://tools.ietf.org/html/rfc1952) and [RFC1950](https://tools.ietf.org/html/rfc1950). + - **Xz** as provided by [python-xz](https://github.com/Rogdham/python-xz) by Rogdham or [lzmaffi](https://github.com/r3m0t/backports.lzma) by Tomer Chachamu. See also [The .xz File Format](https://tukaani.org/xz/xz-file-format.txt). + - **Zstd** as provided by [indexed_zstd](https://github.com/martinellimarco/indexed_zstd) by Marco Martinelli. See also [Zstandard Compression Format](https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md). + +## Other supported archive formats + + - **Rar** as provided by [rarfile](https://github.com/markokr/rarfile) by Marko Kreen. See also the [RAR 5.0 archive format](https://www.rarlab.com/technote.htm). + - **SquashFS, AppImage, Snap** as provided by [PySquashfsImage](https://github.com/matteomattei/PySquashfsImage) by Matteo Mattei. There seems to be no authoritative, open format specification, only [this nicely-done reverse-engineered description](https://dr-emann.github.io/squashfs/squashfs.html), I assume based on the [source code](https://github.com/plougher/squashfs-tools). Note that [Snaps](https://snapcraft.io/docs/the-snap-format) and [Appimages](https://github.com/AppImage/AppImageSpec/blob/master/draft.md#type-2-image-format) are both SquashFS images, with an executable prepended for AppImages. + - **Zip** as provided by [zipfile](https://docs.python.org/3/library/zipfile.html), which is distributed with Python itself. See also the [ZIP File Format Specification](https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). + - **Many Others** as provided by [libarchive](https://github.com/libarchive/libarchive) via [python-libarchive-c](https://github.com/Changaco/python-libarchive-c). + - Formats with tests: + [7z](https://github.com/ip7z/7zip/blob/main/DOC/7zFormat.txt), + ar, + [cab](https://download.microsoft.com/download/4/d/a/4da14f27-b4ef-4170-a6e6-5b1ef85b1baa/[ms-cab].pdf), + compress, cpio, + [iso](http://www.brankin.com/main/technotes/Notes_ISO9660.htm), + [lrzip](https://github.com/ckolivas/lrzip), + [lzma](https://www.7-zip.org/a/lzma-specification.7z), + [lz4](https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md), + [lzip](https://www.ietf.org/archive/id/draft-diaz-lzip-09.txt), + lzo, + [warc](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/), + xar. + - Untested formats that might work or not: deb, grzip, + [rpm](https://refspecs.linuxbase.org/LSB_4.1.0/LSB-Core-generic/LSB-Core-generic/pkgformat.html), + [uuencoding](https://en.wikipedia.org/wiki/Uuencoding). + - Beware that libarchive has no performant random access to files and to file contents. + In order to seek or open a file, in general, it needs to be assumed that the archive has to be parsed from the beginning. + If you have a performance-critical use case for a format only supported via libarchive, + then please open a feature request for a faster customized archive format implementation. + The hope would be to add suitable stream compressors such as "short"-distance LZ-based compressions to [rapidgzip](https://github.com/mxmlnkn/rapidgzip). + # Benchmarks @@ -503,6 +525,31 @@ lbzip2 -cd well-compressed-file.bz2 | createMultiFrameZstd $(( 4*1024*1024 )) > +# Remote Files + +The [fsspec](https://github.com/fsspec/filesystem_spec) API backend adds support for mounting many remote archive or folders: + + - `git://[path-to-repo:][ref@]path/to/file` + Uses the current path if no repository path is specified. + - `github://org:repo@[sha]/path-to/file-or-folder` + E.g. github://mxmlnkn:ratarmount@v0.15.2/tests/single-file.tar + - `http[s]://hostname[:port]/path-to/archive.rar` + - `s3://[endpoint-hostname[:port]]/bucket[/single-file.tar[?versionId=some_version_id]]` + Will default to AWS according to the Boto3 library defaults when no endpoint is specified. + Boto3 will check, among others, [these environment variables](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html), for credentials: + - `AWS_ACCESS_KEY_ID` + - `AWS_SECRET_ACCESS_KEY` + - `AWS_SESSION_TOKEN` + - `AWS_DEFAULT_REGION`, e.g., `us-west-1` + fsspec/s3fs furthermore supports these environment variables: + - [`FSSPEC_S3_ENDPOINT_URL`](https://github.com/fsspec/s3fs/pull/704), e.g., `http://127.0.0.1:8053` + - `[s]ftp://[user[:password]@]hostname[:port]/path-to/archive.rar` + - `ssh://[user[:password]@]hostname[:port]/path-to/archive.rar` + - `smb://[workgroup;][user:password@]server[:port]/share/folder/file.tar` + +Many others fsspec-based projects may also work when installed. + + # Writable Mounting The `--write-overlay ` option can be used to create a writable mount point. diff --git a/core/pyproject.toml b/core/pyproject.toml index 821bd38c..95a75113 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -72,9 +72,53 @@ full = [ # With Python 3.14, when building the wheel, I get: # /usr/bin/ld: cannot find /tmp/tmpcuw21d78/bin/isa-l.a: No such file or directory 'isal ~= 1.0; python_version < "3.14.0"', + # Pin to < 3.12 because of https://github.com/nathanhi/pyfatfs/issues/41 + 'pyfatfs ~= 1.0; python_version < "3.12.0"', + # fsspec: + "requests", + "aiohttp", + "sshfs", # For performance, asyncssh > 2.17 would be recommended: https://github.com/ronf/asyncssh/issues/691 + # Need newer pyopenssl than comes with Ubuntu 22.04. + # https://github.com/ronf/asyncssh/issues/690 + "pyopenssl>=23", + "smbprotocol", + # pygit2 1.15 introduced many breaking changes! + # https://github.com/libgit2/pygit2/issues/1316 + # https://github.com/fsspec/filesystem_spec/pull/1703 + # build error in Python 3.13 because it requires libgit2 1.8.1 and there are no wheels + "pygit2<1.15", + "fsspec", + "s3fs", + #"gcsfs", # untested + #"adlfs", # untested. build error in Python 3.13 + "dropboxdrivefs", ] bzip2 = ["rapidgzip >= 0.13.1"] gzip = ["indexed_gzip >= 1.6.3, < 2.0"] +fsspec = [ + # Copy-pasted from fsspec[full] list. Some were excluded because they are too unproportionally large. + "requests", + "aiohttp", + "sshfs", # For performance, asyncssh > 2.17 would be recommended: https://github.com/ronf/asyncssh/issues/691 + # Need newer pyopenssl than comes with Ubuntu 22.04. + # https://github.com/ronf/asyncssh/issues/690 + "pyopenssl>=23", + "smbprotocol", # build error in Python 3.13 + # pygit2 1.15 introduced many breaking changes! + # https://github.com/libgit2/pygit2/issues/1316 + # https://github.com/fsspec/filesystem_spec/pull/1703 + # build error in Python 3.13 because it requires libgit2 1.8.1 and there are no wheels + "pygit2<1.15", + "fsspec", + "s3fs", + #"gcsfs", # untested + #"adlfs", # untested. build error in Python 3.13 + "dropboxdrivefs", + # "dask", "distributed" : ~34 MB, ~10 MB gzip-compressed + # "pyarrow >= 1" : ~196 MB, ~60 MB gzip-compressed, build error in Python 3.13 + # "ocifs" : ~350 MB + # "panel" : only for fsspec GUI +] # Need >= 4.1 because of https://github.com/markokr/rarfile/issues/73 rar = ["rarfile ~= 4.1"] # For now, only optional (and installed in the AppImage) because it is unstable and depends on many other packages diff --git a/core/ratarmountcore/FSSpecMountSource.py b/core/ratarmountcore/FSSpecMountSource.py new file mode 100644 index 00000000..a014ecd2 --- /dev/null +++ b/core/ratarmountcore/FSSpecMountSource.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import stat +import time +import urllib +from typing import Dict, IO, Iterable, Optional, Union + +from .MountSource import FileInfo, MountSource, createRootFileInfo +from .utils import overrides + +try: + import fsspec + import fsspec.core + import fsspec.implementations.http +except ImportError: + fsspec = None # type: ignore + + +class FSSpecMountSource(MountSource): + """ + Generic wrapper around fsspec-based filesystems. + At least as "generic" as it gets given that many details are inconsistent between the implementations. + Note also that many implementations are rather experimental, utterly slow, or unstable. + """ + + # TODO implement some of the most rudimentarily implemented filesystems myself instead of via fsspec. + # wc -l 'fsspec/implementations/'*.py | sort -n + # 0 fsspec/implementations/__init__.py + # 58 fsspec/implementations/data.py + # 75 fsspec/implementations/cache_mapper.py + # 124 fsspec/implementations/jupyter.py + # 124 fsspec/implementations/tar.py -> SQLiteIndexedTar + # 127 fsspec/implementations/git.py -> TODO + # 152 fsspec/implementations/dask.py + # 176 fsspec/implementations/zip.py -> ZipMountSource + # 180 fsspec/implementations/sftp.py -> fsspec/sshfs + # 213 fsspec/implementations/libarchive.py -> LibarchiveMountSource + # 232 fsspec/implementations/cache_metadata.py + # 239 fsspec/implementations/github.py + # 303 fsspec/implementations/memory.py + # 304 fsspec/implementations/arrow.py + # 372 fsspec/implementations/dirfs.py -> FolderMountSource + chdir + # 395 fsspec/implementations/ftp.py + # 416 fsspec/implementations/smb.py + # 467 fsspec/implementations/dbfs.py + # 471 fsspec/implementations/local.py -> FolderMountSource + # 484 fsspec/implementations/webhdfs.py + # 872 fsspec/implementations/http.py + # 929 fsspec/implementations/cached.py + # 1173 fsspec/implementations/reference.py + # I guess git is the most obvious candidate because it is the most interesting and most barebone implementation. + + # pylint: disable=unused-argument + def __init__(self, urlOrOpenFile, **options) -> None: + """ + urlOrOpenFile : Take a URL or an already opened fsspec Filesystem object. + Note that this might take an AbstractFileSystem-derived object in the future. + """ + # Note that fsspec.implementations.ssh did not use ~/.ssh/config! + # That's one of the many reasons why fsspec/sshfs based on asyncssh instead of paramiko is used. + assert isinstance(urlOrOpenFile, (str, fsspec.core.OpenFile)) + self.openFile: fsspec.core.OpenFile = ( + fsspec.open(urlOrOpenFile) if isinstance(urlOrOpenFile, str) else urlOrOpenFile + ) + self.fileSystem: fsspec.AbstractFileSystem = self.openFile.fs + self.rootFileInfo = createRootFileInfo(userdata=["/"]) + + # The fsspec filesystems are not uniform! http:// expects the arguments to isdir with prefixed + # protocol while other filesystem implementations are fine with only the path. + # https://github.com/ray-project/ray/issues/26423#issuecomment-1179561181 + self._isHTTP = isinstance(self.fileSystem, fsspec.implementations.http.HTTPFileSystem) + prefix = self.openFile.path + self.prefix = prefix.rstrip("/") if prefix.strip("/") and self.fileSystem.isdir(prefix) else "" + + def _getPath(self, path: str) -> str: + if self._isHTTP: + path = urllib.parse.quote(path) + if self.prefix: + if not path or path == "/": + return self.prefix + return self.prefix.rstrip("/") + "/" + path.lstrip("/") + return path + + @staticmethod + def _getMode(entry) -> int: + return 0o555 | (stat.S_IFDIR if entry.get('type', '') == 'directory' else stat.S_IFREG) + + @staticmethod + def _getModificationTime(entry) -> Union[int, float]: + # There is no standardized API for the modification time: + # https://github.com/fsspec/filesystem_spec/issues/1680#issuecomment-2368750882 + # + # sshfs.SSHF: 'mtime': datetime.datetime(2020, 3, 23, 20, 15, 34) + # fsspec.implementations.git.GitFileSystem: Nothing with listdir(details=True)! + # fsspec.implementations.ftp.FTPFileSystem: 'modify': '20241004165129' + mtime = entry.get('mtime', None) + if mtime is not None: + return mtime.timestamp() if hasattr(mtime, 'timestamp') else mtime + modify = entry.get('modify', None) + if isinstance(modify, str): + return time.mktime(time.strptime(modify, "%Y%m%d%H%M%S")) + return 0 + + @staticmethod + def _convertToFileInfo(entry, path) -> FileInfo: + # TODO fsspec does not have an API to get symbolic link targets! + # They kinda work only like hardlinks. + # https://github.com/fsspec/filesystem_spec/issues/1679 + # https://github.com/fsspec/filesystem_spec/issues/1680 + return FileInfo( + # fmt: off + size = entry.get('size', 0), + mtime = FSSpecMountSource._getModificationTime(entry), + mode = FSSpecMountSource._getMode(entry), + linkname = "", + uid = os.getuid(), + gid = os.getgid(), + userdata = [path], + # fmt: on + ) + + @overrides(MountSource) + def isImmutable(self) -> bool: + return True + + @overrides(MountSource) + def exists(self, path: str) -> bool: + return self.fileSystem.lexists(self._getPath(path)) + + def _listDir(self, path: str, onlyMode: bool) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]: + path = self._getPath(path) + + result = self.fileSystem.listdir(path, detail=True) + if not result: + return [] + if isinstance(result[0], str): + return result + + # Examples for listdir return values: + # + # sshfs.SSHF: [ + # {'size': 8, 'type': 'link', 'gid': 0, 'uid': 0, 'time': datetime.datetime(2024, 10, 3, 19, 32, 42), + # 'mtime': datetime.datetime(2020, 3, 23, 20, 15, 34), 'permissions': 41471, 'name': '/sbin'}, + # {'size': 4096, 'type': 'directory', 'gid': 0, 'uid': 0, 'time': datetime.datetime(2024, 9, 25, 19, 45, 31), + # 'mtime': datetime.datetime(2023, 7, 22, 11, 32, 1), 'permissions': 16877, 'name': '/var'} + # {'size': 134217728, 'type': 'file', 'gid': 0, 'uid': 0, 'time': datetime.datetime(2024, 9, 25, 19, 45, 30), + # 'mtime': datetime.datetime(2021, 6, 16, 19, 26, 38), 'permissions': 33188, 'name': '/swapfile'} + # -> "name" contains the absolute path to each file (also tested with subfolders)! + # fsspec.implementations.git.GitFileSystem: [ + # {'type': 'file', 'name': '.gitattributes', 'hex': '2a396079050e5847b7c995642ed07a7c8591bde9', + # 'mode': '100644', 'size': 363}, + # {'type': 'directory', 'name': '.github', 'hex': 'c8ab28a6ded46c96fa33a96a9d6d0b53dfe815de', + # 'mode': '40000', 'size': 0}, + # [{'type': 'directory', 'name': '.github/workflows', 'hex': 'b1b9b9b0d1ca1210f823195238e8fe71829fae42', + # 'mode': '40000', 'size': 0}] + # -> "name" is absolute path but without leading slash + # fsspec.implementations.ftp.FTPFileSystem: [ + # {'modify': '20241004165129', 'perm': 'el', 'size': 0, 'type': 'directory', + # 'unique': 'fd01ga9f7f6', 'name': '/.git'}, + # {'modify': '20240602192724', 'perm': 'r', 'size': 363, 'type': 'file', + # 'unique': 'fd01g2de4e2', 'name': '/.gitattributes'}, + # fsspec.implementations.http.HTTPFileSystem: [ + # {'name': 'http://127.0.0.1:8000/?S=D', 'size': None, 'type': 'file'}, + # {'name': 'http://127.0.0.1:8000/benchmarks/', 'size': None, 'type': 'directory'}, + # {'name': 'http://127.0.0.1:8000/benchmark-sshfs-block_size.py', 'size': None, 'type': 'file'}, + # -> For some reason, the name always has to include the full URL for the request and result. + # -> There are some HTTP server artifacts such as "?S=D", which are links for changing the sorting... + prefixToStrip = path.lstrip('/') + result = { + ( + entry['name'].strip('/')[len(prefixToStrip) :].strip('/') + if entry['name'].strip('/').startswith(prefixToStrip) + else entry['name'] + ): ( + FSSpecMountSource._getMode(entry) + if onlyMode + else FSSpecMountSource._convertToFileInfo(entry, entry['name']) + ) + for entry in result + } + if self._isHTTP: + return { + urllib.parse.unquote(name): info for name, info in result.items() if not name.startswith(('?', '#')) + } + return result + + @overrides(MountSource) + def listDir(self, path: str) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]: + return self._listDir(path, onlyMode=False) + + @overrides(MountSource) + def listDirModeOnly(self, path: str) -> Optional[Union[Iterable[str], Dict[str, int]]]: + return self._listDir(path, onlyMode=True) + + def _getFileInfoHTTP(self, path: str) -> Optional[FileInfo]: + path = self._getPath(path) + + # Avoid aiohttp.client_exceptions.ClientResponseError: 404, message='Not Found' + if not self.fileSystem.lexists(path): + return None + + # fs.info will always return the given path to be file because it counts it as an HTML file ... + # isdir works somewhat better, but it downloads the whole file! + # https://github.com/fsspec/filesystem_spec/issues/1707 + # Therefore, only call it if the mimetype indicates an HTML file. + # In the future it might be best to call listdir on the parent path to detect whether it is a folder or file. + info = self.fileSystem.info(path) + if info.get('mimetype', None) == 'text/html' and self.fileSystem.isdir(path): + return FSSpecMountSource._convertToFileInfo({'type': 'directory'}, path) + return FSSpecMountSource._convertToFileInfo(info, path) + + @overrides(MountSource) + def getFileInfo(self, path: str, fileVersion: int = 0) -> Optional[FileInfo]: + if self._isHTTP: + return self._getFileInfoHTTP(path) + + path = self._getPath(path) + if path == '/' or not path: + # We need to handle this specially because some filesystems, at least ssshfs.SSHFileSystem, + # do not support 'info' on '/' and will cause an exception: + # + # Traceback (most recent call last): + # sshfs/utils.py", line 27, in wrapper + # return await func(*args, **kwargs) + # sshfs/spec.py", line 145, in _info + # attributes = await channel.stat(path) + # asyncssh/sftp.py", line 4616, in stat + # return await self._handler.stat(path, flags, + # asyncssh/sftp.py", line 2713, in stat + # return cast(SFTPAttrs, await self._make_request( + # asyncssh/sftp.py", line 2468, in _make_request + # result = self._packet_handlers[resptype](self, resp) + # asyncssh/sftp.py", line 2484, in _process_status + # raise exc + # asyncssh.sftp.SFTPNoSuchFile: No such file + return self.rootFileInfo.clone() + + if not self.fileSystem.lexists(path): + return None + return FSSpecMountSource._convertToFileInfo(self.fileSystem.info(path), path) + + @overrides(MountSource) + def fileVersions(self, path: str) -> int: + return 1 + + @overrides(MountSource) + def open(self, fileInfo: FileInfo, buffering=-1) -> IO[bytes]: + path = fileInfo.userdata[-1] + assert isinstance(path, str) + return self.fileSystem.open(path, block_size=buffering if buffering >= 0 else None) + + @overrides(MountSource) + def __exit__(self, exception_type, exception_value, exception_traceback): + if hasattr(self.openFile, 'close'): + self.openFile.close() + + def __del__(self): + if hasattr(self.openFile, 'close'): + self.openFile.close() diff --git a/core/ratarmountcore/SQLiteIndex.py b/core/ratarmountcore/SQLiteIndex.py index c94ad1e4..40e7f3e3 100644 --- a/core/ratarmountcore/SQLiteIndex.py +++ b/core/ratarmountcore/SQLiteIndex.py @@ -183,6 +183,7 @@ def __init__( preferMemory: bool = False, indexMinimumFileCount: int = 0, backendName: str = '', + ignoreCurrentFolder: bool = False, ): """ indexFilePath @@ -206,6 +207,9 @@ def __init__( exceeded. It may also be written to a file if a gzip index is stored. backendName The backend name to be stored as metadata and to determine compatibility of found indexes. + ignoreCurrentFolder + If true, then do not store the index into the current path. This was introduced for URL + opened as file objects but may be useful for any archive given via a file object. """ if not backendName: @@ -217,7 +221,7 @@ def __init__( self.indexFilePath: Optional[str] = None self.encoding = encoding self.possibleIndexFilePaths = SQLiteIndex.getPossibleIndexFilePaths( - indexFilePath, indexFolders, archiveFilePath + indexFilePath, indexFolders, archiveFilePath, ignoreCurrentFolder ) # stores which parent folders were last tried to add to database and therefore do exist self.parentFolderCache: List[Tuple[str, str]] = [] @@ -247,7 +251,10 @@ def __init__( @staticmethod def getPossibleIndexFilePaths( - indexFilePath: Optional[str], indexFolders: Optional[List[str]] = None, archiveFilePath: Optional[str] = None + indexFilePath: Optional[str], + indexFolders: Optional[List[str]] = None, + archiveFilePath: Optional[str] = None, + ignoreCurrentFolder: bool = False, ) -> List[str]: if indexFilePath: return [] if indexFilePath == ':memory:' else [os.path.abspath(os.path.expanduser(indexFilePath))] @@ -265,7 +272,7 @@ def getPossibleIndexFilePaths( if folder: indexPath = os.path.join(folder, indexPathAsName) possibleIndexFilePaths.append(os.path.abspath(os.path.expanduser(indexPath))) - else: + elif not ignoreCurrentFolder: possibleIndexFilePaths.append(defaultIndexFilePath) return possibleIndexFilePaths @@ -563,6 +570,9 @@ def reloadIndexReadOnly(self): self.sqlConnection = SQLiteIndex._openSqlDb(f"file:{uriPath}?mode=ro", uri=True, check_same_thread=False) def _reloadIndexOnDisk(self): + if self.printDebug >= 2: + print("[Info] Try to reopen SQLite database on disk at:", self.indexFilePath) + print("other index paths:", self.possibleIndexFilePaths) if not self.indexFilePath or self.indexFilePath != ':memory:' or not self.sqlConnection: return diff --git a/core/ratarmountcore/SQLiteIndexedTar.py b/core/ratarmountcore/SQLiteIndexedTar.py index 45bb2dc9..3a776893 100644 --- a/core/ratarmountcore/SQLiteIndexedTar.py +++ b/core/ratarmountcore/SQLiteIndexedTar.py @@ -704,6 +704,7 @@ def __init__( self.tarFileName = tarFileName else: raise RatarmountError("At least one of tarFileName and fileObject arguments should be set!") + self._fileNameIsURL = re.match('[A-Za-z0-9]*://', self.tarFileName) is not None # If no fileObject given, then self.tarFileName is the path to the archive to open. if not fileObject: @@ -771,16 +772,19 @@ def __init__( if indexFolders and isinstance(indexFolders, str): indexFolders = [indexFolders] + archiveFilePath = self.tarFileName if not self.isFileObject or self._fileNameIsURL else None + super().__init__( SQLiteIndex( indexFilePath, indexFolders=indexFolders, - archiveFilePath=None if self.isFileObject else self.tarFileName, + archiveFilePath=archiveFilePath, encoding=self.encoding, checkMetadata=self._checkMetadata, printDebug=self.printDebug, indexMinimumFileCount=indexMinimumFileCount, backendName='SQLiteIndexedTar', + ignoreCurrentFolder=self.isFileObject and self._fileNameIsURL, ), clearIndexCache=clearIndexCache, ) @@ -829,9 +833,9 @@ def __init__( # Open new database when we didn't find an existing one. if not self.index.indexIsLoaded(): - # Simply open in memory without an error even if writeIndex is True but when not indication - # for a index file location has been given. - if writeIndex and (indexFilePath or not self.isFileObject): + # Simply open in memory without an error even if writeIndex is True but when no indication + # for an index file location has been given. + if writeIndex and (indexFilePath or self._getArchivePath() or not self.isFileObject): self.index.openWritable() else: self.index.openInMemory() @@ -890,6 +894,9 @@ def __exit__(self, exception_type, exception_value, exception_traceback): if not self.isFileObject and self.rawFileObject: self.rawFileObject.close() + def _getArchivePath(self) -> Optional[str]: + return None if self.tarFileName == '' else self.tarFileName + def _storeMetadata(self) -> None: argumentsToSave = [ 'mountRecursively', @@ -902,6 +909,7 @@ def _storeMetadata(self) -> None: ] argumentsMetadata = json.dumps({argument: getattr(self, argument) for argument in argumentsToSave}) + # The second argument must be a path to a file to call os.stat with, not simply a file name. self.index.storeMetadata(argumentsMetadata, None if self.isFileObject else self.tarFileName) self.index.storeMetadataKeyValue('isGnuIncremental', '1' if self._isGnuIncremental else '0') diff --git a/core/ratarmountcore/compressions.py b/core/ratarmountcore/compressions.py index dbbfb44a..63aba74f 100644 --- a/core/ratarmountcore/compressions.py +++ b/core/ratarmountcore/compressions.py @@ -99,7 +99,12 @@ def checkZlibHeader(file): 'bz2': CompressionInfo( ['bz2', 'bzip2'], ['tb2', 'tbz', 'tbz2', 'tz2'], - [CompressionModuleInfo('rapidgzip', lambda x, parallelization=0: rapidgzip.IndexedBzip2File(x, parallelization=parallelization))], # type: ignore + [ + CompressionModuleInfo( + 'rapidgzip', + (lambda x, parallelization=0: rapidgzip.IndexedBzip2File(x, parallelization=parallelization)), + ) + ], lambda x: (x.read(4)[:3] == b'BZh' and x.read(6) == (0x314159265359).to_bytes(6, 'big')), ), 'gz': CompressionInfo( @@ -532,9 +537,21 @@ def detectCompression( ) -> Optional[str]: # isinstance(fileobj, io.IOBase) does not work for everything, e.g., for paramiko.sftp_file.SFTPFile # because it does not inherit from io.IOBase. Therefore, do duck-typing and test for required methods. - if any(not hasattr(fileobj, method) for method in ['seekable', 'seek', 'read', 'tell']) or not fileobj.seekable(): + expectedMethods = ['seekable', 'seek', 'read', 'tell'] + isFileObject = any(not hasattr(fileobj, method) for method in expectedMethods) + if isFileObject or not fileobj.seekable(): + if printDebug >= 2: + seekable = fileobj.seekable() if isFileObject else None + print( + f"[Warning] Cannot detect compression for given Python object {fileobj} " + f"because it does not look like a file object or is not seekable ({seekable})." + ) if printDebug >= 3: - print("[Warning] Cannot detect compression for give Python object that does not look like a file object.") + print(dir(fileobj)) + for name in ['readable', 'seekable', 'writable', 'closed', 'tell']: + method = getattr(fileobj, name, None) + if method is not None: + print(f" fileobj.{name}:", method() if callable(method) else method) traceback.print_exc() return None diff --git a/core/ratarmountcore/factory.py b/core/ratarmountcore/factory.py index 81d65d97..f72cbb66 100644 --- a/core/ratarmountcore/factory.py +++ b/core/ratarmountcore/factory.py @@ -1,8 +1,13 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +# pylint: disable=no-member,abstract-method +# Disable pylint errors. See https://github.com/fsspec/filesystem_spec/issues/1678 + import os +import sys import traceback +import warnings from typing import IO, Optional, Union @@ -10,6 +15,7 @@ from .utils import CompressionError, RatarmountError from .MountSource import MountSource from .FolderMountSource import FolderMountSource +from .FSSpecMountSource import FSSpecMountSource from .RarMountSource import RarMountSource from .SingleFileMountSource import SingleFileMountSource from .SQLiteIndexedTar import SQLiteIndexedTar @@ -18,6 +24,18 @@ from .ZipMountSource import ZipMountSource from .LibarchiveMountSource import LibarchiveMountSource +try: + import fsspec + import fsspec.utils + import fsspec.implementations.http +except ImportError: + fsspec = None # type: ignore + +try: + from sshfs import SSHFileSystem +except ImportError: + SSHFileSystem = None # type: ignore + def _openRarMountSource(fileOrPath: Union[str, IO[bytes]], **options) -> Optional[MountSource]: try: @@ -103,9 +121,116 @@ def _openPySquashfsImage(fileOrPath: Union[str, IO[bytes]], **options) -> Option } +class FixedSSHFileSystem(SSHFileSystem): + protocols = ["sftp", "ssh", "scp"] + cachable = False + + +def openFsspec(url, options, printDebug: int) -> Optional[Union[MountSource, IO[bytes], str]]: + splitURI = url.split('://', 1) + protocol = splitURI[0] if len(splitURI) > 1 else '' + if not protocol: + return None + + if protocol == 'file': + return splitURI[1] + + if not fsspec: + print("[Warning] An URL was detected but fsspec is not installed. You may want to install it with:") + print("[Warning] python3 -m pip install ratarmount[fsspec]") + return None + + result = None + try: + if printDebug >= 3: + print("[Info] Try to open with fsspec") + + # Suppress warning about (default!) encoding not being support for Python<3.9 -.-. + if sys.version_info < (3, 9) and protocol == 'ftp': + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + openFile = fsspec.open(url) + elif protocol in FixedSSHFileSystem.protocols: + fs = FixedSSHFileSystem(**FixedSSHFileSystem._get_kwargs_from_urls(url)) # pytype: disable=attribute-error + + # Remove one leading / in order to add support for relative paths. E.g.: + # ssh://127.0.0.1/relative/path + # ssh://127.0.0.1//home/user/relative/path + path = fsspec.utils.infer_storage_options(url)['path'] + if path.startswith("/"): + path = path[1:] + if not path: + path = "." + openFile = fsspec.core.OpenFile(fs, path) + else: + openFile = fsspec.open(url) + assert isinstance(openFile, fsspec.core.OpenFile) + + if printDebug >= 3: + print("[Info] Opened file fsspec:", openFile, "filesystem:", openFile.fs) + + # Note that http:// URLs are always files. Folders are only regex-parsed HTML files! + # By checking with isdir instead of isfile, we give isdir a higher precedence. + # Also note that isdir downloads the whole file! + # https://github.com/fsspec/filesystem_spec/issues/1707 + if isinstance(openFile.fs, fsspec.implementations.http.HTTPFileSystem): + info = openFile.fs.info(openFile.path) + if info.get('mimetype', None) == 'text/html' and openFile.fs.isdir(openFile.path): + return FSSpecMountSource(openFile) + elif openFile.fs.isdir(openFile.path): + return FSSpecMountSource(openFile) + + # This open call can fail with FileNotFoundError, IsADirectoryError, and probably others. + result = openFile.open() # pylint: disable=no-member + + # Avoid resource leaks, e.g., when the seek check fails. + oldDel = getattr(result, '__del__', None) + + def newDel(): + if callable(oldDel): + oldDel() + result.close() + + result.__del__ = newDel + + # Check that seeking works. May fail when, e.g., the HTTP server does not support range requests. + # Use https://github.com/danvk/RangeHTTPServer for testing purposes because + # "python3 -m http.server 9000" does not have range support. Use "python3 -m RangeHTTPServer 9000". + result.seek(1) + result.read(1) + result.seek(0) + + # Add tarFileName argument so that mounting a TAR file via SSH can create a properly named index + # file inside ~/.cache/ratarmount. + if 'tarFileName' not in options: + options['tarFileName'] = url + + # Note that asycnssh SSHFile does/did not implement seekable correctly! + # https://github.com/fsspec/sshfs/pull/50 + if 'sshfs.file.SSHFile' in str(type(result)): + result.seekable = lambda: True # type:ignore + except Exception as exception: + if result and hasattr(result, 'close'): + result.close() + if printDebug >= 1: + print("[Warning] Trying to open with fsspec raised an exception:", exception) + if printDebug >= 3: + traceback.print_exc() + return result + + def openMountSource(fileOrPath: Union[str, IO[bytes]], **options) -> MountSource: printDebug = int(options.get("printDebug", 0)) if isinstance(options.get("printDebug", 0), int) else 0 + if isinstance(fileOrPath, str): + result = openFsspec(fileOrPath, options, printDebug=printDebug) + if isinstance(result, MountSource): + return result + if isinstance(result, str) or result is not None: + fileOrPath = result + if not isinstance(fileOrPath, str) and printDebug >= 3: + print("[Info] Opened remote file with fsspec.") + joinedFileName = '' if isinstance(fileOrPath, str): if not os.path.exists(fileOrPath): diff --git a/pyproject.toml b/pyproject.toml index 86559937..aec31e89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ xz = ["ratarmountcore[xz]"] zip = ["ratarmountcore[zip]"] zstd = ["ratarmountcore[zstd]"] squashfs = ["ratarmountcore[squashfs]"] +fsspec = ["ratarmountcore[fsspec]"] [project.scripts] ratarmount = "ratarmount:cli" diff --git a/ratarmount.py b/ratarmount.py index 3a37541a..b6946e2c 100755 --- a/ratarmount.py +++ b/ratarmount.py @@ -52,6 +52,12 @@ except ImportError: pass +try: + import fsspec +except ImportError: + fsspec = None # type: ignore + + import ratarmountcore as core from ratarmountcore import ( AutoMountLayer, @@ -536,7 +542,7 @@ def __init__(self, pathToMount: Union[str, List[str]], mountPoint: str, foregrou pass hadPathsToMount = bool(pathToMount) - pathToMount = list(filter(os.path.exists, pathToMount)) + pathToMount = list(filter(lambda x: os.path.exists(x) or '://' in x, pathToMount)) if hadPathsToMount and not pathToMount: raise ValueError("No paths to mount left over after filtering!") @@ -931,6 +937,18 @@ def checkInputFileType( ) -> Tuple[str, Optional[str]]: """Raises an exception if it is not an accepted archive format else returns the real path and compression type.""" + splitURI = tarFile.split('://') + if len(splitURI) > 1: + protocol = splitURI[0] + if fsspec is None: + raise argparse.ArgumentTypeError("Detected an URI, but fsspec was not found. Try: pip install fsspec.") + if protocol not in fsspec.available_protocols(): + raise argparse.ArgumentTypeError( + f"URI: {tarFile} uses an unknown protocol. Protocols known by fsspec are: " + + ', '.join(fsspec.available_protocols()) + ) + return tarFile, None + if not os.path.isfile(tarFile): raise argparse.ArgumentTypeError(f"File '{tarFile}' is not a file!") tarFile = os.path.realpath(tarFile) @@ -1140,6 +1158,12 @@ def _parseArgs(rawArgs: Optional[List[str]] = None): - ratarmount folder1 folder2 mountpoint - ratarmount folder archive.zip folder - ratarmount -o modules=subdir,subdir=squashfs-root archive.squashfs mountpoint + - ratarmount http://server.org:80/archive.rar folder folder + - ratarmount ssh://hostname:22/relativefolder/ mountpoint + - ratarmount ssh://hostname:22//tmp/tmp-abcdef/ mountpoint + - ratarmount github://mxmlnkn:ratarmount@v0.15.2/tests/single-file.tar mountpoint + - AWS_ACCESS_KEY_ID=aaaaaaaaaaaaaaaaaaaa AWS_SECRET_ACCESS_KEY=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb \\ + ratarmount s3://127.0.0.1/bucket/single-file.tar mounted For further information, see the ReadMe on the project's homepage: @@ -1436,8 +1460,9 @@ def _parseArgs(rawArgs: Optional[List[str]] = None): # This is a hack but because we have two positional arguments (and want that reflected in the auto-generated help), # all positional arguments, including the mountpath will be parsed into the tar file path's namespace and we have to # manually separate them depending on the type. - if os.path.isdir(args.mount_source[-1]) or not os.path.exists(args.mount_source[-1]): - args.mount_point = args.mount_source[-1] + lastArgument = args.mount_source[-1] + if '://' not in lastArgument and (os.path.isdir(lastArgument) or not os.path.exists(lastArgument)): + args.mount_point = lastArgument args.mount_source = args.mount_source[:-1] if not args.mount_source and not args.write_overlay: raise argparse.ArgumentTypeError( @@ -1491,6 +1516,8 @@ def checkMountSource(path): args.mount_point = os.path.splitext(args.mount_source[0])[0] else: args.mount_point = autoMountPoint + if '://' in args.mount_point: + args.mount_point = "ratarmount.mounted" args.mount_point = os.path.abspath(args.mount_point) # Preprocess the --index-folders list as a string argument @@ -1830,7 +1857,7 @@ def main(): try: cli(args) - except (FileNotFoundError, RatarmountError, argparse.ArgumentTypeError) as exception: + except (FileNotFoundError, RatarmountError, argparse.ArgumentTypeError, ValueError) as exception: print("[Error]", exception) if debug >= 3: traceback.print_exc() diff --git a/tests/.pylintrc b/tests/.pylintrc index 4cb7bdd8..177eaf53 100644 --- a/tests/.pylintrc +++ b/tests/.pylintrc @@ -4,7 +4,8 @@ init-hook='import sys; sys.path.append("./core")' # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code. extension-pkg-whitelist=indexed_gzip,indexed_bzip2,indexed_zstd,libarchive,libarchive.ffi,lzmaffi,rapidgzip,isal, - PySquashfsImage,PySquashfsImage.compressor,zstandard,lz4,deflate,pyminizip,fast_zip_decryption + PySquashfsImage,PySquashfsImage.compressor,zstandard,lz4,deflate,pyminizip,fast_zip_decryption, + asyncssh,sshfs,fsspec # Specify a score threshold to be exceeded before program exits with error. fail-under=10.0 @@ -69,6 +70,7 @@ disable=invalid-name, too-many-instance-attributes, too-many-locals, too-many-lines, + too-many-positional-arguments, unnecessary-lambda, # I don't need the style checker to bother me with missing docstrings and todos. missing-class-docstring, diff --git a/tests/ratarmount-help.txt b/tests/ratarmount-help.txt index f9738a53..6b75f319 100644 --- a/tests/ratarmount-help.txt +++ b/tests/ratarmount-help.txt @@ -205,6 +205,12 @@ Examples: - ratarmount folder1 folder2 mountpoint - ratarmount folder archive.zip folder - ratarmount -o modules=subdir,subdir=squashfs-root archive.squashfs mountpoint + - ratarmount http://server.org:80/archive.rar folder folder + - ratarmount ssh://hostname:22/relativefolder/ mountpoint + - ratarmount ssh://hostname:22//tmp/tmp-abcdef/ mountpoint + - ratarmount github://mxmlnkn:ratarmount@v0.15.2/tests/single-file.tar mountpoint + - AWS_ACCESS_KEY_ID=aaaaaaaaaaaaaaaaaaaa AWS_SECRET_ACCESS_KEY=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb \ + ratarmount s3://127.0.0.1/bucket/single-file.tar mounted For further information, see the ReadMe on the project's homepage: diff --git a/tests/requirements-tests.txt b/tests/requirements-tests.txt index efa4ecae..5149275d 100644 --- a/tests/requirements-tests.txt +++ b/tests/requirements-tests.txt @@ -1,19 +1,35 @@ +# Code checking black -build codespell -fusepy flake8 -indexed_bzip2 -indexed_gzip -indexed_zstd mypy -pandas pylint -pyminizip pytest>=8 # pytest 7 did not work with pytest-xdist 3.5.0 pytest-xdist +# Depends on msgspec, which has no wheels for Python 3.13 and results in build errors. +# msgspec/_core.c:11254:15: error: too few arguments to function ‘_PyLong_AsByteArray’ +pytype; python_version < '3.13' + +# Dependencies +fusepy +indexed_bzip2 +indexed_gzip +indexed_zstd python-xz -pytype +zstandard + +# Build tools +build twine wheel -zstandard + +# Plotting +pandas + +# Tools to set up tests, e.g., servers +impacket +pyftpdlib +pyminizip +pyopenssl>=23 +rangehttpserver +boto3 diff --git a/tests/runtests.sh b/tests/runtests.sh index 4132befe..ffdec76d 100755 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -180,6 +180,17 @@ funmount() done } + +waitForMountpoint() +{ + for (( i=0; i<10; ++i )); do + if mountpoint -q -- "$1"; then break; fi + sleep 1s + done + if ! mountpoint -q -- "$1"; then return 1; fi +} + + returnError() { local lineNumber message @@ -240,21 +251,26 @@ checkFileInTAR() mountFolder="$( mktemp -d )" || returnError "$LINENO" 'Failed to create temporary directory' MOUNT_POINTS_TO_CLEANUP+=( "$mountFolder" ) + local args=() + if [[ "$archive" != *"://"* ]]; then args+=( '--recursive' ); fi + # try with index recreation - local args=( -P "$parallelization" -c --detect-gnu-incremental --ignore-zeros --recursive "$archive" "$mountFolder" ) + args+=( -P "$parallelization" -c --detect-gnu-incremental --ignore-zeros "$archive" "$mountFolder" ) { runAndCheckRatarmount "${args[@]}" && checkStat "$mountFolder/$fileInTar" && verifyCheckSum "$mountFolder" "$fileInTar" "$archive" "$correctChecksum" } || returnError "$LINENO" "$RATARMOUNT_CMD ${args[*]}" funmount "$mountFolder" - if [[ "$archive" =~ .tar ]]; then + if [[ "$archive" =~ [.]tar ]]; then 'grep' -q 'Creating offset dictionary' ratarmount.stdout.log ratarmount.stderr.log || returnError "$LINENO" "Looks like index was not created while executing: $RATARMOUNT_CMD ${args[*]}" fi # retry without forcing index recreation - local args=( -P "$parallelization" --detect-gnu-incremental --ignore-zeros --recursive "$archive" "$mountFolder" ) + args=() + if [[ "$archive" != *"://"* ]]; then args+=( '--recursive' ); fi + args+=( -P "$parallelization" --detect-gnu-incremental --ignore-zeros "$archive" "$mountFolder" ) { runAndCheckRatarmount "${args[@]}" && checkStat "$mountFolder/$fileInTar" && @@ -264,7 +280,7 @@ checkFileInTAR() # The libarchive backend does not create indexes for now because it doesn't help the poor performance much and # introduces complexity with index compatibility to other backends. - if [[ "$archive" =~ .tar && ! "$archive" =~ .7z$ ]]; then + if [[ "$archive" =~ [.]tar && ! "$archive" =~ [.]7z$ ]]; then 'grep' -q 'Successfully loaded offset dictionary' ratarmount.stdout.log ratarmount.stderr.log || returnError "$LINENO" "Looks like index was not loaded for '$archive' while executing: $RATARMOUNT_CMD ${args[*]}" fi @@ -443,7 +459,7 @@ testLargeTar() memoryUsage "$ratarmountPid" "$timeSeriesFile" & local memoryUsagePid="$!" - while ! mountpoint -- "$mountFolder"; do sleep 1s; done + waitForMountpoint "$mountFolder" || returnError 'Waiting for mountpoint timed out!' $RATARMOUNT_CMD -u "$mountFolder" wait "$memoryUsagePid" wait "$ratarmountPid" @@ -457,7 +473,7 @@ testLargeTar() memoryUsage "$ratarmountPid" "$timeSeriesFile" & local memoryUsagePid="$!" - while ! mountpoint -- "$mountFolder"; do sleep 1s; done + waitForMountpoint "$mountFolder" || returnError 'Waiting for mountpoint timed out!' $RATARMOUNT_CMD -u "$mountFolder" wait "$memoryUsagePid" wait "$ratarmountPid" @@ -1778,6 +1794,468 @@ checkStatfsWriteOverlay() } +checkURLProtocolFile() +{ + checkFileInTAR 'file://tests/single-file.tar' bar d3b07384d113edec49eaa6238ad5ff00 || + returnError "$LINENO" 'Failed to read via file:// protocol' + echo checkFileInTAR 'file://tests/' single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 + checkFileInTAR 'file://tests/' single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read via file:// protocol' + checkFileInTAR 'file://tests' single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read via file:// protocol' +} + + +checkFileInTARForeground() +{ + # Similar to checkFileInTAR but calls ratarmount with -f as is necessary for some threaded fsspec backends. + # TODO make those fsspec backends work without -f, e.g., by only mounting them in FuseMount.init, maybe + # trying to open in __init__ and close them at the end of __init__ and reopen them in init for better + # error reporting, or even better, somehow find out how to close only those threads and restart them + # in FuseMount.init. + local archive="$1"; shift + local fileInTar="$1"; shift + local correctChecksum="$1" + + local startTime + startTime=$( date +%s ) + + rm -f ratarmount.{stdout,stderr}.log + + local mountFolder + mountFolder="$( mktemp -d )" || returnError "$LINENO" 'Failed to create temporary directory' + MOUNT_POINTS_TO_CLEANUP+=( "$mountFolder" ) + + $RATARMOUNT_CMD -c -f -d 3 "$archive" "$mountFolder" >ratarmount.stdout.log 2>ratarmount.stderr.log & + waitForMountpoint "$mountFolder" || returnError 'Waiting for mountpoint timed out!' + ! 'grep' -C 5 -Ei '(warn|error)' ratarmount.stdout.log ratarmount.stderr.log || + returnError "$LINENO" "Found warnings while executing: $RATARMOUNT_CMD $*" + + echo "Check access to $archive" + verifyCheckSum "$mountFolder" "$fileInTar" "$archive" "$correctChecksum" || returnError "$LINENO" 'Checksum mismatches!' + funmount "$mountFolder" + + safeRmdir "$mountFolder" +} + + +checkURLProtocolHTTP() +{ + local pid mountPoint protocol port + mountPoint=$( mktemp -d ) + protocol='http' + port=8000 + + # Failed alternatives to set up a test HTTP server: + # python3 -m http.server -b 127.0.0.1 8000 & # Does not support range requests + # python3 -m RangeHTTPServer -b 127.0.0.1 8000 & # Client has spurious errors every 5th test or so with this. + # TODO Debug this... Bug could be in fsspec/implementations/http.py, aiohttp, RangeHTTPServer, ... + # sudo apt install busybox-static + # busybox httpd -f -p 8000 & # Does not support range requests. + # sudo apt install ruby-webrick + if ! command -v ruby &>/dev/null; then + echo "Ruby not found. Please install ruby-webrick." + return 0 + fi + ruby -run -e httpd --version || returnError "$LINENO" 'Failed to start up ruby HTTP test server!' + ruby -run -e httpd . --port $port --bind-address=127.0.0.1 1>'httpd-ruby-webrick.log' 2>&1 & + pid=$! + sleep 5 + wget 127.0.0.1:$port + + + checkFileInTARForeground "$protocol://127.0.0.1:$port/tests/single-file.tar" 'bar' d3b07384d113edec49eaa6238ad5ff00 || + returnError "$LINENO" 'Failed to read from HTTP server' + checkFileInTARForeground "$protocol://127.0.0.1:$port/tests/" 'single-file.tar' 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from HTTP server' + checkFileInTARForeground "$protocol://127.0.0.1:$port/tests" 'single-file.tar' 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from HTTP server' + + kill $pid &>/dev/null + rmdir "$mountPoint" +} + + +checkURLProtocolFTP() +{ + local pid user password + # python3 -m pip install pyftpdlib pyopenssl>=23 + user='pqvFUMqbqp' + password='ioweb123GUIweb' + port=8021 + echo "Starting FTP server..." + python3 -m pyftpdlib --user="$user" --password="$password" --port "$port" --interface 127.0.0.1 2>/dev/null & + pid=$! + sleep 1 + + checkFileInTAR "ftp://$user:$password@127.0.0.1:8021/tests/single-file.tar" bar d3b07384d113edec49eaa6238ad5ff00 || + returnError "$LINENO" 'Failed to read from FTP server' + checkFileInTAR "ftp://$user:$password@127.0.0.1:8021/tests/" single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from FTP server' + checkFileInTAR "ftp://$user:$password@127.0.0.1:8021/tests" single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from FTP server' + + kill $pid +} + + +killRogueSSH() +{ + local pid + for pid in $( pgrep -f start-asyncssh-server ) $( pgrep -f ssh:// ); do + kill "$pid" + sleep 1 + kill -9 "$pid" + done + sleep 1 +} + + +checkURLProtocolSSHErrorOnPython314() +{ + cat </dev/null +Traceback (most recent call last): + File ".../python3.14/site-packages/ratarmountcore/factory.py", line 180, in openFsspec + elif openFile.fs.isdir(openFile.path): + ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^ + File ".../python3.14/site-packages/fsspec/asyn.py", line 118, in wrapper + return sync(self.loop, func, *args, **kwargs) + File ".../python3.14/site-packages/fsspec/asyn.py", line 103, in sync + raise return_result + File ".../python3.14/site-packages/fsspec/asyn.py", line 56, in _runner + result[0] = await coro + ^^^^^^^^^^ + File ".../python3.14/site-packages/fsspec/asyn.py", line 677, in _isdir + return (await self._info(path))["type"] == "directory" + ^^^^^^^^^^^^^^^^^^^^^^ + File ".../python3.14/site-packages/sshfs/utils.py", line 27, in wrapper + return await func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File ".../python3.14/site-packages/sshfs/spec.py", line 142, in _info + attributes = await channel.stat(path) + ^^^^^^^^^^^^^^^^^^^^^^^^ + File ".../python3.14/site-packages/asyncssh/sftp.py", line 4616, in stat + return await self._handler.stat(path, flags, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + follow_symlinks=follow_symlinks) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File ".../python3.14/site-packages/asyncssh/sftp.py", line 2713, in stat + return cast(SFTPAttrs, await self._make_request( + ^^^^^^^^^^^^^^^^^^^^^^^^^ + FXP_STAT, String(path), flag_bytes)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File ".../python3.14/site-packages/asyncssh/sftp.py", line 2468, in _make_request + result = self._packet_handlers[resptype](self, resp) + File ".../python3.14/site-packages/asyncssh/sftp.py", line 2484, in _process_status + raise exc +asyncssh.sftp.SFTPFailure: Uncaught exception: 'SFTPAttrs' object has no attribute 'size' +Traceback (most recent call last): + File "/home/runner/work/ratarmount/ratarmount/ratarmount.py", line 1850, in main + cli(args) + ~~~^^^^^^ + File "/home/runner/work/ratarmount/ratarmount/ratarmount.py", line 1794, in cli + with FuseMount( + ~~~~~~~~~^ + # fmt: off + ^^^^^^^^^^ + ...<27 lines>... + # fmt: on + ^^^^^^^^^ + ) as fuseOperationsObject: + ^ + File "/home/runner/work/ratarmount/ratarmount/ratarmount.py", line 570, in __init__ + mountSources.append((os.path.basename(path), openMountSource(path, **options))) + ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File ".../python3.14/site-packages/ratarmountcore/factory.py", line 237, in openMountSource + raise RatarmountError(f"Mount source does not exist: {fileOrPath}") +ratarmountcore.utils.RatarmountError: Mount source does not exist: ssh://127.0.0.1:8022/tests/single-file.tar +EOF +} + + +checkURLProtocolSSH() +{ + python3MinorVersion=$( python3 -c 'import sys; print(sys.version_info.minor)' ) + if [[ -n "$python3MinorVersion" && "$python3MinorVersion" -ge 14 ]]; then + return 0 + fi + + local pid fingerprint publicKey mountPoint port file + # rm -f ssh_host_key; ssh-keygen -q -N "" -C "" -t ed25519 -f ssh_host_key + cat < ssh_host_key +-----BEGIN OPENSSH PRIVATE KEY----- +b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW +QyNTUxOQAAACA6luxe0F9n0zBbFW6DExxYAMz2tinaHPb9IwLmreJMzgAAAIhe3ftsXt37 +bAAAAAtzc2gtZWQyNTUxOQAAACA6luxe0F9n0zBbFW6DExxYAMz2tinaHPb9IwLmreJMzg +AAAECRurZq3m4qFnBUpJG3+SwhdL410zFoUODgRIU4aLTbpjqW7F7QX2fTMFsVboMTHFgA +zPa2Kdoc9v0jAuat4kzOAAAAAAECAwQF +-----END OPENSSH PRIVATE KEY----- +EOF + # Only works on server. Also not hashed in not in known_hosts format. + #fingerprint=$( ssh-keygen -lf ssh_host_key ) + fingerprint=$( ssh-keyscan -H -p 8022 127.0.0.1 2>/dev/null ) + file="$HOME/.ssh/known_hosts" + if [[ ! -f "$file" ]] || ! 'grep' -q -F "$fingerprint" "$file"; then + echo "$fingerprint" >> "$file" + fi + + [[ -f ~/.ssh/id_ed25519 ]] || ssh-keygen -q -N "" -t ed25519 -f ~/.ssh/id_ed25519 + publicKey=$( cat ~/.ssh/id_ed25519.pub ) + file='ssh_user_ca' + if [[ ! -f "$file" ]] || ! 'grep' -q -F "$publicKey" "$file"; then + echo "$publicKey" >> "$file" + fi + + killRogueSSH + port=8022 + python3 tests/start-asyncssh-server.py & + pid=$! + echo "Started SSH server with process ID $pid" + sleep 2 + + mountPoint=$( mktemp -d ) + + checkFileInTARForeground "ssh://127.0.0.1:$port/tests/single-file.tar" 'bar' d3b07384d113edec49eaa6238ad5ff00 || + returnError "$LINENO" 'Failed to read from SSH server' + checkFileInTARForeground "ssh://127.0.0.1:$port/tests/" 'single-file.tar' 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from SSH server' + checkFileInTARForeground "ssh://127.0.0.1:$port/tests" 'single-file.tar' 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from SSH server' + + kill $pid + killRogueSSH + rmdir "$mountPoint" +} + + +checkURLProtocolGit() +{ + # Pygit2 is missing wheels for Python 3.13 + # The manual installation from source fails because of: + # File "/opt/hostedtoolcache/Python/3.13.0-rc.3/x64/lib/python3.13/site-packages/pygit2/__init__.py", + # line 32, in + # from ._pygit2 import * + # ImportError: libgit2.so.1.7: cannot open shared object file: No such file or directory + # Even though the compilation was fine and the installation also looks fine: + # Install the project... + # -- Install configuration: "Debug" + # -- Installing: /usr/local/lib/pkgconfig/libgit2.pc + # -- Installing: /usr/local/lib/libgit2.so.1.7.2 + # -- Installing: /usr/local/lib/libgit2.so.1.7 + # -- Installing: /usr/local/lib/libgit2.so + # -- Installing: /usr/local/include/git2 + python3MinorVersion=$( python3 -c 'import sys; print(sys.version_info.minor)' ) + if [[ -n "$python3MinorVersion" && "$python3MinorVersion" -ge 13 ]] && ! python3 -c 'import pygit2'; then + return 0 + fi + + # https://github.com/fsspec/filesystem_spec/blob/360e46d13069b0426565429f9f610bf704cfa062/ + # fsspec/implementations/git.py#L28C14-L28C58 + # > "git://[path-to-repo[:]][ref@]path/to/file" (but the actual + # > file path should not contain "@" or ":"). + checkFileInTAR 'git://v0.15.2@tests/single-file.tar' bar d3b07384d113edec49eaa6238ad5ff00 || + returnError "$LINENO" 'Failed to read from HTTP server' + checkFileInTAR 'git://v0.15.2@tests/' single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from HTTP server' + checkFileInTAR 'git://v0.15.2@tests' single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from HTTP server' +} + + +checkURLProtocolGithub() +{ + # Cannot do automated tests because of Github rate limit... + # Trying to open with fsspec raised an exception: 403 Client Error: rate limit exceeded for url + if [[ -n "$CI" ]]; then return 0; fi + + # https://github.com/fsspec/filesystem_spec/blob/360e46d13069b0426565429f9f610bf704cfa062/ + # fsspec/implementations/github.py#L26 + # https://github.com/fsspec/filesystem_spec/blob/360e46d13069b0426565429f9f610bf704cfa062/ + # fsspec/implementations/github.py#L202 + # https://github.com/fsspec/filesystem_spec/blob/360e46d13069b0426565429f9f610bf704cfa062/fsspec/utils.py#L37 + # + # - "github://path/file", in which case you must specify org, repo and + # may specify sha in the extra args + # - 'github://org:repo@/precip/catalog.yml', where the org and repo are + # part of the URI + # - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included + # + # ``sha`` can be the full or abbreviated hex of the commit you want to fetch + # from, or a branch or tag name (so long as it doesn't contain special characters + # like "/", "?", which would have to be HTTP-encoded). + + checkFileInTAR 'github://mxmlnkn:ratarmount@v0.15.2/tests/single-file.tar' bar d3b07384d113edec49eaa6238ad5ff00 || + returnError "$LINENO" 'Failed to read from HTTP server' + checkFileInTAR 'github://mxmlnkn:ratarmount@v0.15.2/tests/' single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from HTTP server' + checkFileInTAR 'github://mxmlnkn:ratarmount@v0.15.2/tests' single-file.tar 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from HTTP server' +} + + +checkURLProtocolS3() +{ + local mountPoint pid weedFolder port + mountPoint=$( mktemp -d ) + port=8053 + + if [[ ! -f weed ]]; then + wget -q 'https://github.com/seaweedfs/seaweedfs/releases/download/3.74/linux_amd64_large_disk.tar.gz' + tar -xf 'linux_amd64_large_disk.tar.gz' + fi + [[ -x weed ]] || chmod u+x weed + + weedFolder=$( mktemp -d ) + TMP_FILES_TO_CLEANUP+=( "$weedFolder" ) + ./weed server -dir="$weedFolder" -s3 -s3.port "$port" -idleTimeout=30 -ip 127.0.0.1 2>weed.log & + pid=$! + + # Wait for port to open + echo "Waiting for seaweedfs to start up and port $port to open..." + python3 -c ' +import socket +import sys +import time +from contextlib import closing + +t0 = time.time() +with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + for i in range(10): + if sock.connect_ex(("127.0.0.1", int(sys.argv[1]))) == 0: + print(f"Weed port opened after {time.time() - t0:.1f} s.") + break + time.sleep(5) +' "$port" + + # Create bucket and upload test file + python3 -c " +import os +import sys +import boto3 + +def list_buckets(client): + result = client.list_buckets() + return [x['Name'] for x in result['Buckets']] if 'Buckets' in result else [] + +def list_bucket_files(client, bucket_name): + result = client.list_objects_v2(Bucket=bucket_name) + return [x['Key'] for x in result['Contents']] if 'Contents' in result else [] + +endpoint_url = 'http://127.0.0.1:' + sys.argv[1] +print('Connect to:', endpoint_url) + +client = boto3.client( + 's3', endpoint_url=endpoint_url, + aws_access_key_id = '01234567890123456789', + aws_secret_access_key = '0123456789012345678901234567890123456789' +) + +bucket_name = 'bucket' +buckets = list_buckets(client) +print('Existing buckets:', buckets) +if bucket_name not in buckets: + print(f'Create new bucket: {bucket_name} ...') + client.create_bucket(Bucket=bucket_name) + +path = 'tests/single-file.tar' +if not os.path.isfile(path): + print('Failed to find file to upload:', path) +print(f'Upload file {path} to bucket.') +client.upload_file(path, bucket_name, 'single-file.tar') +" "$port" + + export FSSPEC_S3_ENDPOINT_URL="http://127.0.0.1:$port" + # Even though no credentials are configured for the seaweedfs server, we need dummy credentials for boto3 -.- + export AWS_ACCESS_KEY_ID=01234567890123456789 + export AWS_SECRET_ACCESS_KEY=0123456789012345678901234567890123456789 + + # At last, test ratarmount. + checkFileInTARForeground "s3://bucket/single-file.tar" 'bar' d3b07384d113edec49eaa6238ad5ff00 || + returnError "$LINENO" 'Failed to read from S3 server' + checkFileInTARForeground "s3://bucket/" 'single-file.tar' 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from S3 server' + checkFileInTARForeground "s3://bucket" 'single-file.tar' 1a28538854d1884e4415cb9bfb7a2ad8 || + returnError "$LINENO" 'Failed to read from S3 server' + + kill $pid &>/dev/null + + 'rm' -rf "$weedFolder" +} + + +checkURLProtocolSamba() +{ + return 0 # Cannot automate because of the myriad of bugs and issues explained below. + + local pid user password + + user='pqvFUMqbqp' + password='ioweb123GUIweb' + + # Unusable because tests should not be run as root. + if false; then + sudo apt install samba + cat <&dev/null & + pid=$! + + checkFileInTAR "smb://$user:$password@127.0.0.1:8445/test-share/single-file.tar" bar d3b07384d113edec49eaa6238ad5ff00 || + returnError "$LINENO" 'Failed to read from Samba server' +} + + +checkRemoteSupport() +{ + # Some implementations of fsspec. See e.g. this list: + # https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations + + checkURLProtocolFile || returnError 'Failed file:// check' + checkURLProtocolGit || returnError 'Failed git:// check' + checkURLProtocolGithub || returnError 'Failed github:// check' + checkURLProtocolFTP || returnError 'Failed ftp:// check' + + checkURLProtocolHTTP || returnError 'Failed http:// check' + checkURLProtocolS3 || returnError 'Failed s3:// check' + checkURLProtocolSSH || returnError 'Failed ssh:// check' + + checkURLProtocolSamba || returnError 'Failed smb:// check' + # TODO Add and test IPFS + # TODO look for other fsspec implementations in an automated manner +} + + rm -f ratarmount.{stdout,stderr}.log # Linting only to be done locally because in CI it is in separate steps @@ -1824,7 +2302,8 @@ if [[ -z "$CI" ]]; then while read -r file; do filesToSpellCheck+=( "$file" ) done < <( git ls-tree -r --name-only HEAD | 'grep' -E '[.](py|md|txt|sh|yml)' ) - codespell --ignore-words-list fo,Nd,unx "${filesToSpellCheck[@]}" + # fsspec uses cachable instead of cacheable ... + codespell --ignore-words-list fo,Nd,unx,cachable "${filesToSpellCheck[@]}" flake8 --config tests/.flake8 "${files[@]}" "${testFiles[@]}" || returnError "$LINENO" 'Flake8 failed!' @@ -1885,7 +2364,7 @@ fi # We need to run these tests without pytest because, for some reason, # pytest slows the zip decryption fix down from 0.1 s to 1.1 s?! -python3 tests/test_ZipMountSource.py +python3 core/tests/test_ZipMountSource.py rm -f tests/*.index.* @@ -2105,6 +2584,7 @@ if [[ ! -f tests/2k-recursive-tars.tar ]]; then bzip2 -q -d -k tests/2k-recursive-tars.tar.bz2 fi +checkRemoteSupport checkStatfs || returnError "$LINENO" 'Statfs failed!' checkStatfsWriteOverlay || returnError "$LINENO" 'Statfs with write overlay failed!' checkSymbolicLinkRecursion || returnError "$LINENO" 'Symbolic link recursion failed!' diff --git a/tests/start-asyncssh-server.py b/tests/start-asyncssh-server.py new file mode 100644 index 00000000..f3feeafd --- /dev/null +++ b/tests/start-asyncssh-server.py @@ -0,0 +1,23 @@ +import asyncio + +import asyncssh + + +# for pid in $( ps aux | grep start-asyncssh-server | grep -v grep | awk '{ print $2; }' ); do +# kill "$pid"; sleep 0.1; kill -9 "$pid"; done + + +async def start_server(): + await asyncssh.listen( + "127.0.0.1", + 8022, + server_host_keys=["ssh_host_key"], + authorized_client_keys="ssh_user_ca", + sftp_factory=True, + allow_scp=True, + ) + + +loop = asyncio.new_event_loop() +loop.run_until_complete(start_server()) +loop.run_forever()