diff --git a/core/ratarmountcore/SQLiteIndex.py b/core/ratarmountcore/SQLiteIndex.py index 5960eeab..f59f0424 100644 --- a/core/ratarmountcore/SQLiteIndex.py +++ b/core/ratarmountcore/SQLiteIndex.py @@ -4,10 +4,12 @@ import json import os import re +import shutil import sqlite3 import stat import sys import tarfile +import tempfile import time import traceback import urllib.parse @@ -15,6 +17,11 @@ from typing import Any, AnyStr, Callable, Dict, IO, List, Optional, Tuple, Union from dataclasses import dataclass +try: + import fsspec +except ImportError: + fsspec = None # type: ignore + try: import indexed_gzip except ImportError: @@ -27,9 +34,22 @@ from .version import __version__ from .MountSource import FileInfo, createRootFileInfo -from .compressions import TAR_COMPRESSION_FORMATS +from .compressions import ( + CompressionInfo, + LIBARCHIVE_FILTER_FORMATS, + TAR_COMPRESSION_FORMATS, + detectCompression, + findAvailableOpen, +) from .SQLiteBlobFile import SQLiteBlobsFile, WriteSQLiteBlobs -from .utils import RatarmountError, IndexNotOpenError, InvalidIndexError, findModuleVersion, MismatchingIndexError +from .utils import ( + CompressionError, + IndexNotOpenError, + InvalidIndexError, + RatarmountError, + MismatchingIndexError, + findModuleVersion, +) def getSqliteTables(connection: sqlite3.Connection): @@ -214,6 +234,9 @@ def __init__( self.sqlConnection: Optional[sqlite3.Connection] = None # Will hold the actually opened valid path to an index file self.indexFilePath: Optional[str] = None + # This is true if the index file found was compressed or an URL and had to be downloaded + # and/or extracted into a temporary folder. + self.indexFilePathDeleteOnClose: bool = False self.encoding = encoding self.possibleIndexFilePaths = SQLiteIndex.getPossibleIndexFilePaths( indexFilePath, indexFolders, archiveFilePath, ignoreCurrentFolder @@ -224,6 +247,7 @@ def __init__( self.indexMinimumFileCount = indexMinimumFileCount self.backendName = backendName self._insertedRowCount = 0 + self._temporaryIndexFile: Optional[Any] = None assert self.backendName @@ -251,6 +275,8 @@ def getPossibleIndexFilePaths( ignoreCurrentFolder: bool = False, ) -> List[str]: if indexFilePath: + if '://' in indexFilePath: + return [indexFilePath] return [] if indexFilePath == ':memory:' else [os.path.abspath(os.path.expanduser(indexFilePath))] if not archiveFilePath: @@ -279,7 +305,6 @@ def openExisting(self, checkMetadata: Optional[Callable[[Dict[str, Any]], None]] """Tries to find an already existing index.""" for indexPath in self.possibleIndexFilePaths: if self._tryLoadIndex(indexPath, checkMetadata=checkMetadata): - self.indexFilePath = indexPath break def openInMemory(self): @@ -327,6 +352,8 @@ def close(self): pass self.sqlConnection = None + self._setIndexFilePath(None) + def getConnection(self) -> sqlite3.Connection: if self.sqlConnection: return self.sqlConnection @@ -459,6 +486,11 @@ def getIndexVersion(self): @staticmethod def _pathIsWritable(path: str, printDebug: int = 0) -> bool: + # Writing indexes to remote filesystems currently not supported and we need to take care that URLs + # are not interpreted as local file paths, i.e., creating an ftp: folder with a user:password@host subfolder. + if '://' in path: + return False + try: folder = os.path.dirname(path) if folder: @@ -952,7 +984,31 @@ def indexIsLoaded(self) -> bool: return True - def _loadIndex(self, indexFilePath: AnyStr, checkMetadata: Optional[Callable[[Dict[str, Any]], None]]) -> None: + def _setIndexFilePath(self, indexFilePath: Optional[str], deleteOnClose: bool = False): + # This is called from __del__, so we need to account for this being called when something + # in the constructor raises an exception and not all members of self exist. + if ( + getattr(self, 'indexFilePath', None) + and getattr(self, 'indexFilePathDeleteOnClose', False) + and self.indexFilePath + and os.path.isfile(self.indexFilePath) + ): + try: + os.remove(self.indexFilePath) + except Exception as exception: + if self.printDebug >= 1: + print( + "[Warning] Failed to remove temporarily downloaded and/or extracted index file at:", + self.indexFilePath, + "because of:", + exception, + ) + + if hasattr(self, 'indexFilePath') and hasattr(self, 'indexFilePathDeleteOnClose'): + self.indexFilePath = indexFilePath + self.indexFilePathDeleteOnClose = deleteOnClose + + def _loadIndex(self, indexFilePath: str, checkMetadata: Optional[Callable[[Dict[str, Any]], None]]) -> None: """ Loads the given index SQLite database and checks it for validity raising an exception if it is invalid. @@ -964,7 +1020,68 @@ def _loadIndex(self, indexFilePath: AnyStr, checkMetadata: Optional[Callable[[Di if self.indexIsLoaded(): return - self.sqlConnection = self._openSqlDb(indexFilePath) + # Download and/or extract the file to a temporary file if necessary. + + # Strip file:// prefix to avoid useless copies to the temporary directory. + # TODO What about operator chainin?! It would be a valid use case for starting with file://! + fileURLPrefix = 'file://' + while indexFilePath.startswith(fileURLPrefix): + indexFilePath = indexFilePath[len(fileURLPrefix) :] + + temporaryFolder = os.environ.get("RATARMOUNT_INDEX_TMPDIR", None) + + def _undoCompression(file) -> Optional[Tuple[str, CompressionInfo]]: + compressionsToTest = TAR_COMPRESSION_FORMATS.copy() + compressionsToTest.update(LIBARCHIVE_FILTER_FORMATS) + compression = detectCompression(file, printDebug=self.printDebug, compressionsToTest=compressionsToTest) + if not compression or compression not in compressionsToTest: + return None + + if self.printDebug >= 2: + print(f"[Info] Detected {compression}-compressed index.") + + formatOpen = findAvailableOpen(compression) + if not formatOpen: + moduleNames = [module.name for module in TAR_COMPRESSION_FORMATS[compression].modules] + raise CompressionError( + f"Cannot open a {compression} compressed index file {indexFilePath} " + f"without any of these modules: {moduleNames}" + ) + + return formatOpen(file) + + def _copyToTemp(file): + self._temporaryIndexFile = tempfile.NamedTemporaryFile(suffix=".tmp.sqlite.index", dir=temporaryFolder) + # TODO add progress bar / output? + with open(self._temporaryIndexFile.name, 'wb') as targetFile: + shutil.copyfileobj(file, targetFile) + + if '://' in indexFilePath: + if fsspec is None: + raise RatarmountError( + "Detected an URL for the index path but fsspec could not be imported!\n" + "Try installing it with 'pip install fsspec' or 'pip install ratarmount[full]'." + ) + + # TODO Maybe manual deletion not even necessary when using tempfile correctly? + with fsspec.open(indexFilePath) as file: + decompressedFile = _undoCompression(file) + with decompressedFile if decompressedFile else file as fileToCopy: + _copyToTemp(fileToCopy) + else: + with open(indexFilePath, 'rb') as file: + decompressedFile = _undoCompression(file) + if decompressedFile: + with decompressedFile: + _copyToTemp(decompressedFile) + else: + temporaryIndexFilePath = indexFilePath + + temporaryIndexFilePath = self._temporaryIndexFile.name if self._temporaryIndexFile else indexFilePath + + # Done downloading and/or extracting the SQLite index. + + self.sqlConnection = self._openSqlDb(temporaryIndexFilePath) tables = getSqliteTables(self.sqlConnection) versions = None try: @@ -1036,10 +1153,15 @@ def _loadIndex(self, indexFilePath: AnyStr, checkMetadata: Optional[Callable[[Di pass if self.printDebug >= 1: - print(f"Successfully loaded offset dictionary from {str(indexFilePath)}") + message = "Successfully loaded offset dictionary from " + str(indexFilePath) + if temporaryIndexFilePath != indexFilePath: + message += " temporarily downloaded/decompressed into: " + str(temporaryIndexFilePath) + print(message) + + self._setIndexFilePath(temporaryIndexFilePath) def _tryLoadIndex( - self, indexFilePath: AnyStr, checkMetadata: Optional[Callable[[Dict[str, Any]], None]] = None + self, indexFilePath: str, checkMetadata: Optional[Callable[[Dict[str, Any]], None]] = None ) -> bool: """Calls loadIndex if index is not loaded already and provides extensive error handling.""" diff --git a/core/ratarmountcore/compressions.py b/core/ratarmountcore/compressions.py index 52815e81..9a3a7f52 100644 --- a/core/ratarmountcore/compressions.py +++ b/core/ratarmountcore/compressions.py @@ -571,7 +571,10 @@ def getGzipInfo(fileobj: IO[bytes]) -> Optional[Tuple[str, int]]: def detectCompression( - fileobj: IO[bytes], prioritizedBackends: Optional[List[str]], printDebug: int = 0 + fileobj: IO[bytes], + prioritizedBackends: Optional[List[str]] = None, + printDebug: int = 0, + compressionsToTest: Dict[str, CompressionInfo] = TAR_COMPRESSION_FORMATS, ) -> Optional[str]: # isinstance(fileobj, io.IOBase) does not work for everything, e.g., for paramiko.sftp_file.SFTPFile # because it does not inherit from io.IOBase. Therefore, do duck-typing and test for required methods. @@ -594,7 +597,7 @@ def detectCompression( return None oldOffset = fileobj.tell() - for compressionId, compression in TAR_COMPRESSION_FORMATS.items(): + for compressionId, compression in compressionsToTest.items(): # The header check is a necessary condition not a sufficient condition. # Especially for gzip, which only has 2 magic bytes, false positives might happen. # Therefore, only use the magic bytes based check if the module could not be found diff --git a/ratarmount.py b/ratarmount.py index 84ee1bb0..d8dc1911 100755 --- a/ratarmount.py +++ b/ratarmount.py @@ -591,7 +591,15 @@ def pointsIntoMountPoint(pathToTest): hasIndexPath = False if 'indexFilePath' in options and isinstance(options['indexFilePath'], str): - indexFilePath = os.path.realpath(options['indexFilePath']) + indexFilePath = options['indexFilePath'] + # TODO What about operator chainin?! It would be a valid use case for starting with file://! + if '://' in options['indexFilePath']: + fileURLPrefix = 'file://' + while indexFilePath.startswith(fileURLPrefix): + indexFilePath = indexFilePath[len(fileURLPrefix) :] + if '://' not in indexFilePath: + indexFilePath = os.path.realpath(options['indexFilePath']) + if pointsIntoMountPoint(indexFilePath): del options['indexFilePath'] else: @@ -1265,7 +1273,13 @@ def _parseArgs(rawArgs: Optional[List[str]] = None): indexGroup.add_argument( '--index-file', type=str, help='Specify a path to the .index.sqlite file. Setting this will disable fallback index folders. ' - 'If the given path is ":memory:", then the index will not be written out to disk.') + 'If the given path is ":memory:", then the index will not be written out to disk. ' + 'If the specified path is a remote URL, such as "https://host.org/file.tar.index.sqlite", or ' + 'a compressed index, such as "file.tar.index.sqlite.gz", then the index file will be downloaded ' + f'and/or extracted into the default temporary folder ({tempfile.gettempdir()}). This path can be ' + 'changed with the environment variable RATARMOUNT_INDEX_TMPDIR. The temporary folder in general ' + 'can also be changed with these environment variables in decreasing priority: TMPDIR, TEMP, TMP ' + 'as described in the Python tempfile standard library documentation.') indexFolders = ['', os.path.join( "~", ".ratarmount")] xdgCacheHome = getXdgCacheHome() diff --git a/tests/ratarmount-help.txt b/tests/ratarmount-help.txt index a6330596..73094146 100644 --- a/tests/ratarmount-help.txt +++ b/tests/ratarmount-help.txt @@ -49,7 +49,15 @@ Index Options: --index-file INDEX_FILE Specify a path to the .index.sqlite file. Setting this will disable fallback index folders. If the given path is ":memory:", then the index - will not be written out to disk. (default: None) + will not be written out to disk. If the specified path is a remote URL, + such as "https://host.org/file.tar.index.sqlite", or a compressed index, + such as "file.tar.index.sqlite.gz", then the index file will be + downloaded and/or extracted into the default temporary folder (/tmp). + This path can be changed with the environment variable + RATARMOUNT_INDEX_TMPDIR. The temporary folder in general can also be + changed with these environment variables in decreasing priority: TMPDIR, + TEMP, TMP as described in the Python tempfile standard library + documentation. (default: None) --index-folders INDEX_FOLDERS Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for suitability in the given order. An empty path will be