Skip to content

Commit

Permalink
[feature] Implement a faster git backend that works with pygit2 >= 1.15
Browse files Browse the repository at this point in the history
  • Loading branch information
mxmlnkn committed Oct 6, 2024
1 parent 0b01c9a commit e23c68a
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 1 deletion.
130 changes: 130 additions & 0 deletions core/ratarmountcore/GitMountSource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import io
import os
import stat
from typing import Dict, Iterable, IO, Optional, Union

try:
import pygit2
except ImportError:
pygit2 = None # type: ignore

from .MountSource import FileInfo, MountSource
from .utils import overrides


class GitMountSource(MountSource):
"""
Reimplementation from scratch of the very barebones implementation inside fsspec
because it is slow and "older" versions did not work with pygit2 1.15.
https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/git.py
https://github.com/fsspec/filesystem_spec/issues/1708
"""

enabled = pygit2 is not None

# pylint: disable=unused-argument
def __init__(self, path: Optional[str] = None, reference: Optional[str] = None, **kwargs):
self.repository = pygit2.Repository(path if path else os.getcwd())
self.reference = reference if reference else self._getDefaultReference(self.repository)
commit, reference = self.repository.resolve_refish(self.reference)
self.tree = commit.tree
self.commitTime = self.repository[self.repository.head.target].commit_time
self.prefix = ""

@staticmethod
def _getDefaultReference(repository):
if 'init.defaultBranch' in repository.config:
return repository.config['init.defaultBranch']

# Try to find checked out branch.
for branch in repository.branches:
if repository.branches[branch].is_head():
return branch

for branch in ['master', 'main']:
if branch in repository.branches:
return branch

return 'master'

def _lookUpPath(self, path: str):
tree = self.tree
for name in self.prefix.split("/") + path.split("/"):
if name and isinstance(tree, pygit2.Tree):
if name not in tree:
return None
tree = tree[name]
return tree

@staticmethod
def _convertToFileMode(obj):
if obj.filemode == pygit2.enums.FileMode.LINK:
return 0o555 | stat.S_IFLNK
return 0o555 | (stat.S_IFDIR if isinstance(obj, pygit2.Tree) else stat.S_IFREG)

def _convertToFileInfo(self, obj, path: str):
return FileInfo(
# fmt: off
size = obj.size if hasattr(obj, 'size') else 0,
mtime = self.commitTime,
mode = GitMountSource._convertToFileMode(obj),
linkname = obj.data.decode() if obj.filemode == pygit2.enums.FileMode.LINK else "",
uid = os.getuid(),
gid = os.getgid(),
userdata = [path],
# fmt: on
)

@overrides(MountSource)
def isImmutable(self) -> bool:
return True

@overrides(MountSource)
def exists(self, path: str) -> bool:
return self._lookUpPath(path) is not None

def _listDir(self, path: str, onlyMode: bool) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]:
tree = self._lookUpPath(path)
if not isinstance(tree, pygit2.Tree):
return None
return {
obj.name: (
GitMountSource._convertToFileMode(obj)
if onlyMode
else self._convertToFileInfo(obj, path + '/' + obj.name)
)
for obj in tree
}

@overrides(MountSource)
def listDir(self, path: str) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]:
return self._listDir(path, onlyMode=False)

@overrides(MountSource)
def listDirModeOnly(self, path: str) -> Optional[Union[Iterable[str], Dict[str, int]]]:
return self._listDir(path, onlyMode=True)

@overrides(MountSource)
def getFileInfo(self, path: str, fileVersion: int = 0) -> Optional[FileInfo]:
obj = self._lookUpPath(path)
return None if obj is None else self._convertToFileInfo(obj, path)

@overrides(MountSource)
def fileVersions(self, path: str) -> int:
return 1

@overrides(MountSource)
def open(self, fileInfo: FileInfo, buffering=-1) -> IO[bytes]:
path = fileInfo.userdata[-1]
assert isinstance(path, str)
# TODO Avoid high memory usage for very large files.
# Check whether pygit2 even has a kind of streaming API for file contents.
return io.BytesIO(self._lookUpPath(path).data)

@overrides(MountSource)
def __exit__(self, exception_type, exception_value, exception_traceback):
pass
40 changes: 40 additions & 0 deletions core/ratarmountcore/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Disable pylint errors. See https://github.com/fsspec/filesystem_spec/issues/1678

import os
import stat
import sys
import traceback
import warnings
Expand All @@ -16,6 +17,7 @@
from .MountSource import MountSource
from .FolderMountSource import FolderMountSource
from .FSSpecMountSource import FSSpecMountSource
from .GitMountSource import GitMountSource
from .RarMountSource import RarMountSource
from .SingleFileMountSource import SingleFileMountSource
from .SQLiteIndexedTar import SQLiteIndexedTar
Expand Down Expand Up @@ -135,6 +137,44 @@ def openFsspec(url, options, printDebug: int) -> Optional[Union[MountSource, IO[
if protocol == 'file':
return splitURI[1]

if protocol == 'git':
if not GitMountSource.enabled:
raise ValueError(
"Detected git:// URL but GitMountSource could not be loaded. Please ensure that pygit2 is installed."
)

remainder = splitURI[1]

splitRepositoryPath = remainder.split(':', 1)
repositoryPath = splitRepositoryPath[0] if len(splitRepositoryPath) > 1 else None
remainder = splitRepositoryPath[-1]

splitReference = remainder.split('@', 1)
reference = splitReference[0] if len(splitReference) > 1 else None
pathInsideRepository = splitReference[-1]

mountSource = GitMountSource(repositoryPath, reference=reference)
if pathInsideRepository:
fileInfo = mountSource.getFileInfo(pathInsideRepository)
if not fileInfo:
raise ValueError(
f"The path {pathInsideRepository} in the git repository specified via '{url}' does not exist!"
)

if stat.S_ISDIR(fileInfo.mode):
mountSource.prefix = pathInsideRepository
else:
# Add tarFileName argument so that mounting a TAR file via SSH can create a properly named index
# file inside ~/.cache/ratarmount.
if 'tarFileName' not in options:
options['tarFileName'] = url

# In the future it might be necessary to extend the lifetime of mountSource by adding it as
# a member of the opened file, but not right now.
return mountSource.open(fileInfo)

return mountSource

if not fsspec:
print("[Warning] An URL was detected but fsspec is not installed. You may want to install it with:")
print("[Warning] python3 -m pip install ratarmount[fsspec]")
Expand Down
2 changes: 1 addition & 1 deletion tests/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ init-hook='import sys; sys.path.append("./core")'
# run arbitrary code.
extension-pkg-whitelist=indexed_gzip,indexed_bzip2,indexed_zstd,libarchive,libarchive.ffi,lzmaffi,rapidgzip,isal,
PySquashfsImage,PySquashfsImage.compressor,zstandard,lz4,deflate,pyminizip,fast_zip_decryption,
asyncssh,sshfs,fsspec
asyncssh,sshfs,fsspec,pygit2

# Specify a score threshold to be exceeded before program exits with error.
fail-under=10.0
Expand Down

0 comments on commit e23c68a

Please sign in to comment.