From dbad68401b1bb9b65d01dc3794832cf7ac6ccad0 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Mon, 23 Sep 2024 13:59:20 +0000 Subject: [PATCH 1/3] Use a Storage Interface to abstract S3 and WebDAV - new WEBDAV_URL_WITH_CREDENTIALS environ - new WEBDAV_REQUEST_TIMEOUT_SEC environ (must be long enough for all files to be uploaded in one request) - new `constants.storage_type`: WEBDAV if webdav environ is set, otherwise S3 - rename all things `s3` to `storage` - new abstract StorageInterface which includes Key/Path provider - `s3` module to implement the StorageInterface - new `webdav` module implementing the StorageInterface - New `webdav_path` property on Project to store the part of URL upload to - config entry point - collections.json remote parsing - mimetype from suffix - add/delete/update via .dav endpoint - auto create remote folder - display public URL on UI - recursive listing of files from WeBDAV - auto filtering of known garbage (macos files) - UI replaceProject - UI get /config - UI conditional display of setter or updater --- backend/api/constants.py | 22 +- backend/api/database/models.py | 2 + backend/api/entrypoint.py | 6 +- backend/api/routes/archives.py | 58 +++--- backend/api/routes/files.py | 57 ++--- backend/api/routes/projects.py | 195 +++++++++++++++++- backend/api/routes/utils.py | 7 + backend/api/s3.py | 46 ----- backend/api/storage/__init__.py | 70 +++++++ backend/api/storage/s3.py | 109 ++++++++++ backend/api/storage/webdav.py | 159 ++++++++++++++ .../versions/b9fb42d00930_webdav_path.py | 28 +++ backend/tests/conftest.py | 17 +- backend/tests/routes/test_archives.py | 2 +- dev/reload-compose.yaml | 6 +- frontend/Dockerfile | 1 + .../src/components/FileTableRowComponent.vue | 2 +- .../src/components/ProjectColumnComponent.vue | 2 +- .../components/ReloadWebDAVPathComponent.vue | 48 +++++ .../components/SaveWebDAVPathComponent.vue | 55 +++++ frontend/src/constants.ts | 9 +- frontend/src/stores/stores.ts | 31 ++- frontend/src/utils.ts | 18 ++ frontend/src/views/ProjectView.vue | 18 +- 24 files changed, 836 insertions(+), 132 deletions(-) delete mode 100644 backend/api/s3.py create mode 100644 backend/api/storage/__init__.py create mode 100644 backend/api/storage/s3.py create mode 100644 backend/api/storage/webdav.py create mode 100644 backend/migrations/versions/b9fb42d00930_webdav_path.py create mode 100644 frontend/src/components/ReloadWebDAVPathComponent.vue create mode 100644 frontend/src/components/SaveWebDAVPathComponent.vue diff --git a/backend/api/constants.py b/backend/api/constants.py index 02ffa541..c746c86e 100644 --- a/backend/api/constants.py +++ b/backend/api/constants.py @@ -4,6 +4,7 @@ import tempfile import uuid from dataclasses import dataclass, field +from enum import StrEnum from pathlib import Path from uuid import UUID @@ -19,6 +20,11 @@ def determine_mandatory_environment_variables(): raise OSError(f"Please set the {variable} environment variable") +class StorageType(StrEnum): + WEBDAV = "webdav" + S3 = "s3" + + @dataclass(kw_only=True) class BackendConf: """ @@ -40,7 +46,7 @@ class BackendConf: # Scheduler process redis_uri: str = os.getenv("REDIS_URI") or "redis://localhost:6379/0" - channel_name: str = os.getenv("CHANNEL_NAME") or "s3_upload" + channel_name: str = os.getenv("CHANNEL_NAME") or "storage_upload" # Transient (on host disk) Storage transient_storage_path: Path = Path() @@ -58,6 +64,14 @@ class BackendConf: "PRIVATE_SALT", uuid.uuid4().hex ) # used to make S3 keys unguessable + # WebDAV Storage + webdav_url_with_credentials: str = os.getenv("WEBDAV_URL_WITH_CREDENTIALS") or "" + # webdav4 uses a single timeout mechanism for all requests, + # including the upload (PUT) ones which can be large + webdav_request_timeout_sec: int = int( + os.getenv("WEBDAV_REQUEST_TIMEOUT_SEC") or "900" + ) + # Cookies cookie_domain = os.getenv("COOKIE_DOMAIN", None) cookie_expiration_days = int(os.getenv("COOKIE_EXPIRATION_DAYS", "30")) @@ -138,6 +152,12 @@ def __post_init__(self): def single_user(self) -> UUID: return UUID(self.single_user_id) + @property + def storage_type(self) -> StorageType: + if self.webdav_url_with_credentials: + return StorageType.WEBDAV + return StorageType.S3 + constants = BackendConf() logger = constants.logger diff --git a/backend/api/database/models.py b/backend/api/database/models.py index 5bafe225..f7f9b98d 100644 --- a/backend/api/database/models.py +++ b/backend/api/database/models.py @@ -165,6 +165,8 @@ class Project(Base): user: Mapped[User] = relationship(back_populates="projects", init=False) + webdav_path: Mapped[str | None] + files: Mapped[list["File"]] = relationship(cascade="all, delete-orphan") archives: Mapped[list["Archive"]] = relationship(cascade="all, delete-orphan") diff --git a/backend/api/entrypoint.py b/backend/api/entrypoint.py index 9b1965dd..058187c0 100755 --- a/backend/api/entrypoint.py +++ b/backend/api/entrypoint.py @@ -2,13 +2,13 @@ import uvicorn +from api import storage from api.main import create_app # pragma: no cover -from api.s3 import s3_storage # dont check S3 credentials URL in tests if "pytest" not in sys.modules: - # raises should S3 credentials URL be malformed - _ = s3_storage.storage + # raises should S3 credentials URL be malformed (attempts to use) + _ = storage.storage app = create_app() # pragma: no cover diff --git a/backend/api/routes/archives.py b/backend/api/routes/archives.py index 0eb06cd4..6cbd3997 100644 --- a/backend/api/routes/archives.py +++ b/backend/api/routes/archives.py @@ -26,7 +26,7 @@ read_file_in_chunks, ) from api.routes import userless_validated_project, validated_project -from api.s3 import s3_file_key, s3_storage +from api.storage import storage from api.zimfarm import RequestSchema, WebhookPayload, request_task router = APIRouter() @@ -289,7 +289,8 @@ def gen_collection_for(project: Project) -> tuple[list[dict[str, Any]], BinaryIO entry["authors"] = ", ".join(file.authors) entry["files"] = [ { - "url": f"{constants.download_url}/{s3_file_key(project.id, file.hash)}", + "url": f"{constants.download_url}/" + f"{storage.get_file_path(file=file)}", "filename": file.filename, } ] @@ -304,22 +305,17 @@ def gen_collection_for(project: Project) -> tuple[list[dict[str, Any]], BinaryIO return collection, file, digest -def get_file_key(project_id: UUID, file_hash: str, suffix: str) -> str: - # suffix useful to debug live URLs in-browser - return f"{s3_file_key(project_id=project_id, file_hash=file_hash)}{suffix}" - - -def upload_file_to_s3(project: Project, file: BinaryIO, s3_key: str): +def upload_file_to_storage(project: Project, file: BinaryIO, storage_path: str): try: - if s3_storage.storage.has_object(s3_key): - logger.debug(f"Object `{s3_key}` already in S3… weird but OK") + if storage.has(storage_path): + logger.debug(f"Object `{storage_path}` already in Storage… weird but OK") return - logger.debug(f"Uploading file to `{s3_key}`") - s3_storage.storage.upload_fileobj(fileobj=file, key=s3_key) - s3_storage.storage.set_object_autodelete_on(s3_key, project.expire_on) + logger.debug(f"Uploading file to `{storage_path}`") + storage.upload_fileobj(fileobj=file, path=storage_path) + storage.set_autodelete_on(storage_path, project.expire_on) except Exception as exc: - logger.error(f"File failed to upload to s3 `{s3_key}`: {exc}") + logger.error(f"File failed to upload to Storage `{storage_path}`: {exc}") raise exc @@ -357,37 +353,41 @@ async def request_archive( # upload illustration illustration = io.BytesIO(base64.b64decode(archive.config.illustration)) - illus_key = get_file_key( - project_id=archive.project_id, + illus_key = storage.get_companion_file_path( + project=project, file_hash=generate_file_hash(illustration), - suffix=".png", + suffix="illustration.png", ) illustration.seek(0) - # upload it to S3 - upload_file_to_s3(project=project, file=illustration, s3_key=illus_key) + # upload it to Storage + upload_file_to_storage(project=project, file=illustration, storage_path=illus_key) # upload main-logo if archive.config.main_logo: main_logo = io.BytesIO(base64.b64decode(archive.config.main_logo)) - main_logo_key = get_file_key( - project_id=archive.project_id, + main_logo_key = storage.get_companion_file_path( + project=project, file_hash=generate_file_hash(main_logo), - suffix=".png", + suffix="main-logo.png", ) main_logo.seek(0) - # upload it to S3 - upload_file_to_s3(project=project, file=main_logo, s3_key=main_logo_key) + # upload it to Storage + upload_file_to_storage( + project=project, file=main_logo, storage_path=main_logo_key + ) # gen collection and stream collection, collection_file, collection_hash = gen_collection_for(project=project) - collection_key = get_file_key( - project_id=archive.project_id, file_hash=collection_hash, suffix=".json" + collection_key = storage.get_companion_file_path( + project=project, file_hash=collection_hash, suffix="collection.json" ) - # upload it to S3 - upload_file_to_s3(project=project, file=collection_file, s3_key=collection_key) + # upload it to Storage + upload_file_to_storage( + project=project, file=collection_file, storage_path=collection_key + ) - # Everything's on S3, prepare and submit a ZF request + # Everything's on Storage, prepare and submit a ZF request request_def = RequestSchema( collection_url=f"{constants.download_url}/{collection_key}", name=archive.config.name, diff --git a/backend/api/routes/files.py b/backend/api/routes/files.py index a3a6a535..9d43310b 100644 --- a/backend/api/routes/files.py +++ b/backend/api/routes/files.py @@ -16,7 +16,7 @@ from api.database.utils import get_file_by_id, get_project_by_id from api.files import calculate_file_size, generate_file_hash, save_file from api.routes import validated_project -from api.s3 import s3_file_key, s3_storage +from api.storage import storage from api.store import task_queue router = APIRouter() @@ -48,7 +48,7 @@ class FileModel(BaseModel): class FileStatus(str, Enum): LOCAL = "LOCAL" - S3 = "S3" + STORAGE = "STORAGE" FAILURE = "FAILURE" PROCESSING = "PROCESSING" @@ -130,8 +130,8 @@ def update_file_path(file: File, path: str): update_file_status_and_path(file, file.status, path) -def upload_file_to_s3(new_file_id: UUID): - """Update local file to S3 storage and update file status""" +def upload_file_to_storage(new_file_id: UUID): + """Update local file to Storage and update file status""" new_file = get_file_by_id(new_file_id) project = get_project_by_id(new_file.project_id) if not project.expire_on: @@ -142,35 +142,38 @@ def upload_file_to_s3(new_file_id: UUID): else: update_file_status(new_file, FileStatus.PROCESSING) - s3_key = s3_file_key(new_file.project_id, new_file.hash) - try: - if s3_storage.storage.has_object(s3_key): - logger.debug(f"Object `{s3_key}` for File {new_file_id} already in S3") - update_file_status_and_path(new_file, FileStatus.S3, s3_key) + storage_path = storage.get_file_path(file=new_file) + if storage.has(storage_path): + logger.debug( + f"Object `{storage_path}` for File {new_file_id} already in Storage" + ) + update_file_status_and_path(new_file, FileStatus.STORAGE, storage_path) return - logger.debug(f"Uploading {new_file_id}: `{new_file.local_fpath}` to `{s3_key}`") - s3_storage.storage.upload_file(fpath=new_file.local_fpath, key=s3_key) + logger.debug( + f"Uploading {new_file_id}: `{new_file.local_fpath}` to `{storage_path}`" + ) + storage.upload_file(fpath=new_file.local_fpath, path=storage_path) logger.debug(f"Uploaded {new_file_id}. Removing `{new_file.local_fpath}`…") new_file.local_fpath.unlink(missing_ok=True) - s3_storage.storage.set_object_autodelete_on(s3_key, project.expire_on) - update_file_status_and_path(new_file, FileStatus.S3, s3_key) + storage.set_autodelete_on(storage_path, project.expire_on) + update_file_status_and_path(new_file, FileStatus.STORAGE, storage_path) except Exception as exc: - logger.error(f"File: {new_file_id} failed to upload to s3: {exc}") + logger.error(f"File: {new_file_id} failed to upload to Storage: {exc}") update_file_status(new_file, FileStatus.FAILURE) raise exc -def delete_key_from_s3(s3_file_key: str): - """Delete files from S3.""" - logger.warning(f"File: {s3_file_key} starts deletion from S3") - if not s3_storage.storage.has_object(s3_file_key): - logger.debug(f"{s3_file_key} does not exist in S3") +def delete_from_storage(storage_path: str): + """Delete files from Storage.""" + logger.warning(f"File: {storage_path} starts deletion from Storage") + if not storage.has(storage_path): + logger.debug(f"{storage_path} does not exist in Storage") return try: - s3_storage.storage.delete_object(s3_file_key) + storage.delete(storage_path) except Exception as exc: - logger.error(f"File: {s3_file_key} failed to be deleted from S3") + logger.error(f"File: {storage_path} failed to be deleted from Storage") logger.exception(exc) @@ -239,7 +242,7 @@ async def create_file( indep_session.refresh(new_file) file_id = str(new_file.id) # request file upload by rq-worker - task_queue.enqueue(upload_file_to_s3, file_id, retry=constants.job_retry) + task_queue.enqueue(upload_file_to_storage, file_id, retry=constants.job_retry) # fetch File from DB in this session to return it file = session.execute(select(File).filter_by(id=file_id)).scalar() @@ -295,14 +298,12 @@ async def delete_file( if number_of_duplicate_files == 1: if file.status == FileStatus.LOCAL: file.local_fpath.unlink(missing_ok=True) - if file.status == FileStatus.S3: - task_queue.enqueue( - delete_key_from_s3, s3_file_key(file.project_id, file.hash) - ) + if file.status == FileStatus.STORAGE: + task_queue.enqueue(delete_from_storage, storage.get_file_path(file=file)) if file.status == FileStatus.PROCESSING: task_queue.enqueue_at( datetime.datetime.now(tz=datetime.UTC) + constants.s3_deletion_delay, - delete_key_from_s3, - s3_file_key(file.project_id, file.hash), + delete_from_storage, + storage.get_file_path(file=file), ) session.delete(file) diff --git a/backend/api/routes/projects.py b/backend/api/routes/projects.py index 2eb306c6..78f8273e 100644 --- a/backend/api/routes/projects.py +++ b/backend/api/routes/projects.py @@ -1,15 +1,29 @@ import datetime from http import HTTPStatus -from uuid import UUID +from pathlib import Path +from urllib.parse import urljoin +from uuid import UUID, uuid4 +import requests from fastapi import APIRouter, Depends from pydantic import BaseModel, ConfigDict, TypeAdapter -from sqlalchemy import update +from sqlalchemy import select, update from sqlalchemy.orm import Session +from api.constants import constants, logger +from api.database import Session as DBSession from api.database import gen_session -from api.database.models import Archive, ArchiveConfig, ArchiveStatus, Project, User +from api.database.models import ( + Archive, + ArchiveConfig, + ArchiveStatus, + File, + Project, + User, +) from api.routes import validated_project, validated_user +from api.routes.files import FileStatus, validate_project_quota +from api.storage import storage router = APIRouter(prefix="/projects") @@ -18,11 +32,16 @@ class ProjectRequest(BaseModel): name: str +class ProjectWebdavRequest(BaseModel): + webdav_path: str + + class ProjectModel(BaseModel): name: str id: UUID created_on: datetime.datetime expire_on: datetime.datetime | None + webdav_path: str | None model_config = ConfigDict(from_attributes=True) @@ -39,6 +58,7 @@ async def create_project( name=project.name, created_on=now, expire_on=None, + webdav_path=None, files=[], archives=[], ) @@ -95,3 +115,172 @@ async def update_project( """Update a specific project by its id.""" stmt = update(Project).filter_by(id=project.id).values(name=project_request.name) session.execute(stmt) + + +@router.post("/{project_id}.dav", response_model=ProjectModel) +async def update_project_webdav( + project_request: ProjectWebdavRequest, + project: Project = Depends(validated_project), + session: Session = Depends(gen_session), +): + """Update a specific project by its id.""" + stmt = ( + update(Project) + .filter_by(id=project.id) + .values(webdav_path=project_request.webdav_path) + ) + session.execute(stmt) + session.refresh(project) + + # update Files from WebDAV folder + await update_project_files_from_webdav(session, project) + + session.refresh(project) + return ProjectModel.model_validate(project) + + +class NautilusCollection: + def __init__(self, data: list[dict[str, str | list[str]]]): + self.data = data + self.files_indexes: dict[str, int] = {} + for index, entry in enumerate(self.data): + for filename in entry.get("files", []): + self.files_indexes[filename] = index + + def __getitem__(self, key: str): + return self.data[self.files_indexes[key]] + + def __len__(self): + return len(self.files_indexes) + + def __contains__(self, item: str): + return item in self.files_indexes + + def __iter__(self): + return iter(self.data) + + def get(self, key: str, default=None): + try: + return self[key] + except KeyError: + return default + + +async def read_remote_collection(url: str): + resp = requests.get(url, timeout=constants.webdav_request_timeout_sec) + resp.raise_for_status() + return NautilusCollection(resp.json()) + + +async def update_project_files_from_webdav(session: Session, project: Project): + if project.webdav_path is None: + logger.warning( + f"[project #{project.id}] requested webdav update " + "but project has no webdav_path" + ) + return + + logger.debug(f"[project #{project.id}] refreshing from {project.webdav_path}") + + now = datetime.datetime.now(tz=datetime.UTC) + prefix = Path(project.webdav_path) + + # create a folder if this prefix does not exists + if not storage.has(path=project.webdav_path): + logger.debug(f"[project #{project.id}] mkdir {project.webdav_path}") + storage.mkdir(path=project.webdav_path) + + entries = { + str(Path(entry.path).relative_to(prefix)): entry + for entry in list(storage.list(prefix=project.webdav_path)) + } + + remote_paths = list(entries.keys()) + db_paths = [file.path for file in project.files] + to_add = [path for path in remote_paths if path not in db_paths] + to_remove = [path for path in db_paths if path not in remote_paths] + + collection = None + if entries.get("collection.json"): + logger.debug(f"[project #{project.id}] there is a remote collection.json") + try: + collection = await read_remote_collection( + urljoin(storage.public_url, f"{prefix}/collection.json") + ) + except Exception as exc: + logger.warning("Failed to read collection.json") + logger.exception(exc) + else: + logger.debug(f"[project #{project.id}] collection: {len(collection)} files") + + # update existing Files without removing metadata + for path, entry in entries.items(): + if path in to_add or path in to_remove: + continue + + logger.debug(f"[project #{project.id}] deleting {path}") + stmt = select(File).filter_by(project_id=project.id).filter_by(path=str(path)) + file = session.execute(stmt).scalar_one() + file.filesize = entry.size + file.uploaded_on = entry.modified_on + file.type = entry.mimetype + file.status = FileStatus.STORAGE.value + session.add(file) + + # add new files + for path, entry in entries.items(): + if path not in to_add: + continue + + logger.debug(f"[project #{project.id}] adding {path}") + filepath = Path(entry.path).relative_to(prefix) + filename = filepath.name + + # TODO + validate_project_quota(entry.size, project) + + title, authors, description = filename, None, None + if collection and path in collection: + authors = str(collection[path].get("authors", "")) or authors + if isinstance(authors, str): + authors = [authors] + description = str(collection[path].get("description", "")) or description + title = str(collection[path].get("title", "")) or title + + file_hash = f"unknown:{uuid4().hex}" + if len(project.files) == 0: + project.expire_on = now + constants.project_expire_after + + # adding file in an independant session that gets commited before enquing + # so its visible by other processes (rq-worker) + with DBSession.begin() as indep_session: + # get project again but from this session + project_: Project | None = indep_session.execute( + select(Project).filter_by(id=str(project.id)) + ).scalar() + if not project_: + raise OSError("Failed to re-fetch Project") + new_file = File( + filename=filename, + filesize=entry.size, + title=title, + authors=authors, + description=description, + uploaded_on=entry.modified_on, + hash=file_hash, + path=str(filepath), + type=entry.mimetype, + status=FileStatus.STORAGE.value, + ) + project_.files.append(new_file) + indep_session.add(new_file) + indep_session.flush() + indep_session.refresh(new_file) + + # delete those that dont exist anymore + for path in to_remove: + logger.debug(f"[project #{project.id}] deleting {path}") + stmt = select(File).filter_by(path=path).filter_by(project_id=project.id) + file = session.execute(stmt).scalar() + if file: + session.delete(file) diff --git a/backend/api/routes/utils.py b/backend/api/routes/utils.py index 6a33de92..42e42ef1 100644 --- a/backend/api/routes/utils.py +++ b/backend/api/routes/utils.py @@ -1,8 +1,15 @@ from fastapi import APIRouter +from api.storage import storage + router = APIRouter() @router.get("/ping") async def pong(): return {"message": "pong"} + + +@router.get("/config") +async def info(): + return {"NAUTILUS_STORAGE_URL": storage.public_url} diff --git a/backend/api/s3.py b/backend/api/s3.py deleted file mode 100644 index a91fe6e2..00000000 --- a/backend/api/s3.py +++ /dev/null @@ -1,46 +0,0 @@ -import hashlib -from uuid import UUID - -from kiwixstorage import KiwixStorage - -from api.constants import constants, logger - - -class S3Storage: - def __init__(self) -> None: - self._storage = None - - def _setup_s3_and_check_credentials(self, s3_url_with_credentials): - logger.info("testing S3 Optimization Cache credentials") - s3_storage = KiwixStorage(s3_url_with_credentials) - - if not s3_storage.check_credentials( - list_buckets=True, bucket=True, write=True, read=True, failsafe=True - ): - logger.error("S3 cache connection error testing permissions.") - logger.error(f" Server: {s3_storage.url.netloc}") - logger.error(f" Bucket: {s3_storage.bucket_name}") - logger.error(f" Key ID: {s3_storage.params.get('keyid')}") - raise ValueError("Unable to connect to Optimization Cache. Check its URL.") - - return s3_storage - - @property - def storage(self): - if not self._storage: - self._storage = self._setup_s3_and_check_credentials( - constants.s3_url_with_credentials - ) - return self._storage - - -s3_storage = S3Storage() - - -def s3_file_key(project_id: UUID, file_hash: str) -> str: - """S3 key for a Project's File""" - digest = hashlib.sha256( - bytes(f"{project_id}-{file_hash}-{constants.private_salt}", "utf-8") - ).hexdigest() - # using project_id/ pattern to ease browsing bucket for objects - return f"{project_id}/{digest}" diff --git a/backend/api/storage/__init__.py b/backend/api/storage/__init__.py new file mode 100644 index 00000000..77fdca56 --- /dev/null +++ b/backend/api/storage/__init__.py @@ -0,0 +1,70 @@ +import datetime +from abc import ABC, abstractmethod, abstractproperty +from collections.abc import Generator +from dataclasses import dataclass +from pathlib import Path +from typing import BinaryIO + +from api.constants import StorageType, constants +from api.database.models import File, Project + + +@dataclass(kw_only=True) +class StorageEntry: + path: str + size: int + mimetype: str + modified_on: datetime.datetime + etag: str | None + + +class StorageInterface(ABC): + + @abstractproperty + def storage(self): ... + + @abstractproperty + def public_url(self) -> str: ... + + @abstractmethod + def has(self, path: str) -> bool: ... + + @abstractmethod + def upload_fileobj(self, fileobj: BinaryIO, path: str): ... + + @abstractmethod + def set_autodelete_on(self, path: str, on: datetime.datetime | None): ... + + @abstractmethod + def upload_file(self, fpath: Path, path: str): ... + + @abstractmethod + def delete(self, path: str): ... + + @abstractmethod + def list(self, prefix: str) -> Generator[StorageEntry, None, None]: ... + + @abstractmethod + def mkdir(self, path: str, *, parents: bool = True, exists_ok: bool = True): ... + + @abstractmethod + def get_file_path(self, file: File) -> str: ... + + @abstractmethod + def get_companion_file_path( + self, project: Project, file_hash: str, suffix: str + ) -> str: ... + + +def get_storage() -> StorageInterface: + if constants.storage_type == StorageType.WEBDAV: + from api.storage.webdav import WebDAVStorage + + return WebDAVStorage() + else: + from api.storage.s3 import S3Storage + + return S3Storage() + + +storage = get_storage() diff --git a/backend/api/storage/s3.py b/backend/api/storage/s3.py new file mode 100644 index 00000000..32cbb831 --- /dev/null +++ b/backend/api/storage/s3.py @@ -0,0 +1,109 @@ +import datetime +import hashlib +from collections.abc import Generator +from pathlib import Path +from typing import BinaryIO + +from kiwixstorage import KiwixStorage + +from api.constants import constants, logger +from api.database.models import File, Project +from api.storage import StorageEntry, StorageInterface + + +class S3Storage(StorageInterface): + def __init__(self) -> None: + self._storage = None + + def _setup_s3_and_check_credentials(self, s3_url_with_credentials): + logger.info("testing S3 Storage credentials") + s3_storage = KiwixStorage(s3_url_with_credentials) + + if not s3_storage.check_credentials( + list_buckets=True, bucket=True, write=True, read=True, failsafe=True + ): + logger.error("S3 Storage connection error testing permissions.") + logger.error(f" Server: {s3_storage.url.netloc}") + logger.error(f" Bucket: {s3_storage.bucket_name}") + logger.error(f" Key ID: {s3_storage.params.get('keyid')}") + raise ValueError("Unable to connect to S3 Storage. Check its URL.") + + return s3_storage + + @property + def storage(self): + if not self._storage: + self._storage = self._setup_s3_and_check_credentials( + constants.s3_url_with_credentials + ) + return self._storage + + @property + def public_url(self) -> str: + uri = self.storage.url + query_bucket = uri.query.get("bucketName") + # TODO: not working + uri = ( + uri._replace(username=None) + ._replace(password=None) + ._replace(query=None) + ._replace(params=None) + ) + # we know its wasabi, use subdomain + if query_bucket: + if uri.hostname.endswith("wasabisys.com") and uri.hostname.startswith( + "s3." + ): + uri._replace(hostname=f"{query_bucket}.{uri.hostname}") + else: + uri._replace(path=f"/{query_bucket}{uri.path}") + return uri.geturl() + + def has(self, path: str) -> bool: + return self.storage.has_object(key=path) + + def upload_fileobj(self, fileobj: BinaryIO, path: str): + self.storage.upload_fileobj(fileobj=fileobj, key=path) + + def set_autodelete_on(self, path: str, on: datetime.datetime): + if on is not None: + self.storage.set_object_autodelete_on(key=path, on=on) + + def upload_file(self, fpath: Path, path: str): + self.storage.upload_file(fpath=fpath, key=path) + + def delete(self, path: str): + self.storage.delete_object(key=path) + + def list(self, prefix: str) -> Generator[StorageEntry, None, None]: + for ( + summary + ) in self.storage.resource.Bucket( # pyright: ignore [reportAttributeAccessIssue] + self.storage.bucket_name + ).objects.filter( + Prefix=prefix + ): + yield StorageEntry( + path=summary.key, + size=summary.size, + mimetype="binary/octet-stream", + modified_on=summary.last_modified, + etag=summary.e_tag, + ) + + def mkdir(self, path: str, *, parents: bool = True, exists_ok: bool = True): ... + + def get_file_path(self, file: File) -> str: + """S3 key for a Project's File""" + digest = hashlib.sha256( + bytes(f"{file.project_id!s}-{file.hash}-{constants.private_salt}", "utf-8") + ).hexdigest() + # using project_id/ pattern to ease browsing bucket for objects + return f"{file.project_id!s}/{digest}" + + def get_companion_file_path( + self, project: Project, file_hash: str, suffix: str + ) -> str: + """S3 key for a Project's companion file (not a File)""" + # using project_id/ pattern to ease browsing bucket for objects + return f"{project.id!s}/{file_hash}_{suffix}" diff --git a/backend/api/storage/webdav.py b/backend/api/storage/webdav.py new file mode 100644 index 00000000..e4b46ae3 --- /dev/null +++ b/backend/api/storage/webdav.py @@ -0,0 +1,159 @@ +import datetime +import mimetypes +from collections.abc import Generator +from pathlib import Path +from typing import BinaryIO +from urllib.parse import ParseResult, urlparse + +from webdav4.client import Client as DAVClient +from webdav4.client import ResourceAlreadyExists + +from api.constants import constants, logger +from api.database.models import File, Project +from api.database.utils import get_project_by_id +from api.storage import StorageEntry, StorageInterface + + +class WebDAVUrl: + _uri: ParseResult + + def __init__(self, webdav_url_with_credentials: str): + self._uri = urlparse(webdav_url_with_credentials) + + @property + def public_url(self) -> str: + port_suffix = f":{self._uri.port}" if self._uri.port else "" + return self._uri._replace(netloc=f"{self._uri.hostname}{port_suffix}").geturl() + + @property + def auth(self) -> tuple[str, str] | None: + return ( + None + if not self._uri.username + else (self._uri.username or "", self._uri.password or "") + ) + + @property + def path(self) -> str: + return self._uri.path + + +class WebDAVStorage(StorageInterface): + def __init__(self) -> None: + self._storage = None + + def _setup_webdav_and_check_credentials(self, webdav_url_with_credentials): + logger.info("testing WebDAV credentials") + try: + public_url, auth = explode_webdav_credentials(webdav_url_with_credentials) + + dav_url = WebDAVUrl(webdav_url_with_credentials) + client = DAVClient( + dav_url.public_url, + auth=dav_url.auth, + timeout=constants.webdav_request_timeout_sec, + ) + client.ls(dav_url.path) + except Exception as exc: + logger.error(f"WebDAV error: {exc!s}") + raise ValueError("Unable to connect to WebDAV. Check its URL.") from exc + + return client + + @property + def storage(self): + if not self._storage: + self._storage = self._setup_webdav_and_check_credentials( + constants.webdav_url_with_credentials + ) + return self._storage + + @property + def public_url(self) -> str: + return str(self.storage.base_url) + + def has(self, path: str) -> bool: + return self.storage.exists(path) + + def upload_fileobj(self, fileobj: BinaryIO, path: str): + self.storage.upload_fileobj(file_obj=fileobj, to_path=path, overwrite=True) + + def set_autodelete_on( + self, path: str, on: datetime.datetime | None # noqa: ARG002 + ): + logger.warning( + f"requested autodelete for {path} while storage doesnt support it" + ) + return + + def upload_file(self, fpath: Path, path: str): + self.storage.upload_file(from_path=fpath, to_path=path, overwrite=True) + + def delete(self, path: str): + self.storage.remove(path) + + def list(self, prefix: str) -> Generator[StorageEntry, None, None]: + for entry in self.storage.ls(prefix, detail=True, allow_listing_resource=True): + # we should not get there (detail=True) but type checker doesnt know + if not isinstance(entry, dict): + continue + if entry["type"] == "directory": + yield from self.list(entry["name"]) + # we only want files + if entry["type"] != "file": + continue + if not isinstance(entry, dict): + continue + fpath = Path(entry["name"]) + if "__MACOSX" in fpath.parts: + continue + if "DS_Store" in fpath.parts: + continue + yield StorageEntry( + path=entry["name"], + size=entry["content_length"], + mimetype=entry["content_type"] + or mimetypes.types_map.get(fpath.suffix) + or "binary/octet-stream", + modified_on=entry["modified"], + etag=entry["etag"], + ) + + def mkdir(self, path: str, *, parents: bool = True, exists_ok: bool = True): + ppath = Path(path) + if parents: + for parent in list(reversed(ppath.parents))[1:]: + try: + self.storage.mkdir(path=str(parent)) + except ResourceAlreadyExists: + ... + try: + self.storage.mkdir(path) + except ResourceAlreadyExists as exc: + if not exists_ok: + raise exc + + def get_file_path(self, file: File) -> str: + """WebDAV path for a Project's File""" + project = get_project_by_id(file.project_id) + if project.webdav_path is None: + raise ValueError("project unconfigured: no webdav_path") + return f"{project.webdav_path}/{file.filename}" + + def get_companion_file_path( + self, project: Project, file_hash: str, suffix: str + ) -> str: + """S3 key for a Project's companion file (not a File)""" + # using project_id/ pattern to ease browsing bucket for objects + if project.webdav_path is None: + raise ValueError("project unconfigured: no webdav_path") + return f"{project.webdav_path!s}/{file_hash}_{suffix}" + + +def explode_webdav_credentials(url: str) -> tuple[str, tuple[str, str] | None]: + """ """ + uri = urlparse(url) + port_suffix = f":{uri.port}" if uri.port else "" + auth = None if not uri.username else (uri.username or "", uri.password or "") + public = uri._replace(netloc=f"{uri.hostname}{port_suffix}").geturl() + return public, auth diff --git a/backend/migrations/versions/b9fb42d00930_webdav_path.py b/backend/migrations/versions/b9fb42d00930_webdav_path.py new file mode 100644 index 00000000..07bf3716 --- /dev/null +++ b/backend/migrations/versions/b9fb42d00930_webdav_path.py @@ -0,0 +1,28 @@ +"""unnamed + +Revision ID: b9fb42d00930 +Revises: be9763d49e5f +Create Date: 2024-09-23 13:50:21.160514 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "b9fb42d00930" +down_revision = "be9763d49e5f" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("project", sa.Column("webdav_path", sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("project", "webdav_path") + # ### end Alembic commands ### diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 724604ff..2c8154a0 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -13,6 +13,7 @@ from httpx import AsyncClient from starlette.testclient import TestClient +from api import storage from api.database import Session from api.database.models import ( Archive, @@ -24,7 +25,6 @@ ) from api.entrypoint import app from api.files import save_file -from api.s3 import s3_storage pytestmark = pytest.mark.asyncio(scope="package") @@ -168,6 +168,7 @@ def project_id(test_project_name, user_id): name=test_project_name, created_on=now, expire_on=None, + webdav_path=None, files=[], archives=[], ) @@ -193,6 +194,7 @@ def expiring_project_id(test_expiring_project_name, user_id): name=test_expiring_project_name, created_on=now, expire_on=now + datetime.timedelta(minutes=30), + webdav_path=None, files=[], archives=[], ) @@ -295,22 +297,19 @@ def upload_file(*args, **kwargs): ... def upload_fileobj(*args, **kwargs): ... - def set_object_autodelete_on(*args, **kwargs): ... + def set_autodelete_on(*args, **kwargs): ... - def has_object(*args, **kwargs): + def has(*args, **kwargs): return True - def check_credentials(*args, **kwargs): - return True - - def delete_object(*args, **kwargs): ... + def delete(*args, **kwargs): ... @pytest.fixture -def successful_s3_upload_file(monkeypatch): +def successful_storage_upload_file(monkeypatch): """Requests.get() mocked to return {'mock_key':'mock_response'}.""" - monkeypatch.setattr(s3_storage, "_storage", SuccessStorage()) + monkeypatch.setattr(storage, "storage", SuccessStorage()) yield True diff --git a/backend/tests/routes/test_archives.py b/backend/tests/routes/test_archives.py index 984e5c40..becb8dd5 100644 --- a/backend/tests/routes/test_archives.py +++ b/backend/tests/routes/test_archives.py @@ -237,7 +237,7 @@ async def test_request_archive_ready( project_id, expiring_project_id, expiring_archive_id, - successful_s3_upload_file, + successful_storage_upload_file, successful_zimfarm_request_task, ): diff --git a/dev/reload-compose.yaml b/dev/reload-compose.yaml index ab43a846..0e58c4da 100644 --- a/dev/reload-compose.yaml +++ b/dev/reload-compose.yaml @@ -31,7 +31,7 @@ services: retries: 10 worker: build: - dockerfile: ../dev/backend-tools-tests/Dockerfile + dockerfile: ../backend/Dockerfile context: ../backend volumes: - nautilus-reload-storage:/storage @@ -41,6 +41,7 @@ services: - POSTGRES_URI=${POSTGRES_URI} - REDIS_URI=${REDIS_URI} - S3_URL_WITH_CREDENTIALS=${S3_URL_WITH_CREDENTIALS} + - WEBDAV_URL_WITH_CREDENTIALS=${WEBDAV_URL_WITH_CREDENTIALS} - PRIVATE_SALT=secrectkey - TRANSIENT_STORAGE_PATH=/storage - CHANNEL_NAME=${CHANNEL_NAME} @@ -70,7 +71,7 @@ services: - backend backend: build: - dockerfile: ../dev/backend-tools-tests/Dockerfile + dockerfile: ../backend/Dockerfile context: ../backend volumes: - nautilus-reload-storage:/storage @@ -83,6 +84,7 @@ services: - POSTGRES_URI=${POSTGRES_URI} - REDIS_URI=${REDIS_URI} - S3_URL_WITH_CREDENTIALS=${S3_URL_WITH_CREDENTIALS} + - WEBDAV_URL_WITH_CREDENTIALS=${WEBDAV_URL_WITH_CREDENTIALS} - PRIVATE_SALT=secrectkey - TRANSIENT_STORAGE_PATH=/storage - CHANNEL_NAME=${CHANNEL_NAME} diff --git a/frontend/Dockerfile b/frontend/Dockerfile index f0ccfca7..d005642d 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -16,6 +16,7 @@ RUN ls /app ENV NAUTILUS_WEB_API http://localhost:8080/v1 ENV NAUTILUS_IS_SINGLE_USER "" +ENV NAUTILUS_STORAGE_URL "unknown" EXPOSE 80 ENTRYPOINT [ "/app/entrypoint.sh" ] diff --git a/frontend/src/components/FileTableRowComponent.vue b/frontend/src/components/FileTableRowComponent.vue index 4b606ea4..d9b96a14 100644 --- a/frontend/src/components/FileTableRowComponent.vue +++ b/frontend/src/components/FileTableRowComponent.vue @@ -34,7 +34,7 @@ diff --git a/frontend/src/components/ProjectColumnComponent.vue b/frontend/src/components/ProjectColumnComponent.vue index 73842554..b883b589 100644 --- a/frontend/src/components/ProjectColumnComponent.vue +++ b/frontend/src/components/ProjectColumnComponent.vue @@ -94,7 +94,7 @@ async function updateProjectName(projectId: string, newName: string) { try { await storeApp.axiosInstance.patch(`/projects/${projectId}`, projectRequestData) } catch (error: any) { - console.log('Unable to update project name.', error, projectId) + console.error('Unable to update project name.', error, projectId) storeApp.alertsError(`Unable to update project name, project id: ${projectId}`) editingProjectName.value = projectName.value } diff --git a/frontend/src/components/ReloadWebDAVPathComponent.vue b/frontend/src/components/ReloadWebDAVPathComponent.vue new file mode 100644 index 00000000..5be3ade0 --- /dev/null +++ b/frontend/src/components/ReloadWebDAVPathComponent.vue @@ -0,0 +1,48 @@ + + + + + diff --git a/frontend/src/components/SaveWebDAVPathComponent.vue b/frontend/src/components/SaveWebDAVPathComponent.vue new file mode 100644 index 00000000..33c8cbd3 --- /dev/null +++ b/frontend/src/components/SaveWebDAVPathComponent.vue @@ -0,0 +1,55 @@ + + + + + diff --git a/frontend/src/constants.ts b/frontend/src/constants.ts index adec04ef..52fa654c 100644 --- a/frontend/src/constants.ts +++ b/frontend/src/constants.ts @@ -13,6 +13,7 @@ export interface Project { name: string created_on: string expire_on?: string + webdav_path?: string } export interface Archive { @@ -116,7 +117,7 @@ export class NautilusFile implements File { } get isEditable(): boolean { - return this.status == FileStatus.S3 + return this.status == FileStatus.STORAGE } } @@ -124,7 +125,7 @@ export enum FileStatus { UPLOADING = 'UPLOADING', PROCESSING = 'PROCESSING', LOCAL = 'LOCAL', - S3 = 'S3', + STORAGE = 'STORAGE', FAILURE = 'FAILURE' } @@ -134,6 +135,7 @@ export interface Environ { NAUTILUS_PROJECT_QUOTA: number NAUTILUS_FILE_REFRESH_EVERY_MS: number NAUTILUS_IS_SINGLE_USER: boolean + NAUTILUS_STORAGE_URL: string } export interface AlertMessage { @@ -167,7 +169,8 @@ export const EmptyConstants = new Constants({ NAUTILUS_FILE_QUOTA: 100000000, NAUTILUS_PROJECT_QUOTA: 100000000, NAUTILUS_FILE_REFRESH_EVERY_MS: 1000, - NAUTILUS_IS_SINGLE_USER: false + NAUTILUS_IS_SINGLE_USER: false, + NAUTILUS_STORAGE_URL: 'notset', }) // using iec to be consistent accross tools (MiB): jedec renders MiB as MB diff --git a/frontend/src/stores/stores.ts b/frontend/src/stores/stores.ts index 12fc2d27..5d55fc72 100644 --- a/frontend/src/stores/stores.ts +++ b/frontend/src/stores/stores.ts @@ -24,15 +24,25 @@ export const useProjectStore = defineStore( function setProjects(newIds: Array) { projects.value = newIds if (lastProjectId.value) { - lastProject.value = - projects.value.filter((project) => project.id == lastProjectId.value).at(0) || null + setLastProjectId(lastProjectId.value) } } function setLastProjectId(newId: string) { + console.debug(`Switching to project #${newId} from`, lastProject.value ? lastProject.value.id : null) lastProjectId.value = newId - lastProject.value = - projects.value.filter((project) => project.id == lastProjectId.value).at(0) || null + lastProject.value = projects.value.filter((project) => project.id == lastProjectId.value).at(0) || null + } + + function replaceProject(project: Project) { + for (let idx: int = 0; idx <= projects.value.length; idx++) { + if (projects.value[idx].id == project.id) { + projects.value[idx] = project + return + } + } + projects.value.push(project) + setLastProjectId(project.id) } function clearLastProjectId() { @@ -58,6 +68,7 @@ export const useProjectStore = defineStore( lastProjectArchives, lastProjectPendingArchive, setLastProjectId, + replaceProject, clearLastProjectId, setProjects, setLastProjectArchives, @@ -106,6 +117,18 @@ export const useAppStore = defineStore('app', () => { console.log('Unable to retrieve the environ.json file', error) alertsError('Unable to retrieve the environ.json file') } + + try { + await axiosInstance.value.get(`/config`) + .then((response) => { + constants.value.env.NAUTILUS_STORAGE_URL = response.data.NAUTILUS_STORAGE_URL + }) + .catch((error) => { + console.error(`Unable to get STORAGE URL`, error) + }) + } catch (error: unknown) { + console.log('Error retrieving storage URL', error) + } } return { diff --git a/frontend/src/utils.ts b/frontend/src/utils.ts index 57d3198f..631a24d8 100644 --- a/frontend/src/utils.ts +++ b/frontend/src/utils.ts @@ -77,3 +77,21 @@ export async function refreshArchives() { storeApp.alertsError('Unable to retrieve archives info') } } + + +export async function set_project_webdav_path(webdav_path: string) { + console.debug(`Setting WebDAV path to ${webdav_path}`) + const storeProject = useProjectStore() + const storeApp = useAppStore() + const requestData = {'webdav_path': webdav_path} + storeApp.axiosInstance + .post(`/projects/${storeProject.lastProjectId}.dav`, requestData) + .then((response) => { + storeProject.replaceProject(response.data) + storeApp.alertsSuccess(`Successfuly retrieved WebDAV info.`) + }) + .catch((error) => { + console.error(error) + storeApp.alertsError(`Unable to get/set WebDAV info`) + }) +} diff --git a/frontend/src/views/ProjectView.vue b/frontend/src/views/ProjectView.vue index 604bae14..38080a92 100644 --- a/frontend/src/views/ProjectView.vue +++ b/frontend/src/views/ProjectView.vue @@ -1,7 +1,10 @@