Skip to content

Commit

Permalink
chore: next try
Browse files Browse the repository at this point in the history
  • Loading branch information
phil65 committed Nov 30, 2024
1 parent 34e1183 commit 11d9ca6
Showing 1 changed file with 41 additions and 23 deletions.
64 changes: 41 additions & 23 deletions src/llmling/resources/loaders/path.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING, ClassVar
import urllib.parse

from upath import UPath

Expand Down Expand Up @@ -38,34 +38,49 @@ def supports_uri(cls, uri: str) -> bool:
else:
return True

@staticmethod
def _normalize_path_components(parts: list[str]) -> list[str]:
"""Normalize path components, resolving . and .. entries."""
result: list[str] = []
for part in parts:
match part:
case "." | "":
continue
case "..":
if result:
result.pop()
case _:
if not PathResourceLoader._is_ignorable_part(part):
result.append(part)
return result

@classmethod
def get_name_from_uri(cls, uri: str) -> str:
"""Extract the normalized path from a URI."""
try:
if not cls.supports_uri(uri):
msg = f"Unsupported URI: {uri}"
if not uri.startswith("file:///"):
msg = f"Invalid file URI format: {uri}"
raise exceptions.LoaderError(msg) # noqa: TRY301

path = UPath(uri)
# Remove the file:/// prefix
path = uri[8:]

# Get parts excluding protocol info
parts = [part for part in path.parts if not cls._is_ignorable_part(str(part))]
# Split into components and decode, including empty parts
parts = [urllib.parse.unquote(part) for part in path.split("/")]

if not parts:
# Remove empty parts and normalize
normalized_parts = cls._normalize_path_components(parts)
if not normalized_parts:
msg = "Empty path after normalization"
raise exceptions.LoaderError(msg) # noqa: TRY301

# Validate path components
for part in parts:
if cls.invalid_chars_pattern.search(str(part)):
# Validate normalized components
for part in normalized_parts:
if cls.invalid_chars_pattern.search(part):
msg = f"Invalid characters in path component: {part}"
raise exceptions.LoaderError(msg) # noqa: TRY301

# Join with forward slashes and normalize consecutive slashes
joined = "/".join(str(p) for p in parts)
return re.sub(
r"/+", "/", joined
) # Replace multiple slashes with single slash
return "/".join(normalized_parts)

except Exception as exc:
if isinstance(exc, exceptions.LoaderError):
Expand All @@ -75,22 +90,25 @@ def get_name_from_uri(cls, uri: str) -> str:

@classmethod
def create_uri(cls, *, name: str) -> str:
"""Create a URI from a path or URL."""
"""Create a URI from a path."""
try:
# Validate path
if cls.invalid_chars_pattern.search(name):
msg = f"Invalid characters in path: {name}"
raise exceptions.LoaderError(msg) # noqa: TRY301

# Use UPath for handling
path = UPath(name)
# Normalize path separators and split
parts = name.replace("\\", "/").split("/")

# If it already has a protocol, use it as-is
if path.protocol:
return str(path)
# Normalize components
normalized_parts = cls._normalize_path_components(parts)
if not normalized_parts:
msg = "Empty path"
raise exceptions.LoaderError(msg) # noqa: TRY301

# Otherwise, treat as local file
return f"file:///{str(path).lstrip('/')}"
# Encode components and create URI
encoded_parts = [urllib.parse.quote(part) for part in normalized_parts]
return f"file:///{'/'.join(encoded_parts)}"

except Exception as exc:
if isinstance(exc, exceptions.LoaderError):
Expand Down

0 comments on commit 11d9ca6

Please sign in to comment.