Skip to content

Commit

Permalink
Add "data" filesystem
Browse files Browse the repository at this point in the history
  • Loading branch information
martindurant committed Nov 8, 2023
1 parent 0883bf5 commit 1017e12
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 1 deletion.
4 changes: 4 additions & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ Built-in Implementations
fsspec.implementations.cached.SimpleCacheFileSystem
fsspec.implementations.cached.WholeFileCacheFileSystem
fsspec.implementations.dask.DaskWorkerFileSystem
fsspec.implementations.data.DataFileSystem
fsspec.implementations.dbfs.DatabricksFileSystem
fsspec.implementations.dirfs.DirFileSystem
fsspec.implementations.ftp.FTPFileSystem
Expand Down Expand Up @@ -149,6 +150,9 @@ Built-in Implementations
.. autoclass:: fsspec.implementations.dask.DaskWorkerFileSystem
:members: __init__

.. autoclass:: fsspec.implementations.data.DataFileSystem
:members: __init__

.. autoclass:: fsspec.implementations.dbfs.DatabricksFileSystem
:members: __init__

Expand Down
2 changes: 2 additions & 0 deletions fsspec/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,8 @@ def split_protocol(urlpath):
if len(protocol) > 1:
# excludes Windows paths
return protocol, path
if ":" in urlpath and urlpath.find(":") > 1:
return urlpath.split(":", 1)
return None, urlpath


Expand Down
38 changes: 38 additions & 0 deletions fsspec/implementations/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import base64
import io
from urllib.parse import unquote

from fsspec import AbstractFileSystem


class DataFileSystem(AbstractFileSystem):
"""A handy decoder for data-URLs
Example
-------
>>> with fsspec.open("data:,Hello%2C%20World%21") as f:
... print(f.read())
b"Hello, World!"
"""

protocol = "data"

def cat_file(self, path, start=None, end=None, **kwargs):
pref, data = path.split(",", 1)
if pref.endswith("base64"):
return base64.b64decode(data)[start:end]
return unquote(data).encode()[start:end]

def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
if "r" not in mode:
raise ValueError("Read only filesystem")
return io.BytesIO(self.cat_file(path))
4 changes: 3 additions & 1 deletion fsspec/implementations/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def open_refs(field, record):
@staticmethod
def create(record_size, root, fs, **kwargs):
met = {"metadata": {}, "record_size": record_size}
fs.makedirs(root, exist_ok=True)
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
return LazyReferenceMapper(root, fs, **kwargs)

Expand Down Expand Up @@ -292,7 +293,7 @@ def _get_chunk_sizes(self, field):
def _generate_record(self, field, record):
"""The references for a given parquet file of a given field"""
refs = self.open_refs(field, record)
it = iter(zip(refs.values()))
it = iter(zip(*refs.values()))
if len(refs) == 3:
# All urls
return (list(t) for t in it)
Expand Down Expand Up @@ -650,6 +651,7 @@ def __init__(
self.fss[protocol] = fs
if remote_protocol is None:
# get single protocol from references
# TODO: warning here, since this can be very expensive?
for ref in self.references.values():
if callable(ref):
ref = ref()
Expand Down
9 changes: 9 additions & 0 deletions fsspec/implementations/tests/.benchmarks/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import fsspec


def test_1():
with fsspec.open("data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==") as f:
assert f.read() == b"Hello, World!"

with fsspec.open("data:,Hello%2C%20World%21") as f:
assert f.read() == b"Hello, World!"
1 change: 1 addition & 0 deletions fsspec/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def register_implementation(name, cls, clobber=False, errtxt=None):
# protocols mapped to the class which implements them. This dict can
# updated with register_implementation
known_implementations = {
"data": {"class": "fsspec.implementations.data.DataFileSystem"},
"file": {"class": "fsspec.implementations.local.LocalFileSystem"},
"local": {"class": "fsspec.implementations.local.LocalFileSystem"},
"memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
Expand Down

0 comments on commit 1017e12

Please sign in to comment.