Skip to content

Commit

Permalink
Test that verifies we can read repos generated with prev versions
Browse files Browse the repository at this point in the history
  • Loading branch information
paraseba committed Oct 11, 2024
1 parent 610dc9a commit ed5fcba
Show file tree
Hide file tree
Showing 25 changed files with 174 additions and 0 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"8AEWDWJRTMECASF516SG"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"3KP6E7F3C2PE2HNGCNM0"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"C1ZKMGE3ESPJ24YKN9MG"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"H01K0XJPGVW4HFX470AG"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"AG1HZQ5SWS8DM8DNC670"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"VNKSCC59M58V0MSJ01RG"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"8AEWDWJRTMECASF516SG"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"AG1HZQ5SWS8DM8DNC670"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"VNKSCC59M58V0MSJ01RG"}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
165 changes: 165 additions & 0 deletions icechunk-python/tests/test_can_read_old.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
""" This test reads a repository generated with an older version of icechunk.
In this way, we check we maintain read compatibility. The repository lives
in the git repository, as a filesystem store, in the directory icechunk-python/tests/data/test-repo
If something changes in the on disk format, we probably won't be able to read the repo, the
test will fail and we can avoid breaking user data.
When new features that impact the stored info are added, or when the on-disk format is
intentionally changed, the repository files must be regenerated. For that, run the current
file as a python script: `python ./tests/test_can_read_old.py`.
"""

import icechunk as ic
import zarr
import pytest
from object_store import ClientOptions, ObjectStore
from numpy.testing import assert_array_equal


def write_chunks_to_minio(chunks: list[tuple[str, bytes]]):
client_options = ClientOptions(
allow_http=True, # type: ignore
)
store = ObjectStore(
"s3://testbucket",
{
"access_key_id": "minio123",
"secret_access_key": "minio123",
"aws_region": "us-east-1",
"aws_endpoint": "http://localhost:9000",
},
client_options=client_options,
)

for key, data in chunks:
store.put(key, data)


async def mk_store(mode):
"""Create a store that can access virtual chunks in localhost MinIO"""
store_path = "./tests/data/test-repo"
store = await ic.IcechunkStore.open(
storage=ic.StorageConfig.filesystem(store_path),
config = ic.StoreConfig(
inline_chunk_threshold_bytes=10,
virtual_ref_config=ic.VirtualRefConfig.s3_from_config(
credentials=ic.S3Credentials(
access_key_id="minio123",
secret_access_key="minio123",
),
endpoint_url="http://localhost:9000",
allow_http=True,
region="us-east-1",
)
),
mode=mode,
)
return store

async def write_a_test_repo():
"""Write the test repository.
This function tries to explore as many icechunk features as possible, to generate
an richer repository on disk. For example, it does several commits, it has a hierarchy,
it has virtual, inline and materialized chunks, branches and tags, etc.
PLEASE: keep addign more actions to this function as we add more features to Icechunk.
"""

print("Writing repository to ./tests/data/test-repo")
store = await mk_store("w");

root = zarr.group(store=store)
group1 = root.create_group("group1", attributes={"this": "is a nice group", "icechunk": 1, "size":42.0})

# these chunks will be materialized
big_chunks = group1.create_array("big_chunks", shape=(10, 10), chunk_shape=(5, 5), dtype="float32", fill_value=float('nan'), attributes={"this": "is a nice array", "icechunk": 1, "size":42.0})

# these chunks will be inline
small_chunks = group1.create_array("small_chunks", shape=(5), chunk_shape=(1), dtype="int8", fill_value=8, attributes={"this": "is a nice array", "icechunk": 1, "size":42.0})
snap1 = await store.commit("empty structure")

big_chunks[:] = 42.0
small_chunks[:] = 84
snap2 = await store.commit("fill data")

await store.set_virtual_ref(
"group1/big_chunks/c/0/0", "s3://testbucket/path/to/python/chunk-1", offset=0, length=5*5*4
)
snap3 = await store.commit("set virtual chunk")

await store.new_branch("my-branch")
await store.delete("group1/small_chunks/c/4")
snap4 = await store.commit("delete a chunk")

await store.tag("it works!", snap4)

group2 = root.create_group("group2", attributes={"this": "is a nice group", "icechunk": 1, "size":42.0})
group3 = group2.create_group("group3", attributes={"this": "is a nice group", "icechunk": 1, "size":42.0})
group4 = group3.create_group("group4", attributes={"this": "is a nice group", "icechunk": 1, "size":42.0})
group5 = group4.create_group("group5", attributes={"this": "is a nice group", "icechunk": 1, "size":42.0})
inner = group5.create_array("inner", shape=(10, 10), chunk_shape=(5, 5), dtype="float32", fill_value=float('nan'), attributes={"this": "is a nice array", "icechunk": 1, "size":42.0})
snap5 = await store.commit("some more structure")
await store.tag("it also works!", snap5)

store.close()

async def test_icechunk_can_read_old_repo():
store = await mk_store("r");

expected_main_history = ["set virtual chunk", "fill data", "empty structure", "Repository initialized"]
assert [p.message async for p in store.ancestry()] == expected_main_history

await store.checkout(branch="my-branch")
expected_branch_history = ["some more structure", "delete a chunk"] + expected_main_history
assert [p.message async for p in store.ancestry()] == expected_branch_history

await store.checkout(tag="it also works!")
assert [p.message async for p in store.ancestry()] == expected_branch_history

await store.checkout(tag="it works!")
assert [p.message async for p in store.ancestry()] == expected_branch_history[1:]

store = await mk_store("r");
await store.checkout(branch="my-branch")
assert sorted([p async for p in store.list_dir("")]) == ["group1", "group2", "zarr.json"]
assert sorted([p async for p in store.list_dir("group1")]) == ["big_chunks", "small_chunks", "zarr.json"]
assert sorted([p async for p in store.list_dir("group2")]) == ["group3", "zarr.json"]
assert sorted([p async for p in store.list_dir("group2/group3")]) == ["group4", "zarr.json"]
assert sorted([p async for p in store.list_dir("group2/group3/group4")]) == ["group5", "zarr.json"]
assert sorted([p async for p in store.list_dir("group2/group3/group4/group5")]) == ["inner", "zarr.json"]
assert sorted([p async for p in store.list_dir("group2/group3/group4/group5/inner")]) == ["zarr.json"]

root = zarr.group(store=store)
# inner is not initialized, so it's all fill values
inner = root["group2/group3/group4/group5/inner"]
assert_array_equal(inner[:], float('nan'))

small_chunks = root["group1/small_chunks"]
# has 5 elements, we deleted the last chunk (of size 1), and the fill value is 8
assert_array_equal(small_chunks[:], [84,84,84,84, 8])

# big_chunks array has a virtual chunk, so we need to write it to local MinIO
# we get the bytes from one of the materialized chunks
buffer_prototype = zarr.core.buffer.default_buffer_prototype()
chunk_data = (await store.get("group1/big_chunks/c/0/1", prototype=buffer_prototype)).to_bytes()

# big chunks array has a virtual chunk pointing here
write_chunks_to_minio(
[
("path/to/python/chunk-1", chunk_data),
]
)

big_chunks = root["group1/big_chunks"]
assert_array_equal(big_chunks[:], 42.0)


if __name__ == '__main__':
import asyncio
asyncio.run(write_a_test_repo())



0 comments on commit ed5fcba

Please sign in to comment.