Skip to content

Commit

Permalink
Add rows (#115)
Browse files Browse the repository at this point in the history
## changes

- [x] loc indexer should disallow adding new rows to a DF
- [x] ability to modify existing rows using loc
- [x] from_deeporigin now works with completely empty DBs
- [x] from_deeporigin now works with DBs with no rows (but some columns)
- [x] tests: ability to modifiy 1 row using .loc
- [x] tests: ability to modify many rows using .loc
- [x] tests: prevention of adding new rows to a DF
- [x] support for appending a square chunk (similar to a DB) directly to
a DB. combination of create new rows + write data
- [x] support for writing 1 new row to a DB with all columns with data
- [x] support for writing many rows + many columns
- [x] documentation explaining how to add new rows to a DB
  • Loading branch information
sg-s authored Nov 22, 2024
1 parent fb3e218 commit b546456
Show file tree
Hide file tree
Showing 11 changed files with 417 additions and 136 deletions.
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,18 @@ else
endif
@source $(CURDIR)/venv/bin/activate && \
interrogate -c pyproject.toml -v . -f 100 && \
python3 -m coverage run -m pytest -x -n $(n_workers) --failed-first -k $(chosen_tests) --client $(client) --responses $(responses) && \
python3 -m coverage run --source="src" -m pytest -x -n $(n_workers) --failed-first -k $(chosen_tests) --client $(client) --responses $(responses) && \
python3 -m coverage html && \
deactivate


coverage:
@source $(CURDIR)/venv/bin/activate && \
python3 -m coverage run -m pytest -x --client $(client) && \
python3 -m coverage html && \
open htmlcov/index.html && \
deactivate

# set up jupyter dev kernel
jupyter:
-deactivate
Expand Down
47 changes: 47 additions & 0 deletions docs/how-to/data-hub/add-data.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Add data to a Deep Origin Database

This document describes how to add data to a Deep Origin Database.

Consider the following dataframe constructed from a database using:

```python
from deeporigin.data_hub import api
df = api.get_dataframe("xy")
df
```

![](../../images/df-xy.png)

## Add new rows

To add new rows to the underlying database, use the `add_databse_rows` function:

```python
data = dict(X=[1, 2], Y=[2, 3])
api.add_database_rows(database_id="xy", data=data)
```

`data` should be a dictionary where the keys are column names and the values are lists of values to be written to the corresponding columns. `add_database_rows` will add this data to the database, creating as many new rows as needed.

`add_database_rows` returns a list of the row IDs created during this process.


## Add fragments of new rows

Similarly, fragments of rows (subsets of columns) can be written to the database:

```python
data = dict(X=[10, 20]) # note Y is not specified
api.add_database_rows(database_id="xy", data=data)
```

`add_database_rows` returns a list of the row IDs created during this process, for example:

```python
["row-1", "row-2"]
```


## Reference

The reference documentation for [add_database_rows](../../ref/data-hub/high-level-api.md#src.data_hub.api.add_database_rows)
Binary file added docs/images/df-xy.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 2 additions & 1 deletion mkdocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ nav:
- Upload files: how-to/data-hub/upload-files.md
- Download files: how-to/data-hub/download-files.md
- Write data: how-to/data-hub/write-data.md
- Add data to database: how-to/data-hub/add-data.md
- API reference:
- High-level API: ref/data-hub/high-level-api.md
- Low-level API: ref/data-hub/low-level-api.md
Expand Down Expand Up @@ -112,7 +113,7 @@ plugins:
handlers:
python:
paths: ["."]
options:
options:
annotations_path: brief
show_source: false
docstring_options:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ dependencies = [
"tabulate",
"filetype",
"httpx",
"deeporigin-data-sdk==0.1.0a21",
"deeporigin-data-sdk==0.1.0a27",
"humanize",
"packaging",
"diskcache",
Expand Down
194 changes: 162 additions & 32 deletions src/data_hub/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,11 +358,81 @@ def upload_file(
return response.file


@beartype
def add_database_rows(
*,
database_id: str,
data: dict,
client=None,
_stash: bool = False,
) -> list[str]:
"""Add new data to a database.
Use this function to add new rows, or fragments of rows, to a Deep Origin database.
Args:
database_id: Human ID or System ID of the database
data: A dictionary where each key is a column name and each value is a list of values. All values should have the same length. Key names should match column names in the database.
Returns:
A list of row IDs
"""
# check that dict has columns that make sense
db = _api.describe_database(
database_id=database_id,
client=client,
_stash=_stash,
)

col_names = [col.name for col in db.cols]

for col in data.keys():
if col not in col_names:
raise DeepOriginException(
message=f"Column `{col}` does not exist in database `{database_id}`."
)

# check that dict has all keys of the same length
value_lengths = []
for col in data.keys():
value_lengths.append(len(data[col]))

if len(set(value_lengths)) > 1:
raise DeepOriginException(
message="All rows must have the same number of values."
)

response = make_database_rows(
database_id=database_id,
n_rows=value_lengths[0],
client=client,
_stash=_stash,
)

row_ids = [row.id for row in response.rows]
row_hids = [row.hid for row in response.rows]

for col in data.keys():
set_data_in_cells(
values=data[col],
row_ids=row_ids,
column_id=col,
database_id=database_id,
columns=db.cols,
client=client,
_stash=_stash,
)

return row_hids


@beartype
@ensure_client
def make_database_rows(
database_id: str,
n_rows: int = 1,
*,
client=None,
_stash: bool = False,
) -> dict:
Expand Down Expand Up @@ -1005,6 +1075,23 @@ def get_dataframe(
f"Expected database_id: {database_id} to resolve to a database, but instead, it resolved to a {db_row.type}"
)

# early exit for empty DB
if "cols" not in db_row.keys() or db_row.cols is None:
data = dict()
if return_type == "dataframe":
# this import is here because we don't want to
# import pandas unless we actually use this function
df = _make_deeporigin_dataframe(
data=data,
reference_ids=None,
db_row=db_row,
rows=None,
columns=None,
)
return df
else:
return dict()

columns = db_row.cols
database_id = db_row.id

Expand All @@ -1017,9 +1104,6 @@ def get_dataframe(
reference_ids = []
file_ids = []

if columns is None:
return None

# remove notebook columns because they are not
# shown in the UI as columns
columns = [
Expand All @@ -1033,7 +1117,7 @@ def get_dataframe(
data[column["id"]] = []

for row in rows:
# warning: add_row_to_data mutates file_ids
# warning: add_row_to_data mutates data, file_ids
# and reference_ids
add_row_to_data(
data=data,
Expand Down Expand Up @@ -1082,22 +1166,13 @@ def get_dataframe(
if return_type == "dataframe":
# make the dataframe

# this import is here because we don't want to
# import pandas unless we actually use this function
from deeporigin.data_hub.dataframe import DataFrame

df = DataFrame(data)
df.attrs["reference_ids"] = list(set(reference_ids))
df.attrs["id"] = database_id
df.attrs["metadata"] = dict(db_row)

df = _type_and_cleanup_dataframe(df, columns)

# find last updated row for pretty printing
df.attrs["last_updated_row"] = find_last_updated_row(rows)

df._deep_origin_out_of_sync = False
df._modified_columns = dict()
df = _make_deeporigin_dataframe(
data=data,
reference_ids=reference_ids,
db_row=db_row,
rows=rows,
columns=columns,
)
return df

else:
Expand All @@ -1113,6 +1188,38 @@ def get_dataframe(
return renamed_data


def _make_deeporigin_dataframe(
*,
data: dict,
reference_ids: Optional[list],
db_row: dict,
columns: Optional[list],
rows: Optional[list],
):
# this import is here because we don't want to
# import pandas unless we actually use this function
from deeporigin.data_hub.dataframe import DataFrame

df = DataFrame(data)
if reference_ids is not None:
df.attrs["reference_ids"] = list(set(reference_ids))
df.attrs["id"] = db_row.id
df.attrs["metadata"] = dict(db_row)

if columns is not None:
df = _type_and_cleanup_dataframe(df, columns)

# find last updated row for pretty printing
if len(df) > 0:
df.attrs["last_updated_row"] = find_last_updated_row(rows)
else:
df.attrs["last_updated_row"] = db_row

df._deep_origin_out_of_sync = False
df._modified_columns = dict()
return df


@beartype
@ensure_client
def download_files(
Expand Down Expand Up @@ -1173,21 +1280,26 @@ def download_files(
pass


@beartype
def add_row_to_data(
*,
data: dict,
row,
row: dict,
columns: list,
file_ids: list,
reference_ids: list,
):
"""utility function to combine data from a row into a dataframe"""
row_data = _row_to_dict(
row_data = row_to_dict(
row,
file_ids=file_ids,
reference_ids=reference_ids,
)
if row_data is None:
for column in columns:
col_id = column["id"]
data[col_id].append(None)

return

data["ID"].append(row_data["ID"])
Expand All @@ -1204,21 +1316,39 @@ def add_row_to_data(
data[col_id].append(None)


def _row_to_dict(
row,
@beartype
def row_to_dict(
row: dict,
*,
file_ids: list,
reference_ids: list,
):
"""utility function to convert a row to a dictionary"""
if "fields" not in row.keys():
return None
file_ids: Optional[list] = None,
reference_ids: Optional[list] = None,
) -> dict:
"""convert a database row (as returned by api.list_database_rows) to a dictionary where keys are column IDs and values are the values in the row
fields = row.fields
Danger: This function mutates inputs
This function mutates file_ids and reference_ids
Args:
row: database row (as returned by api.list_database_rows)
file_ids: list of file IDs, will be mutated in-place
reference_ids: list of reference IDs, will be mutated in-place
Returns:
dict
"""

if file_ids is None:
file_ids = []
if reference_ids is None:
reference_ids = []

values = {"ID": row.hid, "Validation Status": row.validationStatus}
if fields is None:

if "fields" not in row.keys() or row.fields is None:
return values

fields = row.fields

for field in fields:
if "systemType" in field.keys() and field.systemType == "bodyDocument":
continue
Expand Down
Loading

0 comments on commit b546456

Please sign in to comment.