Add rows (#115)

## changes - [x] loc indexer should disallow adding new rows to a DF - [x] ability to modify existing rows using loc - [x] from_deeporigin now works with completely empty DBs - [x] from_deeporigin now works with DBs with no rows (but some columns) - [x] tests: ability to modifiy 1 row using .loc - [x] tests: ability to modify many rows using .loc - [x] tests: prevention of adding new rows to a DF - [x] support for appending a square chunk (similar to a DB) directly to a DB. combination of create new rows + write data - [x] support for writing 1 new row to a DB with all columns with data - [x] support for writing many rows + many columns - [x] documentation explaining how to add new rows to a DB
deeporiginbio · Nov 22, 2024 · b546456 · b546456
1 parent fb3e218
commit b546456
Show file tree

Hide file tree

Showing 11 changed files with 417 additions and 136 deletions.
diff --git a/Makefile b/Makefile
@@ -20,10 +20,18 @@ else
 endif 
 	@source $(CURDIR)/venv/bin/activate && \
 	interrogate -c pyproject.toml -v . -f 100 && \
-	python3 -m coverage run -m pytest -x -n $(n_workers) --failed-first -k $(chosen_tests) --client $(client) --responses $(responses) && \
+	python3 -m coverage run --source="src" -m pytest -x -n $(n_workers) --failed-first -k $(chosen_tests) --client $(client) --responses $(responses) && \
 	python3 -m coverage html && \
 	deactivate
 
+
+coverage:
+	@source $(CURDIR)/venv/bin/activate && \
+	python3 -m coverage run -m pytest -x --client $(client)  && \
+	python3 -m coverage html && \
+	open htmlcov/index.html && \
+	deactivate
+
 # set up jupyter dev kernel
 jupyter:
 	-deactivate

diff --git a/docs/how-to/data-hub/add-data.md b/docs/how-to/data-hub/add-data.md
@@ -0,0 +1,47 @@
+# Add data to a Deep Origin Database
+
+This document describes how to add data to a Deep Origin Database.
+
+Consider the following dataframe constructed from a database using:
+
+```python
+from deeporigin.data_hub import api
+df = api.get_dataframe("xy")
+df
+```
+
+![](../../images/df-xy.png)
+
+## Add new rows
+
+To add new rows to the underlying database, use the `add_databse_rows` function:
+
+```python
+data = dict(X=[1, 2], Y=[2, 3])
+api.add_database_rows(database_id="xy", data=data)
+```
+
+`data` should be a dictionary where the keys are column names and the values are lists of values to be written to the corresponding columns. `add_database_rows` will add this data to the database, creating as many new rows as needed. 
+
+`add_database_rows` returns a list of the row IDs created during this process.
+
+
+## Add fragments of new rows 
+
+Similarly, fragments of rows (subsets of columns) can be written to the database:
+
+```python
+data = dict(X=[10, 20]) # note Y is not specified
+api.add_database_rows(database_id="xy", data=data)
+```
+
+`add_database_rows` returns a list of the row IDs created during this process, for example:
+
+```python
+["row-1", "row-2"]
+```
+
+
+## Reference
+
+The reference documentation for [add_database_rows](../../ref/data-hub/high-level-api.md#src.data_hub.api.add_database_rows)
diff --git a/docs/images/df-xy.png b/docs/images/df-xy.png
diff --git a/mkdocs.yaml b/mkdocs.yaml
@@ -59,6 +59,7 @@ nav:
     - Upload files: how-to/data-hub/upload-files.md
     - Download files: how-to/data-hub/download-files.md
     - Write data: how-to/data-hub/write-data.md
+    - Add data to database: how-to/data-hub/add-data.md
   - API reference:
     - High-level API: ref/data-hub/high-level-api.md
     - Low-level API: ref/data-hub/low-level-api.md
@@ -112,7 +113,7 @@ plugins:
       handlers:
         python:
           paths: ["."]
-          options:            
+          options:   
             annotations_path: brief
             show_source: false
             docstring_options:

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
     "tabulate",
     "filetype",
     "httpx",
-    "deeporigin-data-sdk==0.1.0a21",
+    "deeporigin-data-sdk==0.1.0a27",
     "humanize",
     "packaging",
     "diskcache",

diff --git a/src/data_hub/api.py b/src/data_hub/api.py
@@ -358,11 +358,81 @@ def upload_file(
     return response.file
 
 
+@beartype
+def add_database_rows(
+    *,
+    database_id: str,
+    data: dict,
+    client=None,
+    _stash: bool = False,
+) -> list[str]:
+    """Add new data to a database.
+
+    Use this function to add new rows, or fragments of rows, to a Deep Origin database.
+
+    Args:
+        database_id: Human ID or System ID of the database
+        data: A dictionary where each key is a column name and each value is a list of values. All values should have the same length. Key names should match column names in the database.
+
+    Returns:
+        A list of row IDs
+
+    """
+    # check that dict has columns that make sense
+    db = _api.describe_database(
+        database_id=database_id,
+        client=client,
+        _stash=_stash,
+    )
+
+    col_names = [col.name for col in db.cols]
+
+    for col in data.keys():
+        if col not in col_names:
+            raise DeepOriginException(
+                message=f"Column `{col}` does not exist in database `{database_id}`."
+            )
+
+    # check that dict has all keys of the same length
+    value_lengths = []
+    for col in data.keys():
+        value_lengths.append(len(data[col]))
+
+    if len(set(value_lengths)) > 1:
+        raise DeepOriginException(
+            message="All rows must have the same number of values."
+        )
+
+    response = make_database_rows(
+        database_id=database_id,
+        n_rows=value_lengths[0],
+        client=client,
+        _stash=_stash,
+    )
+
+    row_ids = [row.id for row in response.rows]
+    row_hids = [row.hid for row in response.rows]
+
+    for col in data.keys():
+        set_data_in_cells(
+            values=data[col],
+            row_ids=row_ids,
+            column_id=col,
+            database_id=database_id,
+            columns=db.cols,
+            client=client,
+            _stash=_stash,
+        )
+
+    return row_hids
+
+
 @beartype
 @ensure_client
 def make_database_rows(
     database_id: str,
     n_rows: int = 1,
+    *,
     client=None,
     _stash: bool = False,
 ) -> dict:
@@ -1005,6 +1075,23 @@ def get_dataframe(
             f"Expected database_id: {database_id} to resolve to a database, but instead, it resolved to a {db_row.type}"
         )
 
+    # early exit for empty DB
+    if "cols" not in db_row.keys() or db_row.cols is None:
+        data = dict()
+        if return_type == "dataframe":
+            # this import is here because we don't want to
+            # import pandas unless we actually use this function
+            df = _make_deeporigin_dataframe(
+                data=data,
+                reference_ids=None,
+                db_row=db_row,
+                rows=None,
+                columns=None,
+            )
+            return df
+        else:
+            return dict()
+
     columns = db_row.cols
     database_id = db_row.id
 
@@ -1017,9 +1104,6 @@ def get_dataframe(
     reference_ids = []
     file_ids = []
 
-    if columns is None:
-        return None
-
     # remove notebook columns because they are not
     # shown in the UI as columns
     columns = [
@@ -1033,7 +1117,7 @@ def get_dataframe(
         data[column["id"]] = []
 
     for row in rows:
-        # warning: add_row_to_data mutates file_ids
+        # warning: add_row_to_data mutates data, file_ids
         # and reference_ids
         add_row_to_data(
             data=data,
@@ -1082,22 +1166,13 @@ def get_dataframe(
     if return_type == "dataframe":
         # make the dataframe
 
-        # this import is here because we don't want to
-        # import pandas unless we actually use this function
-        from deeporigin.data_hub.dataframe import DataFrame
-
-        df = DataFrame(data)
-        df.attrs["reference_ids"] = list(set(reference_ids))
-        df.attrs["id"] = database_id
-        df.attrs["metadata"] = dict(db_row)
-
-        df = _type_and_cleanup_dataframe(df, columns)
-
-        # find last updated row for pretty printing
-        df.attrs["last_updated_row"] = find_last_updated_row(rows)
-
-        df._deep_origin_out_of_sync = False
-        df._modified_columns = dict()
+        df = _make_deeporigin_dataframe(
+            data=data,
+            reference_ids=reference_ids,
+            db_row=db_row,
+            rows=rows,
+            columns=columns,
+        )
         return df
 
     else:
@@ -1113,6 +1188,38 @@ def get_dataframe(
         return renamed_data
 
 
+def _make_deeporigin_dataframe(
+    *,
+    data: dict,
+    reference_ids: Optional[list],
+    db_row: dict,
+    columns: Optional[list],
+    rows: Optional[list],
+):
+    # this import is here because we don't want to
+    # import pandas unless we actually use this function
+    from deeporigin.data_hub.dataframe import DataFrame
+
+    df = DataFrame(data)
+    if reference_ids is not None:
+        df.attrs["reference_ids"] = list(set(reference_ids))
+    df.attrs["id"] = db_row.id
+    df.attrs["metadata"] = dict(db_row)
+
+    if columns is not None:
+        df = _type_and_cleanup_dataframe(df, columns)
+
+    # find last updated row for pretty printing
+    if len(df) > 0:
+        df.attrs["last_updated_row"] = find_last_updated_row(rows)
+    else:
+        df.attrs["last_updated_row"] = db_row
+
+    df._deep_origin_out_of_sync = False
+    df._modified_columns = dict()
+    return df
+
+
 @beartype
 @ensure_client
 def download_files(
@@ -1173,21 +1280,26 @@ def download_files(
             pass
 
 
+@beartype
 def add_row_to_data(
     *,
     data: dict,
-    row,
+    row: dict,
     columns: list,
     file_ids: list,
     reference_ids: list,
 ):
     """utility function to combine data from a row into a dataframe"""
-    row_data = _row_to_dict(
+    row_data = row_to_dict(
         row,
         file_ids=file_ids,
         reference_ids=reference_ids,
     )
     if row_data is None:
+        for column in columns:
+            col_id = column["id"]
+            data[col_id].append(None)
+
         return
 
     data["ID"].append(row_data["ID"])
@@ -1204,21 +1316,39 @@ def add_row_to_data(
             data[col_id].append(None)
 
 
-def _row_to_dict(
-    row,
+@beartype
+def row_to_dict(
+    row: dict,
     *,
-    file_ids: list,
-    reference_ids: list,
-):
-    """utility function to convert a row to a dictionary"""
-    if "fields" not in row.keys():
-        return None
+    file_ids: Optional[list] = None,
+    reference_ids: Optional[list] = None,
+) -> dict:
+    """convert a database row (as returned by api.list_database_rows) to a dictionary where keys are column IDs and values are the values in the row
 
-    fields = row.fields
+    Danger: This function mutates inputs
+        This function mutates file_ids and reference_ids
+
+    Args:
+        row: database row (as returned by api.list_database_rows)
+        file_ids: list of file IDs, will be mutated in-place
+        reference_ids: list of reference IDs, will be mutated in-place
+
+    Returns:
+        dict
+    """
+
+    if file_ids is None:
+        file_ids = []
+    if reference_ids is None:
+        reference_ids = []
 
     values = {"ID": row.hid, "Validation Status": row.validationStatus}
-    if fields is None:
+
+    if "fields" not in row.keys() or row.fields is None:
         return values
+
+    fields = row.fields
+
     for field in fields:
         if "systemType" in field.keys() and field.systemType == "bodyDocument":
             continue