From c55a6547217a53bd6758273e553182512813dde8 Mon Sep 17 00:00:00 2001 From: lcard Date: Fri, 6 Oct 2023 16:26:26 +0100 Subject: [PATCH 1/2] Fix sdk dataset patterns --- Makefile | 2 +- docs/changelog.md | 7 +- docs/sdk/api/patterns/data.md | 2 +- docs/sdk/useful_patterns.md | 4 +- infrastructure/modules/rapid/variables.tf | 4 +- sdk/rapid/patterns/data.py | 68 -------- sdk/rapid/patterns/dataset.py | 69 ++++++++ sdk/rapid/rapid.py | 21 ++- sdk/setup.py | 2 +- sdk/tests/test_patterns/test_dataset.py | 186 ++++++++++++++++++++++ sdk/tests/test_patterns/test_patterns.py | 154 ------------------ 11 files changed, 283 insertions(+), 236 deletions(-) delete mode 100644 sdk/rapid/patterns/data.py create mode 100644 sdk/rapid/patterns/dataset.py create mode 100644 sdk/tests/test_patterns/test_dataset.py delete mode 100644 sdk/tests/test_patterns/test_patterns.py diff --git a/Makefile b/Makefile index c19d87c..5dbbb77 100644 --- a/Makefile +++ b/Makefile @@ -138,7 +138,7 @@ sdk-setup: ## Setup Python required for the sdk # SDK Testing -------------------- ## sdk-test: ## Run sdk unit tests - @cd sdk/; pytest -vv -s + @cd sdk/; . .venv/bin/activate && pytest -vv -s # SDK Release -------------------- ## diff --git a/docs/changelog.md b/docs/changelog.md index d1371a2..8b0cea4 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,11 @@ # Changelog +## v7.0.5 / v0.1.3 (sdk) - _2023-09-20_ + +### Fixes + +- Fix the behaviour of the dataset pattern functions in the SDK. + ## v7.0.4 / v0.1.2 (sdk) - _2023-09-20_ ### Features @@ -18,7 +24,6 @@ - Fixes issue where permissions were not being correctly read and causing api functionality to fail - ## v7.0.2 / v0.1.2 (sdk) - _2023-09-14_ ### Fixes diff --git a/docs/sdk/api/patterns/data.md b/docs/sdk/api/patterns/data.md index 9519e89..b6ed6cb 100644 --- a/docs/sdk/api/patterns/data.md +++ b/docs/sdk/api/patterns/data.md @@ -1 +1 @@ -::: rapid.patterns.data +::: rapid.patterns.dataset diff --git a/docs/sdk/useful_patterns.md b/docs/sdk/useful_patterns.md index ab0f56a..e417137 100644 --- a/docs/sdk/useful_patterns.md +++ b/docs/sdk/useful_patterns.md @@ -25,7 +25,7 @@ metadata = SchemaMetadata( ) try: - data.upload_and_create_dataframe( + data.upload_and_create_dataset( rapid=rapid, df=df, metadata=metadata, upgrade_schema_on_fail=False ) except DataFrameUploadValidationException: @@ -57,7 +57,7 @@ metadata = SchemaMetadata( ) try: - data.update_schema_dataframe( + data.update_schema_to_dataframe( rapid=rapid, df=df, metadata=metadata, diff --git a/infrastructure/modules/rapid/variables.tf b/infrastructure/modules/rapid/variables.tf index 41f889d..f28f729 100644 --- a/infrastructure/modules/rapid/variables.tf +++ b/infrastructure/modules/rapid/variables.tf @@ -13,13 +13,13 @@ variable "app-replica-count-max" { variable "application_version" { type = string description = "The version number for the application image (e.g.: v1.0.4, v1.0.x-latest, etc.)" - default = "v7.0.4" + default = "v7.0.5" } variable "ui_version" { type = string description = "The version number for the static ui (e.g.: v1.0.0, etc.)" - default = "v7.0.4" + default = "v7.0.5" } variable "catalog_disabled" { diff --git a/sdk/rapid/patterns/data.py b/sdk/rapid/patterns/data.py deleted file mode 100644 index 5f29df3..0000000 --- a/sdk/rapid/patterns/data.py +++ /dev/null @@ -1,68 +0,0 @@ -from typing import Union, List -from pandas import DataFrame -from rapid.exceptions import ( - ColumnNotDifferentException, - DataFrameUploadValidationException, -) -from rapid.items.schema import Schema, SchemaMetadata, Column -from rapid import Rapid - - -def upload_and_create_dataframe( - rapid: Rapid, metadata: SchemaMetadata, df: DataFrame, upgrade_schema_on_fail=False -): - """ - Generates a schema and dataset from a pandas Dataframe. The function first creates the schema - using the API and the uploads the DataFrame to this schema, uploading the data to rAPId. - - Args: - rapid (Rapid): An instance of the rAPId SDK's main class. - metadata (SchemaMetadata): The metadata for the schema to be created and the dataset to upload the DataFrame to.ß - df (DataFrame): The pandas DataFrame to generate a schema for and upload to the dataset. - upgrade_schema_on_fail (bool, optional): Whether to upgrade the schema if the DataFrame's schema is incorrect. Defaults to False. - - Raises: - rapid.exceptions.DataFrameUploadValidationException: If the DataFrame's schema is incorrect and upgrade_schema_on_fail is False. - Exception: If an error occurs while generating the schema, creating the schema, or uploading the DataFrame. - """ - schema = rapid.generate_schema( - df, metadata.layer, metadata.domain, metadata.dataset, metadata.sensitivity - ) - try: - rapid.create_schema(schema) - rapid.upload_dataframe(metadata.layer, metadata.domain, metadata.dataset, df) - except DataFrameUploadValidationException as exception: - if upgrade_schema_on_fail: - update_schema_dataframe(rapid, metadata, df, schema.columns) - else: - raise exception - except Exception as exception: - raise exception - - -def update_schema_dataframe( - rapid: Rapid, - metadata: SchemaMetadata, - new_columns: Union[List[Column], List[dict]], -): - """ - Updates a schema for a specified dataset in the API based on a pandas DataFrame. - - Args: - rapid (Rapid): An instance of the rAPId SDK's main class. - metadata (SchemaMetadata): The metadata for the schema to be updated and the dataset the DataFrame belongs to. - new_columns (Union[List[Column], List[dict]]): The new schema columns to update the schema with. - Raises: - rapid.exceptions.ColumnNotDifferentException: If the new schema columns are the same as the existing schema columns. - Exception: If an error occurs while generating the schema information, updating the schema, or comparing the schema columns. - """ - info = rapid.fetch_dataset_info(metadata.layer, metadata.domain, metadata.dataset) - try: - schema = Schema(metadata=metadata, columns=info["columns"]) - if schema.are_columns_the_same(new_columns=new_columns): - raise ColumnNotDifferentException - - schema.columns = new_columns - rapid.update_schema(schema) - except Exception as e: - raise e diff --git a/sdk/rapid/patterns/dataset.py b/sdk/rapid/patterns/dataset.py new file mode 100644 index 0000000..24134f1 --- /dev/null +++ b/sdk/rapid/patterns/dataset.py @@ -0,0 +1,69 @@ +from pandas import DataFrame +from rapid.exceptions import ( + DataFrameUploadValidationException, + DatasetNotFoundException, +) +from rapid.items.schema import Schema, SchemaMetadata +from rapid import Rapid + + +def upload_and_create_dataset( + rapid: Rapid, metadata: SchemaMetadata, df: DataFrame, upgrade_schema_on_fail=False +): + """ + Uploads a dataframe to a dataset in the API, creating schema first if necessary. + + Args: + rapid (Rapid): An instance of the rAPId SDK's main class. + metadata (SchemaMetadata): The metadata for the schema to be created and the dataset to upload the DataFrame to.ß + df (DataFrame): The pandas DataFrame to generate a schema for and upload to the dataset. + upgrade_schema_on_fail (bool, optional): Whether to upgrade the schema if the DataFrame's schema is incorrect. Defaults to False. + + Raises: + rapid.exceptions.DataFrameUploadValidationException: If the DataFrame's schema is incorrect and upgrade_schema_on_fail is False. + Exception: If an error occurs while generating the schema, creating the schema, or uploading the DataFrame. + """ + try: + rapid.upload_dataframe( + metadata.layer, metadata.domain, metadata.dataset, df, wait_to_complete=True + ) + except DatasetNotFoundException: + schema = rapid.generate_schema( + df, metadata.layer, metadata.domain, metadata.dataset, metadata.sensitivity + ) + rapid.create_schema(schema) + rapid.upload_dataframe( + metadata.layer, metadata.domain, metadata.dataset, df, wait_to_complete=True + ) + except DataFrameUploadValidationException as validation_exception: + if upgrade_schema_on_fail: + update_schema_to_dataframe(rapid, metadata, df) + rapid.upload_dataframe( + metadata.layer, + metadata.domain, + metadata.dataset, + df, + wait_to_complete=True, + ) + else: + raise validation_exception + + +def update_schema_to_dataframe( + rapid: Rapid, + metadata: SchemaMetadata, + df: DataFrame, +): + """ + Updates a schema for a specified dataset in the API to match the given Dataframe. + + Args: + rapid (Rapid): An instance of the rAPId SDK's main class. + metadata (SchemaMetadata): The metadata for the schema to be updated and the dataset the DataFrame belongs to. + df (Dataframe): The dataframe that the schema should be updated to match. + """ + schema_response = rapid.generate_schema( + df, metadata.layer, metadata.domain, metadata.dataset, metadata.sensitivity + ) + schema = Schema(metadata=metadata, columns=schema_response.columns) + rapid.update_schema(schema) diff --git a/sdk/rapid/rapid.py b/sdk/rapid/rapid.py index 2c4a8dc..3ac98e5 100644 --- a/sdk/rapid/rapid.py +++ b/sdk/rapid/rapid.py @@ -137,7 +137,7 @@ def download_dataframe( return pd.read_json(json.dumps(data), orient="index") raise DatasetNotFoundException( - f"Could not find dataset, {domain}/{dataset} to download", data + f"Could not find dataset, {layer}/{domain}/{dataset} to download", data ) def upload_dataframe( @@ -184,11 +184,15 @@ def upload_dataframe( raise DataFrameUploadValidationException( "Could not upload dataframe due to an incorrect schema definition" ) - - raise DataFrameUploadFailedException( - "Encountered an unexpected error, could not upload dataframe", - data["details"], - ) + elif response.status_code == 404: + raise DatasetNotFoundException( + "Could not find dataset: {layer}/{domain}/{dataset}", data + ) + else: + raise DataFrameUploadFailedException( + "Encountered an unexpected error, could not upload dataframe", + data["details"], + ) def fetch_dataset_info(self, layer: str, domain: str, dataset: str): """ @@ -215,6 +219,11 @@ def fetch_dataset_info(self, layer: str, domain: str, dataset: str): if response.status_code == 200: return data + if response.status_code == 404: + raise DatasetNotFoundException( + f"Could not find dataset, {layer}/{domain}/{dataset} to get info", data + ) + raise DatasetInfoFailedException( "Failed to gather the dataset info", data["details"] ) diff --git a/sdk/setup.py b/sdk/setup.py index 195d690..88d646d 100644 --- a/sdk/setup.py +++ b/sdk/setup.py @@ -2,7 +2,7 @@ setup( name="rapid-sdk", - version="0.1.2", + version="0.1.3", description="A python sdk for the rAPId API", url="https://github.com/no10ds/rapid-sdk", author="Lewis Card", diff --git a/sdk/tests/test_patterns/test_dataset.py b/sdk/tests/test_patterns/test_dataset.py new file mode 100644 index 0000000..e053329 --- /dev/null +++ b/sdk/tests/test_patterns/test_dataset.py @@ -0,0 +1,186 @@ +from mock import MagicMock, Mock, patch, call +import pytest +from pandas import DataFrame +from requests_mock import Mocker + +from rapid.items.schema import Owner, SchemaMetadata, Schema, SensitivityLevel, Column +from rapid.patterns.dataset import ( + upload_and_create_dataset, + update_schema_to_dataframe, +) +from rapid.exceptions import ( + DataFrameUploadValidationException, + DatasetNotFoundException, +) +from rapid import Rapid + + +class TestData: + def setup_method(self): + self.metadata = SchemaMetadata( + layer="raw", + domain="test", + dataset="rapid_sdk", + sensitivity=SensitivityLevel.PUBLIC, + owners=[Owner(name="test", email="test@email.com")], + ) + + self.mock_schema = Schema( + metadata=self.metadata, + columns=[ + Column( + name="column_a", + partition_index=None, + data_type="object", + allow_null=True, + format=None, + ), + Column( + name="column_b", + partition_index=None, + data_type="object", + allow_null=True, + format=None, + ), + Column( + name="column_c", + partition_index=None, + data_type="object", + allow_null=True, + format=None, + ), + ], + ) + + self.df = DataFrame( + { + "column_a": ["one", "two", "three"], + "column_b": ["one", "two", "three"], + "column_c": ["one", "two", "three"], + } + ) + + def test_upload_and_create_dataset_success(self, rapid: Rapid): + rapid.upload_dataframe = Mock() + rapid.create_schema = Mock() + rapid.update_schema = Mock() + + upload_and_create_dataset(rapid, self.metadata, self.df) + rapid.upload_dataframe.assert_called_once_with( + self.metadata.layer, + self.metadata.domain, + self.metadata.dataset, + self.df, + wait_to_complete=True, + ) + rapid.create_schema.assert_not_called() + rapid.update_schema.assert_not_called() + + def test_upload_and_create_dataset_dataset_not_found(self, rapid: Rapid): + rapid.upload_dataframe = Mock( + side_effect=[DatasetNotFoundException("dummy", "data"), None] + ) + rapid.generate_schema = Mock(return_value=self.mock_schema) + rapid.create_schema = Mock() + upload_and_create_dataset(rapid, self.metadata, self.df) + + rapid.generate_schema.assert_called_once_with( + self.df, + self.metadata.layer, + self.metadata.domain, + self.metadata.dataset, + self.metadata.sensitivity, + ) + rapid.create_schema.assert_called_once_with(self.mock_schema) + + expected_call = call( + self.metadata.layer, + self.metadata.domain, + self.metadata.dataset, + self.df, + wait_to_complete=True, + ) + rapid.upload_dataframe.assert_has_calls([expected_call, expected_call]) + + def test_upload_and_create_dataset_do_not_upgrade_schema_on_fail( + self, rapid: Rapid + ): + rapid.upload_dataframe = Mock( + side_effect=[DataFrameUploadValidationException("dummy", "data"), None] + ) + + with pytest.raises(DataFrameUploadValidationException): + upload_and_create_dataset( + rapid, self.metadata, self.df, upgrade_schema_on_fail=False + ) + rapid.upload_dataframe.assert_called_once_with( + self.metadata.layer, + self.metadata.domain, + self.metadata.dataset, + self.df, + wait_to_complete=True, + ) + + @patch("rapid.patterns.dataset.update_schema_to_dataframe") + def test_upload_and_create_dataset_upgrade_schema_on_fail( + self, mocked_update_schema_to_dataframe: MagicMock, rapid: Rapid + ): + rapid.upload_dataframe = Mock( + side_effect=[DataFrameUploadValidationException("dummy", "data"), None] + ) + + upload_and_create_dataset( + rapid, self.metadata, self.df, upgrade_schema_on_fail=True + ) + mocked_update_schema_to_dataframe.assert_called_once_with( + rapid, self.metadata, self.df + ) + + expected_call = call( + self.metadata.layer, + self.metadata.domain, + self.metadata.dataset, + self.df, + wait_to_complete=True, + ) + rapid.upload_dataframe.assert_has_calls([expected_call, expected_call]) + + def test_update_schema_to_dataframe(self, requests_mock: Mocker, rapid: Rapid): + new_columns = [ + Column( + name="column_a", + partition_index=None, + data_type="float64", # NOTE: Change in data type for column + allow_null=True, + format=None, + ), + Column( + name="column_b", + partition_index=None, + data_type="object", + allow_null=True, + format=None, + ), + Column( + name="column_c", + partition_index=None, + data_type="object", + allow_null=True, + format=None, + ), + ] + + new_schema = Schema(metadata=self.mock_schema.metadata, columns=new_columns) + rapid.generate_schema = Mock(return_value=new_schema) + rapid.update_schema = Mock() + + update_schema_to_dataframe(rapid, self.metadata, self.df) + + rapid.generate_schema.assert_called_once_with( + self.df, + self.metadata.layer, + self.metadata.domain, + self.metadata.dataset, + self.metadata.sensitivity, + ) + rapid.update_schema.assert_called_once_with(new_schema) diff --git a/sdk/tests/test_patterns/test_patterns.py b/sdk/tests/test_patterns/test_patterns.py deleted file mode 100644 index b5a0e95..0000000 --- a/sdk/tests/test_patterns/test_patterns.py +++ /dev/null @@ -1,154 +0,0 @@ -from mock import Mock, patch -import pytest -from pandas import DataFrame -from requests_mock import Mocker - -from rapid.items.schema import Owner, SchemaMetadata, SensitivityLevel, Column -from rapid.patterns.data import upload_and_create_dataframe, update_schema_dataframe -from rapid.exceptions import ( - ColumnNotDifferentException, - DataFrameUploadValidationException, -) -from rapid import Rapid -from tests.conftest import RAPID_URL - -metadata = SchemaMetadata( - layer="raw", - domain="test", - dataset="rapid_sdk", - sensitivity=SensitivityLevel.PUBLIC, - owners=[Owner(name="test", email="test@email.com")], -) - -mock_response = { - "metadata": { - "layer": "raw", - "domain": "test", - "dataset": "rapid_sdk", - "sensitivity": "PUBLIC", - "key_value_tags": {}, - "key_only_tags": [], - "owners": [{"name": "change_me", "email": "change_me@email.com"}], - "update_behaviour": "APPEND", - }, - "columns": [ - { - "name": "column_a", - "partition_index": None, - "data_type": "object", - "allow_null": True, - "format": None, - }, - { - "name": "column_b", - "partition_index": None, - "data_type": "object", - "allow_null": True, - "format": None, - }, - { - "name": "column_c", - "partition_index": None, - "data_type": "object", - "allow_null": True, - "format": None, - }, - ], -} - -mock_failed_response = {"details": "dummy"} - -df = DataFrame( - { - "column_a": ["one", "two", "three"], - "column_b": ["one", "two", "three"], - "column_c": ["one", "two", "three"], - } -) - - -class TestUtils: - def test_upload_and_create_dataframe(self, requests_mock: Mocker, rapid: Rapid): - requests_mock.post( - f"{RAPID_URL}/schema/{metadata.layer}/{metadata.sensitivity}/{metadata.domain}" - + f"/{metadata.dataset}/generate", - json=mock_response, - ) - requests_mock.post( - f"{RAPID_URL}/schema", - ) - rapid.upload_dataframe = Mock() - upload_and_create_dataframe(rapid, metadata, df) - - rapid.upload_dataframe.assert_called_once_with( - metadata.layer, metadata.domain, metadata.dataset, df - ) - - def test_upload_and_create_dataframe_fails( - self, requests_mock: Mocker, rapid: Rapid - ): - requests_mock.post( - f"{RAPID_URL}/schema/{metadata.layer}/{metadata.sensitivity}/{metadata.domain}" - + f"/{metadata.dataset}/generate", - json=mock_response, - ) - requests_mock.post(f"{RAPID_URL}/schema") - rapid.upload_dataframe = Mock(side_effect=DataFrameUploadValidationException) - - with pytest.raises(DataFrameUploadValidationException): - upload_and_create_dataframe(rapid, metadata, df) - - @patch("rapid.patterns.data.update_schema_dataframe") - def test_upload_and_create_dataframe_upgrade_schema_on_fail( - self, mocked_update_schema_dataframe, requests_mock: Mocker, rapid: Rapid - ): - requests_mock.post( - f"{RAPID_URL}/schema/{metadata.layer}/{metadata.sensitivity}/{metadata.domain}" - + f"/{metadata.dataset}/generate", - json=mock_response, - ) - requests_mock.post(f"{RAPID_URL}/schema") - rapid.upload_dataframe = Mock(side_effect=DataFrameUploadValidationException) - - upload_and_create_dataframe(rapid, metadata, df, upgrade_schema_on_fail=True) - mocked_update_schema_dataframe.assert_called_once() - - def test_update_schema_dataframe(self, requests_mock: Mocker, rapid: Rapid): - new_columns = [ - Column( - name="column_a", - partition_index=None, - data_type="float64", # NOTE: Change in data type for column - allow_null=True, - format=None, - ), - Column( - name="column_b", - partition_index=None, - data_type="object", - allow_null=True, - format=None, - ), - Column( - name="column_c", - partition_index=None, - data_type="object", - allow_null=True, - format=None, - ), - ] - requests_mock.get( - f"{RAPID_URL}/datasets/{metadata.layer}/{metadata.domain}/{metadata.dataset}/info", - json=mock_response, - ) - requests_mock.put(f"{RAPID_URL}/schema", json={"dummy": "data"}) - update_schema_dataframe(rapid, metadata, new_columns) - - def test_update_schema_dataframe_fail(self, requests_mock: Mocker, rapid: Rapid): - requests_mock.get( - f"{RAPID_URL}/datasets/{metadata.layer}/{metadata.domain}/{metadata.dataset}/info", - json=mock_response, - ) - requests_mock.put(f"{RAPID_URL}/schema", json={"dummy": "data"}) - with pytest.raises(ColumnNotDifferentException): - update_schema_dataframe(rapid, metadata, mock_response["columns"]) From f1ff07c65bf647c1871918c1f0b91a8bf8725213 Mon Sep 17 00:00:00 2001 From: lcard Date: Wed, 11 Oct 2023 13:43:54 +0100 Subject: [PATCH 2/2] Fix docstrings --- sdk/rapid/patterns/dataset.py | 1 - sdk/rapid/rapid.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sdk/rapid/patterns/dataset.py b/sdk/rapid/patterns/dataset.py index 24134f1..8acf708 100644 --- a/sdk/rapid/patterns/dataset.py +++ b/sdk/rapid/patterns/dataset.py @@ -21,7 +21,6 @@ def upload_and_create_dataset( Raises: rapid.exceptions.DataFrameUploadValidationException: If the DataFrame's schema is incorrect and upgrade_schema_on_fail is False. - Exception: If an error occurs while generating the schema, creating the schema, or uploading the DataFrame. """ try: rapid.upload_dataframe( diff --git a/sdk/rapid/rapid.py b/sdk/rapid/rapid.py index 3ac98e5..2c9157d 100644 --- a/sdk/rapid/rapid.py +++ b/sdk/rapid/rapid.py @@ -159,8 +159,9 @@ def upload_dataframe( wait_to_complete (bool, optional): Whether to wait for the upload job to complete before returning. Defaults to True. Raises: - rapid.exceptions.DataFrameUploadValidationException: If the DataFrame's schema is incorrect. - rapid.exceptions.DataFrameUploadFailedException: If an unexpected error occurs while uploading the DataFrame. + rapid.exceptions.DataFrameUploadValidationException: If the DataFrame's schema is incorrect. + rapid.exceptions.DataFrameUploadFailedException: If an unexpected error occurs while uploading the DataFrame. + rapid.exceptions.DatasetNotFoundException: If the specified dataset does not exist. Returns: If wait_to_complete is True, returns "Success" if the upload is successful. @@ -205,6 +206,7 @@ def fetch_dataset_info(self, layer: str, domain: str, dataset: str): Raises: rapid.exceptions.DatasetInfoFailedException: If an error occurs while fetching the dataset information. + rapid.exceptions.DatasetNotFoundException: If the specified dataset does not exist. Returns: A dictionary containing the metadata information for the dataset.