From 9d781a02681f59e74103ab8ff9506f0fe1248070 Mon Sep 17 00:00:00 2001 From: Samuel Tonks <60216815+Tonks684@users.noreply.github.com> Date: Thu, 25 May 2023 15:00:44 +0100 Subject: [PATCH 1/9] Update catalog_entry_form.js --- frontend/src/catalog_entry_form.js | 1 + 1 file changed, 1 insertion(+) diff --git a/frontend/src/catalog_entry_form.js b/frontend/src/catalog_entry_form.js index dce14dd5..a6939980 100644 --- a/frontend/src/catalog_entry_form.js +++ b/frontend/src/catalog_entry_form.js @@ -99,6 +99,7 @@ export function CatalogEntryForm({ gh_logged_in, schema, uiSchema, catalog_kind, branch: user_scivision_fork.default_branch, }); } + // /////////// From 2cf40db07e3ce72e19f9a7f354f88f7dbca83670 Mon Sep 17 00:00:00 2001 From: Samuel Tonks <60216815+Tonks684@users.noreply.github.com> Date: Thu, 8 Jun 2023 14:48:39 +0100 Subject: [PATCH 2/9] automated checks file version 1 --- dev/automated_checks.py | 57 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 dev/automated_checks.py diff --git a/dev/automated_checks.py b/dev/automated_checks.py new file mode 100644 index 00000000..7994bbdf --- /dev/null +++ b/dev/automated_checks.py @@ -0,0 +1,57 @@ +''' +Automated Checks + +Iterate through data catalog via scivision.load_dataset function and log responses + +''' + +import pandas as pd +from scivision import default_catalog, load_dataset, load_pretrained_model +from tqdm import tqdm +import logging + +# Create Logger +logger = logging.getLogger(__name__) +# Set log level +logger.setLevel(logging.INFO) +file_handler = logging.FileHandler('logfile.log') +formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(name)s : %(message)s') +file_handler.setFormatter(formatter) +logger.addHandler(file_handler) + +## Datasource Checks + +# Load dataset catalog +datasources_catalog = default_catalog.datasources.to_dataframe() +# Load dataset using load_dataset and record response +rows = [] +for index in tqdm(range(datasources_catalog.shape[0])): + name = datasources_catalog.loc[index]['name'] + print(f'\nValidating: {name}') + data_url = datasources_catalog.loc[index]['url'][:] + try: + load_dataset(data_url) + check_result = "Pass" + response = None + except Exception as e: + print(e) + logger.exception("Automated Dataset Check has failed!") + check_result = "Fail" + response = logger.error(e,exc_info=True) + pass + + new_row = { + 'dataset_name':datasources_catalog.loc[index]['name'], + 'url': data_url, + 'check_result': check_result, + 'response': response, + + } + + rows.append(new_row) +automated_checks_report = pd.DataFrame.from_dict(rows, orient='columns') +automated_checks_report.to_csv('automated_dataset_checks.csv',index=False) + + + + From b638b6e0b922143a359b618c16798a6ab9fbbdc2 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Thu, 13 Jul 2023 15:08:13 +0100 Subject: [PATCH 3/9] Lint --- dev/automated_checks.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/dev/automated_checks.py b/dev/automated_checks.py index 7994bbdf..a669d010 100644 --- a/dev/automated_checks.py +++ b/dev/automated_checks.py @@ -6,7 +6,7 @@ ''' import pandas as pd -from scivision import default_catalog, load_dataset, load_pretrained_model +from scivision import default_catalog, load_dataset from tqdm import tqdm import logging @@ -19,7 +19,7 @@ file_handler.setFormatter(formatter) logger.addHandler(file_handler) -## Datasource Checks +# Datasource Checks # Load dataset catalog datasources_catalog = default_catalog.datasources.to_dataframe() @@ -37,21 +37,16 @@ print(e) logger.exception("Automated Dataset Check has failed!") check_result = "Fail" - response = logger.error(e,exc_info=True) - pass + response = logger.error(e, exc_info=True) new_row = { - 'dataset_name':datasources_catalog.loc[index]['name'], + 'dataset_name': datasources_catalog.loc[index]['name'], 'url': data_url, 'check_result': check_result, 'response': response, + } - } - rows.append(new_row) -automated_checks_report = pd.DataFrame.from_dict(rows, orient='columns') -automated_checks_report.to_csv('automated_dataset_checks.csv',index=False) - - - +automated_checks_report = pd.DataFrame.from_dict(rows, orient='columns') +automated_checks_report.to_csv('automated_dataset_checks.csv', index=False) From bf8e86b147d89507d4146b464a43d858d0255ab0 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Thu, 13 Jul 2023 15:13:30 +0100 Subject: [PATCH 4/9] Add tqdm to requirements for automated checks --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7336e9a6..86beb0a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ pydantic~=1.9 exifread~=3.0 distinctipy~=1.2 ipython~=8.3 +tqdm~=4 From e35c3189dd6da0b397abea76a45c048e9d81de19 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Thu, 13 Jul 2023 15:55:57 +0100 Subject: [PATCH 5/9] Rename dataset check script --- dev/{automated_checks.py => check_datasets.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename dev/{automated_checks.py => check_datasets.py} (88%) diff --git a/dev/automated_checks.py b/dev/check_datasets.py similarity index 88% rename from dev/automated_checks.py rename to dev/check_datasets.py index a669d010..338a4439 100644 --- a/dev/automated_checks.py +++ b/dev/check_datasets.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) # Set log level logger.setLevel(logging.INFO) -file_handler = logging.FileHandler('logfile.log') +file_handler = logging.FileHandler('check_datasets.log') formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(name)s : %(message)s') file_handler.setFormatter(formatter) logger.addHandler(file_handler) @@ -49,4 +49,4 @@ rows.append(new_row) automated_checks_report = pd.DataFrame.from_dict(rows, orient='columns') -automated_checks_report.to_csv('automated_dataset_checks.csv', index=False) +automated_checks_report.to_csv('check_datasets.csv', index=False) From 8801ad4eb3f91b16a73f2854856b0ab739f28730 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Thu, 13 Jul 2023 16:00:30 +0100 Subject: [PATCH 6/9] Fix version specifier --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 86beb0a9..51b552d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,4 @@ pydantic~=1.9 exifread~=3.0 distinctipy~=1.2 ipython~=8.3 -tqdm~=4 +tqdm~=4.65 From cd9e8ddc78522cd91e3cd7f3a6773a78dfb8f421 Mon Sep 17 00:00:00 2001 From: ots22 Date: Fri, 14 Jul 2023 12:47:52 +0100 Subject: [PATCH 7/9] Output dataset checks report as json --- dev/check_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/check_datasets.py b/dev/check_datasets.py index 338a4439..a01eec88 100644 --- a/dev/check_datasets.py +++ b/dev/check_datasets.py @@ -50,3 +50,4 @@ automated_checks_report = pd.DataFrame.from_dict(rows, orient='columns') automated_checks_report.to_csv('check_datasets.csv', index=False) +automated_checks_report.to_json('check_datasets.json', orient="records") From ae608080a7d09baae37f570ed4561c50227bf067 Mon Sep 17 00:00:00 2001 From: ots22 Date: Fri, 14 Jul 2023 14:52:18 +0100 Subject: [PATCH 8/9] Update check_datasets.py --- dev/check_datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/check_datasets.py b/dev/check_datasets.py index a01eec88..9f133168 100644 --- a/dev/check_datasets.py +++ b/dev/check_datasets.py @@ -50,4 +50,6 @@ automated_checks_report = pd.DataFrame.from_dict(rows, orient='columns') automated_checks_report.to_csv('check_datasets.csv', index=False) + +automated_checks_report.set_index('dataset_name') automated_checks_report.to_json('check_datasets.json', orient="records") From 2ca198e5a0eb32b345b41aee7cd43c5989adb94a Mon Sep 17 00:00:00 2001 From: ots22 Date: Fri, 14 Jul 2023 14:58:15 +0100 Subject: [PATCH 9/9] Change JSON orientation for automated checks report --- dev/check_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/check_datasets.py b/dev/check_datasets.py index 9f133168..5f56c0c2 100644 --- a/dev/check_datasets.py +++ b/dev/check_datasets.py @@ -51,5 +51,5 @@ automated_checks_report = pd.DataFrame.from_dict(rows, orient='columns') automated_checks_report.to_csv('check_datasets.csv', index=False) -automated_checks_report.set_index('dataset_name') -automated_checks_report.to_json('check_datasets.json', orient="records") +automated_checks_report = automated_checks_report.set_index('dataset_name') +automated_checks_report.to_json('check_datasets.json', orient="index")