diff --git a/.gitignore b/.gitignore index 5333a294..adbcb9d4 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,7 @@ docker/search-api/BUILD # Portal cache src/hubmap_translation/addl_index_transformations/portal/cache/*.jsonld + + +# python virtual environments +**/.venv \ No newline at end of file diff --git a/src/hubmap_translation/addl_index_transformations/portal/__init__.py b/src/hubmap_translation/addl_index_transformations/portal/__init__.py index ad799179..43484b48 100644 --- a/src/hubmap_translation/addl_index_transformations/portal/__init__.py +++ b/src/hubmap_translation/addl_index_transformations/portal/__init__.py @@ -28,10 +28,6 @@ add_assay_details ) -from hubmap_translation.addl_index_transformations.portal.add_dataset_categories import ( - add_dataset_categories -) - from hubmap_translation.addl_index_transformations.portal.lift_dataset_metadata_fields import ( lift_dataset_metadata_fields ) @@ -72,7 +68,6 @@ def transform(doc, transformation_resources, batch_id='unspecified'): doc_copy['transformation_errors'] = [] try: add_assay_details(doc_copy, transformation_resources) - add_dataset_categories(doc_copy) lift_dataset_metadata_fields(doc_copy) translate(doc_copy) except TranslationException as e: diff --git a/src/hubmap_translation/addl_index_transformations/portal/add_assay_details.py b/src/hubmap_translation/addl_index_transformations/portal/add_assay_details.py index e9f05eee..a442245a 100644 --- a/src/hubmap_translation/addl_index_transformations/portal/add_assay_details.py +++ b/src/hubmap_translation/addl_index_transformations/portal/add_assay_details.py @@ -1,6 +1,7 @@ import requests import logging import re +from enum import Enum from portal_visualization.builder_factory import has_visualization @@ -13,6 +14,20 @@ logger = logging.getLogger(__name__) +class CreationAction(str, Enum): + CREATE_DATASET = 'Create Dataset Activity' + MULTI_ASSAY_SPLIT = 'Multi-Assay Split' + CENTRAL_PROCESS = 'Central Process' + LAB_PROCESS = 'Lab Process' + CREATE_PUBLICATION = 'Create Publication Activity' + + +processing_type_map = { + CreationAction.CENTRAL_PROCESS: 'hubmap', + CreationAction.LAB_PROCESS: 'lab', +} + + def _get_assay_details(doc, transformation_resources): soft_assay_url = transformation_resources.get('ingest_api_soft_assay_url') token = transformation_resources.get('token') @@ -33,6 +48,56 @@ def _get_assay_details(doc, transformation_resources): raise +def _add_dataset_processing_fields(doc): + if processing_type := processing_type_map.get(doc['creation_action']): + doc['processing'] = 'processed' + doc['processing_type'] = processing_type + else: + doc['processing'] = 'raw' + + +def _is_component_dataset(doc): + if 'creation_action' in doc and doc['creation_action'] == CreationAction.MULTI_ASSAY_SPLIT: + return True + return False + + +def _add_multi_assay_fields(doc, assay_details): + if _is_component_dataset(doc): + doc['assay_modality'] = 'multiple' + doc['is_component'] = True + return + + if assay_details.get('is-multi-assay', False): + doc['assay_modality'] = 'multiple' + creation_action = doc.get('creation_action', None) + if creation_action in [CreationAction.CREATE_DATASET, CreationAction.CENTRAL_PROCESS]: + doc['is_component'] = False + else: + error_msg = f"Unexpected creation_action={creation_action}. is_component will not be set." + _log_transformation_error(doc, error_msg) + return + + doc['assay_modality'] = 'single' + + +def _add_dataset_categories(doc, assay_details): + if doc['entity_type'] == 'Dataset': + creation_action = doc.get('creation_action') + if not creation_action: + error_msg = "Creation action undefined." + _log_transformation_error(doc, error_msg) + return + + if creation_action not in {enum.value for enum in CreationAction}: + error_msg = f"Unrecognized creation action, {creation_action}." + _log_transformation_error(doc, error_msg) + return + + _add_dataset_processing_fields(doc) + _add_multi_assay_fields(doc, assay_details) + + def add_assay_details(doc, transformation_resources): if 'dataset_type' in doc: assay_details = _get_assay_details(doc, transformation_resources) @@ -48,6 +113,8 @@ def add_assay_details(doc, transformation_resources): doc['mapped_data_types'] = [assay_details.get('description')] doc['vitessce-hints'] = assay_details.get('vitessce-hints') + _add_dataset_categories(doc, assay_details) + error_msg = assay_details.get('error') if error_msg: _log_transformation_error(doc, error_msg) diff --git a/src/hubmap_translation/addl_index_transformations/portal/add_dataset_categories.py b/src/hubmap_translation/addl_index_transformations/portal/add_dataset_categories.py deleted file mode 100644 index a14c69dd..00000000 --- a/src/hubmap_translation/addl_index_transformations/portal/add_dataset_categories.py +++ /dev/null @@ -1,67 +0,0 @@ -from enum import Enum - -from hubmap_translation.addl_index_transformations.portal.utils import ( - _log_transformation_error -) - - -class CreationAction(str, Enum): - CREATE_DATASET = 'Create Dataset Activity' - MULTI_ASSAY_SPLIT = 'Multi-Assay Split' - CENTRAL_PROCESS = 'Central Process' - LAB_PROCESS = 'Lab Process' - CREATE_PUBLICATION = 'Create Publication Activity' - - -processing_type_map = { - CreationAction.CENTRAL_PROCESS: 'hubmap', - CreationAction.LAB_PROCESS: 'lab', -} - - -def _add_dataset_processing_fields(doc): - if processing_type := processing_type_map.get(doc['creation_action']): - doc['processing'] = 'processed' - doc['processing_type'] = processing_type - else: - doc['processing'] = 'raw' - - -def _is_component_dataset(doc): - if 'creation_action' in doc and doc['creation_action'] == CreationAction.MULTI_ASSAY_SPLIT: - return True - return False - -# Currently only handles primary and component datasets. -# As multi-assay datasets begin to be processed, we will transition to getting an is_multi_assay bool from soft assay. - - -def _add_multi_assay_fields(doc): - if _is_component_dataset(doc): - doc['assay_modality'] = 'multiple' - doc['multi_assay_category'] = 'component' - return - - for descendant_doc in doc['descendants']: - if _is_component_dataset(descendant_doc): - doc['assay_modality'] = 'multiple' - doc['multi_assay_category'] = 'primary' - return - doc['assay_modality'] = 'single' - - -def add_dataset_categories(doc): - if doc['entity_type'] == 'Dataset': - creation_action = doc.get('creation_action') - if not creation_action: - error_msg = "Creation action undefined." - _log_transformation_error(doc, error_msg) - return - - if creation_action not in {enum.value for enum in CreationAction}: - error_msg = f"Unrecognized creation action, {creation_action}." - _log_transformation_error(doc, error_msg) - return - - _add_dataset_processing_fields(doc) - _add_multi_assay_fields(doc) diff --git a/src/hubmap_translation/addl_index_transformations/portal/test.sh b/src/hubmap_translation/addl_index_transformations/portal/test.sh index fe0ff5a2..20ad85ac 100755 --- a/src/hubmap_translation/addl_index_transformations/portal/test.sh +++ b/src/hubmap_translation/addl_index_transformations/portal/test.sh @@ -22,6 +22,6 @@ end portal/doctests start portal/pytest cd ../../../ -PYTHONPATH="src:$PYTHONPATH" pytest --verbose --log-cli-level WARN +PYTHONPATH="src:$PYTHONPATH" pytest -vv --log-cli-level WARN cd - end portal/pytest \ No newline at end of file diff --git a/src/hubmap_translation/addl_index_transformations/portal/tests/test_add_dataset_categories.py b/src/hubmap_translation/addl_index_transformations/portal/tests/test_add_dataset_categories.py deleted file mode 100644 index 6025f19c..00000000 --- a/src/hubmap_translation/addl_index_transformations/portal/tests/test_add_dataset_categories.py +++ /dev/null @@ -1,138 +0,0 @@ -from hubmap_translation.addl_index_transformations.portal.add_dataset_categories import ( - add_dataset_categories -) - - -def test_hubmap_processing(): - hubmap_processed_input_doc = { - 'creation_action': 'Central Process', - 'descendants': [], - 'entity_type': 'Dataset', - } - - hubmap_processed_ouput_doc = { - 'assay_modality': 'single', - 'creation_action': 'Central Process', - 'descendants': [], - 'entity_type': 'Dataset', - 'processing': 'processed', - 'processing_type': 'hubmap', - } - - add_dataset_categories(hubmap_processed_input_doc) - assert hubmap_processed_input_doc == hubmap_processed_ouput_doc - - -def test_lab_processing(): - lab_processed_input_doc = { - 'creation_action': 'Lab Process', - 'descendants': [], - 'entity_type': 'Dataset', - } - - lab_processed_ouput_doc = { - 'assay_modality': 'single', - 'creation_action': 'Lab Process', - 'descendants': [], - 'entity_type': 'Dataset', - 'processing': 'processed', - 'processing_type': 'lab', - } - add_dataset_categories(lab_processed_input_doc) - assert lab_processed_input_doc == lab_processed_ouput_doc - - -def test_raw(): - raw_input_doc = { - 'entity_type': 'Dataset', - 'creation_action': 'Create Dataset Activity', - 'descendants': [] - } - - raw_ouput_doc = { - 'assay_modality': 'single', - 'creation_action': 'Create Dataset Activity', - 'descendants': [], - 'entity_type': 'Dataset', - 'processing': 'raw', - } - add_dataset_categories(raw_input_doc) - assert raw_input_doc == raw_ouput_doc - - -def test_component(): - component_input_doc = { - 'creation_action': 'Multi-Assay Split', - 'descendants': [], - 'entity_type': 'Dataset', - } - - component_output_doc = { - 'assay_modality': 'multiple', - 'creation_action': 'Multi-Assay Split', - 'descendants': [], - 'entity_type': 'Dataset', - 'multi_assay_category': 'component', - 'processing': 'raw', - } - add_dataset_categories(component_input_doc) - assert component_input_doc == component_output_doc - - -def test_primary(): - primary_input_doc = { - 'creation_action': 'Create Dataset Activity', - 'descendants': [ - { - 'creation_action': 'Multi-Assay Split', - } - ], - 'entity_type': 'Dataset', - } - - primary_output_doc = { - 'assay_modality': 'multiple', - 'creation_action': 'Create Dataset Activity', - 'descendants': [ - { - 'creation_action': 'Multi-Assay Split', - } - ], - 'entity_type': 'Dataset', - 'multi_assay_category': 'primary', - 'processing': 'raw', - } - add_dataset_categories(primary_input_doc) - assert primary_input_doc == primary_output_doc - - -def test_undefined_creation_action(): - undefined_creation_action_input_doc = { - 'entity_type': 'Dataset', - 'transformation_errors': [], - } - - undefined_creation_action_output_doc = { - 'entity_type': 'Dataset', - 'transformation_errors': ['Creation action undefined.'], - } - - add_dataset_categories(undefined_creation_action_input_doc) - assert undefined_creation_action_input_doc == undefined_creation_action_output_doc - - -def test_unknown_creation_action(): - unknown_creation_action_input_doc = { - 'creation_action': "Conjure Dataset", - 'entity_type': 'Dataset', - 'transformation_errors': [], - } - - unknown_creation_action_output_doc = { - 'creation_action': "Conjure Dataset", - 'entity_type': 'Dataset', - 'transformation_errors': ['Unrecognized creation action, Conjure Dataset.'], - } - - add_dataset_categories(unknown_creation_action_input_doc) - assert unknown_creation_action_input_doc == unknown_creation_action_output_doc diff --git a/src/hubmap_translation/addl_index_transformations/portal/tests/test_assay_details.py b/src/hubmap_translation/addl_index_transformations/portal/tests/test_assay_details.py index 7a347fb0..a3bce94d 100644 --- a/src/hubmap_translation/addl_index_transformations/portal/tests/test_assay_details.py +++ b/src/hubmap_translation/addl_index_transformations/portal/tests/test_assay_details.py @@ -1,5 +1,8 @@ +import pytest + from hubmap_translation.addl_index_transformations.portal.add_assay_details import ( - add_assay_details + add_assay_details, + _add_dataset_categories ) transformation_resources = { @@ -33,6 +36,8 @@ def test_raw_dataset_type(mocker): input_raw_doc = { 'uuid': '421007293469db7b528ce6478c00348d', 'dataset_type': 'RNAseq', + 'entity_type': 'Dataset', + 'creation_action': 'Create Dataset Activity', } expected_raw_output_doc = { @@ -43,6 +48,10 @@ def test_raw_dataset_type(mocker): 'uuid': '421007293469db7b528ce6478c00348d', 'vitessce-hints': [], 'visualization': False, + 'entity_type': 'Dataset', + 'assay_modality': 'single', + 'creation_action': 'Create Dataset Activity', + 'processing': 'raw' } add_assay_details(input_raw_doc, transformation_resources) assert input_raw_doc == expected_raw_output_doc @@ -76,14 +85,21 @@ def test_processed_dataset_type(mocker): input_processed_doc = { 'uuid': '22684b9011fc5aea5cb3f89670a461e8', 'dataset_type': 'RNAseq [Salmon]', + 'entity_type': 'Dataset', + 'creation_action': 'Central Process' } output_processed_doc = { 'assay_display_name': ['sciRNA-seq [Salmon]'], 'dataset_type': 'RNAseq [Salmon]', + 'entity_type': 'Dataset', 'mapped_data_types': ['sciRNA-seq [Salmon]'], 'pipeline': 'Salmon', 'raw_dataset_type': 'RNAseq', + 'assay_modality': 'single', + 'creation_action': 'Central Process', + 'processing': 'processed', + 'processing_type': 'hubmap', 'uuid': '22684b9011fc5aea5cb3f89670a461e8', 'vitessce-hints': [ "is_sc", @@ -115,19 +131,145 @@ def test_transform_unknown_assay(mocker): unknown_assay_input_doc = { 'uuid': '69c70762689b20308bb049ac49653342', 'dataset_type': 'RNAseq [Salmon]', - 'transformation_errors': [] + 'transformation_errors': [], + 'entity_type': 'Dataset', + 'creation_action': 'Central Process' } unknown_assay_output_doc = { 'assay_display_name': ['RNAseq [Salmon]'], + 'assay_modality': 'single', + 'creation_action': 'Central Process', 'dataset_type': 'RNAseq [Salmon]', 'mapped_data_types': ['RNAseq [Salmon]'], 'pipeline': 'Salmon', + "processing": "processed", + "processing_type": "hubmap", 'raw_dataset_type': 'RNAseq', 'transformation_errors': ['No soft assay information returned.'], 'uuid': '69c70762689b20308bb049ac49653342', 'vitessce-hints': ['unknown-assay'], 'visualization': False, + 'entity_type': 'Dataset', } add_assay_details(unknown_assay_input_doc, transformation_resources) assert unknown_assay_input_doc == unknown_assay_output_doc + + +def test_hubmap_processing(): + hubmap_processed_input_doc = { + 'creation_action': 'Central Process', + 'descendants': [], + 'entity_type': 'Dataset', + } + + hubmap_processed_ouput_doc = { + 'assay_modality': 'single', + 'creation_action': 'Central Process', + 'descendants': [], + 'entity_type': 'Dataset', + 'processing': 'processed', + 'processing_type': 'hubmap', + } + + _add_dataset_categories(hubmap_processed_input_doc, {}) + assert hubmap_processed_input_doc == hubmap_processed_ouput_doc + + +def test_lab_processing(): + lab_processed_input_doc = { + 'creation_action': 'Lab Process', + 'descendants': [], + 'entity_type': 'Dataset', + } + + lab_processed_ouput_doc = { + 'assay_modality': 'single', + 'creation_action': 'Lab Process', + 'descendants': [], + 'entity_type': 'Dataset', + 'processing': 'processed', + 'processing_type': 'lab', + } + _add_dataset_categories(lab_processed_input_doc, {}) + assert lab_processed_input_doc == lab_processed_ouput_doc + + +@pytest.mark.parametrize( + "creation_action, expected_error", + [ + pytest.param( + "Conjure Dataset", ['Unrecognized creation action, Conjure Dataset.'], id="unknown" + ), + pytest.param( + None, ['Creation action undefined.'], id="undefined" + ), + ] +) +def test_creation_action(creation_action, expected_error): + creation_action_input_doc = { + 'entity_type': 'Dataset', + 'transformation_errors': [], + } + + creation_action_output_doc = { + 'entity_type': 'Dataset', + 'transformation_errors': expected_error, + } + + if creation_action: + creation_action_input_doc['creation_action'] = creation_action + creation_action_output_doc['creation_action'] = creation_action + + _add_dataset_categories(creation_action_input_doc, {}) + assert creation_action_input_doc == creation_action_output_doc + + +@pytest.mark.parametrize( + "creation_action,is_multi_assay,expected_component_bool,expected_modality,expected_processing", + [ + pytest.param( + "Create Dataset Activity", None, None, "single", "raw", id="primary single assay" + ), + pytest.param( + "Create Dataset Activity", True, False, "multiple", "raw", id="primary multiassay" + ), + pytest.param( + "Multi-Assay Split", None, True, "multiple", "raw", id="component" + ), + pytest.param( + "Central Process", True, False, "multiple", "processed", id="processed multiassay" + ), + pytest.param( + "Central Process", None, None, "single", "processed", id="processed single assay" + ), + ] +) +def test_assay_modality_fields(creation_action, is_multi_assay, expected_component_bool, expected_modality, expected_processing): + input_doc = { + 'creation_action': creation_action, + 'entity_type': 'Dataset', + } + + output_doc = { + 'assay_modality': expected_modality, + 'creation_action': creation_action, + 'entity_type': 'Dataset', + 'processing': expected_processing, + } + + assay_details = { + "vitessce-hints": [] + } + + if is_multi_assay: + assay_details['is-multi-assay'] = is_multi_assay + + if expected_processing == "processed": + output_doc['processing_type'] = "hubmap" + + if expected_modality == 'multiple': + output_doc['is_component'] = expected_component_bool + + _add_dataset_categories(input_doc, assay_details) + assert input_doc == output_doc diff --git a/src/hubmap_translation/addl_index_transformations/portal/utils.py b/src/hubmap_translation/addl_index_transformations/portal/utils.py index 1d479701..11b7c740 100644 --- a/src/hubmap_translation/addl_index_transformations/portal/utils.py +++ b/src/hubmap_translation/addl_index_transformations/portal/utils.py @@ -6,5 +6,6 @@ def _log_transformation_error(doc, msg): + doc['transformation_errors'] = doc.get('transformation_errors', []) doc['transformation_errors'].append(msg) logger.info(f"doc={doc.get('uuid')}:, {msg}") diff --git a/src/libs/assay_type.py b/src/libs/assay_type.py index 04850045..4ff17faf 100644 --- a/src/libs/assay_type.py +++ b/src/libs/assay_type.py @@ -50,7 +50,7 @@ class AssayType(object): >>> a_t = AssayType('scRNA-Seq-10x') # This one does contain PII >>> [a_t.name, a_t.description, a_t.vis_only, a_t.contains_pii] - ['scRNA-Seq-10x', 'scRNA-seq (10x Genomics)', False, True] + ['scRNAseq-10xGenomics-v3', 'scRNA-seq (10x Genomics v3)', False, True] >>> a_t = AssayType(['PAS', 'Image Pyramid']) # complex alt name >>> [a_t.name, a_t.description, a_t.vis_only, a_t.contains_pii]