From 201962d6b7610863543f94543559494cfdd53d9c Mon Sep 17 00:00:00 2001 From: mccalluc Date: Wed, 15 Dec 2021 14:46:47 -0500 Subject: [PATCH 1/7] make sure metadata numbers come through --- .../portal/__init__.py | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/elasticsearch/addl_index_transformations/portal/__init__.py b/src/elasticsearch/addl_index_transformations/portal/__init__.py index cf923a94..a1c34b14 100644 --- a/src/elasticsearch/addl_index_transformations/portal/__init__.py +++ b/src/elasticsearch/addl_index_transformations/portal/__init__.py @@ -88,6 +88,9 @@ def transform(doc, batch_id='unspecified'): ... 'metadata_path': 'No!', ... 'tissue_id': 'No!', ... 'donor_id': 'No!', + ... 'cell_barcode_size': '123', + ... 'should_be_int': '123', + ... 'should_be_float': '123.456', ... 'keep_this_field': 'Yes!', ... 'is_boolean': '1' ... } @@ -121,7 +124,11 @@ def transform(doc, batch_id='unspecified'): 'mapped_external_group_name': 'Outside HuBMAP', 'mapped_metadata': {}, 'mapped_status': 'New', - 'metadata': {'metadata': {'is_boolean': 'TRUE', 'keep_this_field': 'Yes!'}}, + 'metadata': {'metadata': {'cell_barcode_size': '123', + 'is_boolean': 'TRUE', + 'keep_this_field': 'Yes!', + 'should_be_float': 123.456, + 'should_be_int': 123}}, 'origin_sample': {'mapped_organ': 'Lymph Node', 'organ': 'LY'}, 'rui_location': '{"ccf_annotations": ' '["http://purl.obolibrary.org/obo/UBERON_0001157"]}', @@ -205,6 +212,11 @@ def _simple_clean(doc): 'donor_id', 'tissue_id' # For internal use only. ] + # Ideally, we'd pull from https://github.com/hubmapconsortium/ingest-validation-tools/blob/main/docs/field-types.yaml + # here, or make the TSV parsing upstream schema aware, + # instead of trying to guess, but I think the number of special cases will be relatively small. + not_really_a_number = ['cell_barcode_size', 'cell_barcode_offset'] + # Explicitly convert items to list, # so we can remove keys from the metadata dict: for k, v in list(metadata.items()): @@ -218,9 +230,19 @@ def _simple_clean(doc): metadata[k] = 'FALSE' if v in ['1', 'true', 'True']: metadata[k] = 'TRUE' - # Other converstions are handled by ES numeric detection. - # See: portal/config.yaml - # https://www.elastic.co/guide/en/elasticsearch/reference/current/dynamic-field-mapping.html + continue + + if k not in not_really_a_number: + try: + as_number = int(v) + except ValueError: + try: + as_number = float(v) + except ValueError: + as_number = None + if as_number is not None: + metadata[k] = as_number + # TODO: Reenable this when we have time, and can make sure we don't need these fields. # From f8f14f4c27616035ec0804774e0fd98b9f6ab5af Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Thu, 16 Dec 2021 19:05:42 -0500 Subject: [PATCH 2/7] Update assay_types.yaml The entry for "bulk-RNA" contained two elements for alt-names. The second, incorrectly empty element was overriding the first. --- src/search-schema/data/definitions/enums/assay_types.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/search-schema/data/definitions/enums/assay_types.yaml b/src/search-schema/data/definitions/enums/assay_types.yaml index b7300ecc..8f50ce27 100644 --- a/src/search-schema/data/definitions/enums/assay_types.yaml +++ b/src/search-schema/data/definitions/enums/assay_types.yaml @@ -238,7 +238,6 @@ PAS_pyramid: bulk-RNA: description: Bulk RNA-seq alt-names: ['bulk RNA'] - alt-names: [] primary: true contains-pii: true vitessce-hints: [] From c775da3f768fcf867403ad4fa2a83049583b8fc2 Mon Sep 17 00:00:00 2001 From: "Zhou (Joe) Yuan" Date: Fri, 17 Dec 2021 10:36:46 -0500 Subject: [PATCH 3/7] Bump version to 2.2.2 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index c043eea7..b1b25a5f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.1 +2.2.2 From ca0ca27f13ce3b008f6a32ecc878b2927229b628 Mon Sep 17 00:00:00 2001 From: mccalluc Date: Fri, 17 Dec 2021 16:24:23 -0500 Subject: [PATCH 4/7] Add mapped_consortium --- .../portal/__init__.py | 1 + .../portal/translate.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/elasticsearch/addl_index_transformations/portal/__init__.py b/src/elasticsearch/addl_index_transformations/portal/__init__.py index cf923a94..ea5f11e3 100644 --- a/src/elasticsearch/addl_index_transformations/portal/__init__.py +++ b/src/elasticsearch/addl_index_transformations/portal/__init__.py @@ -115,6 +115,7 @@ def transform(doc, batch_id='unspecified'): 'preferred_term': 'Male'}]}}, 'entity_type': 'dataset', 'group_name': 'EXT - Outside HuBMAP', + 'mapped_consortium': 'Outside HuBMAP', 'mapped_create_timestamp': '2019-12-04 19:58:29', 'mapped_data_access_level': 'Consortium', 'mapped_data_types': ['snRNA-seq [Salmon]'], diff --git a/src/elasticsearch/addl_index_transformations/portal/translate.py b/src/elasticsearch/addl_index_transformations/portal/translate.py index 963e815c..57525ea6 100644 --- a/src/elasticsearch/addl_index_transformations/portal/translate.py +++ b/src/elasticsearch/addl_index_transformations/portal/translate.py @@ -100,19 +100,26 @@ def _access_level_map(access_level): def _translate_external_consortium(doc): ''' + >>> doc = {} + >>> _translate_external_consortium(doc); doc + {'mapped_consortium': 'HuBMAP'} + >>> doc = {'group_name': 'Inside HuBMAP'} >>> _translate_external_consortium(doc); doc - {'group_name': 'Inside HuBMAP'} + {'group_name': 'Inside HuBMAP', 'mapped_consortium': 'HuBMAP'} + >>> doc = {'group_name': 'EXT - Outside HuBMAP'} >>> _translate_external_consortium(doc); doc - {'group_name': 'EXT - Outside HuBMAP', 'mapped_external_group_name': 'Outside HuBMAP'} + {'group_name': 'EXT - Outside HuBMAP', 'mapped_external_group_name': 'Outside HuBMAP', 'mapped_consortium': 'Outside HuBMAP'} ''' group_name = doc.get('group_name') - if group_name is None: - return - if 'EXT' in group_name: - doc['mapped_external_group_name'] = group_name.replace('EXT - ', '') + if group_name is not None and 'EXT' in group_name: + mapped_consortium = group_name.replace('EXT - ', '') + doc['mapped_external_group_name'] = mapped_consortium + else: + mapped_consortium = 'HuBMAP' + doc['mapped_consortium'] = mapped_consortium # Timestamp: From 68cd665a28aa1c0f004c2b68d0406217abe301f6 Mon Sep 17 00:00:00 2001 From: mccalluc Date: Wed, 5 Jan 2022 17:34:02 -0500 Subject: [PATCH 5/7] Add "null" to the list of known bad fields --- .../addl_index_transformations/portal/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/elasticsearch/addl_index_transformations/portal/__init__.py b/src/elasticsearch/addl_index_transformations/portal/__init__.py index be8e2092..63c38087 100644 --- a/src/elasticsearch/addl_index_transformations/portal/__init__.py +++ b/src/elasticsearch/addl_index_transformations/portal/__init__.py @@ -208,7 +208,7 @@ def _simple_clean(doc): metadata = doc['metadata']['metadata'] bad_fields = [ - 'collectiontype', # Inserted by IEC. + 'collectiontype', 'null', # Inserted by IEC. 'data_path', 'metadata_path', 'version', # Only meaningful at submission time. 'donor_id', 'tissue_id' # For internal use only. ] @@ -223,6 +223,8 @@ def _simple_clean(doc): for k, v in list(metadata.items()): if k in bad_fields or k.startswith('_'): del metadata[k] + continue + # Normalize booleans to all-caps, the Excel default. # (There is no guaratee that boolean fields with be prefixed this way, # but at the moment it is the case.) From 4085a2d5d0ffbe52d1bbc3657a12d2da6ce0b5b6 Mon Sep 17 00:00:00 2001 From: Bill Shirey Date: Fri, 7 Jan 2022 14:59:07 -0500 Subject: [PATCH 6/7] ignore eclipse project files --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 1adfb4db..4087f9e3 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,7 @@ __pycache__/ Pipfile Pipfile.lock + +#Eclipse project files +.project +.pydevproject \ No newline at end of file From 56d89b0cb80d6c7999ea124a1796c58c39745d1e Mon Sep 17 00:00:00 2001 From: Bill Shirey Date: Fri, 7 Jan 2022 15:04:48 -0500 Subject: [PATCH 7/7] add left and right knee organ types --- src/search-schema/data/definitions/enums/organ_types.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/search-schema/data/definitions/enums/organ_types.yaml b/src/search-schema/data/definitions/enums/organ_types.yaml index d941f560..b9ca9ce9 100644 --- a/src/search-schema/data/definitions/enums/organ_types.yaml +++ b/src/search-schema/data/definitions/enums/organ_types.yaml @@ -39,6 +39,8 @@ LK: LL: description: Lung (Left) iri: http://purl.obolibrary.org/obo/UBERON_0002168 +LN: + description: Knee (Left) LV: description: Liver LY: @@ -65,6 +67,8 @@ RK: RL: description: Lung (Right) iri: http://purl.obolibrary.org/obo/UBERON_0002167 +RN: + description: Knee (Right) SI: description: Small Intestine SK: