-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #106 from x-atlas-consortia/jas_assayclassifier
Jas assayclassifier
- Loading branch information
Showing
23 changed files
with
1,244 additions
and
524 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,6 @@ BUILD | |
|
||
**/__pycache__ | ||
|
||
/tests/*/*.out | ||
/test/*/*.out | ||
/src/cells_index/*.csv | ||
/src/cells_index/*.tsv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
2.0.4 | ||
2.1.0 |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
// Called by the assayclassifier endpoint. | ||
|
||
// Return information on rule-based datasets--i.e., the datasets specified in the Rule Engine's testing rule chain. | ||
|
||
// Obtain identifiers for rule-based datasets (assay classes) for the application context. | ||
// The assayclass_filter allows filtering by either the UBKG code or term (rule_description) | ||
// for the assay class. | ||
WITH '$context' AS context | ||
CALL | ||
{ | ||
WITH context | ||
MATCH (p:Concept)<-[:isa]-(pRBD:Concept)-[:CODE]->(cRBD:Code)-[r:PT]->(tRBD:Term) | ||
WHERE p.CUI = context+':C000004 CUI' | ||
AND r.CUI=pRBD.CUI | ||
$assayclass_filter | ||
RETURN pRBD.CUI AS CUIRBD,cRBD.CODE AS CodeRBD,tRBD.name AS NameRBD | ||
ORDER BY pRBD.CUI | ||
} | ||
// assaytype | ||
CALL | ||
{ | ||
WITH CUIRBD, context | ||
MATCH (pRBD:Concept)-[:has_assaytype]->(passaytype:Concept)-[:CODE]->(cassaytype:Code)-[r:PT]->(tassaytype:Term) | ||
WHERE pRBD.CUI=CUIRBD | ||
$assaytype_filter | ||
AND r.CUI=passaytype.CUI and cassaytype.SAB=context | ||
RETURN DISTINCT REPLACE(tassaytype.name,'_assaytype','') AS assaytype | ||
} | ||
// dir-schema | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_dir_schema]->(pdir_schema:Concept)-[:CODE]-(cdir_schema:Code)-[r:PT]->(tdir_schema:Term) | ||
WHERE pRBD.CUI=CUIRBD AND r.CUI=pdir_schema.CUI AND cdir_schema.SAB=context | ||
RETURN DISTINCT tdir_schema.name AS dir_schema | ||
} | ||
// tbl-schema | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_tbl_schema]->(ptbl_schema:Concept)-[:CODE]->(ctbl_schema:Code)-[r:PT]->(ttbl_schema:Term) | ||
WHERE pRBD.CUI=CUIRBD AND r.CUI=ptbl_schema.CUI AND ctbl_schema.SAB=context | ||
RETURN DISTINCT ttbl_schema.name AS tbl_schema | ||
} | ||
// vitessce_hints | ||
// Strip the optional suffix '_vitessce_hint' from terms such as 'rna_vitessce_hint'. | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_vitessce_hint]->(pvitessce_hint:Concept)-[:CODE]->(cvitessce_hint:Code)-[r:PT]->(tvitessce_hint:Term) | ||
WHERE pRBD.CUI=CUIRBD AND r.CUI=pvitessce_hint.CUI AND cvitessce_hint.SAB=context | ||
RETURN COLLECT(DISTINCT REPLACE(tvitessce_hint.name,'_vitessce_hint','')) AS vitessce_hints | ||
} | ||
// process state. The process_state_filter allows for filtering to just primary or derived assay classes. | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
MATCH (pRBD:Concept)-[:has_process_state]->(pdsProcess:Concept)-[:isa]->(pProcessParent:Concept), | ||
(pdsProcess:Concept)-[:CODE]->(cdsProcess:Code)-[r:PT]->(tdsProcess:Term) | ||
WHERE pRBD.CUI=CUIRBD | ||
AND pProcessParent.CUI = context+':C004002 CUI' | ||
AND r.CUI=pdsProcess.CUI | ||
AND cdsProcess.SAB=context | ||
$process_state_filter | ||
RETURN tdsProcess.name as process_state | ||
} | ||
// dataset_type | ||
// The dataset_type concepts in HUBMAP are cross-referenced to HRAVS concepts; however, the terms for the HRAVS concepts | ||
// are enclosed in a list, so use the HUBMAP terms. | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_dataset_type]->(pdataset_type:Concept)-[:CODE]->(cdataset_type:Code)-[r:PT]->(tdataset_type:Term) | ||
WHERE pRBD.CUI=CUIRBD AND r.CUI=pdataset_type.CUI AND cdataset_type.SAB=context | ||
RETURN DISTINCT tdataset_type.name AS dataset_type, pdataset_type.CUI AS CUIDatasetType | ||
} | ||
// Pipeline Decision Rules category | ||
CALL | ||
{ | ||
WITH CUIDatasetType,context | ||
OPTIONAL MATCH (pDatasetType:Concept)-[:has_pdr_category]->(pPDRCategory:Concept)-[:CODE]->(cPDRCategory:Code)-[r:PT]->(tPDRCategory:Term) | ||
WHERE pDatasetType.CUI=CUIDatasetType AND r.CUI=pPDRCategory.CUI AND cPDRCategory.SAB=context | ||
RETURN DISTINCT tPDRCategory.name AS pdr_category | ||
} | ||
// Fig 2 aggregated assay type | ||
CALL | ||
{ | ||
WITH CUIDatasetType,context | ||
OPTIONAL MATCH (pDatasetType:Concept)-[:has_fig2_agg_assay_type]->(pFig2agg:Concept)-[:CODE]-(cFig2agg:Code)-[r:PT]->(tFig2agg:Term) | ||
WHERE pDatasetType.CUI=CUIDatasetType AND r.CUI=pFig2agg.CUI AND cFig2agg.SAB=context | ||
RETURN DISTINCT tFig2agg.name AS fig2_aggregated_assaytype | ||
} | ||
// Fig2 modality | ||
CALL | ||
{ | ||
WITH CUIDatasetType,context | ||
OPTIONAL MATCH (pDatasetType:Concept)-[:has_fig2_modality]->(pFig2modality:Concept)-[:CODE]-(cFig2modality:Code)-[r:PT]->(tFig2modality:Term) | ||
WHERE pDatasetType.CUI=CUIDatasetType AND r.CUI=pFig2modality.CUI AND cFig2modality.SAB=context | ||
RETURN DISTINCT tFig2modality.name AS fig2_modality | ||
} | ||
// Fig2 category | ||
CALL | ||
{ | ||
WITH CUIDatasetType,context | ||
OPTIONAL MATCH (pDatasetType:Concept)-[:has_fig2_category]->(pFig2category:Concept)-[:CODE]->(cFig2category:Code)-[r:PT]->(tFig2category:Term) | ||
WHERE pDatasetType.CUI=CUIDatasetType AND r.CUI=pFig2category.CUI AND cFig2category.SAB=context | ||
RETURN DISTINCT tFig2category.name AS fig2_category | ||
} | ||
// description | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_description]->(pdescription:Concept)-[:CODE]->(cdescription:Code)-[r:PT]->(tdescription:Term) | ||
WHERE pRBD.CUI=CUIRBD AND r.CUI=pdescription.CUI AND cdescription.SAB=context | ||
RETURN DISTINCT REPLACE(tdescription.name,'_description','') AS description | ||
} | ||
// pipeline-shorthand | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_pipeline_shorthand]->(pshorthand:Concept)-[:CODE]->(cshorthand:Code)-[r:PT]->(tshorthand:Term) | ||
WHERE pRBD.CUI=CUIRBD and r.CUI=pshorthand.CUI AND cshorthand.SAB=context | ||
RETURN DISTINCT tshorthand.name AS pipeline_shorthand | ||
} | ||
// is multi-assay | ||
CALL | ||
{ | ||
WITH CUIRBD, context | ||
OPTIONAL MATCH (pRBD:Concept)-[:isa]->(pMulti:Concept) | ||
WHERE pRBD.CUI=CUIRBD | ||
AND pMulti.CUI = context+':C004033 CUI' | ||
RETURN DISTINCT CASE WHEN pMulti.CUI IS NOT NULL THEN True ELSE False END AS is_multiassay | ||
} | ||
// must_contain | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:must_contain]->(pDT:Concept)-[:CODE]-(cDT:Code)-[r:PT]->(tDT:Term) | ||
WHERE pRBD.CUI=CUIRBD AND r.CUI=pDT.CUI AND cDT.SAB=context | ||
RETURN COLLECT(DISTINCT tDT.name) AS must_contain | ||
} | ||
// measurement assay CUI | ||
CALL | ||
{ | ||
WITH CUIRBD | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_measurement_assay]->(pMeas:Concept) | ||
WHERE pRBD.CUI=CUIRBD | ||
RETURN DISTINCT pMeas.CUI as CUIMeas | ||
} | ||
// Optional measurement codes | ||
CALL | ||
{ | ||
WITH CUIMeas | ||
MATCH (pMeas:Concept)-[:CODE]->(cMeas:Code)-[:PT]->(tMeas:Term) | ||
WHERE pMeas.CUI = CUIMeas | ||
RETURN COLLECT(DISTINCT {code:cMeas.CodeID,term:tMeas.name}) AS MeasCodes | ||
} | ||
// whether the measurement assay contains full_genetic_sequencesi | ||
CALL | ||
{ | ||
WITH CUIMeas,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:contains]->(ppii:Concept) | ||
WHERE pRBD.CUI=CUIMeas | ||
AND ppii.CUI = context+':C004009 CUI' | ||
RETURN DISTINCT CASE WHEN NOT ppii.CUI IS null THEN true ELSE false END AS contains_full_genetic_sequences | ||
} | ||
// provider | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_provider]->(pProvider:Concept)-[:CODE]-(cProvider:Code)-[r:PT]->(tProvider:Term) | ||
WHERE pRBD.CUI=CUIRBD AND r.CUI=pProvider.CUI AND cProvider.SAB=context | ||
RETURN DISTINCT tProvider.name AS provider | ||
} | ||
// active status | ||
CALL | ||
{ | ||
WITH CUIRBD,context | ||
OPTIONAL MATCH (pRBD:Concept)-[:has_active_status]->(pStatus:Concept)-[:CODE]->(cStatus:Code)-[r:PT]->(tStatus:Term) | ||
WHERE pRBD.CUI=CUIRBD AND r.CUI=pStatus.CUI and cStatus.SAB=context | ||
RETURN DISTINCT tStatus.name AS active_status | ||
} | ||
CALL | ||
{ | ||
WITH context, CodeRBD, NameRBD, assaytype, dir_schema, tbl_schema, vitessce_hints,process_state,pipeline_shorthand,description,dataset_type,pdr_category,fig2_aggregated_assaytype,fig2_modality,fig2_category,is_multiassay,must_contain,MeasCodes,contains_full_genetic_sequences,provider,active_status | ||
RETURN | ||
{ | ||
rule_description: | ||
{ code:CodeRBD,application_context:context, name:NameRBD | ||
}, | ||
value: | ||
{ | ||
assaytype:assaytype, dir_schema:dir_schema, tbl_schema:tbl_schema, vitessce_hints:vitessce_hints, | ||
process_state:process_state, | ||
pipeline_shorthand:pipeline_shorthand, description:description, | ||
is_multiassay:is_multiassay, must_contain:must_contain, | ||
provider:provider, | ||
active_status:active_status, | ||
dataset_type: | ||
{ | ||
dataset_type:dataset_type, PDR_category:pdr_category, | ||
fig2: | ||
{ | ||
aggregated_assaytype:fig2_aggregated_assaytype, modality:fig2_modality, category:fig2_category | ||
} | ||
}, | ||
measurement_assay:{ | ||
codes:MeasCodes, | ||
contains_full_genetic_sequences:contains_full_genetic_sequences | ||
} | ||
} | ||
} AS rule_based_dataset | ||
} | ||
WITH rule_based_dataset | ||
RETURN rule_based_dataset AS rule_based_datasets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,96 +1,39 @@ | ||
// Obtains associations between ingest metadata fields and assay dataset types, both for legacy (HMFIELD) and CEDAR. | ||
// Obtains associations between ingest metadata fields and assaytypes, for legacy metadata only (HMFIELD) | ||
// Used by the field-assays endpoint. | ||
|
||
// NOTE: With the deployment of the assay classifier (Rules Engine, or "soft assay types"), the UBKG is no longer the | ||
// source of truth for assay type. This endpoint is primarily for legacy datasets. | ||
|
||
// Identify all metadata fields, from both: | ||
// - legacy sources (the field_*.yaml files in ingest-validation-tools, and modeled in HMFIELD), child codes of HMFIELD:1000 | ||
// - current sources (CEDAR tempates, modeled in CEDAR), child codes of CEDAR:TemplateField | ||
// Fields that are in the intersection of HMFIELD and CEDAR share CUIs. | ||
|
||
// Collect the HMFIELD and CEDAR codes for each metadata field to flatten to level of field name. | ||
// Identify all metadata fields, from legacy sources (the field_*.yaml files in ingest-validation-tools, and modeled in HMFIELD), child codes of //HMFIELD:1000 | ||
|
||
// The function that calls this query will replace the variable field_filter. | ||
|
||
CALL | ||
{ | ||
MATCH (cFieldParent:Code)<-[:CODE]-(pFieldParent:Concept)-[:inverse_isa]->(pField:Concept)-[:CODE]->(cField:Code)-[rField:PT]->(tField:Term) | ||
WHERE rField.CUI=pField.CUI | ||
AND cFieldParent.CodeID IN ['HMFIELD:1000','CEDAR:TemplateField'] | ||
AND cFieldParent.CodeID IN ['HMFIELD:1000'] | ||
$field_filter | ||
RETURN tField.name AS field_name, pField.CUI as CUIField, apoc.text.join(COLLECT(DISTINCT cField.CodeID),'|') AS code_ids | ||
RETURN tField.name AS field_name, pField.CUI as CUIField, cField.CodeID as field_code_id | ||
ORDER BY tField.name | ||
} | ||
|
||
// For each field, find the associated assay identifier (originally from field_assays.yaml). | ||
// These identifiers can be one of three types: | ||
// - description | ||
// - data_type | ||
// - alt_name | ||
// These identifiers are cross-referenced to CUIs for codes in the HUBMAP Dataset hierarchy. | ||
// | ||
// - assaytype | ||
// - alt-name | ||
// However, only assaytype is relevant: alt-names were deprecated, and descriptions are no longer current. | ||
|
||
// The function that calls this query will replace the variable assay_type_filter. | ||
// HMFIELD originally mapped fields to the "dataset type" in the older HuBMAP dataset hierarchy, but now maps directly to the HUBMAP | ||
// code for the assaytype. | ||
CALL | ||
{ | ||
WITH CUIField | ||
OPTIONAL MATCH (pField:Concept)-[:used_in_dataset]->(pAssay:Concept)-[:CODE]->(cAssay:Code)-[r:PT]->(tAssay:Term) | ||
WHERE pField.CUI=CUIField AND cAssay.SAB='HMFIELD' | ||
MATCH (pField:Concept)-[:used_in_dataset]->(pAssayType:Concept)-[:CODE]->(cAssayType:Code)-[r:PT]->(tAssayType:Term) | ||
WHERE pField.CUI=CUIField AND cAssayType.SAB='HUBMAP' AND r.CUI=pAssayType.CUI | ||
$assay_type_filter | ||
RETURN DISTINCT cAssay.CodeID AS assay_code_id, | ||
CASE WHEN tAssay.name IS NULL THEN 'none' ELSE tAssay.name END AS assay_identifier | ||
} | ||
|
||
// In HMFIELD, assay identifiers are cross-referenced to CUIs for codes in the HUBMAP "hard" Dataset hierarchy. | ||
// Duplicate assignments are possible--e.g., a HuBMAP dataset is assigned to both a data_type and an alt-name. | ||
// Because a CUI can be the "preferred" CUI for only one code in an ontology, this results in some assay identifiers | ||
// being associated with multiple CUIs--i.e., one to the CUI shared by the HUBMAP dataset code, | ||
// one to a new CUI for the HMFIELD code. | ||
// We want the HUBMAP CUI for each HMFIELD code, regardless of whether it is the "preferred" CUI. | ||
|
||
// SPECIAL CASE: The identifiers for 10x Multiome have so many variants of case and delimiter for | ||
// 10x<delimiter>multiome, so hard-code the CUI mapping. | ||
|
||
CALL | ||
{ | ||
WITH assay_code_id, assay_identifier | ||
OPTIONAL MATCH (pAssay:Concept)-[:CODE]->(cAssay:Code) | ||
WHERE cAssay.CodeID = assay_code_id | ||
AND pAssay.CUI STARTS WITH 'HUBMAP' | ||
RETURN distinct CASE WHEN assay_identifier='10x Multiome' THEN 'HUBMAP:C014002 CUI' ELSE pAssay.CUI END AS CUIHMDataset | ||
RETURN COLLECT(DISTINCT tAssayType.name) AS assaytypes | ||
} | ||
WITH field_name, assaytypes | ||
WHERE assaytypes <>[] | ||
|
||
// For each HuBMAP Dataset, obtain the data_type. | ||
|
||
CALL | ||
{ | ||
WITH CUIHMDataset | ||
OPTIONAL MATCH (pAssay:Concept)-[:has_data_type]->(pDataType:Concept)-[:CODE]->(cDataType:Code)-[r:PT]->(tDataType:Term) | ||
WHERE pAssay.CUI=CUIHMDataset | ||
AND cDataType.SAB ='HUBMAP' | ||
AND r.CUI=pDataType.CUI | ||
RETURN CASE WHEN tDataType.name IS NULL THEN 'none' ELSE tDataType.name END AS data_type | ||
} | ||
|
||
// For each HuBMAP Dataset, obtain the "soft assay" dataset type. | ||
// The "soft assay" dataset type is a member of the Soft Assay Dataset Type hierarchy in HUBMAP, with parent code | ||
// HUBMAP:C003041 | ||
CALL | ||
{ | ||
WITH CUIHMDataset | ||
OPTIONAL MATCH (pAssay:Concept)-[:isa]->(pSoftAssayDatasetType:Concept)-[:isa]->(pSoftAssayDatasetTypeRoot:Concept)-[:CODE]->(cSoftAssayDatasetTypeRoot:Code)-[r:PT]->(tSoftAssayDatasetTypeRoot:Term), | ||
(pSoftAssayDatasetType:Concept)-[:CODE]->(cSoftAssayDatasetType:Code)-[r2:PT]->(tSoftAssayDatasetType:Term) | ||
WHERE pAssay.CUI = CUIHMDataset | ||
AND cSoftAssayDatasetTypeRoot.CodeID='HUBMAP:C003041' | ||
AND r.CUI=pSoftAssayDatasetTypeRoot.CUI | ||
AND r2.CUI=pSoftAssayDatasetType.CUI | ||
RETURN CASE WHEN tSoftAssayDatasetType.name IS NULL THEN 'none' ELSE tSoftAssayDatasetType.name END as dataset_type | ||
} | ||
|
||
// Collect assay_identifier, data_type, and dataset_type into a delimited list to flatten to level of field name. | ||
// The function that calls this query will replace the variable data_type_dataset_type_filters. | ||
|
||
WITH field_name, code_ids, assay_identifier, data_type, dataset_type | ||
$data_type_dataset_type_filters | ||
RETURN field_name, code_ids, COLLECT(DISTINCT assay_identifier + '|' + data_type + '|' + dataset_type) AS assays | ||
ORDER BY field_name | ||
RETURN {fields:COLLECT(DISTINCT {field_name: field_name, assaytypes:assaytypes})} AS fieldassays |
Oops, something went wrong.