Skip to content

Commit

Permalink
Merge pull request #106 from x-atlas-consortia/jas_assayclassifier
Browse files Browse the repository at this point in the history
Jas assayclassifier
  • Loading branch information
yuanzhou authored Aug 1, 2024
2 parents 3e86e82 + 96a3df6 commit b8aa504
Show file tree
Hide file tree
Showing 23 changed files with 1,244 additions and 524 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,6 @@ BUILD

**/__pycache__

/tests/*/*.out
/test/*/*.out
/src/cells_index/*.csv
/src/cells_index/*.tsv
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.4
2.1.0
513 changes: 245 additions & 268 deletions hs-ontology-api-spec.yaml

Large diffs are not rendered by default.

215 changes: 215 additions & 0 deletions src/hs_ontology_api/cypher/assayclass.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
// Called by the assayclassifier endpoint.

// Return information on rule-based datasets--i.e., the datasets specified in the Rule Engine's testing rule chain.

// Obtain identifiers for rule-based datasets (assay classes) for the application context.
// The assayclass_filter allows filtering by either the UBKG code or term (rule_description)
// for the assay class.
WITH '$context' AS context
CALL
{
WITH context
MATCH (p:Concept)<-[:isa]-(pRBD:Concept)-[:CODE]->(cRBD:Code)-[r:PT]->(tRBD:Term)
WHERE p.CUI = context+':C000004 CUI'
AND r.CUI=pRBD.CUI
$assayclass_filter
RETURN pRBD.CUI AS CUIRBD,cRBD.CODE AS CodeRBD,tRBD.name AS NameRBD
ORDER BY pRBD.CUI
}
// assaytype
CALL
{
WITH CUIRBD, context
MATCH (pRBD:Concept)-[:has_assaytype]->(passaytype:Concept)-[:CODE]->(cassaytype:Code)-[r:PT]->(tassaytype:Term)
WHERE pRBD.CUI=CUIRBD
$assaytype_filter
AND r.CUI=passaytype.CUI and cassaytype.SAB=context
RETURN DISTINCT REPLACE(tassaytype.name,'_assaytype','') AS assaytype
}
// dir-schema
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:has_dir_schema]->(pdir_schema:Concept)-[:CODE]-(cdir_schema:Code)-[r:PT]->(tdir_schema:Term)
WHERE pRBD.CUI=CUIRBD AND r.CUI=pdir_schema.CUI AND cdir_schema.SAB=context
RETURN DISTINCT tdir_schema.name AS dir_schema
}
// tbl-schema
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:has_tbl_schema]->(ptbl_schema:Concept)-[:CODE]->(ctbl_schema:Code)-[r:PT]->(ttbl_schema:Term)
WHERE pRBD.CUI=CUIRBD AND r.CUI=ptbl_schema.CUI AND ctbl_schema.SAB=context
RETURN DISTINCT ttbl_schema.name AS tbl_schema
}
// vitessce_hints
// Strip the optional suffix '_vitessce_hint' from terms such as 'rna_vitessce_hint'.
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:has_vitessce_hint]->(pvitessce_hint:Concept)-[:CODE]->(cvitessce_hint:Code)-[r:PT]->(tvitessce_hint:Term)
WHERE pRBD.CUI=CUIRBD AND r.CUI=pvitessce_hint.CUI AND cvitessce_hint.SAB=context
RETURN COLLECT(DISTINCT REPLACE(tvitessce_hint.name,'_vitessce_hint','')) AS vitessce_hints
}
// process state. The process_state_filter allows for filtering to just primary or derived assay classes.
CALL
{
WITH CUIRBD,context
MATCH (pRBD:Concept)-[:has_process_state]->(pdsProcess:Concept)-[:isa]->(pProcessParent:Concept),
(pdsProcess:Concept)-[:CODE]->(cdsProcess:Code)-[r:PT]->(tdsProcess:Term)
WHERE pRBD.CUI=CUIRBD
AND pProcessParent.CUI = context+':C004002 CUI'
AND r.CUI=pdsProcess.CUI
AND cdsProcess.SAB=context
$process_state_filter
RETURN tdsProcess.name as process_state
}
// dataset_type
// The dataset_type concepts in HUBMAP are cross-referenced to HRAVS concepts; however, the terms for the HRAVS concepts
// are enclosed in a list, so use the HUBMAP terms.
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:has_dataset_type]->(pdataset_type:Concept)-[:CODE]->(cdataset_type:Code)-[r:PT]->(tdataset_type:Term)
WHERE pRBD.CUI=CUIRBD AND r.CUI=pdataset_type.CUI AND cdataset_type.SAB=context
RETURN DISTINCT tdataset_type.name AS dataset_type, pdataset_type.CUI AS CUIDatasetType
}
// Pipeline Decision Rules category
CALL
{
WITH CUIDatasetType,context
OPTIONAL MATCH (pDatasetType:Concept)-[:has_pdr_category]->(pPDRCategory:Concept)-[:CODE]->(cPDRCategory:Code)-[r:PT]->(tPDRCategory:Term)
WHERE pDatasetType.CUI=CUIDatasetType AND r.CUI=pPDRCategory.CUI AND cPDRCategory.SAB=context
RETURN DISTINCT tPDRCategory.name AS pdr_category
}
// Fig 2 aggregated assay type
CALL
{
WITH CUIDatasetType,context
OPTIONAL MATCH (pDatasetType:Concept)-[:has_fig2_agg_assay_type]->(pFig2agg:Concept)-[:CODE]-(cFig2agg:Code)-[r:PT]->(tFig2agg:Term)
WHERE pDatasetType.CUI=CUIDatasetType AND r.CUI=pFig2agg.CUI AND cFig2agg.SAB=context
RETURN DISTINCT tFig2agg.name AS fig2_aggregated_assaytype
}
// Fig2 modality
CALL
{
WITH CUIDatasetType,context
OPTIONAL MATCH (pDatasetType:Concept)-[:has_fig2_modality]->(pFig2modality:Concept)-[:CODE]-(cFig2modality:Code)-[r:PT]->(tFig2modality:Term)
WHERE pDatasetType.CUI=CUIDatasetType AND r.CUI=pFig2modality.CUI AND cFig2modality.SAB=context
RETURN DISTINCT tFig2modality.name AS fig2_modality
}
// Fig2 category
CALL
{
WITH CUIDatasetType,context
OPTIONAL MATCH (pDatasetType:Concept)-[:has_fig2_category]->(pFig2category:Concept)-[:CODE]->(cFig2category:Code)-[r:PT]->(tFig2category:Term)
WHERE pDatasetType.CUI=CUIDatasetType AND r.CUI=pFig2category.CUI AND cFig2category.SAB=context
RETURN DISTINCT tFig2category.name AS fig2_category
}
// description
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:has_description]->(pdescription:Concept)-[:CODE]->(cdescription:Code)-[r:PT]->(tdescription:Term)
WHERE pRBD.CUI=CUIRBD AND r.CUI=pdescription.CUI AND cdescription.SAB=context
RETURN DISTINCT REPLACE(tdescription.name,'_description','') AS description
}
// pipeline-shorthand
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:has_pipeline_shorthand]->(pshorthand:Concept)-[:CODE]->(cshorthand:Code)-[r:PT]->(tshorthand:Term)
WHERE pRBD.CUI=CUIRBD and r.CUI=pshorthand.CUI AND cshorthand.SAB=context
RETURN DISTINCT tshorthand.name AS pipeline_shorthand
}
// is multi-assay
CALL
{
WITH CUIRBD, context
OPTIONAL MATCH (pRBD:Concept)-[:isa]->(pMulti:Concept)
WHERE pRBD.CUI=CUIRBD
AND pMulti.CUI = context+':C004033 CUI'
RETURN DISTINCT CASE WHEN pMulti.CUI IS NOT NULL THEN True ELSE False END AS is_multiassay
}
// must_contain
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:must_contain]->(pDT:Concept)-[:CODE]-(cDT:Code)-[r:PT]->(tDT:Term)
WHERE pRBD.CUI=CUIRBD AND r.CUI=pDT.CUI AND cDT.SAB=context
RETURN COLLECT(DISTINCT tDT.name) AS must_contain
}
// measurement assay CUI
CALL
{
WITH CUIRBD
OPTIONAL MATCH (pRBD:Concept)-[:has_measurement_assay]->(pMeas:Concept)
WHERE pRBD.CUI=CUIRBD
RETURN DISTINCT pMeas.CUI as CUIMeas
}
// Optional measurement codes
CALL
{
WITH CUIMeas
MATCH (pMeas:Concept)-[:CODE]->(cMeas:Code)-[:PT]->(tMeas:Term)
WHERE pMeas.CUI = CUIMeas
RETURN COLLECT(DISTINCT {code:cMeas.CodeID,term:tMeas.name}) AS MeasCodes
}
// whether the measurement assay contains full_genetic_sequencesi
CALL
{
WITH CUIMeas,context
OPTIONAL MATCH (pRBD:Concept)-[:contains]->(ppii:Concept)
WHERE pRBD.CUI=CUIMeas
AND ppii.CUI = context+':C004009 CUI'
RETURN DISTINCT CASE WHEN NOT ppii.CUI IS null THEN true ELSE false END AS contains_full_genetic_sequences
}
// provider
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:has_provider]->(pProvider:Concept)-[:CODE]-(cProvider:Code)-[r:PT]->(tProvider:Term)
WHERE pRBD.CUI=CUIRBD AND r.CUI=pProvider.CUI AND cProvider.SAB=context
RETURN DISTINCT tProvider.name AS provider
}
// active status
CALL
{
WITH CUIRBD,context
OPTIONAL MATCH (pRBD:Concept)-[:has_active_status]->(pStatus:Concept)-[:CODE]->(cStatus:Code)-[r:PT]->(tStatus:Term)
WHERE pRBD.CUI=CUIRBD AND r.CUI=pStatus.CUI and cStatus.SAB=context
RETURN DISTINCT tStatus.name AS active_status
}
CALL
{
WITH context, CodeRBD, NameRBD, assaytype, dir_schema, tbl_schema, vitessce_hints,process_state,pipeline_shorthand,description,dataset_type,pdr_category,fig2_aggregated_assaytype,fig2_modality,fig2_category,is_multiassay,must_contain,MeasCodes,contains_full_genetic_sequences,provider,active_status
RETURN
{
rule_description:
{ code:CodeRBD,application_context:context, name:NameRBD
},
value:
{
assaytype:assaytype, dir_schema:dir_schema, tbl_schema:tbl_schema, vitessce_hints:vitessce_hints,
process_state:process_state,
pipeline_shorthand:pipeline_shorthand, description:description,
is_multiassay:is_multiassay, must_contain:must_contain,
provider:provider,
active_status:active_status,
dataset_type:
{
dataset_type:dataset_type, PDR_category:pdr_category,
fig2:
{
aggregated_assaytype:fig2_aggregated_assaytype, modality:fig2_modality, category:fig2_category
}
},
measurement_assay:{
codes:MeasCodes,
contains_full_genetic_sequences:contains_full_genetic_sequences
}
}
} AS rule_based_dataset
}
WITH rule_based_dataset
RETURN rule_based_dataset AS rule_based_datasets
89 changes: 16 additions & 73 deletions src/hs_ontology_api/cypher/fieldassays.cypher
Original file line number Diff line number Diff line change
@@ -1,96 +1,39 @@
// Obtains associations between ingest metadata fields and assay dataset types, both for legacy (HMFIELD) and CEDAR.
// Obtains associations between ingest metadata fields and assaytypes, for legacy metadata only (HMFIELD)
// Used by the field-assays endpoint.

// NOTE: With the deployment of the assay classifier (Rules Engine, or "soft assay types"), the UBKG is no longer the
// source of truth for assay type. This endpoint is primarily for legacy datasets.

// Identify all metadata fields, from both:
// - legacy sources (the field_*.yaml files in ingest-validation-tools, and modeled in HMFIELD), child codes of HMFIELD:1000
// - current sources (CEDAR tempates, modeled in CEDAR), child codes of CEDAR:TemplateField
// Fields that are in the intersection of HMFIELD and CEDAR share CUIs.

// Collect the HMFIELD and CEDAR codes for each metadata field to flatten to level of field name.
// Identify all metadata fields, from legacy sources (the field_*.yaml files in ingest-validation-tools, and modeled in HMFIELD), child codes of //HMFIELD:1000

// The function that calls this query will replace the variable field_filter.

CALL
{
MATCH (cFieldParent:Code)<-[:CODE]-(pFieldParent:Concept)-[:inverse_isa]->(pField:Concept)-[:CODE]->(cField:Code)-[rField:PT]->(tField:Term)
WHERE rField.CUI=pField.CUI
AND cFieldParent.CodeID IN ['HMFIELD:1000','CEDAR:TemplateField']
AND cFieldParent.CodeID IN ['HMFIELD:1000']
$field_filter
RETURN tField.name AS field_name, pField.CUI as CUIField, apoc.text.join(COLLECT(DISTINCT cField.CodeID),'|') AS code_ids
RETURN tField.name AS field_name, pField.CUI as CUIField, cField.CodeID as field_code_id
ORDER BY tField.name
}

// For each field, find the associated assay identifier (originally from field_assays.yaml).
// These identifiers can be one of three types:
// - description
// - data_type
// - alt_name
// These identifiers are cross-referenced to CUIs for codes in the HUBMAP Dataset hierarchy.
//
// - assaytype
// - alt-name
// However, only assaytype is relevant: alt-names were deprecated, and descriptions are no longer current.

// The function that calls this query will replace the variable assay_type_filter.
// HMFIELD originally mapped fields to the "dataset type" in the older HuBMAP dataset hierarchy, but now maps directly to the HUBMAP
// code for the assaytype.
CALL
{
WITH CUIField
OPTIONAL MATCH (pField:Concept)-[:used_in_dataset]->(pAssay:Concept)-[:CODE]->(cAssay:Code)-[r:PT]->(tAssay:Term)
WHERE pField.CUI=CUIField AND cAssay.SAB='HMFIELD'
MATCH (pField:Concept)-[:used_in_dataset]->(pAssayType:Concept)-[:CODE]->(cAssayType:Code)-[r:PT]->(tAssayType:Term)
WHERE pField.CUI=CUIField AND cAssayType.SAB='HUBMAP' AND r.CUI=pAssayType.CUI
$assay_type_filter
RETURN DISTINCT cAssay.CodeID AS assay_code_id,
CASE WHEN tAssay.name IS NULL THEN 'none' ELSE tAssay.name END AS assay_identifier
}

// In HMFIELD, assay identifiers are cross-referenced to CUIs for codes in the HUBMAP "hard" Dataset hierarchy.
// Duplicate assignments are possible--e.g., a HuBMAP dataset is assigned to both a data_type and an alt-name.
// Because a CUI can be the "preferred" CUI for only one code in an ontology, this results in some assay identifiers
// being associated with multiple CUIs--i.e., one to the CUI shared by the HUBMAP dataset code,
// one to a new CUI for the HMFIELD code.
// We want the HUBMAP CUI for each HMFIELD code, regardless of whether it is the "preferred" CUI.

// SPECIAL CASE: The identifiers for 10x Multiome have so many variants of case and delimiter for
// 10x<delimiter>multiome, so hard-code the CUI mapping.

CALL
{
WITH assay_code_id, assay_identifier
OPTIONAL MATCH (pAssay:Concept)-[:CODE]->(cAssay:Code)
WHERE cAssay.CodeID = assay_code_id
AND pAssay.CUI STARTS WITH 'HUBMAP'
RETURN distinct CASE WHEN assay_identifier='10x Multiome' THEN 'HUBMAP:C014002 CUI' ELSE pAssay.CUI END AS CUIHMDataset
RETURN COLLECT(DISTINCT tAssayType.name) AS assaytypes
}
WITH field_name, assaytypes
WHERE assaytypes <>[]

// For each HuBMAP Dataset, obtain the data_type.

CALL
{
WITH CUIHMDataset
OPTIONAL MATCH (pAssay:Concept)-[:has_data_type]->(pDataType:Concept)-[:CODE]->(cDataType:Code)-[r:PT]->(tDataType:Term)
WHERE pAssay.CUI=CUIHMDataset
AND cDataType.SAB ='HUBMAP'
AND r.CUI=pDataType.CUI
RETURN CASE WHEN tDataType.name IS NULL THEN 'none' ELSE tDataType.name END AS data_type
}

// For each HuBMAP Dataset, obtain the "soft assay" dataset type.
// The "soft assay" dataset type is a member of the Soft Assay Dataset Type hierarchy in HUBMAP, with parent code
// HUBMAP:C003041
CALL
{
WITH CUIHMDataset
OPTIONAL MATCH (pAssay:Concept)-[:isa]->(pSoftAssayDatasetType:Concept)-[:isa]->(pSoftAssayDatasetTypeRoot:Concept)-[:CODE]->(cSoftAssayDatasetTypeRoot:Code)-[r:PT]->(tSoftAssayDatasetTypeRoot:Term),
(pSoftAssayDatasetType:Concept)-[:CODE]->(cSoftAssayDatasetType:Code)-[r2:PT]->(tSoftAssayDatasetType:Term)
WHERE pAssay.CUI = CUIHMDataset
AND cSoftAssayDatasetTypeRoot.CodeID='HUBMAP:C003041'
AND r.CUI=pSoftAssayDatasetTypeRoot.CUI
AND r2.CUI=pSoftAssayDatasetType.CUI
RETURN CASE WHEN tSoftAssayDatasetType.name IS NULL THEN 'none' ELSE tSoftAssayDatasetType.name END as dataset_type
}

// Collect assay_identifier, data_type, and dataset_type into a delimited list to flatten to level of field name.
// The function that calls this query will replace the variable data_type_dataset_type_filters.

WITH field_name, code_ids, assay_identifier, data_type, dataset_type
$data_type_dataset_type_filters
RETURN field_name, code_ids, COLLECT(DISTINCT assay_identifier + '|' + data_type + '|' + dataset_type) AS assays
ORDER BY field_name
RETURN {fields:COLLECT(DISTINCT {field_name: field_name, assaytypes:assaytypes})} AS fieldassays
Loading

0 comments on commit b8aa504

Please sign in to comment.