-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
gene_list endpoint that obtains genes from UBKG instead of Cells API;…
… utility to extract index information from cells api.
- Loading branch information
1 parent
3952eed
commit a3a58c9
Showing
12 changed files
with
430 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
*.csv | ||
*.tsv |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# coding: utf-8 | ||
|
||
# Prototype utility that builds a CSV file of information extracted from the Cells API. | ||
|
||
import logging | ||
import csv | ||
import os | ||
|
||
from hubmap_api_py_client import Client | ||
from hubmap_api_py_client.errors import ClientError | ||
|
||
logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s:%(lineno)d: %(message)s', | ||
datefmt='%Y-%m-%d %H:%M:%S', | ||
level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
logger.info(f'Getting information from Cells API...') | ||
|
||
# Instantiate hubmap-api-py-client. | ||
client_url = 'https://cells.dev.hubmapconsortium.org/api/' | ||
client = Client(client_url) | ||
|
||
# Open CSV and write header. | ||
fpath = os.path.dirname(os.getcwd()) | ||
fpath = os.path.join(fpath, 'cells_index/cells.tsv') | ||
csvfile = open(fpath, 'w', newline='') | ||
cellwriter = csv.writer(csvfile, delimiter='\t',quotechar='|', quoting=csv.QUOTE_MINIMAL) | ||
cellwriter.writerow(['gene_symbol', 'dataset_uuid', 'organ', 'cell_type']) | ||
|
||
# Load dataset and cell information using Cells API. | ||
|
||
# 1. All datasets | ||
datasets = client.select_datasets() | ||
logging.info(f'{len(datasets)} datasets') | ||
|
||
# 2. All cells in datasets. | ||
dataset_uuids = [] | ||
datasets = datasets.get_list() | ||
for d in datasets: | ||
dataset_uuids.append(d['uuid']) | ||
|
||
cells_in_datasets = client.select_cells(where='dataset', has=dataset_uuids) | ||
logging.info(f'{len(cells_in_datasets)} cells in datasets') | ||
|
||
# 3. All genes | ||
genes = client.select_genes().get_list() | ||
logging.info(f'{len(genes)} genes') | ||
|
||
# DEBUG - use set of test genes for prototype. | ||
genes = [{'gene_symbol':'MMRN1'}] | ||
# Check every gene for presence in cells in datasets. | ||
gene_symbols = [] | ||
|
||
for gene in genes: | ||
gene_symbol = gene['gene_symbol'] | ||
try: | ||
# Find cells with the gene, and intersect with cells from datasets to find | ||
# cells with the gene in datasets. | ||
logging.info(f'Looking for cells with gene {gene_symbol}, rna') | ||
cells_with_gene_rna = client.select_cells(where='gene', has=[f'{gene_symbol} > 1'], genomic_modality='rna') | ||
logging.info(f'Looking for cells with gene {gene_symbol}, atac') | ||
cells_with_gene_atac = client.select_cells(where='gene', has=[f'{gene_symbol} > 1'], genomic_modality='atac') | ||
|
||
# Cells from all modalities | ||
cells_with_gene = cells_with_gene_rna | cells_with_gene_atac | ||
|
||
# Cells with gene in datasets | ||
cells_with_gene_in_datasets = cells_with_gene & cells_in_datasets | ||
cells_list = cells_with_gene_in_datasets.get_list() | ||
|
||
# Find distinct combinations of cell type, dataset, gene. | ||
cell_types = [] | ||
for c in cells_list: | ||
cell_type = c['cell_type'] | ||
if not cell_type in cell_types: | ||
cell_types.append(cell_type) | ||
dataset_uuid = c['dataset'] | ||
organ = c['organ'] | ||
cellwriter.writerow([gene_symbol, dataset_uuid, organ, cell_type]) | ||
|
||
except ClientError: | ||
# The genes list contains elements that are not actually genes, and that | ||
# result in errors from the client that are meaningless in this context. | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
// GENES LIST | ||
// Return information on genes in the UBKG | ||
// Used by the genes_list endpoint. | ||
|
||
CALL | ||
{ | ||
OPTIONAL MATCH (tGene:Term)<-[r]-(cGene:Code)<-[:CODE]-(pGene:Concept)-[:DEF]->(d:Definition) WHERE r.CUI=pGene.CUI AND cGene.SAB='HGNC' AND d.SAB='REFSEQ' AND type(r) IN ['PT','ACR'] RETURN toInteger(cGene.CODE) AS hgnc_id, CASE type(r) WHEN 'PT' THEN 'approved_name' WHEN 'ACR' THEN 'approved_symbol' ELSE type(r) END AS ret_key, tGene.name AS ret_value, d.DEF AS description | ||
order by hgnc_id,ret_key | ||
} | ||
// Pivot approved_name and approved_symbol. | ||
WITH hgnc_id,ret_key, COLLECT(ret_value) AS values, description | ||
WITH hgnc_id,apoc.map.fromLists(COLLECT(ret_key),COLLECT(values)) AS map, description | ||
WHERE hgnc_id IS NOT NULL | ||
RETURN hgnc_id, | ||
map['approved_symbol'] AS approved_symbol, | ||
map['approved_name'] AS approved_name, | ||
description | ||
ORDER BY approved_symbol | ||
// Pagination parameters to be added by calling function. | ||
SKIP $skiprows | ||
LIMIT $limitrows |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
// Returns count of HGNC genes with RefSeq definitions in UBKG. | ||
MATCH (cGene:Code)<-[:CODE]-(pGene:Concept)-[:DEF]->(d:Definition) WHERE cGene.SAB='HGNC' RETURN COUNT(DISTINCT cGene) AS genelistcount |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
Oops, something went wrong.