-
Notifications
You must be signed in to change notification settings - Fork 0
/
processEMTAB.py
161 lines (106 loc) · 5.44 KB
/
processEMTAB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import argparse
import os
import numpy as np
import pandas as pd
import scanpy as sc
from pathlib import Path
# Heavily inspired by steps detailed in:
# https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html#Preprocessing
def get_parser() -> argparse.ArgumentParser:
'''
:return: an argparse ArgumentParser object for parsing command
line parameters
'''
parser = argparse.ArgumentParser(
description='Run pathway reconstruction pipeline.')
parser.add_argument('--emtab', default='E-MTAB-6819', help='E-MTAB id number. Has to be one from: http://www.nxn.se/single-cell-studies/gui')
parser.add_argument('--org', default='hsapiens', help='Organism name. Has to be either hsapiens or mmusculus.')
parser.add_argument('--outPath', default='inputs/datasets/', help='Expression file name for writing output.')
parser.add_argument('--hvg', type=int, default=500, help='Number of highly varying genes.')
parser.add_argument('--minGenes', type=int, default=200, help='Cutoff for cells. Keeps only cells with at least minGenes expressed.')
parser.add_argument('--minCells', type=int, default=3, help='Cutoff for genes. Keeps only genes which are expressed in at least minCells.')
return parser
def parse_arguments():
'''
Initialize a parser and use it to parse the command line arguments
:return: parsed dictionary of command line arguments
'''
parser = get_parser()
opts = parser.parse_args()
return opts
def main():
opts = parse_arguments()
# Make a temporary data/ directory where scanpy
# stores the downloaded data
# TODO: see if this can be passed as as argument
Path.mkdir(Path('data/'), exist_ok=True)
eData = sc.datasets.ebi_expression_atlas(opts.emtab)
# once this step is complete, scanpy will read in the data from
fName = f'data/{opts.emtab}/{opts.emtab}.h5ad'
try:
adata = sc.read_h5ad(fName)
except:
print(f'Unknown error. Check if {fName} exists.\n')
# Main preprocessing check//
# check if gene names used are ensembl id: i.e., ENSG or ENSMUSG
# else, I'm not sure if it is safe to assume anything about it
# quit, if not ensembl id.
if opts.org == 'hsapiens':
if not 'ENSG' in adata.var.index[0]:
print("Gene names in annData are not ensembl ids. Quitting.\n")
sys.exit()
elif opts.org == 'mmusculus':
if not 'ENSMUSG' in adata.var.index[0]:
print("Gene names in annData are not ensembl ids. Quitting.\n")
sys.exit()
else:
print("Organism name should either be hsapiens or mmusculus. Quitting.\n")
sys.exit()
# Keep only unique gene names
adata.var_names_make_unique()
# Get annotations from biomart
annot = sc.queries.biomart_annotations(
opts.org,
["ensembl_gene_id", "external_gene_name","start_position", "end_position", "chromosome_name"],
use_cache=True).set_index("ensembl_gene_id")
print(f"\nInput raw data has: {str(adata.shape[0])} cells and {str(adata.shape[1])} genes\n")
# Copy annotations
# Necessary for writing gene names at the end
adata.var[annot.columns] = annot
# Some minimal filtering.
sc.pp.filter_cells(adata, min_genes=opts.minGenes)
sc.pp.filter_genes(adata, min_cells=opts.minCells)
print(f"\nNumber of cells and genes after initial filtering: {str(adata.shape[0])} cells and {str(adata.shape[1])} genes\n")
# Step needed for filtering out MT genes
adata.var['mt'] = adata.var.chromosome_name == 'MT' # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
# Normalization
sc.pp.normalize_total(adata, target_sum=1e4)
# Convert to log counts
sc.pp.log1p(adata)
# Compute highly varying genes
sc.pp.highly_variable_genes(adata, n_top_genes = opts.hvg)
# Keep only highly varying genes
adata = adata[:, adata.var.highly_variable]
print(f"\nAfter applying highly varying genes filter: {str(adata.shape[0])} cells and {str(adata.shape[1])} genes\n")
# Regress out effects of total counts per cell and the percentage of
# mitochondrial genes expressed. Scale the data to unit variance.
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)
aDF = adata.to_df()
# Convert gene names to ensembl ids
presentGenes = annot.loc[annot.index.intersection(aDF.columns)].external_gene_name.dropna().to_dict()
aDFNew = aDF.rename(columns=presentGenes, inplace=False)
aDFNew = aDFNew.T.drop_duplicates()
aDFNew = aDFNew[~aDFNew.index.duplicated(keep='first')]
print(f"\nNumber of cells and genes after converting from Ensembl to gene IDs: {str(aDFNew.shape[1])} cells and {str(aDFNew.shape[0])} genes\n")
outPath = os.path.join(opts.outPath, opts.emtab+'-'+str(opts.hvg), opts.emtab+'-'+str(opts.hvg)+'.csv')
print(f"Writing output to: {outPath}")
os.makedirs(Path('/'.join(outPath.split('/')[:-1])), exist_ok=True)
aDFNew.to_csv(outPath)
print("\nDone\n")
if __name__ == '__main__':
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
main()