Skip to content

Commit

Permalink
Finetuned GTEx query
Browse files Browse the repository at this point in the history
  • Loading branch information
tayaza committed Oct 17, 2017
1 parent 02ded34 commit 95bedd5
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 34 deletions.
179 changes: 150 additions & 29 deletions codes3d/codes3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from sets import Set
from wikipathways_api_client import WikipathwaysApiClient
import argparse,ast,bisect,configparser,csv,json,multiprocessing,os,pandas,pybedtools,re,requests,shutil,sqlite3,time
from operator import itemgetter

from Bio import SeqIO
from Bio import Restriction
Expand Down Expand Up @@ -166,19 +167,21 @@ def find_eqtls(snps,genes,eqtl_data_dir,gene_database_fp,fdr_threshold,local_dat
p_values = [] #A sorted list of all p-values for use computing FDR
num_sig = {} #Number of eQTLs deemed significant under the given threshold
num_tests = 0 #Total number of tests done
num_tests += query_local_databases(eqtl_data_dir,genes,eqtls,p_values)
#num_tests += query_local_databases(eqtl_data_dir,genes,eqtls,p_values) #Suspend local database query until it is updated
if not local_databases_only:
num_tests += query_GTEx_service(snps,genes,eqtls,p_values,num_processes,output_dir)
process_eqtls(snps,genes,eqtls,gene_database_fp)
eqtlfile = None
p_values,adj_p_values = compute_adj_pvalues(p_values)
if not suppress_intermediate_files:
eqtlfile = open(output_dir + '/eqtls.txt','w')
for snp in eqtls.keys():
num_sig[snp] = 0
for gene in eqtls[snp].keys():
if not gene == "snp_info":
for tissue in eqtls[snp][gene]["tissues"].keys():
eqtls[snp][gene]["tissues"][tissue]["qvalue"] = compute_fdr(eqtls[snp][gene]["tissues"][tissue]["pvalue"],p_values,num_tests)
eqtls[snp][gene]["tissues"][tissue]["qvalue"] = adj_p_values[p_values.index(eqtls[snp][gene]["tissues"][tissue]["pvalue"])]
#eqtls[snp][gene]["tissues"][tissue]["qvalue"] = compute_fdr(eqtls[snp][gene]["tissues"][tissue]["pvalue"],p_values,num_tests)
if eqtls[snp][gene]["tissues"][tissue]["qvalue"] < fdr_threshold:
num_sig[snp] += 1
if not suppress_intermediate_files:
Expand Down Expand Up @@ -207,7 +210,53 @@ def query_local_databases(eqtl_data_dir,genes,eqtls,p_values):
return num_tests

def query_GTEx_service(snps,genes,eqtls,p_values,num_processes,output_dir):
tissues = Set(["Adipose_Subcutaneous","Adipose_Visceral_Omentum","Adrenal_Gland","Artery_Aorta","Artery_Coronary","Artery_Tibial","Brain_Amygdala","Brain_Anterior_cingulate_cortex_BA24","Brain_Caudate_basal_ganglia","Brain_Cerebellar_Hemisphere","Brain_Cerebellum","Brain_Cortex","Brain_Frontal_Cortex_BA9","Brain_Hippocampus","Brain_Hypothalamus","Brain_Nucleus_accumbens_basal_ganglia","Brain_Putamen_basal_ganglia","Brain_Spinal_cord_cervical_c-1","Brain_Substantia_nigra","Breast_Mammary_Tissue","Cells_EBV-transformed_lymphocytes","Cells_Transformed_fibroblasts","Colon_Sigmoid","Esophagus_Gastroesophageal_Junction","Esophagus_Mucosa","Esophagus_Muscularis","Heart_Atrial_Appendage","Heart_Left_Ventricle","Liver","Lung","Minor_Salivary_Gland","Muscle_Skeletal","Nerve_Tibial","Ovary","Pancreas","Pituitary","Prostate","Skin_Not_Sun_Exposed_Suprapubic","Skin_Sun_Exposed_Lower_leg","Small_Intestine_Terminal_Ileum","Spleen","Stomach","Testis","Thyroid","Uterus","Vagina","Whole_Blood"])
tissues = Set(["Adipose_Subcutaneous",
"Adipose_Visceral_Omentum",
"Adrenal_Gland",
"Artery_Aorta",
"Artery_Coronary",
"Artery_Tibial",
"Brain_Amygdala",
"Brain_Anterior_cingulate_cortex_BA24",
"Brain_Caudate_basal_ganglia",
"Brain_Cerebellar_Hemisphere",
"Brain_Cerebellum",
"Brain_Cortex",
"Brain_Frontal_Cortex_BA9",
"Brain_Hippocampus",
"Brain_Hypothalamus",
"Brain_Nucleus_accumbens_basal_ganglia",
"Brain_Putamen_basal_ganglia",
"Brain_Spinal_cord_cervical_c-1",
"Brain_Substantia_nigra",
"Breast_Mammary_Tissue",
"Cells_EBV-transformed_lymphocytes",
"Cells_Transformed_fibroblasts",
"Colon_Sigmoid",
"Esophagus_Gastroesophageal_Junction",
"Esophagus_Mucosa",
"Esophagus_Muscularis",
"Heart_Atrial_Appendage",
"Heart_Left_Ventricle",
"Liver",
"Lung",
"Minor_Salivary_Gland",
"Muscle_Skeletal",
"Nerve_Tibial",
"Ovary",
"Pancreas",
"Pituitary",
"Prostate",
"Skin_Not_Sun_Exposed_Suprapubic",
"Skin_Sun_Exposed_Lower_leg",
"Small_Intestine_Terminal_Ileum",
"Spleen",
"Stomach",
"Testis",
"Thyroid",
"Uterus",
"Vagina",
"Whole_Blood"])
if num_processes > 10:
num_processes = 10
manager = multiprocessing.Manager()
Expand All @@ -218,7 +267,11 @@ def query_GTEx_service(snps,genes,eqtls,p_values,num_processes,output_dir):
for tissue in tissues:
#We aren't interested in eQTLs already discovered from local databases
if (not eqtls.has_key(snp) or (eqtls.has_key(snp) and not eqtls[snp].has_key(gene))) or (eqtls[snp].has_key(gene) and not eqtls[snp][gene]["tissues"].has_key(tissue)):
if len(reqLists[-1]) < 1000:
#if len(reqLists[-1]) < 1000:
#Make each request set contain tissues from only one SNP-gene pair
if not reqLists[-1]:
reqLists[-1].append({"snpId":snp,"gencodeId":gene,"tissueName":tissue})
elif reqLists[-1][-1]["snpId"]==snp and reqLists[-1][-1]["gencodeId"]==gene:
reqLists[-1].append({"snpId":snp,"gencodeId":gene,"tissueName":tissue})
else:
reqLists.append([{"snpId":snp,"gencodeId":gene,"tissueName":tissue}])
Expand Down Expand Up @@ -289,7 +342,8 @@ def process_eqtls(snps,genes,eqtls,gene_database_fp):
else:
p_thresh = "NA"
max_length = gene_stat[2] - gene_stat[1]
cis = gene_chr == eqtls[snp]["snp_info"]["chr"] and (eqtls[snp]["snp_info"]["locus"] > gene_start - 1000000 and eqtls[snp]["snp_info"]["locus"] < gene_end + 1000000) #eQTL is cis if the SNP is within 1Mbp of the gene
cis = gene_chr == eqtls[snp]["snp_info"]["chr"] and (eqtls[snp]["snp_info"]["locus"] > gene_start - 1000000 \
and eqtls[snp]["snp_info"]["locus"] < gene_end + 1000000) #eQTL is cis if the SNP is within 1Mbp of the gene

eqtls[snp][gene]["gene_chr"] = gene_chr
eqtls[snp][gene]["gene_start"] = gene_start
Expand All @@ -305,29 +359,32 @@ def process_eqtls(snps,genes,eqtls,gene_database_fp):
eqtls[snp][gene]["cis?"] = False

def send_GTEx_query(num,num_reqLists,reqList,gtexResponses):
s = requests.Session()
s.verify = gtex_cert
try:
while True:
print "\t\tSending request %s of %s" % (num,num_reqLists)
res = requests.post("http://gtexportal.org/rest/v1/association/dyneqtl?v=clversion", json=reqList)
gtex_url = "https://gtexportal.org/rest/v1/association/dyneqtl?v=clversion"
res = s.post(gtex_url, json=reqList)
if res.status_code == 200:
gtexResponses.append((reqList,res))
time.sleep(30)
time.sleep(1)
return
elif res.status_code == 500 or res.status_code == 400:
print "\t\tThere was an error processing request %s. Writing to failed request log and continuing." % num
gtexResponses.append((reqList,"Processing error"))
time.sleep(30)
time.sleep(2)
return
else:
print "\t\tRequest %s received response with status %s. Trying again in five minutes." % (num,res.status_code)
time.sleep(300)
time.sleep(10)
except requests.exceptions.ConnectionError:
try:
print "\t\tWarning: Request %s experienced a connection error. Retrying in five minutes." % num
time.sleep(300)
while True:
print "\t\tSending request %s of %s" % (num,num_reqLists)
res = requests.post("http://gtexportal.org/api/v6p/dyneqtl?v=clversion", json=reqList)
res = s.post("http://gtexportal.org/api/v6p/dyneqtl?v=clversion", json=reqList)
if res.status_code == 200:
gtexResponses.append((reqList,res))
time.sleep(30)
Expand All @@ -345,6 +402,7 @@ def send_GTEx_query(num,num_reqLists,reqList,gtexResponses):
gtexResponses.append((reqList,"Connection failure"))
time.sleep(300)
return
s.close()

def get_gene_expression_information(eqtls,expression_table_fp,output_dir):
print "Getting gene expression information..."
Expand All @@ -360,18 +418,29 @@ def get_gene_expression_information(eqtls,expression_table_fp,output_dir):
gene_exp.to_csv(path_or_buf=output_dir+"/gene_expression_table.txt",sep='\t')

def compute_fdr(p,p_values,num_tests):
positives = bisect.bisect_right(p_values, p)
return float(num_tests) * p / float(positives)
k = bisect.bisect_right(p_values, p)
return(k / float(num_tests) * args.fdr_threshold)

def produce_summary(eqtls,expression_table_fp,output_dir):
#TODO: Tidy up this method using pandas
print "Producing output..."
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
summary = open(output_dir + "/summary.txt",'w')
summary.write("SNP\tSNP_Chromosome\tSNP_Locus\tGene_Name\tGene_Chromosome\tGene_Start\tGene_End\tTissue\tp-value\tq-value\tEffect_Size\tCell_Lines\tGTEx_cis_p_Threshold\tcis_SNP-gene_interaction\tSNP-gene_Distance\tExpression_Level_In_eQTL_Tissue\tMax_Expressed_Tissue\tMaximum_Expression_Level\tMin_Expressed_Tissue\tMin_Expression_Level\n")
summ_writer = csv.writer(summary, delimiter='\t')
summ_header = ['SNP', 'SNP_Chromosome', 'SNP_Locus',
'Gene_Name', 'Gene_Chromosome',
'Gene_Start', 'Gene_End', 'Tissue',
'p-value', 'Adj_p-value', 'Effect_Size',
'Cell_Lines', 'GTEx_cis_p_Threshold',
'cis_SNP-gene_interaction', 'SNP-gene_Distance',
'Expression_Level_In_eQTL_Tissue', 'Max_Expressed_Tissue',
'Maximum_Expression_Level', 'Min_Expressed_Tissue',
'Min_Expression_Level']
summ_writer.writerow(summ_header)
gene_df = pandas.read_table(expression_table_fp,index_col="Description")
gene_exp = {} #Cache of already-accessed gene expression data
all_summary_rows = [] #To produce significant eQTL interactions
for snp in eqtls.keys():
for gene in eqtls[snp].keys():
if not gene == "snp_info":
Expand All @@ -384,7 +453,7 @@ def produce_summary(eqtls,expression_table_fp,output_dir):
distance_from_snp = eqtls[snp][gene]["gene_start"] - eqtls[snp]["snp_info"]["locus"]
elif(eqtls[snp]["snp_info"]["locus"] > eqtls[snp][gene]["gene_end"]):
distance_from_snp = eqtls[snp]["snp_info"]["locus"] - eqtls[snp][gene]["gene_end"]

for tissue in eqtls[snp][gene]["tissues"].keys():
if not gene_exp.has_key(gene):
gene_exp[gene] = {}
Expand All @@ -409,28 +478,77 @@ def produce_summary(eqtls,expression_table_fp,output_dir):
except KeyError:
print "\t\tWarning: No expression information for %s in %s" % (gene,tissue)
gene_exp[gene][tissue] = "NA"

summary.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % (snp,eqtls[snp]["snp_info"]["chr"],eqtls[snp]["snp_info"]["locus"],gene,eqtls[snp][gene]["gene_chr"],eqtls[snp][gene]["gene_start"],eqtls[snp][gene]["gene_end"],tissue,eqtls[snp][gene]["tissues"][tissue]["pvalue"],eqtls[snp][gene]["tissues"][tissue]["qvalue"],eqtls[snp][gene]["tissues"][tissue]["effect_size"]))
to_cell_line = ''
if eqtls[snp][gene]["cell_lines"] == "NA":
summary.write("NA")
to_cell_line = "NA"
else:
for i,cell_line in enumerate(eqtls[snp][gene]["cell_lines"],start=1):
if not i == len(eqtls[snp][gene]["cell_lines"]):
summary.write("%s," % cell_line)
to_cell_line = cell_line
else:
summary.write(cell_line)
summary.write("\t%s\t%s\t%s\t%s\t%s\t" % (eqtls[snp][gene]["p_thresh"],eqtls[snp][gene]["cis?"],distance_from_snp,gene_exp[gene][tissue],gene_exp[gene]["max"]))
to_cell_line = cell_line
max_gene_exp = ''
if gene_exp[gene]["max"] == "NA":
summary.write("NA\t")
max_gene_exp = 'NA'
else:
summary.write("%s\t" % gene_exp[gene][gene_exp[gene]["max"]])
summary.write("%s\t" % gene_exp[gene]["min"])
max_gene_exp = gene_exp[gene][gene_exp[gene]["max"]]
min_gene_exp = ''
if gene_exp[gene]["min"] == "NA":
summary.write("NA\n")
min_gene_exp = 'NA'
else:
summary.write("%s\n" % gene_exp[gene][gene_exp[gene]["min"]])
min_gene_exp = gene_exp[gene][gene_exp[gene]["min"]]

to_summary = [snp,eqtls[snp]["snp_info"]["chr"],
eqtls[snp]["snp_info"]["locus"],
gene,eqtls[snp][gene]["gene_chr"],
eqtls[snp][gene]["gene_start"],
eqtls[snp][gene]["gene_end"],tissue,
eqtls[snp][gene]["tissues"][tissue]["pvalue"],
eqtls[snp][gene]["tissues"][tissue]["qvalue"],
eqtls[snp][gene]["tissues"][tissue]["effect_size"],
to_cell_line, eqtls[snp][gene]["p_thresh"],
eqtls[snp][gene]["cis?"],distance_from_snp,
gene_exp[gene][tissue],gene_exp[gene]["max"],
max_gene_exp, gene_exp[gene]["min"],
min_gene_exp]
all_summary_rows.append(to_summary)
summ_writer.writerows(all_summary_rows)
summary.close()
produce_sig_eqtls(all_summary_rows, summ_header, output_dir)

def compute_adj_pvalues(p_values):
p_values = sorted(p_values)
min_function = []
adj_pvalues = []
for k, pval in enumerate(p_values,start=1):
mfunc = (len(p_values) / float(k)) * pval
if mfunc > 1:
mfunc = 1
min_function.append(mfunc)
for i in range(len(min_function)):
adj_pval = min(min_function[i:len(min_function)])
adj_pvalues.append(adj_pval)
return(p_values,adj_pvalues)

def produce_sig_eqtls(all_summary_rows, summ_header,output_dir):
fdr = 0.05
try:
fdr = args.fdr_threshold
except:
fdr = 0.05
sig_eqtls = []
all_summary = sorted(all_summary_rows, key = itemgetter(9))
for row in all_summary_rows:
if row[9] <= fdr:
sig_eqtls.append(row)

sig_file = open(os.path.join(output_dir,'significant_eqtls.txt'),'w')
sigwriter = csv.writer(sig_file, delimiter = '\t')
sigwriter.writerow(summ_header)
sigwriter.writerows(sig_eqtls)
sig_file.close()

def produce_overview(genes,eqtls,num_sig,output_dir):
print "Producing overview..."
stat_table = open(output_dir + "/overview.txt",'w')
Expand Down Expand Up @@ -603,13 +721,13 @@ def parse_genes_files(genes_files):
genes[snp][gene[1]].add(gene[2])
return genes

def parse_eqtls_files(eqtls_files,fdr_threshold=None):
def parse_eqtls_files(eqtls_files,fdr_threshold=0.05):
print "Parsing eQTLs..."
eqtls = {}
p_values = [] #A sorted list of all p-values for use computing FDR
num_tests = 0 #Total number of tests done
num_sig = {} #Number of eQTLs deemed significant under the given threshold
eqtls_files = eqtls_files.split(' ') #@Tayaza: Separate individual eqtl files
#eqtls_files = eqtls_files.split(' ') #@Tayaza: Separate individual eqtl files
for eqtls_file in eqtls_files:
lines = 0
with open(eqtls_file,'r') as eqtlfile:
Expand Down Expand Up @@ -649,7 +767,7 @@ def parse_eqtls_files(eqtls_files,fdr_threshold=None):
q = float(eqtl[12])
effect_size = float(eqtl[13])
except IndexError:
q = 0.0 #@Tayaza: Handle previous versions without qvalue
q = 2.0 #@Tayaza: Handle previous versions without qvalue
effect_size = 0.0 #@Tayaza: Handle previous versions
if not eqtls.has_key(snp):
eqtls[snp] = {}
Expand All @@ -672,12 +790,14 @@ def parse_eqtls_files(eqtls_files,fdr_threshold=None):
else:
eqtls[snp][gene]["tissues"][tissue]["qvalue"] = q

p_values,adj_p_values = compute_adj_pvalues(p_values)
for snp in eqtls.keys():
for gene in eqtls[snp].keys():
if not gene == "snp_info":
for tissue in eqtls[snp][gene]["tissues"].keys():
if fdr_threshold:
eqtls[snp][gene]["tissues"][tissue]["qvalue"] = compute_fdr(eqtls[snp][gene]["tissues"][tissue]["pvalue"],p_values,num_tests)
eqtls[snp][gene]["tissues"][tissue]["qvalue"] = adj_p_values[p_values.index(eqtls[snp][gene]["tissues"][tissue]["pvalue"])]
#eqtls[snp][gene]["tissues"][tissue]["qvalue"] = compute_fdr(eqtls[snp][gene]["tissues"][tissue]["pvalue"],p_values,num_tests)
if eqtls[snp][gene]["tissues"][tissue]["qvalue"] < fdr_threshold:
num_sig[snp] += 1

Expand Down Expand Up @@ -1103,6 +1223,7 @@ def build_eqtl_index(table_fp,output_fp=None,snp_col=23,gene_symbol_col=27,gene_
gene_database_fp = config.get("Defaults","GENE_DATABASE_FP")
eqtl_data_dir = config.get("Defaults","EQTL_DATA_DIR")
expression_table_fp = config.get("Defaults","EXPRESSION_TABLE_FP")
gtex_cert = config.get("Defaults", "GTEX_CERT")
if not os.path.isdir(args.output_dir):
os.makedirs(args.output_dir)

Expand Down
12 changes: 7 additions & 5 deletions docs/codes3d.conf
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
#Config file for the hiC_query pipeline
[Defaults]

libdir=lib/

LIB_DIR: %(libdir)s
SNP_DATABASE_FP: %(libdir)ssnp_index_dbSNP_b146.db
ALT_DIR: %(altdir)s
SNP_DATABASE_FP: %(libdir)ssnp_index_dbSNP_b147.db
HIC_DATA_DIR: %(libdir)shic_data
FRAGMENT_BED_FP: %(libdir)sHomo_sapiens.GRCh37.75.dna.MboI.fragments.bed
FRAGMENT_DATABASE_FP: %(libdir)sHomo_sapiens.GRCh37.75.dna.fragments.db
GENE_BED_FP: %(libdir)sgencode.v19.genes.v6p_model.patched_contigs.gtf.bed
GENE_DATABASE_FP: %(libdir)sgencode.v19.genes.v6p_model.patched_contigs.gtf.db
#GENE_BED_FP: %(libdir)sgencode.v19.genes.v6p_model.patched_contigs.gtf.bed
#GENE_DATABASE_FP: %(libdir)sgencode.v19.genes.v6p_model.patched_contigs.gtf.db
GENE_BED_FP: %(libdir)sgene_reference.bed
GENE_DATABASE_FP: %(libdir)sgene_reference.db
EQTL_DATA_DIR: %(libdir)seQTLs
EXPRESSION_TABLE_FP: %(libdir)sGTEx_Analysis_v6p_RNA-seq_gene_median_rpkm.gct
GTEX_CERT: %(libdir)sgtex_cert.pem

0 comments on commit 95bedd5

Please sign in to comment.