Skip to content

Commit

Permalink
update models
Browse files Browse the repository at this point in the history
  • Loading branch information
jefferyUstc committed Jan 15, 2020
1 parent febb2fd commit 9cdf000
Show file tree
Hide file tree
Showing 16 changed files with 4,602 additions and 4,069 deletions.
37 changes: 16 additions & 21 deletions MiScan/core.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
# -*- coding: utf-8 -*-
import operator
import re
import shutil
import warnings
from collections import Counter
from os import makedirs, system, popen, environ
from os.path import exists, join
from os import environ, makedirs
from os import system, popen
from os.path import join, exists
import re

import keras.backend as K
import matplotlib as mpl
import numpy as np
import pandas as pd
from scipy.io import mmread
import shutil

from .utils import get_data, exists_or_mkdir
from .model import build_model
from .logging import get_logger

from .model import build_dense_model as build_model
from .utils import get_data, exists_or_mkdir

mpl.use('Agg')
import matplotlib.pyplot as plt
Expand All @@ -34,8 +34,6 @@
def vcf_to_sparse(outDir, inFeaID, inFeaBed, inVcf, featureID):
"""
convert the vcf to a feature matrix, `matrix.mtx`
:param outDir: output directory
:param inFeaID: `13885fea_exon_cut_100bp_2sample.txt` in dependency_data
:param inFeaBed: `tcga_13885fea_exon_cut_100bp.bed` in dependency_data
Expand All @@ -50,21 +48,18 @@ def vcf_to_sparse(outDir, inFeaID, inFeaBed, inVcf, featureID):
vcf_list = []
with open(inVcf) as f:
for line in f:
# TODO need to optimize the regex
if re.match('#', line):
pass
else:
data = line.strip().split('\t')
# TODO need to optimize the regex
chrom = data[0] if re.match('chr', data[0]) else 'chr' + data[0]
start = int(data[1])
end = start + 1
qual = data[5]
alt = data[3] + ',' + data[4]
vcf_list.append([chrom, start, end, 1, qual, alt])
vcf_df = pd.DataFrame(vcf_list, columns=['chrom', 'start', 'end', 'sample_name', 'qual', 'alt'])
vcf_df.sort_values(by=['chrom', 'start'], ascending=True, inplace=True) # 'snp_sampleID.bed'
# TODO check this
vcf_df.sort_values(by=['chrom', 'start'], ascending=True, inplace=True)
outSnpBedFile = join(outDir, 'snp_sampleID.bed')
vcf_df.to_csv(outSnpBedFile, sep='\t', header=None, index=None)
# --------------------------------------------------
Expand Down Expand Up @@ -123,7 +118,8 @@ def generate_report(inDir, outDir, y_pred_pat):
"""
generate report for single sample, including Cancer Risk Prediction | Top Gene Mutation Sites
:param inDir: for historical reason, actually, it's the path of `MISCAN.norm.trainPred.txt` and 'MISCAN.pat.trainPred.txt' in dependency_data
:param inDir: for historical reason, actually, it's the path of `MISCAN.norm.trainPred.txt` and
'MISCAN.pat.trainPred.txt' in dependency_data
:param outDir: output directory
:param y_pred_pat: risk_to_be_patient from func `prediction`
:return:
Expand All @@ -147,14 +143,13 @@ def generate_report(inDir, outDir, y_pred_pat):

axes[2].set_position(Bbox([[0.09, 0.57], [0.95, 0.83]]))

trainPat_file = inDir[0]
trainNorm_file = inDir[1]
trainPat_ay = np.loadtxt(trainPat_file)
trainNorm_ay = np.loadtxt(trainNorm_file)

g = sns.kdeplot(list(trainPat_ay), label='trainPat', ax=axes[2],
train_pat = pd.read_csv(inDir[0], header=None).values
train_norm = pd.read_csv(inDir[1], header=None).values
train_pat = np.squeeze(train_pat, axis=1)
train_norm = np.squeeze(train_norm, axis=1)
g = sns.kdeplot(train_pat, label='trainPat', ax=axes[2],
shade=True, color='#ffb7ce')
g = sns.kdeplot(list(trainNorm_ay), label='trainNorm', ax=axes[2],
g = sns.kdeplot(train_norm, label='trainNorm', ax=axes[2],
shade=True, color='#95d0fc')
axes[2].set_xlabel('Cancer risk', size=15)
axes[2].set_ylabel('Density', size=15)
Expand Down
Loading

0 comments on commit 9cdf000

Please sign in to comment.