Skip to content

Commit

Permalink
Merge pull request #1 from wtsi-hgi/main
Browse files Browse the repository at this point in the history
Modified yascp keras
  • Loading branch information
Tobi1kenobi authored Oct 10, 2023
2 parents ccfb649 + 7c96d19 commit 8f88b37
Show file tree
Hide file tree
Showing 7 changed files with 1,021 additions and 69 deletions.
20 changes: 2 additions & 18 deletions CITATIONS.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
# nf-core/yascp: Citations

## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/)

> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031.
## [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/)

> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311.
# yascp: Citations
Mention the use of this repo in manuscript.

## Pipeline tools

Expand Down Expand Up @@ -34,15 +27,6 @@

## Software packaging/containerisation tools

* [Anaconda](https://anaconda.com)
> Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web.
* [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/)
> Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506.
* [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/)
> da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671.
* [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241)

* [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/)
Expand Down
124 changes: 95 additions & 29 deletions bin/concordance_calculations_donor_exclusive_read_level_noA2G.py

Large diffs are not rendered by default.

59 changes: 42 additions & 17 deletions bin/dynamic_donor_exclusive_snp_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
import multiprocessing as mp
from multiprocessing import Lock
import os

add_noninformative=True


use_only_informative_snps=True #When this is set to True we ignore any sites that have no difference between all the individuals.
class VCF_Loader:

Expand Down Expand Up @@ -68,15 +70,20 @@ def load_sample_mp(self,line,obs_ids,count,format_list):
random.shuffle(c)
obs_with_gt, list_val_with_gt = zip(*c)
test_if_there_is_difference = set(pd.DataFrame(list_val_with_gt)[0].str.split(":").str[0])
test_if_there_is_difference.discard('./.')
test_if_there_is_difference.discard('.|.')
test_if_there_is_difference.discard('.')

nr_genotypes = len(test_if_there_is_difference)
if use_only_informative_snps==True:
if nr_genotypes==0:
_='here we only have missing genotypes' #here we only have missing genotypes
else:
if nr_genotypes>1:
return [obs_with_gt,list_val_with_gt,idx,list_val,count,coment_fields]
return [obs_with_gt,list_val_with_gt,idx,list_val,count,coment_fields,'informative']
else:
_='Site is not informative'
# print('Site is not informative')
else:
return [obs_with_gt,list_val_with_gt,idx,list_val,count,coment_fields]
return [obs_with_gt,list_val_with_gt,idx,list_val,count,coment_fields,'constant']
# print('Site is not informative')



def set_results(self,to_set,id):
Expand Down Expand Up @@ -116,7 +123,7 @@ def append_results(self,result):
alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx]
if alleles!='.':
ids = "_".join([list_val[x] for x in [0, 1, 3, 4]])
donor_var = f"{ids}_{alleles}"
donor_var = f"{result[6]}_{ids}_{alleles}"
while ob_id in self.curently_pushing:
time.sleep(r*0.01)
self.curently_pushing.append(ob_id)
Expand Down Expand Up @@ -314,32 +321,50 @@ def donor_exclusive_sites(exclusive_don_variants2):
for key1 in iteration_dataframe.keys():
all_sites = pd.DataFrame(iteration_dataframe[key1],columns=['f1'])
splits = all_sites['f1'].str.split('_')
all_sites['#CHROM']=splits.str[0]
all_sites['POS']=splits.str[1]
all_sites['informative']=splits.str[0]
all_sites['#CHROM']=splits.str[1]
all_sites['POS']=splits.str[2]
all_sites['ID']=f'.'
all_sites['REF']=splits.str[2]
all_sites['ALT']=splits.str[3]
all_sites['REF']=splits.str[3]
all_sites['ALT']=splits.str[4]
all_sites['QUAL']=f'.'
all_sites['FILTER']=f'.'
all_sites['INFO']=f'.'
all_sites = all_sites.drop(columns=['f1'])
exta_snps=pd.concat([exta_snps,all_sites])
# All_Extra_informative_Sites = exta_snps.drop_duplicates()
informative_sites = exta_snps[exta_snps['informative']=='informative'].drop('informative',axis=1)
constant_sites = exta_snps[exta_snps['informative']=='constant'].drop('informative',axis=1)
constant_sites = constant_sites.drop_duplicates(subset=['#CHROM', 'POS'])
constant_sites.index=constant_sites['#CHROM']+'_'+constant_sites['POS']

exta_snps_back = exta_snps.copy()
exta_snps = informative_sites
exta_snps.columns = cellsnp.columns
exta_snps=exta_snps.drop_duplicates(subset=[0, 1])
cellsnp_exta_snps=pd.concat([cellsnp,exta_snps])

exta_snps.index=exta_snps[0]+'_'+exta_snps[1]
cellsnp.index=cellsnp[0]+'_'+cellsnp[1]
set1_uninformative_sites=set(cellsnp.index)-set(exta_snps.index)


informative_sites_covered_in_default_panel = set(exta_snps.index)-set(cellsnp.index)
constant_sites_covered_in_default_panel = set(constant_sites.index)-set(cellsnp.index)


if add_noninformative:
constant_sites.columns = cellsnp.columns
cellsnp_exta_snps=pd.concat([cellsnp,exta_snps,constant_sites])
else:
cellsnp_exta_snps=pd.concat([cellsnp,exta_snps])

cellsnp_exta_snps = cellsnp_exta_snps.drop_duplicates(subset=[0, 1])
set1_uninformative_sites = cellsnp.loc[set1_uninformative_sites]

set2_informative_sites =exta_snps
description = pd.DataFrame([{'total sites':len(cellsnp_exta_snps),'informative sites':len(set2_informative_sites),'uninformative sites':len(set1_uninformative_sites),'informative sites covered in initial panel':len(informative_sites_covered_in_default_panel)}])
set1_uninformative_sites=constant_sites
description = pd.DataFrame([{'total sites':len(cellsnp_exta_snps),'informative sites':len(set2_informative_sites),'uninformative sites':len(set1_uninformative_sites),'informative sites covered in initial panel':len(informative_sites_covered_in_default_panel), 'constant_sites_covered_in_default_panel':len(constant_sites_covered_in_default_panel)}])
description.to_csv('variants_description.tsv',sep='\t',index=False)
cellsnp_exta_snps.to_csv('cellsnp_variants.tsv',sep='\t',index=False,header=False)
set1_uninformative_sites.to_csv('set1_uninformative_sites.tsv',sep='\t',index=False,header=False)
constant_sites.to_csv('set1_uninformative_sites.tsv',sep='\t',index=False,header=False)
set2_informative_sites.to_csv('set2_informative_sites.tsv',sep='\t',index=False,header=False)
print('Done')

Expand Down
Loading

0 comments on commit 8f88b37

Please sign in to comment.