Skip to content

Commit

Permalink
Merge pull request #150 from UPHL-BioNGS/erin-update20231115
Browse files Browse the repository at this point in the history
Update 20231115
  • Loading branch information
erinyoung authored Nov 17, 2023
2 parents 9a775bc + 698a125 commit c8dd6bc
Show file tree
Hide file tree
Showing 25 changed files with 761 additions and 468 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/just_msa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
- name: Check MSA files
run: |
for file in grandeur/roary/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix_with_qc.txt
for file in grandeur/*/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix.txt
do
head $file
wc -l $file
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/phylogenetic_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,12 @@ jobs:
cat grandeur/grandeur_summary.tsv
nextflow run . -profile docker,msa --maxcpus 2 --medcpus 2 -resume
- name: Check MSA files
run: |
for file in grandeur/*/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix.txt
do
head $file
wc -l $file
done
19 changes: 10 additions & 9 deletions bin/.tests.sh
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
#/bin/bash

# just a bunch of tests with local directories
# /home/eriny/sandbox/Grandeur/bin/.tests.sh
# /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur/bin/.tests.sh

# default with input
nextflow run /home/eriny/sandbox/Grandeur \
nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
-profile singularity \
--sample_sheet /home/eriny/sandbox/Grandeur/bin/sample_sheet.csv \
--sample_sheet /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur/bin/sample_sheet.csv \
--outdir grandeur_sample_sheet \
-resume \
-with-tower

# default with reads
nextflow run /home/eriny/sandbox/Grandeur \
nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
-profile singularity \
--reads /home/eriny/sandbox/test_files/grandeur/reads \
--outdir grandeur_fastq_channel \
-resume \
-with-tower

# default with fastas
nextflow run /home/eriny/sandbox/Grandeur \
nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
-profile singularity \
--fastas /home/eriny/sandbox/test_files/grandeur/fastas \
--outdir grandeur_fasta_channel \
-resume \
-with-tower

# default with reads and fastas
nextflow run /home/eriny/sandbox/Grandeur \
nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
-profile singularity \
--reads /home/eriny/sandbox/test_files/grandeur/reads \
--fastas /home/eriny/sandbox/test_files/grandeur/fastas \
Expand All @@ -37,20 +37,21 @@ nextflow run /home/eriny/sandbox/Grandeur \
-with-tower

# multiple sequence alignment
nextflow run /home/eriny/sandbox/Grandeur \
nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
-profile singularity,msa \
--gff /home/eriny/sandbox/test_files/grandeur/msa \
--fastas /home/eriny/sandbox/test_files/grandeur/msa \
--reads /home/eriny/sandbox/test_files/grandeur/msa \
--outdir grandeur_msa_fastani \
--min_core_genes 50 \
-resume \
-with-tower

for profile in "singularity" "uphl"
do
for ver in "test0" "test1" "test2" "test3" "test4" "test5" "test6"
do
nextflow run /home/eriny/sandbox/Grandeur \
nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
-profile $profile,$ver \
--outdir grandeur_${ver}_$profile \
-resume \
Expand All @@ -59,7 +60,7 @@ do
done

# with nothing
nextflow run /home/eriny/sandbox/Grandeur \
nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
-profile singularity \
--gff wontexist \
--fastas shouldntexit \
Expand Down
139 changes: 0 additions & 139 deletions bin/HeatCluster.py

This file was deleted.

81 changes: 81 additions & 0 deletions bin/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python3

'''
Author: Erin Young
Description:
This script is to get some genome accession from NCBI datasets
EXAMPLE:
python3 evaluate.py
'''

import pandas as pd
from pathlib import Path

genepresence_df = pd.read_table("gene_presence_absence.Rtab")
num_samples = len(genepresence_df.columns) - 1

genepresence_df['sum'] = genepresence_df.drop('Gene', axis=1).sum(axis=1)

core_df = genepresence_df[ genepresence_df['sum'] >= num_samples * .99 ]
soft_df = genepresence_df[(genepresence_df['sum'] >= num_samples * .95 ) & (genepresence_df['sum'] < num_samples * .99 )]
shel_df = genepresence_df[(genepresence_df['sum'] >= num_samples * .15 ) & (genepresence_df['sum'] < num_samples * .95 )]
clud_df = genepresence_df[ genepresence_df['sum'] < num_samples * .15 ]

samples = genepresence_df.drop('Gene', axis=1).drop('sum', axis=1).columns
percent_df = pd.DataFrame([])
percent_df['sample'] = samples

for sample in samples:
bamindex = percent_df.index[percent_df['sample'] == sample]
total = genepresence_df[sample].sum(axis=0)
core = core_df[sample].sum(axis=0)
soft = soft_df[sample].sum(axis=0)
shell = shel_df[sample].sum(axis=0)
cloud = clud_df[sample].sum(axis=0)

percent_df.loc[bamindex, 'total'] = total
percent_df.loc[bamindex, 'core'] = core
percent_df.loc[bamindex, 'soft'] = soft
percent_df.loc[bamindex, 'shell'] = shell
percent_df.loc[bamindex, 'cloud'] = cloud


percent_df["per_core"] = percent_df['core'] / percent_df['total']
percent_df["per_soft"] = percent_df['soft'] / percent_df['total']
percent_df["per_shell"] = percent_df['shell'] / percent_df['total']
percent_df["per_clouc"] = percent_df['cloud'] / percent_df['total']
percent_df = percent_df.sort_values('per_core', ascending=False)

core_genome_file = Path("core_gene_alignment_filtered.aln")
if not core_genome_file.is_file():
core_genome_file = Path("core_gene_alignment.aln")

sample = ""
length = ""
ambiguous = ""
with open(core_genome_file) as file:
for line in file:
if ">" in line:
if sample:
bamindex = percent_df.index[percent_df['sample'] == sample]
percent_df.loc[bamindex, 'length'] = length
percent_df.loc[bamindex, 'num_ambiguous'] = ambiguous
sample = line.replace(">","").strip()
ambiguous = 0
length = 0
else:
line = line.strip()
length += len(line)
nonagct = len(line) - line.count('a') - line.count('A') - line.count('g') - line.count('G') - line.count('c') - line.count('C') - line.count('t') - line.count('T')
ambiguous += nonagct

bamindex = percent_df.index[percent_df['sample'] == sample]
percent_df.loc[bamindex, 'length'] = length
percent_df.loc[bamindex, 'num_ambiguous'] = ambiguous

percent_df["per_ambiguous"] = percent_df['num_ambiguous'] / percent_df['length']

percent_df.to_csv('core_genome_evaluation.csv', index=False)
Loading

0 comments on commit c8dd6bc

Please sign in to comment.