Merge pull request #150 from UPHL-BioNGS/erin-update20231115

Update 20231115
UPHL-BioNGS · Nov 17, 2023 · c8dd6bc · c8dd6bc
2 parents 9a775bc + 698a125
commit c8dd6bc
Show file tree

Hide file tree

Showing 25 changed files with 761 additions and 468 deletions.
diff --git a/.github/workflows/just_msa.yml b/.github/workflows/just_msa.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Check MSA files
         run: |
-          for file in grandeur/roary/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix_with_qc.txt
+          for file in grandeur/*/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix.txt
           do
             head $file
             wc -l $file

diff --git a/.github/workflows/phylogenetic_workflow.yml b/.github/workflows/phylogenetic_workflow.yml
@@ -38,4 +38,12 @@ jobs:
           cat grandeur/grandeur_summary.tsv
 
           nextflow run . -profile docker,msa --maxcpus 2 --medcpus 2 -resume
+
+      - name: Check MSA files
+        run: |
+          for file in grandeur/*/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix.txt
+          do
+            head $file
+            wc -l $file
+          done
           
diff --git a/bin/.tests.sh b/bin/.tests.sh
@@ -1,34 +1,34 @@
 #/bin/bash
 
 # just a bunch of tests with local directories
-# /home/eriny/sandbox/Grandeur/bin/.tests.sh
+# /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur/bin/.tests.sh
 
 # default with input
-nextflow run /home/eriny/sandbox/Grandeur \
+nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
   -profile singularity \
-  --sample_sheet /home/eriny/sandbox/Grandeur/bin/sample_sheet.csv \
+  --sample_sheet /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur/bin/sample_sheet.csv \
   --outdir grandeur_sample_sheet \
   -resume  \
   -with-tower
 
 # default with reads
-nextflow run /home/eriny/sandbox/Grandeur \
+nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
   -profile singularity \
   --reads  /home/eriny/sandbox/test_files/grandeur/reads \
   --outdir grandeur_fastq_channel \
   -resume  \
   -with-tower
 
 # default with fastas
-nextflow run /home/eriny/sandbox/Grandeur \
+nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
   -profile singularity \
   --fastas /home/eriny/sandbox/test_files/grandeur/fastas \
   --outdir grandeur_fasta_channel \
   -resume  \
   -with-tower
 
 # default with reads and fastas
-nextflow run /home/eriny/sandbox/Grandeur \
+nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
   -profile singularity \
   --reads  /home/eriny/sandbox/test_files/grandeur/reads \
   --fastas /home/eriny/sandbox/test_files/grandeur/fastas \
@@ -37,20 +37,21 @@ nextflow run /home/eriny/sandbox/Grandeur \
   -with-tower
 
 # multiple sequence alignment
-nextflow run /home/eriny/sandbox/Grandeur \
+nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
   -profile singularity,msa \
   --gff    /home/eriny/sandbox/test_files/grandeur/msa \
   --fastas /home/eriny/sandbox/test_files/grandeur/msa \
   --reads  /home/eriny/sandbox/test_files/grandeur/msa \
   --outdir grandeur_msa_fastani \
+  --min_core_genes 50 \
   -resume  \
   -with-tower
 
 for profile in "singularity" "uphl"
 do
   for ver in "test0" "test1" "test2" "test3" "test4" "test5" "test6"
   do
-    nextflow run /home/eriny/sandbox/Grandeur \
+    nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
       -profile $profile,$ver \
       --outdir grandeur_${ver}_$profile \
       -resume  \
@@ -59,7 +60,7 @@ do
 done
 
 # with nothing
-nextflow run /home/eriny/sandbox/Grandeur \
+nextflow run /Volumes/IDGenomics_NAS/Bioinformatics/eriny/Grandeur \
   -profile singularity \
   --gff    wontexist \
   --fastas shouldntexit \

diff --git a/bin/HeatCluster.py b/bin/HeatCluster.py
diff --git a/bin/evaluate.py b/bin/evaluate.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+'''
+Author: Erin Young
+
+Description:
+
+This script is to get some genome accession from NCBI datasets
+
+EXAMPLE:
+python3 evaluate.py
+'''
+
+import pandas as pd
+from pathlib import Path
+
+genepresence_df = pd.read_table("gene_presence_absence.Rtab")
+num_samples = len(genepresence_df.columns) - 1
+
+genepresence_df['sum'] = genepresence_df.drop('Gene', axis=1).sum(axis=1)
+
+core_df = genepresence_df[ genepresence_df['sum'] >= num_samples * .99 ]
+soft_df = genepresence_df[(genepresence_df['sum'] >= num_samples * .95 ) & (genepresence_df['sum'] < num_samples * .99 )]
+shel_df = genepresence_df[(genepresence_df['sum'] >= num_samples * .15 ) & (genepresence_df['sum'] < num_samples * .95 )]
+clud_df = genepresence_df[ genepresence_df['sum'] <  num_samples * .15 ]
+
+samples = genepresence_df.drop('Gene', axis=1).drop('sum', axis=1).columns
+percent_df = pd.DataFrame([])
+percent_df['sample'] = samples
+
+for sample in samples:
+    bamindex = percent_df.index[percent_df['sample'] == sample]
+    total = genepresence_df[sample].sum(axis=0)
+    core  = core_df[sample].sum(axis=0)
+    soft  = soft_df[sample].sum(axis=0)
+    shell = shel_df[sample].sum(axis=0)
+    cloud = clud_df[sample].sum(axis=0)
+
+    percent_df.loc[bamindex, 'total'] = total
+    percent_df.loc[bamindex, 'core']  = core
+    percent_df.loc[bamindex, 'soft']  = soft
+    percent_df.loc[bamindex, 'shell'] = shell
+    percent_df.loc[bamindex, 'cloud'] = cloud
+
+
+percent_df["per_core"]  = percent_df['core']  / percent_df['total']
+percent_df["per_soft"]  = percent_df['soft']  / percent_df['total']
+percent_df["per_shell"] = percent_df['shell'] / percent_df['total']
+percent_df["per_clouc"] = percent_df['cloud'] / percent_df['total']
+percent_df = percent_df.sort_values('per_core', ascending=False)
+
+core_genome_file = Path("core_gene_alignment_filtered.aln")
+if not core_genome_file.is_file():
+    core_genome_file = Path("core_gene_alignment.aln")
+
+sample    = ""
+length    = ""
+ambiguous = ""
+with open(core_genome_file) as file:
+    for line in file:
+        if ">" in line:
+            if sample:
+                bamindex = percent_df.index[percent_df['sample'] == sample]
+                percent_df.loc[bamindex, 'length']        = length
+                percent_df.loc[bamindex, 'num_ambiguous'] = ambiguous               
+            sample     = line.replace(">","").strip()
+            ambiguous  = 0
+            length     = 0
+        else:
+            line       = line.strip()
+            length    += len(line)
+            nonagct    = len(line) - line.count('a') - line.count('A') - line.count('g') - line.count('G') - line.count('c') - line.count('C') - line.count('t') - line.count('T')
+            ambiguous += nonagct
+
+bamindex = percent_df.index[percent_df['sample'] == sample]
+percent_df.loc[bamindex, 'length']        = length
+percent_df.loc[bamindex, 'num_ambiguous'] = ambiguous
+
+percent_df["per_ambiguous"] = percent_df['num_ambiguous'] / percent_df['length']
+
+percent_df.to_csv('core_genome_evaluation.csv', index=False)