diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b7c0b09..2bc5487 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,6 +10,8 @@ on: env: NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity concurrency: group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" @@ -24,9 +26,15 @@ jobs: strategy: matrix: NXF_VER: - - "23.04.0" + - "22.10.1" - "latest-everything" steps: + - name: Get branch names + # Pulls the names of current branches in repo + # steps.branch-names.outputs.current_branch is used later and returns the name of the branch the PR is made FROM not to + id: branch-names + uses: tj-actions/branch-names@v8 + - name: Check out pipeline code uses: actions/checkout@v3 @@ -35,10 +43,34 @@ jobs: with: version: "${{ matrix.NXF_VER }}" + - name: Set up Singularity + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Setup apptainer + uses: eWaterCycle/setup-apptainer@main + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install nf-core + run: | + pip install nf-core + + - name: NF-Core Download - download singularity containers + # Forcibly download repo on active branch and download SINGULARITY containers into the CACHE dir if not found + # Must occur after singularity install or will crash trying to dl containers + # Zip up this fresh download and run the checked out version + run: | + nf-core download sanger-tol/ascc --revision ${{ steps.branch-names.outputs.current_branch }} --compress none -d --force --outdir sanger-ascc --container-cache-utilisation amend --container-system singularity + - name: Download test data # Download A fungal test data set that is full enough to show some real output. run: | - curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest.tar.gz | tar xzf - + curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest_V2.tar.gz | tar xzf - - name: Download the NCBI taxdump database run: | @@ -48,11 +80,11 @@ jobs: - name: Download the FCS-gx database run: | mkdir FCS_gx - wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.taxa.tsv -O FCS_gx/all.taxa.tsv - wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxi -O FCS_gx/all.gxi - wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxs -O FCS_gx/all.gxs - wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.meta.jsonl -O FCS_gx/all.meta.jsonl - wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.blast_div.tsv.gz -O FCS_gx/all.blast_div.tsv.gz + wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.taxa.tsv -O FCS_gx/all.taxa.tsv + wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxi -O FCS_gx/all.gxi + wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxs -O FCS_gx/all.gxs + wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.meta.jsonl -O FCS_gx/all.meta.jsonl + wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.blast_div.tsv.gz -O FCS_gx/all.blast_div.tsv.gz - name: Download the BUSCO lineage database run: | @@ -72,7 +104,12 @@ jobs: - name: Download the pacbio barcode run: | mkdir pacbio_barcode - wget -O pacbio_barcode/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip -c https://www.pacb.com/wp-content/uploads/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && cd pacbio_barcode && unzip SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && mv SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta pacbio_adaptors.fa && rm -rf SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip __MACOSX && cd .. + wget -O pacbio_barcode/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip -c https://www.pacb.com/wp-content/uploads/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip + cd pacbio_barcode + unzip SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip + mv SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta pacbio_adaptors.fa + rm -rf SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip __MACOSX + cd ../ - name: Download the subset of Diamond database run: | @@ -84,9 +121,9 @@ jobs: mkdir vecscreen curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf - - - name: Run pipeline with test data + - name: Singularity - Run FULL pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results --steps ALL diff --git a/.nf-core.yml b/.nf-core.yml index f98122b..d142279 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,20 +1,18 @@ repository_type: pipeline lint: - files_exist: - - CODE_OF_CONDUCT.md - - assets/nf-core-ascc_logo_light.png - - docs/images/nf-core-ascc_logo_light.png - - docs/images/nf-core-ascc_logo_dark.png - - .github/ISSUE_TEMPLATE/config.yml - - .github/workflows/awstest.yml - - .github/workflows/awsfulltest.yml - - conf/igenomes.config + files_exist: false files_unchanged: - CODE_OF_CONDUCT.md - assets/nf-core-ascc_logo_light.png - docs/images/nf-core-ascc_logo_light.png - docs/images/nf-core-ascc_logo_dark.png - .github/ISSUE_TEMPLATE/bug_report.yml + - .github/workflows/branch.yml + - .github/CONTRIBUTING.md + - .github/PULL_REQUEST_TEMPLATE.md + - .github/workflows/linting_comment.yml + - assets/email_template.html + - pyproject.toml - LICENSE - .github/workflows/linting.yml - lib/NfcoreTemplate.groovy diff --git a/README.md b/README.md index d1784ea..e4e9621 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index c5a8bbf..fde2e2e 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -1,15 +1,17 @@ -assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa -assembly_title: asccTinyTest -pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa -pacbio_multiplexing_barcode_names: "bc2008,bc2009" -reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio +assembly_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/assembly/pyoelii_tiny_testfile_with_adapters.fa +assembly_title: asccTinyTest_V2 +reads_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/pacbio/ reads_type: "hifi" +pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa +pacbio_multiplexing_barcode_names: "bc2001,bc2009" sci_name: "Plasmodium yoelii yoelii 17XNL" taxid: 352914 -mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa -plastid_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa +mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa +plastid_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa kmer_len: 7 -## Below this point will need updating as more subworkflows are built +dimensionality_reduction_methods: "pca,random_trees" +# all available methods +# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" nt_database: /home/runner/work/ascc/ascc/NT_database/ nt_database_prefix: 18S_fungal_sequences nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2 @@ -20,7 +22,8 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd -vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen +vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/ seqkit: sliding: 6000 window: 100000 +n_neighbours: 13 diff --git a/assets/static-args.yaml b/assets/static-args.yaml new file mode 100755 index 0000000..6ae120e --- /dev/null +++ b/assets/static-args.yaml @@ -0,0 +1,3 @@ +kmer_size: 7 +n_neighbors_setting: 13 +autoencoder_epochs_count: -1 diff --git a/assets/test.yaml b/assets/test.yaml index e9eafa2..122db32 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -1,14 +1,17 @@ -assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh2.fasta -assembly_title: asccTinyTest -reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/ +assembly_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/assembly/pyoelii_tiny_testfile_with_adapters.fa +assembly_title: asccTinyTest_V2 +reads_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/pacbio/ reads_type: "hifi" pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa pacbio_multiplexing_barcode_names: "bc2008,bc2009" sci_name: "Plasmodium yoelii yoelii 17XNL" taxid: 352914 -mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa -plastid_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa +mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa +plastid_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa kmer_len: 7 +dimensionality_reduction_methods: "pca,random_trees" +# all available methods +# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" nt_database: /data/blastdb/Supported/NT/202308/dbv4/ nt_database_prefix: nt nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt @@ -17,9 +20,10 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb -vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database +vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/ diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd seqkit: sliding: 100000 window: 6000 +n_neighbours: 13 diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py index 87ea39d..42d45a3 100755 --- a/bin/VSlistTo1HitPerLine.py +++ b/bin/VSlistTo1HitPerLine.py @@ -5,8 +5,8 @@ This script converts the VecScreen text list output to one line giving the coordinates for each vector segment in the format: VecScreen_Category ID_string start_position end_position -The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including ---skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line. +The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including +--skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line. "No hits" will be reported for any Query sequence that had no matches in any of the selected categories, unless --skip_reporting_no_hits is included on the command line. VecScreen errors will be reported unless --skip_reporting_errors is included on the command line. Usage: diff --git a/bin/get_kmers_counts.py b/bin/get_kmers_counts.py new file mode 100755 index 0000000..808f578 --- /dev/null +++ b/bin/get_kmers_counts.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +""" +Script for counting kmer frequencies per sequence in a FASTA file +Output (STDOUT): kmer counts as a CSV table +Developed by Eerik Aunin (ea10@sanger.ac.uk) +""" + +import argparse +import general_purpose_functions as gpf +import kcounter +from collections import OrderedDict +import pandas as pd + + +def main(fasta_path, out_path, kmer_size): + fasta_data = gpf.read_fasta_in_chunks(fasta_path) + nucleotides_collection = list() + for header, seq in fasta_data: + seq = seq.upper() + seq_len = len(seq) + nucleotides_dict = kcounter.count_kmers(seq, kmer_size, canonical_kmers=True) + relative_counts_dict = OrderedDict() + relative_counts_dict["header"] = header + relative_counts_dict["seq_len"] = seq_len + for kmer in nucleotides_dict: + kmer_relative_count = nucleotides_dict[kmer] / seq_len + relative_counts_dict[kmer] = kmer_relative_count + nucleotides_collection.append(relative_counts_dict) + df = pd.DataFrame(nucleotides_collection) + df = df.fillna(0) + df.to_csv(out_path, index=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-v", "--version", action="version", version="1.0") + parser.add_argument("fasta_path", type=str, help="Path to input FASTA file") + parser.add_argument("out_path", type=str, help="Path for output CSV file") + parser.add_argument("--kmer_size", type=int, help="kmer size (bp). Default: 7", default=7) + args = parser.parse_args() + main(args.fasta_path, args.out_path, args.kmer_size) diff --git a/bin/kmer_count_dim_reduction.py b/bin/kmer_count_dim_reduction.py new file mode 100755 index 0000000..b72d5ba --- /dev/null +++ b/bin/kmer_count_dim_reduction.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 + +""" +Script for kmer count dimensionality reduction, using multiple methods. + +Developed by Eerik Aunin (ea10@sanger.ac.uk) + +Modified by Damon-Lee Pointon (dp24) +""" + +# https://machinelearningmastery.com/dimensionality-reduction-algorithms-with-python/ +# https://scikit-learn.org/stable/auto_examples/manifold/plot_compare_methods.html#sphx-glr-auto-examples-manifold-plot-compare-methods-py + +import os + +matplot_config_path = os.getcwd() + "/matplotconfig/" +fonts_config_path = os.getcwd() + "/fontconfig/" +numba_config_path = os.getcwd() + "/numbaconfig/" + +for i in [fonts_config_path, matplot_config_path, numba_config_path]: + if not os.path.exists(i): + os.makedirs(i) + +os.environ["OSFONTDIR"] = fonts_config_path +os.environ["MPLCONFIGDIR"] = matplot_config_path +os.environ["NUMBA_CACHE_DIR"] = numba_config_path + +os.chmod(fonts_config_path, 0o0755) +os.chmod(matplot_config_path, 0o0755) +os.chmod(numba_config_path, 0o0755) + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np + +from sklearn import preprocessing +import pandas as pd + +import umap +from sklearn import decomposition +from sklearn import manifold +from sklearn import ensemble + +import tensorflow as tf +import random + +import sys +import argparse +from pathlib import Path +from datetime import datetime +from functools import reduce + + +# plt.style.use("ggplot") + + +def reset_random_seeds(): + # https://stackoverflow.com/questions/32419510/how-to-get-reproducible-results-in-keras + os.environ["PYTHONHASHSEED"] = str(1) + tf.random.set_seed(1) + np.random.seed(1) + random.seed(1) + + +def print_timestamp(process_name): + sys.stderr.write("{}: running {}\n".format(datetime.now(), process_name)) + + +def embedding_to_dataframe(embedding, seq_names, plot_title): + """ + Function for taking a dimensionality reduction embedding and converting it into a pandas dataframe + """ + plot_title_underscores = plot_title.replace(" ", "_") + embedding_df = pd.DataFrame(embedding) + embedding_df.columns = ["embedding_x_" + plot_title_underscores, "embedding_y_" + plot_title_underscores] + embedding_df["scaff"] = seq_names + return embedding_df + + +def load_data(kmer_counts_file): + """ + Loads kmer counts from an input file, returns the counts as a pandas dataframe and sequence names as a list + """ + if os.path.isfile(kmer_counts_file) is False: + sys.stderr.write("kmer counts file ({}) was not found\n".format(kmer_counts_file)) + sys.exit(1) + if os.stat(kmer_counts_file).st_size == 0: + sys.stderr.write("kmer counts file ({}) is empty\n".format(kmer_counts_file)) + sys.exit(1) + data = pd.read_csv(kmer_counts_file, sep=",", index_col=0) + df_row_count = data.shape[0] + if df_row_count == 0: + sys.stderr.write("The kmer counts table ({}) has no rows\n".format(kmer_counts_file)) + sys.exit(1) + data_to_cluster = data.drop(["seq_len"], axis=1) + df = data_to_cluster + + seq_names = df.index.values.tolist() + scalar = preprocessing.MinMaxScaler() + scalar.fit(data_to_cluster) + df = scalar.transform(data_to_cluster) + return df, seq_names + + +def make_loss_plot(model_history, out_folder, activation_mode): + """ + Creates the loss vs epoch plot and saves it as a file + """ + plt.figure(figsize=(12, 6)) + plt.plot(model_history.history["loss"]) + plt.title("Autoencoder {}: Loss vs. Epoch".format(activation_mode)) + plt.ylabel("Loss") + plt.xlabel("Epoch") + plt.grid(True) + plt.savefig(out_folder + "/autoencoder_{}_loss_vs_epoch_plot.png".format(activation_mode)) + plt.close() + + +def run_autoencoder(df, nr_of_epochs, activation_mode, out_folder): + """ + Function for running autoencoder for dimensionality reduction of kmer counts + """ + input_dim = df.shape[1] + # This is the dimension of the latent space (encoding space) + latent_dim = 2 + + encoder = tf.keras.models.Sequential( + [ + tf.keras.layers.Dense(128, activation=activation_mode, input_shape=(input_dim,)), + tf.keras.layers.Dense(64, activation=activation_mode), + tf.keras.layers.Dense(32, activation=activation_mode), + tf.keras.layers.Dense(latent_dim, activation=activation_mode), + ] + ) + + decoder = tf.keras.models.Sequential( + [ + tf.keras.layers.Dense(64, activation=activation_mode, input_shape=(latent_dim,)), + tf.keras.layers.Dense(128, activation=activation_mode), + tf.keras.layers.Dense(256, activation=activation_mode), + tf.keras.layers.Dense(input_dim, activation=None), + ] + ) + + autoencoder = tf.keras.models.Model(inputs=encoder.input, outputs=decoder(encoder.output)) + autoencoder.compile(loss="mse", optimizer="adam") + model_history = autoencoder.fit(df, df, epochs=nr_of_epochs, batch_size=32, verbose=0) + if out_folder is not None: + make_loss_plot(model_history, out_folder, activation_mode) + embedding = encoder(df) + embedding = embedding.numpy() + return embedding + + +def run_tsne(df): + """ + Function for running t-SNE for dimensionality reduction of kmer counts + """ + print_timestamp("t-SNE") + t_sne_perplexity = 30 + df_row_count = df.shape[0] + if df_row_count < 2: + sys.stderr.write("Cannot run t-SNE because the kmer counts dataframe has only {} rows\n".format(df_row_count)) + else: + if t_sne_perplexity >= df_row_count: + t_sne_perplexity = round(df_row_count / 2) # t-SNE perplexity must be less than n_samples + if t_sne_perplexity < 1: + t_sne_perplexity = 1 + sys.stderr.write("Set t-SNE perplexity value to {}\n".format(t_sne_perplexity)) + + embedding = manifold.TSNE(n_components=2).fit_transform(df) + return embedding + + +def run_dim_reduction(df, selected_method, n_neighbors_setting=-1, autoencoder_epochs_count=-1, out_folder=None): + embedding = None + embedding_title = None + if selected_method == "pca": + embedding_title = "PCA" + print_timestamp(embedding_title) + pca = decomposition.PCA(n_components=2) + embedding = pca.fit_transform(df) + + elif selected_method == "umap": + embedding_title = "UMAP" + print_timestamp(embedding_title) + reducer = umap.UMAP( + random_state=123, n_neighbors=n_neighbors_setting, min_dist=0.1, n_components=2, metric="euclidean" + ) + embedding = reducer.fit_transform(df) + + elif selected_method == "t-sne": + embedding_title = "t-SNE" + embedding = run_tsne(df) + + elif selected_method == "isomap": + embedding_title = "isomap" + print_timestamp(embedding_title) + embedding = manifold.Isomap(n_neighbors=n_neighbors_setting, n_components=2).fit_transform(df) + + elif selected_method == "lle_standard": + embedding_title = "LLE standard" + print_timestamp(embedding_title) + embedding = manifold.LocallyLinearEmbedding( + n_neighbors=n_neighbors_setting, n_components=2, eigen_solver="dense", method="standard" + ).fit_transform(df) + + elif selected_method == "lle_hessian": + embedding_title = "LLE standard" + print_timestamp(embedding_title) + embedding = manifold.LocallyLinearEmbedding( + n_neighbors=n_neighbors_setting, n_components=2, eigen_solver="dense", method="hessian" + ).fit_transform(df) + + elif selected_method == "lle_modified": + embedding_title = "LLE modified" + print_timestamp(embedding_title) + embedding = manifold.LocallyLinearEmbedding( + n_neighbors=n_neighbors_setting, n_components=2, eigen_solver="dense", method="modified" + ).fit_transform(df) + + elif selected_method == "mds": + embedding_title = "MDS" + print_timestamp(embedding_title) + embedding = manifold.MDS(max_iter=100, n_init=1, n_components=2).fit_transform(df) + + elif selected_method == "se": + embedding_title = "SE" + print_timestamp(embedding_title) + embedding = manifold.SpectralEmbedding(n_neighbors=n_neighbors_setting, n_components=2).fit_transform(df) + + elif selected_method == "random_trees": + # https://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html#sphx-glr-auto-examples-manifold-plot-lle-digits-py + embedding_title = "random_trees" + print_timestamp(embedding_title) + hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) + x_transformed = hasher.fit_transform(df) + pca = decomposition.TruncatedSVD(n_components=2) + embedding = pca.fit_transform(x_transformed) + + elif selected_method == "kernel_pca": + # https://www.tutorialspoint.com/scikit_learn/scikit_learn_dimensionality_reduction_using_pca.htm + embedding_title = "KernelPCA" + print_timestamp(embedding_title) + kernel_pca = decomposition.KernelPCA(n_components=2, kernel="sigmoid") + embedding = kernel_pca.fit_transform(df) + + elif selected_method == "pca_svd": + # https://www.tutorialspoint.com/scikit_learn/scikit_learn_dimensionality_reduction_using_pca.htm + embedding_title = "PCA with SVD solver" + print_timestamp(embedding_title) + pca = decomposition.PCA(n_components=2, svd_solver="randomized") + embedding = pca.fit_transform(df) + + elif selected_method == "autoencoder_sigmoid": + print_timestamp("Autoencoder sigmoid") + # https://ekamperi.github.io/machine%20learning/2021/01/21/encoder-decoder-model.html + embedding = run_autoencoder(df, autoencoder_epochs_count, "sigmoid", out_folder) + + elif selected_method == "autoencoder_linear": + # https://ekamperi.github.io/machine%20learning/2021/01/21/encoder-decoder-model.html + embedding_title = "Autoencoder linear" + print_timestamp(embedding_title) + embedding = run_autoencoder(df, autoencoder_epochs_count, "linear", out_folder) + + elif selected_method == "autoencoder_tanh": + # https://ekamperi.github.io/machine%20learning/2021/01/21/encoder-decoder-model.html + embedding_title = "Autoencoder tanh" + print_timestamp(embedding_title) + embedding = run_autoencoder(df, autoencoder_epochs_count, "tanh", out_folder) + + elif selected_method == "autoencoder_selu": + # https://ekamperi.github.io/machine%20learning/2021/01/21/encoder-decoder-model.html + embedding_title = "Autoencoder selu" + print_timestamp(embedding_title) + embedding = run_autoencoder(df, autoencoder_epochs_count, "selu", out_folder) + + elif selected_method == "autoencoder_relu": + # https://ekamperi.github.io/machine%20learning/2021/01/21/encoder-decoder-model.html + embedding_title = "Autoencoder relu" + print_timestamp(embedding_title) + embedding = run_autoencoder(df, autoencoder_epochs_count, "relu", out_folder) + + elif selected_method == "nmf": + # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html + embedding_title = "Non-Negative Matrix Factorization" + print_timestamp(embedding_title) + nmf_model = decomposition.NMF(n_components=2, init="random", random_state=0) + embedding = nmf_model.fit_transform(df) + return embedding, embedding_title + + +def main(kmer_counts_file, out_folder, selected_methods, n_neighbors_setting, autoencoder_epochs_count): + reset_random_seeds() + + df, seq_names = load_data(kmer_counts_file) + df_row_count = df.shape[0] + if df_row_count == 1: + sys.stderr.write( + "Skipping the dimensionality reduction of kmer counts, as the kmer counts table has only one row" + ) + # Generate an empty file to satisfy nextflow expecting a file from script finishing with no file with small output + with open("EMPTY_kmers_dim_reduction_embeddings.csv") as empty_file: + empty_file.write("FILE TO SMALL FOR ANALYSIS") + sys.exit(0) + + Path(out_folder).mkdir(parents=True, exist_ok=True) + selected_methods = selected_methods.split(",") + + if autoencoder_epochs_count == -1: + autoencoder_epochs_count = df_row_count + + embeddings_list = list() + for selected_method in selected_methods: + embedding, embedding_title = run_dim_reduction( + df, + selected_method, + n_neighbors_setting=n_neighbors_setting, + autoencoder_epochs_count=autoencoder_epochs_count, + out_folder=out_folder, + ) + embedding_df = embedding_to_dataframe(embedding, seq_names, embedding_title) + embeddings_list.append(embedding_df) + + out_df = reduce( + lambda left, right: pd.merge(left, right, on=["scaff"], how="outer"), # Merge DataFrames in list + embeddings_list, + ) + out_path = out_folder + "/kmers_dim_reduction_embeddings.csv" + out_df.to_csv(out_path, index=False) + + os._exit(0) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-v", "--version", action="version", version="1.0") + parser.add_argument("kmer_counts_file", type=str, help="Path to input CSV file with kmer counts") + parser.add_argument("out_folder", type=str, help="Path to folder where output files will be written") + parser.add_argument( + "--selected_methods", + type=str, + help="Comma separated string with the selected dimensionality reduction methods", + default="pca", + ) + parser.add_argument( + "--n_neighbors_setting", + type=int, + help="n_neighbors parameter value for the methods that have this parameter (default: 13)", + default=13, + ) + parser.add_argument( + "--autoencoder_epochs_count", + type=int, + help="Autoencoder epochs count (default: assign automatically as 3x number of sequences in input)", + default=-1, + ) + args = parser.parse_args() + main( + args.kmer_counts_file, + args.out_folder, + args.selected_methods, + args.n_neighbors_setting, + args.autoencoder_epochs_count, + ) diff --git a/bin/kmer_count_dim_reduction_combine_csv.py b/bin/kmer_count_dim_reduction_combine_csv.py new file mode 100755 index 0000000..e7a912d --- /dev/null +++ b/bin/kmer_count_dim_reduction_combine_csv.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +""" +Script to combine kmer count dimensionality reduction embedding csv files. +Adapted from Eerik Aunin (ea10@sanger.ac.uk), by William Eagles (we3@sanger.ac.uk) +""" + +import pandas as pd +import argparse +from functools import reduce + + +def main(embedding_paths, output_name): + # Load csv as dataframes. + dataframe_list = list() + for embedding_path in embedding_paths: + df = pd.read_csv(f"{embedding_path}/kmers_dim_reduction_embeddings.csv") + dataframe_list.append(df) + + # Merge DataFrames in list + out_df = reduce(lambda left, right: pd.merge(left, right, on=["scaff"], how="outer"), dataframe_list) + + # Output as csv. + out_df.to_csv(output_name, index=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-v", "--version", action="version", version="1.0") + parser.add_argument( + "-i", "--kmer_counts_file", nargs="+", type=str, help="List of paths to input CSV file with kmer counts" + ) + parser.add_argument("-o", "--output_name", type=str, help="Path to folder where output file will be written") + args = parser.parse_args() + main(args.kmer_counts_file, args.output_name) diff --git a/bin/reformat_diamond_outfmt6.py b/bin/reformat_diamond_outfmt6.py index 8d88cf4..91c916f 100755 --- a/bin/reformat_diamond_outfmt6.py +++ b/bin/reformat_diamond_outfmt6.py @@ -15,7 +15,7 @@ import general_purpose_functions as gpf if sys.argv[1] == "-v": - print("1.0.0") + print("1.0") sys.exit() else: in_path = sys.argv[1] diff --git a/conf/test.config b/conf/test.config index 5ae71a6..ce291c2 100644 --- a/conf/test.config +++ b/conf/test.config @@ -15,8 +15,8 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' + max_cpus = 4 + max_memory = '10.GB' max_time = '6.h' input = "${projectDir}/assets/github_testing/test.yaml" diff --git a/modules/local/get_kmer_counts.nf b/modules/local/get_kmer_counts.nf new file mode 100755 index 0000000..b0742fe --- /dev/null +++ b/modules/local/get_kmer_counts.nf @@ -0,0 +1,53 @@ +process GET_KMER_COUNTS { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::python=3.9 conda-forge::kcounter=0.1.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-d9b9334c4a0777c7722cbcc301a10ddc8684a85f:796f1cea66b7720fa29583d1f6b90404f90dde2f-0' : + 'biocontainers/mulled-v2-d9b9334c4a0777c7722cbcc301a10ddc8684a85f:796f1cea66b7720fa29583d1f6b90404f90dde2f-0' }" + + input: + tuple val(meta), path(input_fasta) + val kmer_size + + output: + tuple val(meta), path( "*_kmer_counts.csv" ) , emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def KCOUNTER_VERSION = "0.1.1" + def prefix = args.ext.prefix ?: "${meta.id}" + """ + get_kmers_counts.py \\ + $input_fasta \\ + ${prefix}_kmer_counts.csv \\ + --kmer_size $kmer_size + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + kcounter: $KCOUNTER_VERSION + general_purpose_functions.py: \$(general_purpose_functions.py --version | cut -d' ' -f2) + get_kmers_counts.py: \$(get_kmers_counts.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def KCOUNTER_VERSION = "0.1.1" + def prefix = args.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_kmer_counts.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + kcounter: $KCOUNTER_VERSION + general_purpose_functions.py: \$(general_purpose_functions.py --version | cut -d' ' -f2) + get_kmers_counts.py: \$(get_kmers_counts.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/kmer_count_dim_reduction.nf b/modules/local/kmer_count_dim_reduction.nf new file mode 100755 index 0000000..f80b06b --- /dev/null +++ b/modules/local/kmer_count_dim_reduction.nf @@ -0,0 +1,65 @@ +process KMER_COUNT_DIM_REDUCTION { + + tag "$meta.id" + label 'process_medium' + + conda "conda-forge::python=3.9 conda-forge::pandas=2.2.1 conda-forge::tensorlflow=2.15.0 conda-forge::scikit-learn=1.4.1 conda-forge::umap=0.5.5 conda-forge::matplotlib=3.8.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac95cc1cb32439236d915b38af3e056ce8eb0375:34bd58763b84a3ea2f6c60b87b4858b2f80c070e-0' : + 'biocontainers/mulled-v2-ac95cc1cb32439236d915b38af3e056ce8eb0375:34bd58763b84a3ea2f6c60b87b4858b2f80c070e-0' }" + + input: + tuple val(meta), path(kmer_counts_file) + val dimensionality_reduction_method + val n_neighbors_setting + val autoencoder_epochs_count + + output: + path '*_kmers_dim_reduction_embeddings.csv', emit: csv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def UMAP_VERSION = "0.5.5" + def prefix = args.ext.prefix ?: "${meta.id}" + """ + + kmer_count_dim_reduction.py \\ + $kmer_counts_file \\ + ${prefix}_${dimensionality_reduction_method}_kmers_dim_reduction_embeddings.csv \\ + --selected_methods $dimensionality_reduction_method \\ + --n_neighbors_setting $n_neighbors_setting \\ + --autoencoder_epochs_count $autoencoder_epochs_count + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + pandas: \$(python3 -c 'import pandas; print(pandas.__version__)') + tensorflow: \$(python3 -c 'import tensorflow; print(tensorflow.__version__)') + scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") + umap-learn: $UMAP_VERSION + matplotlib: \$(python3 -c 'import matplotlib; print(matplotlib.__version__)') + kmer_count_dim_reduction.py: \$(kmer_count_dim_reduction.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def UMAP_VERSION = "0.5.5" + def prefix = args.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_kmers_dim_reduction_embeddings.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + pandas: \$(python3 -c 'import pandas; print(pandas.__version__)') + tensorflow: \$(python3 -c 'import tensorflow; print(tensorflow.__version__)') + scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") + umap-learn: $UMAP_VERSION + matplotlib: \$(python3 -c 'import matplotlib; print(matplotlib.__version__)') + kmer_count_dim_reduction.py: \$(kmer_count_dim_reduction.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/kmer_count_dim_reduction_combine_csv.nf b/modules/local/kmer_count_dim_reduction_combine_csv.nf new file mode 100755 index 0000000..cdd2d00 --- /dev/null +++ b/modules/local/kmer_count_dim_reduction_combine_csv.nf @@ -0,0 +1,47 @@ +process KMER_COUNT_DIM_REDUCTION_COMBINE_CSV { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" + + input: + tuple val(meta), path(input_files) + + output: + path '*_kmers_dim_reduction_embeddings_combined.csv', emit: csv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = args.ext.prefix ?: "${meta.id}" + """ + kmer_count_dim_reduction_combine_csv.py \\ + -o ${prefix}_kmers_dim_reduction_embeddings_combined.csv \\ + -i $input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + pandas: \$(python3 -c 'import pandas; print(pandas.__version__)') + kmer_count_dim_reduction_combine_csv.py: \$(kmer_count_dim_reduction.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = args.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_kmers_dim_reduction_embeddings_combined.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + pandas: \$(python3 -c 'import pandas; print(pandas.__version__)') + kmer_count_dim_reduction.py: \$(kmer_count_dim_reduction.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index eb78343..422de16 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,6 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null + outdir = "results" tracedir = "${outdir}/pipeline_info/" // MultiQC options @@ -218,7 +219,7 @@ manifest { homePage = 'https://github.com/sanger-tol/ascc' description = """Pipeline for detecting cobionts and contaminants in genome assemblies""" mainScript = 'main.nf' - nextflowVersion = '!>=23.04.0' + nextflowVersion = '!>=22.10.1' version = '1.0dev' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 68ba8f1..f93d416 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -195,7 +195,7 @@ "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/treeval_info", + "default": "results/pipeline_info/", "fa_icon": "fas fa-cogs", "hidden": true }, diff --git a/subworkflows/local/get_kmers_profile.nf b/subworkflows/local/get_kmers_profile.nf new file mode 100755 index 0000000..de11712 --- /dev/null +++ b/subworkflows/local/get_kmers_profile.nf @@ -0,0 +1,96 @@ +#!/usr/bin/env nextflow + +// +// MODULE IMPORT BLOCK +// +include { GET_KMER_COUNTS } from '../../modules/local/get_kmer_counts' +include { KMER_COUNT_DIM_REDUCTION } from '../../modules/local/kmer_count_dim_reduction' +include { KMER_COUNT_DIM_REDUCTION_COMBINE_CSV } from '../../modules/local/kmer_count_dim_reduction_combine_csv' + +workflow GET_KMERS_PROFILE { + take: + assembly_fasta // Channel [ val(meta), path(file) ] + kmer_size // Channel [ val(integer) ] + dimensionality_reduction_methods // Channel [ val(string) ] + n_neighbors_setting // Channel [ val(string) ] + autoencoder_epochs_count // Channel [ val(integer) ] + + main: + ch_versions = Channel.empty() + + // + // LOGIC: REFACTORING REFERENCE TUPLE + // + assembly_fasta + .map{ meta, file -> + tuple([id: meta.id, + single_end: true], + file + ) + } + .set { modified_input } + + // + // MODULE: PRODUCE KMER COUNTS (USING KCOUNTER) + // + GET_KMER_COUNTS ( + modified_input, // val(meta), path(reads) + kmer_size // val kmer_size + ) + ch_versions = ch_versions.mix(GET_KMER_COUNTS.out.versions) + + // + // LOGIC: CREATE CHANNEL OF LIST OF SELECTED METHODS + // + dimensionality_reduction_methods + .splitCsv(sep: ',') + .flatten() + .combine( GET_KMER_COUNTS.out.csv ) + .combine( n_neighbors_setting ) + .combine( autoencoder_epochs_count ) + .multiMap { dim_red_method, csv_meta, csv_val, nn_setting, ae_setting -> + kmer_csv : tuple ([ id: csv_meta.id, single_end: true], csv_val) + dimensionality_reduction_method : dim_red_method + n_neighbors_setting : nn_setting + autoencoder_epochs_count : ae_setting + } + .set{ ch_methods } + + // + // MODULE: DIMENSIONALITY REDUCTION OF KMER COUNTS, USING SPECIFIED METHODS + // + KMER_COUNT_DIM_REDUCTION ( + ch_methods.kmer_csv, // val(meta), path(kmer_counts) + ch_methods.dimensionality_reduction_method, // val dimensionality_reduction_method + ch_methods.n_neighbors_setting, // val n_neighbors_setting + ch_methods.autoencoder_epochs_count // val autoencoder_epochs_count + ) + ch_versions = ch_versions.mix(KMER_COUNT_DIM_REDUCTION.out.versions) + + // + // LOGIC: PREPARING INPUT TO COMBINE OUTPUT CSV FOR EACH METHOD + // + KMER_COUNT_DIM_REDUCTION.out.csv + .collect() + .map { file -> + tuple ( + [ id : file[0].toString().split('/')[-1].split('_')[0] ], // Change sample ID + file + ) + } + .set { collected_files_for_combine } + + collected_files_for_combine.view() + + // + // MODULE: COMBINE OUTPUTS OF MULTIPLE METHODS + // + KMER_COUNT_DIM_REDUCTION_COMBINE_CSV ( + collected_files_for_combine + ) + ch_versions = ch_versions.mix(KMER_COUNT_DIM_REDUCTION_COMBINE_CSV.out.versions) + + emit: + combined_csv = KMER_COUNT_DIM_REDUCTION_COMBINE_CSV.out.csv + versions = ch_versions.ifEmpty(null) +} diff --git a/subworkflows/local/run_fcsgx.nf b/subworkflows/local/run_fcsgx.nf index a748f67..3e8c6d4 100644 --- a/subworkflows/local/run_fcsgx.nf +++ b/subworkflows/local/run_fcsgx.nf @@ -25,7 +25,8 @@ workflow RUN_FCSGX { reference .combine( taxid ) .map { it -> - tuple ([taxid: it[2]], + tuple ([ id: it[0].id, + taxid: it[2] ], it[1]) } .set { reference_with_taxid } diff --git a/subworkflows/local/run_read_coverage.nf b/subworkflows/local/run_read_coverage.nf index 377d17d..001d2f4 100644 --- a/subworkflows/local/run_read_coverage.nf +++ b/subworkflows/local/run_read_coverage.nf @@ -12,17 +12,17 @@ workflow RUN_READ_COVERAGE { reference_tuple // Channel [ val(meta), path(file) ] assembly_path // Channel path(file) pacbio_tuple // Channel [ val(meta), val( str ) ] - platform // Channel val( str ) + platform // Channel val( str ) main: ch_versions = Channel.empty() ch_align_bam = Channel.empty() - + // // LOGIC: CHECK IF THE INPUT READ FILE IS PAIRED END OR SINGLE END BASED ON THE READ PLATFORM, THEN RUN MINIMAP // - if ( platform.filter { it == "hifi" } || platform.filter { it == "clr" } || platform.filter { it == "ont" } ) { + if ( platform.filter { it == "hifi" } || platform.filter { it == "clr" } || platform.filter { it == "ont" } ) { SE_MAPPING ( reference_tuple, assembly_path, @@ -34,7 +34,7 @@ workflow RUN_READ_COVERAGE { .mix( SE_MAPPING.out.mapped_bam ) .set { merged_bam } } - else if ( platform.filter { it == "illumina" } ) { + else if ( platform.filter { it == "illumina" } ) { PE_MAPPING ( reference_tuple, @@ -73,7 +73,7 @@ workflow RUN_READ_COVERAGE { ) ch_versions = ch_versions.mix( SAMTOOLS_DEPTH.out.versions ) - + // // MODULE: COMPUTE THE AVERAGE COVERAGE // diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf index 27a38b1..fce4395 100644 --- a/subworkflows/local/run_vecscreen.nf +++ b/subworkflows/local/run_vecscreen.nf @@ -24,6 +24,8 @@ workflow RUN_VECSCREEN { // // MODULE: RUNS NCBI VECSCREEN // + vecscreen_database_tuple.view() + NCBITOOLS_VECSCREEN( CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, vecscreen_database_tuple diff --git a/subworkflows/local/se_mapping.nf b/subworkflows/local/se_mapping.nf index 1c3f69c..7ce3657 100644 --- a/subworkflows/local/se_mapping.nf +++ b/subworkflows/local/se_mapping.nf @@ -72,7 +72,6 @@ workflow SE_MAPPING { se_input.bool_cigar_bam ) ch_bams = MINIMAP2_ALIGN_SE.out.bam - ch_bams .map { meta, file -> @@ -110,7 +109,7 @@ process GrabFiles { tuple val(meta), path("in") output: - tuple val(meta), path("in/*.{fa,fasta}.{gz}") + tuple val(meta), path("in/*.{fa,fasta,fna}.{gz}") "true" } \ No newline at end of file diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 964f29d..6ed9ea4 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -21,7 +21,7 @@ workflow YAML_INPUT { .multiMap { data -> assembly_title: ( data.assembly_title ) reads_path: ( data.reads_path ) - reads_type: ( data.reads_type ) + reads_type: ( data.reads_type ) assembly_path: ( file(data.assembly_path) ) pacbio_barcodes: ( file(data.pacbio_barcodes) ) pacbio_multiplexing_barcode_names: ( data.pacbio_multiplexing_barcode_names) @@ -33,16 +33,17 @@ workflow YAML_INPUT { reference_proteomes: ( data.reference_proteomes ) nt_kraken_db_path: ( data.nt_kraken_db_path ) kmer_len: ( data.kmer_len ) + dimensionality_reduction_methods: ( data.dimensionality_reduction_methods ) fcs_gx_database_path: ( data.fcs_gx_database_path ) ncbi_taxonomy_path: ( data.ncbi_taxonomy_path ) ncbi_rankedlineage_path: ( data.ncbi_rankedlineage_path ) - ncbi_accessionids: ( data.ncbi_accessionids_folder ) + ncbi_accessionids: ( data.ncbi_accessionids_folder ) busco_lineages_folder: ( data.busco_lineages_folder ) seqkit_values: ( data.seqkit ) diamond_uniprot_database_path: ( data.diamond_uniprot_database_path ) diamond_nr_database_path: ( data.diamond_nr_database_path ) vecscreen_database_path: ( data.vecscreen_database_path ) - + neighbours: ( data.n_neighbours ) } .set{ group } @@ -127,10 +128,13 @@ workflow YAML_INPUT { vecscreen_database_path = ch_vecscreen seqkit_sliding = seqkit.sliding_value seqkit_window = seqkit.window_value + dimensionality_reduction_methods = group.dimensionality_reduction_methods mito_tuple = ch_mito mito_var = "mitochondrial_genome" plastid_tuple = ch_plastid plastid_var = "plastid_genome" + kmer_len = group.kmer_len + n_neighbours = group.neighbours versions = ch_versions.ifEmpty(null) } diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 014b34a..a80033d 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -32,6 +32,7 @@ include { RUN_FCSADAPTOR } from '../subworkflows/ include { RUN_NT_KRAKEN } from '../subworkflows/local/run_nt_kraken' include { RUN_FCSGX } from '../subworkflows/local/run_fcsgx' include { PACBIO_BARCODE_CHECK } from '../subworkflows/local/pacbio_barcode_check' +include { GET_KMERS_PROFILE } from '../subworkflows/local/get_kmers_profile' include { RUN_READ_COVERAGE } from '../subworkflows/local/run_read_coverage' include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' @@ -63,6 +64,9 @@ workflow ASCC { main: ch_versions = Channel.empty() + ch_out_merge = Channel.empty() + + workflow_steps = params.steps.split(",") input_ch = Channel.fromPath(params.input, checkIfExists: true) @@ -80,6 +84,7 @@ workflow ASCC { GC_CONTENT ( YAML_INPUT.out.reference_tuple ) + ch_out_merge = ch_out_merge.mix(GC_CONTENT.out.txt) ch_versions = ch_versions.mix(GC_CONTENT.out.versions) // @@ -92,12 +97,40 @@ workflow ASCC { ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) // - // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA + // SUBWORKFLOW: COUNT KMERS, THEN REDUCE DIMENSIONS USING SELECTED METHODS // - EXTRACT_TIARA_HITS ( + + if ( workflow_steps.contains('kmers') || workflow_steps.contains('ALL')) { + GENERATE_GENOME.out.reference_tuple - ) - ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + .map { meta, file -> + tuple ( + meta, + file, + file.countFasta() * 3 + ) + } + .set {autoencoder_epochs_count} + + GET_KMERS_PROFILE ( + GENERATE_GENOME.out.reference_tuple, + YAML_INPUT.out.kmer_len, + YAML_INPUT.out.dimensionality_reduction_methods, + YAML_INPUT.out.n_neighbours, + autoencoder_epochs_count.map{it -> it[2]} + ) + ch_versions = ch_versions.mix(GET_KMERS_PROFILE.out.versions) + } + + // + // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA + // + if ( workflow_steps.contains('tiara') ) { + EXTRACT_TIARA_HITS ( + GENERATE_GENOME.out.reference_tuple + ) + ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + } // // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE @@ -117,104 +150,136 @@ workflow ASCC { // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST // - EXTRACT_NT_BLAST ( - modified_input, - YAML_INPUT.out.nt_database, - YAML_INPUT.out.ncbi_accessions, - YAML_INPUT.out.ncbi_rankedlineage_path - ) - ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) - - // - // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH - // - YAML_INPUT.out.mito_tuple - .branch { meta, check -> - valid: check != "NO MITO" - invalid: check == "NO MITO" - } - .set { mito_check } + if ( workflow_steps.contains('nt_blast') || workflow_steps.contains('ALL') ) { + EXTRACT_NT_BLAST ( + modified_input, + YAML_INPUT.out.nt_database, + YAML_INPUT.out.ncbi_accessions, + YAML_INPUT.out.ncbi_rankedlineage_path + ) + ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) + } + if ( workflow_steps.contains('mito') || workflow_steps.contains('ALL') ) { + // + // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH + // + YAML_INPUT.out.mito_tuple + .branch { meta, check -> + valid: check != "NO MITO" + invalid: check == "NO MITO" + } + .set { mito_check } + + + // + // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME + // + MITO_ORGANELLAR_BLAST ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.mito_var, + mito_check.valid + ) + ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + } - // - // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME - // - MITO_ORGANELLAR_BLAST ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.mito_var, - mito_check.valid - ) - ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + if ( workflow_steps.contains('chloro') || workflow_steps.contains('ALL') ) { + + // + // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH + // + YAML_INPUT.out.plastid_tuple + .branch { meta, check -> + valid: check != "NO PLASTID" + invalid: check == "NO PLASTID" + } + .set { plastid_check } + + // + // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME + // + PLASTID_ORGANELLAR_BLAST ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.plastid_var, + plastid_check.valid + ) + ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + } - // - // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH - // - YAML_INPUT.out.plastid_tuple - .branch { meta, check -> - valid: check != "NO PLASTID" - invalid: check == "NO PLASTID" - } - .set { plastid_check } - - // - // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME - // - PLASTID_ORGANELLAR_BLAST ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.plastid_var, - plastid_check.valid - ) - ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) // // SUBWORKFLOW: // - RUN_FCSADAPTOR ( - YAML_INPUT.out.reference_tuple - ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) - + if ( workflow_steps.contains('fcs_adapt') || workflow_steps.contains('ALL') ) { + RUN_FCSADAPTOR ( + YAML_INPUT.out.reference_tuple + ) + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + } // // SUBWORKFLOW: // - RUN_FCSGX ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.fcs_gx_database_path, - YAML_INPUT.out.taxid, - YAML_INPUT.out.ncbi_rankedlineage_path - ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + if ( workflow_steps.contains('fcsgx') || workflow_steps.contains('ALL') ) { + RUN_FCSGX ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.fcs_gx_database_path, + YAML_INPUT.out.taxid, + YAML_INPUT.out.ncbi_rankedlineage_path + ) + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + } // // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA // - PACBIO_BARCODE_CHECK ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.pacbio_tuple, - YAML_INPUT.out.pacbio_barcodes, - YAML_INPUT.out.pacbio_multiplex_codes - ) - ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) + if ( workflow_steps.contains('barcodes') || workflow_steps.contains('ALL') ) { + PACBIO_BARCODE_CHECK ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.pacbio_tuple, + YAML_INPUT.out.pacbio_barcodes, + YAML_INPUT.out.pacbio_multiplex_codes + ) + ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) + } // // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE // - RUN_READ_COVERAGE ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.assembly_path, - YAML_INPUT.out.pacbio_tuple, - YAML_INPUT.out.reads_type - ) - ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + if ( workflow_steps.contains('coverage') || workflow_steps.contains('ALL') ) { + RUN_READ_COVERAGE ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.assembly_path, + YAML_INPUT.out.pacbio_tuple, + YAML_INPUT.out.reads_type + ) + ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + } // // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS // - RUN_VECSCREEN ( - GENERATE_GENOME.out.reference_tuple, - YAML_INPUT.out.vecscreen_database_path - ) - ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + if ( workflow_steps.contains('vecscreen') || workflow_steps.contains('ALL') ) { + RUN_VECSCREEN ( + GENERATE_GENOME.out.reference_tuple, + YAML_INPUT.out.vecscreen_database_path + ) + ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + } + + // + // SUBWORKFLOW: Run the kraken classifier + // + if ( workflow_steps.contains('kraken') || workflow_steps.contains('ALL') ) { + RUN_NT_KRAKEN( + GENERATE_GENOME.out.reference_tuple, + YAML_INPUT.out.nt_kraken_db_path, + YAML_INPUT.out.ncbi_rankedlineage_path + ) + } + + // mix the outputs of the outpuutting process so that we can + // insert them into the one process to create the btk and the merged report + // much like the versions channel // // SUBWORKFLOW: Collates version data from prior subworflows