From 619f554f250125781c5dd440dcd0be5a49d651f5 Mon Sep 17 00:00:00 2001 From: christinehc Date: Wed, 7 Dec 2022 17:47:23 -0800 Subject: [PATCH] [feat] Implement min file size for use in models Changelog: - `snekmer.utils`: Helper functions added to check file length - File size is now checked to meet minimum threshold for K-fold cross-validation - If min file size is not met, file is excluded from the pipeline and a warning is printed to the user to notify them of file exclusion - Note: warning is not active during a dry run due to the behavior of the `onstart` directive --- snekmer/rules/model.smk | 12 ++++++++ snekmer/scripts/model_model.py | 4 +-- snekmer/utils.py | 51 ++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/snekmer/rules/model.smk b/snekmer/rules/model.smk index 317d4ce..3edbf40 100644 --- a/snekmer/rules/model.smk +++ b/snekmer/rules/model.smk @@ -52,8 +52,10 @@ unzipped = [ fa.rstrip(".gz") for fa, ext in product(input_files, config["input_file_exts"]) if fa.rstrip(".gz").endswith(f".{ext}") + and skm.utils.check_n_seqs(fa, config["model"]["cv"], show_warning=False) ] + # map extensions to basename (basename.ext.gz -> {basename: ext}) UZ_MAP = { skm.utils.split_file_ext(f)[0]: skm.utils.split_file_ext(f)[1] for f in zipped @@ -62,6 +64,11 @@ FA_MAP = { skm.utils.split_file_ext(f)[0]: skm.utils.split_file_ext(f)[1] for f in unzipped } +# final file map: checks that files are large enough for model building +# FA_MAP = { +# k: v for k, v in f_map.items() if skm.utils.check_n_seqs(k, config["model"]["cv"]) +# } + # get unzipped filenames UZS = [f"{f}.{ext}" for f, ext in UZ_MAP.items()] @@ -82,6 +89,11 @@ out_dir = skm.io.define_output_dir( config["alphabet"], config["k"], nested=config["nested_output"] ) +# show warnings if files excluded +onstart: + [ + skm.utils.check_n_seqs(fa, config["model"]["cv"], show_warning=True) for fa in input_files + ] # define output files to be created by snekmer rule all: diff --git a/snekmer/scripts/model_model.py b/snekmer/scripts/model_model.py index 0e15ac8..0d8886e 100644 --- a/snekmer/scripts/model_model.py +++ b/snekmer/scripts/model_model.py @@ -40,9 +40,7 @@ # ) # print(lookup) with open(snakemake.input.matrix, "rb") as f: - matrix = pickle.load(f) - -data = matrix + data = pickle.load(f) # load all input data and encode rule-wide variables # data = pd.read_csv(input.data) diff --git a/snekmer/utils.py b/snekmer/utils.py index 42e9f40..6f3a333 100644 --- a/snekmer/utils.py +++ b/snekmer/utils.py @@ -7,6 +7,7 @@ import collections.abc import datetime import re + from os.path import basename, splitext from typing import Any, List, Optional, Tuple, Union @@ -200,3 +201,53 @@ def to_feature_matrix( length_array = np.ones(len(array)) array = [np.array(a) / length for a, length in zip(array, length_array)] return np.asarray(array) + + +def count_n_seqs(filename: str) -> int: + """Count number of sequences in a file. + + Parameters + ---------- + filename : str + /path/to/sequence/file.fasta + Other text formatted sequence files (.faa, etc.) also work. + + Returns + ------- + int + Number of sequences contained within the input file. + + """ + return len([1 for line in open(filename) if line.startswith(">")]) + + +def check_n_seqs(filename: str, k: int, show_warning: bool = True) -> bool: + """Check that a file contains at least k sequences. + + Parameters + ---------- + filename : str + /path/to/sequence/file.fasta + Other text formatted sequence files (.faa, etc.) also work. + k : int + Minimum threshold. + show_warning : bool, optional + When True, if len(file) < k, a warning is displayed; + by default True. + + Returns + ------- + bool + True if len(file) < k; False otherwise. + + """ + n_seqs = len([1 for line in open(filename) if line.startswith(">")]) + if (n_seqs < k) and (show_warning): + print( + f"\nWARNING: {filename} contains an insufficient" + " number of sequences for model cross-validation" + " and will thus be excluded from Snekmer modeling." + f" ({k} folds specified in config; {n_seqs}" + " sequence(s) detected.)\n" + ) + return n_seqs >= k