Skip to content

Commit

Permalink
Merge pull request #24 from treangenlab/empty-fasta-bug
Browse files Browse the repository at this point in the history
Distinguish between unmapped and unclassified
  • Loading branch information
kdc10 authored Sep 19, 2024
2 parents 43944a8 + 7c1f51c commit 63b6e69
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 67 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,17 @@ Each step of the installation process is expected to take a matter of seconds.
|--output-dir| ./results| directory for output results|
|--output-basename| stem of input_file(s)| basename of all output files saved in output-dir; default utilizes basename from input file(s)|
|--keep-files| FALSE| keep working files in output-dir ( alignments [.sam], reads of specied length [.fa])|
|--keep-counts| FALSE| include estimated read counts for each species in output|
|--keep-counts| FALSE| include estimated read counts for each species in output*|
|--keep-read-assignments| FALSE| output .tsv file with read assignment distributions: each row as an input read; each entry as the likelihood it is dervied from that taxa (taxid is the column header); each row sums to 1|
|--output-unclassified| FALSE| generate a separate output file of unclassified sequences|
|--output-unclassified| FALSE| generate two additional sequence files of unmapped and unclassified mapped input reads**|
|--threads| 3| number of threads utilized by minimap2|

*Estimated read counts are based on likelihood probabilities and therefore may not be integer values. They are calculated as the product of estimated relative abundance and total classified reads.

**Here, "unmapped reads" are reads that did not result in a mapping to the provided database with minimap2. "Unclassified mapped reads" are those that mapped only to database sequences of species that are presumed to not be present in the sample by Emu's algorithm (likely due to low overall abundance).

Note: If you are experiencing heavy RAM consumption, first upgrade minimap2 to at least v2.22. If memory is still an issue, try decreasing the number of secondary alignments evaluated for each read (--N).
Note: Estimated read counts are based on likelihood probabilities and therefore may not be integer values.


### Build Custom Database

Expand Down
130 changes: 66 additions & 64 deletions emu
Original file line number Diff line number Diff line change
Expand Up @@ -77,37 +77,22 @@ def get_align_len(alignment):
return sum(alignment.get_cigar_stats()[0][cigar_op] for cigar_op in CIGAR_OPS_ALL)


def output_unclassified(sam_path, seq_output_path, input_type):
""" Gather list of input sequences that are deemed unclassified by minimap2,
and output as seq_output_path
def output_sequences(in_path, seq_output_path, input_type, keep_ids):
""" Output specified list of sequences from input_file based on sequence id
sam_path (str): path to sam file generated by minimap2
fa_output_path (str): output path for fasta of unclassified sequences
in_path (str): path to input fasta or fastq
seq_output_path (str): output path for fasta/q of unclassified sequences
input_type (str): fasta or fastq
keep_ids (set): set of seuqence id strings
"""
output_records = []
# pylint: disable=maybe-no-member
sam_pysam = pysam.AlignmentFile(sam_path)
if input_type == "fasta":
for alignment in sam_pysam.fetch():
if alignment.reference_name is None:
record = SeqRecord(
Seq(alignment.query_sequence),
id=alignment.query_name,
name=alignment.query_name,
)
output_records.append(record)
SeqIO.write(output_records, "{}.fa".format(seq_output_path), "fasta")
elif input_type == "fastq":
for alignment in sam_pysam.fetch():
if alignment.reference_name is None:
record = SeqRecord(
Seq(alignment.query_sequence),
id=alignment.query_name,
name=alignment.query_name,
letter_annotations={"phred_quality": alignment.query_qualities}
)
output_records.append(record)
SeqIO.write(output_records, "{}.fq".format(seq_output_path), "fastq")
# Filter and write only the sequences with matching read IDs
with open(in_path, "r", encoding="utf-8") as in_file, \
open("{}.{}".format(seq_output_path, input_type), "w", encoding="utf-8") as out_seq_file:
# Parse the FASTA file and filter by read IDs
filtered_sequences = (seq for seq in SeqIO.parse(in_file, input_type) if seq.id in keep_ids)

# Write the filtered sequences to the output file
SeqIO.write(filtered_sequences, out_seq_file, input_type)


def get_cigar_op_log_probabilities(sam_path):
Expand Down Expand Up @@ -177,11 +162,11 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
dict_longest_align (dict[str]:(int)): dict of max alignment length for each query read
zero_locs(list(int)): list of indices (int) where probability == 0
return ({[str,int]:float}): dict[(query_name,ref_tax_id)]=log(L(query_name|ref_tax_id))
int: unassigned read count
int: assigned read count
int: unmapped read count
int: mapped read count
"""
# calculate log(L(read|seq)) for all alignments
log_p_rgs, unassigned_set = {}, set()
log_p_rgs, unmapped_set = {}, set()
# pylint: disable=maybe-no-member
sam_filename = pysam.AlignmentFile(sam_path, 'rb')

Expand All @@ -206,7 +191,7 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
log_p_rgs[query_name][1][logprgs_idx] = log_score

else:
unassigned_set.add(alignment.query_name)
unmapped_set.add(alignment.query_name)
else:
for alignment in sam_filename.fetch():
align_len = get_align_len(alignment)
Expand All @@ -229,18 +214,18 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
if log_p_rgs[query_name][1][logprgs_idx] < log_score:
log_p_rgs[query_name][1][logprgs_idx] = log_score
else:
unassigned_set.add(alignment.query_name)
unmapped_set.add(alignment.query_name)

assigned_reads = set(log_p_rgs.keys())
unassigned_reads = unassigned_set - assigned_reads
unassigned_count = len(unassigned_reads)
stdout.write(f"Unassigned read count: {unassigned_count}\n")
mapped_set = set(log_p_rgs.keys())
unmapped_set = unmapped_set - mapped_set
unmapped_count = len(unmapped_set)
stdout.write(f"Unmapped read count: {unmapped_count}\n")

## remove low likelihood alignments?
## remove if p(r|s) < 0.01
#min_p_thresh = math.log(0.01)
#log_p_rgs = {r_map: val for r_map, val in log_p_rgs.items() if val > min_p_thresh}
return log_p_rgs, unassigned_count, len(assigned_reads)
return log_p_rgs, unmapped_set, mapped_set


def expectation_maximization(log_p_rgs, freq):
Expand Down Expand Up @@ -305,14 +290,14 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr
"""
n_db = len(db_ids)
n_reads = len(log_p_rgs)
stdout.write("Assigned read count: {}\n".format(n_reads))
stdout.write("Mapped read count: {}\n".format(n_reads))
# check if there are enough reads
if n_reads == 0:
raise ValueError("0 reads assigned")
raise ValueError("0 reads mapped")
freq, counter = dict.fromkeys(db_ids, 1 / n_db), 1

# set output abundance threshold
freq_thresh = 1/n_reads
freq_thresh = 1/(n_reads + 1)
if n_reads > 1000:
freq_thresh = 10/n_reads

Expand Down Expand Up @@ -375,28 +360,32 @@ def lineage_dict_from_tid(taxid, nodes_dict, names_dict):
return tuple(lineage_list)


def freq_to_lineage_df(freq, tsv_output_path, taxonomy_df, assigned_count,
unassigned_count, counts=False):
def freq_to_lineage_df(freq, tsv_output_path, taxonomy_df, mapped_count,
unmapped_count, mapped_unclassified_count, counts=False):
"""Converts freq to a pandas df where each row contains abundance and tax lineage for
classified species in f.keys(). Stores df as .tsv file in tsv_output_path.
freq{int:float}: dict[species_tax_id]:estimated likelihood species is present in sample
tsv_output_path(str): path to output .tsv file
taxonomy_df(df): pandas df of all db sequence taxonomy with index 'tax_id'
assigned_count(int): number of assigned reads
unassigned_count(int): number of unassigned reads
mapped_count(int): number of mapped reads
unmapped_count(int): number of unmapped reads
mapped_unclassified_count(int): number of that mapped but were assigned due to
low abundant classification
counts(boolean): True if include estimated counts in output .tsv file
returns(df): pandas df with lineage and abundances for values in f
"""
#add tax lineage for values in freq
results_df = pd.DataFrame(zip(list(freq.keys()) + ['unassigned'],
list(freq.values()) + [0]),
results_df = pd.DataFrame(zip(list(freq.keys()) + ['unmapped', 'mapped_unclassified'],
list(freq.values()) + [0, 0]),
columns=["tax_id", "abundance"]).set_index('tax_id')
results_df = results_df.join(taxonomy_df, how='left').reset_index()
#add in the estimated count values for the assigned and unassigned counts
#add in the estimated count values for the mapped and unmapped counts
if counts:
counts_series = pd.concat([(results_df["abundance"] * assigned_count)[:-1],
pd.Series(unassigned_count)], ignore_index=True)
classified_count = mapped_count - mapped_unclassified_count
counts_series = pd.concat([(results_df["abundance"] * classified_count)[:-2],
pd.Series(unmapped_count), pd.Series(mapped_unclassified_count)],
ignore_index=True)
results_df["estimated counts"] = counts_series
results_df.to_csv("{}.tsv".format(tsv_output_path), sep='\t', index=False)
return results_df
Expand Down Expand Up @@ -666,11 +655,15 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
df_sample = pd.read_csv(os.path.join(dir_path, file), sep='\t', dtype=str)
df_sample[[metric]] = df_sample[[metric]].apply(pd.to_numeric)
if rank in df_sample.columns and metric in df_sample.columns:
keep_ranks_sample = [value for value in keep_ranks if value in set(df_sample.columns)] #check which keep_ranks are in df_sample
if df_sample.at[len(df_sample.index)-1, 'tax_id'] == 'unassigned':
df_sample.at[len(df_sample.index)-1, rank] = 'unassigned'
df_sample_reduced = df_sample[keep_ranks_sample + [metric]].rename(columns={metric: name})
df_sample_reduced = df_sample_reduced.groupby(keep_ranks_sample, dropna=False).sum().reset_index() #sum metric within df_sample_reduced if same tax lineage
#check which keep_ranks are in df_sample
keep_ranks_sample = [value for value in keep_ranks
if value in set(df_sample.columns)]
if df_sample.at[len(df_sample.index)-1, 'tax_id'] == 'unmapped':
df_sample.at[len(df_sample.index)-1, rank] = 'unmapped'
df_sample_reduced = df_sample[keep_ranks_sample +
[metric]].rename(columns={metric: name})
df_sample_reduced = df_sample_reduced.groupby(keep_ranks_sample, dropna=False)\
.sum().reset_index() #sum metric within df_sample_reduced if same tax lineage
df_sample_reduced = df_sample_reduced.astype(object)
df_sample_reduced[[name]] = df_sample_reduced[[name]].apply(pd.to_numeric)
df_combined_full = pd.merge(df_combined_full, df_sample_reduced, how='outer')
Expand All @@ -679,7 +672,8 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
if count_table:
filename_suffix = "-counts"
if split_files:
abundance_out_path = os.path.join(dir_path, "emu-combined-abundance-{}{}.tsv".format(rank, filename_suffix))
abundance_out_path = os.path.join(dir_path, "emu-combined-abundance-{}{}.tsv"
.format(rank, filename_suffix))
tax_out_path = os.path.join(dir_path, "emu-combined-taxonomy-{}.tsv".format(rank))
stdout.write("Combined taxonomy table generated: {}\n".format(tax_out_path))
df_combined_full[keep_ranks].to_csv(tax_out_path, sep='\t', index=False)
Expand All @@ -693,7 +687,7 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
return df_combined_full

if __name__ == "__main__":
__version__ = "3.4.6"
__version__ = "3.5.0"
parser = argparse.ArgumentParser()
parser.add_argument('--version', '-v', action='version', version='%(prog)s v' + __version__)
subparsers = parser.add_subparsers(dest="subparser_name", help='sub-commands')
Expand Down Expand Up @@ -772,7 +766,7 @@ if __name__ == "__main__":
help='collapsed taxonomic rank')

combine_parser = subparsers.add_parser("combine-outputs",
help="Combine Emu rel abundance outputs to a single table")
help="Combine Emu rel abundance outputs to a single table")
combine_parser.add_argument(
'dir_path', type=str,
help='path to directory containing Emu output files')
Expand Down Expand Up @@ -810,13 +804,17 @@ if __name__ == "__main__":
SAM_FILE = generate_alignments(args.input_file, out_file, args.db)
log_prob_cigar_op, locs_p_cigar_zero, longest_align_dict = \
get_cigar_op_log_probabilities(SAM_FILE)
log_prob_rgs, counts_unassigned, counts_assigned = log_prob_rgs_dict(
log_prob_rgs, set_unmapped, set_mapped = log_prob_rgs_dict(
SAM_FILE, log_prob_cigar_op, longest_align_dict, locs_p_cigar_zero)
f_full, f_set_thresh, read_dist = expectation_maximization_iterations(log_prob_rgs,
db_species_tids,
.01, args.min_abundance)
classified_reads = {read_id for inner_dict in read_dist.values() for read_id in inner_dict}
mapped_unclassified = set_mapped - classified_reads
stdout.write(f"Unclassified mapped read count: {len(mapped_unclassified)}\n")
freq_to_lineage_df(f_full, "{}_rel-abundance".format(out_file), df_taxonomy,
counts_assigned, counts_unassigned, args.keep_counts)
len(set_mapped), len(set_unmapped), len(mapped_unclassified),
args.keep_counts)


# output read assignment distributions as a tsv
Expand All @@ -828,15 +826,19 @@ if __name__ == "__main__":
freq_to_lineage_df(
f_set_thresh,
"{}_rel-abundance-threshold-{}".format(out_file, args.min_abundance),
df_taxonomy, counts_assigned, counts_unassigned, args.keep_counts)
df_taxonomy, len(set_mapped), len(set_unmapped), len(mapped_unclassified),
args.keep_counts)

# gather input sequences that are unclassified according to minimap2
# gather input sequences that are unmapped according to minimap2
if args.output_unclassified:
ext = os.path.splitext(args.input_file[0])[-1]
INPUT_FILETYPE = "fasta"
if ext in [".fastq", ".fq"]:
INPUT_FILETYPE = "fastq"
output_unclassified(SAM_FILE, "{}_unclassified".format(out_file), INPUT_FILETYPE)
output_sequences(args.input_file[0], "{}_unmapped".format(out_file), INPUT_FILETYPE,
set_unmapped)
output_sequences(args.input_file[0], "{}_unclassified_mapped".format(out_file),
INPUT_FILETYPE, mapped_unclassified)

# clean up extra file
if not args.keep_files:
Expand Down

0 comments on commit 63b6e69

Please sign in to comment.