-
Notifications
You must be signed in to change notification settings - Fork 1
/
fadu.jl
304 lines (261 loc) · 12.3 KB
/
fadu.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#!/usr/bin/env julia
"""
FADU.jl - Feature Aggregate Depth Utility.
Description - Generate fractional counts of alignments that map to non-overlapping portions of genes
Requires (version number listed is earliest supported version):
Julia - v1.7
* Packages *
BGZFStreams.jl - v0.3.2
GenomicFeatures.jl - v2.1.0
GFF3 - v0.2.3
Indexes.jl - v0.1.3
StructArrays.jl - v0.6.17
XAM.jl - v0.3.1
BED.jl - v0.3.0
By: Shaun Adkins ([email protected])
Matthew Chung ([email protected])
"""
# The macro on modules and functions makes the code available to all worker processes
using ArgParse
using Logging
using XAM: BAM, SAM
using GenomicFeatures
using GFF3
using Indexes
using Printf
using StructArrays
using BED
include("alignment_overlaps.jl")
include("bam_record.jl")
include("feature_counts.jl")
include("gff_feature.jl")
const VERSION_NUMBER = "1.9" # Version number of the FADU program
const MAX_FRAGMENT_SIZE = 1000 # Maximum size of fragment. If exceeded, fragment will be considered two reads
const EM_ITER_DEFAULT = 1 # Number of iterations to do EM-algorithm
### Functions to clean up, then move to their proper "include" script
function remove_excluded_regions!(uniq_coords::Dict{String, Dict}, excluded_regions_file, stranded::Bool=false)
"""Remove any coordinates that overlap the regions in the BED file. Edits "uniq_coords" in-place."""
@info("Removing alignments that overlap excluded regions...")
# if file does not exist, throw error
isfile(excluded_regions_file) || throw(SystemError("Excluded regions file does not seem to exist. Please check supplied path."))
exclude_regions = open(collect, BED.Reader, excluded_regions_file)
exclude_coords = Dict{String, Dict}()
# Read in all excluded regions, by strand if the "stranded" argument is passed
for region in exclude_regions
# Store chrom since that should match seqid
seqid = BED.chrom(region)
if !haskey(uniq_coords, seqid)
continue
end
# "get!" will create the key if it doesn't exist in addition to returning the value
seqid_exclude_coords = get!(exclude_coords, seqid, Dict{Char, BitSet}())
start = BED.chromstart(region)
stop = BED.chromend(region)
# Store coordinates to exclude
# If unstranded, then everything goes in "+"
if stranded
strand = convert(Char, BED.strand(region))
union!(get!(seqid_exclude_coords, strand, BitSet()), BitSet(start:stop))
else
union!(get!(seqid_exclude_coords, "+", BitSet()), BitSet(start:stop))
end
end
# Remove any alignments that overlap these regions
for (seqid, strand_coords) in exclude_coords
if haskey(uniq_coords, seqid)
for (strand, coords) in strand_coords
if haskey(uniq_coords[seqid], strand)
setdiff!(uniq_coords[seqid][strand], coords)
end
end
end
end
end
########
# Main #
########
function parse_commandline()
s = ArgParseSettings(description = "Generate counts of reads that map to non-overlapping portions of genes",
prog = "fadu.jl",
version = VERSION_NUMBER,
add_version = true,
add_help = true)
# The macro to add a table of arguments and options to the ArgParseSettings object
@add_arg_table! s begin
"--bam_file", "-b"
help = "Path to BAM file (SAM is not supported)."
metavar = "/path/to/file.bam"
required = true
"--gff3_file", "-g"
help = "Path to GFF3-formatted annotation file (GTF is not supported)."
metavar = "/path/to/annotation.gff3"
required = true
"--output_dir", "-o"
help = "Directory to write the output."
metavar = "/path/to/output/dir/"
required = true
"--no_output_header"
help = "If enabled, do not write the header line to the output file."
action = :store_true
dest_name = "no_header"
"--exclude_regions_file", "-x"
help = "Path to BED file containing regions to exclude from analysis. Any alignments that overlap these regions will be ignored."
metavar = "/path/to/regions.bed"
"--stranded", "-s"
help = "Indicate if BAM reads are from a strand-specific assay. Choose between 'yes', 'no', or 'reverse'."
default = "no"
range_tester = (x->x in ["yes", "no", "reverse"])
"--feature_type", "-f"
help = "Which GFF3/GTF feature type (column 3) to obtain. readcount statistics for. Case-sensitive."
default = "gene"
"--attribute_type", "-a"
help = "Which GFF3/GTF feature type (column 9) to obtain. readcount statistics for. Case-sensitive."
default = "ID"
"--keep_only_proper_pairs", "-p"
help = "If enabled, keep only properly paired reads when performing calculations."
action = :store_true
dest_name = "pp_only"
"--max_fragment_size", "-m"
help = "If the fragment size of properly-paired reads exceeds this value, process pair as single reads instead of as a fragment. Setting this value to 0 will make every fragment pair be processed as two individual reads. If --keep_only_proper_pairs is enabled, then any fragment exceeding this value will be discarded."
default = MAX_FRAGMENT_SIZE
arg_type = Int
range_tester = (x->typemin(UInt)<=x<=typemax(UInt))
"--remove_multimapped", "-M"
help = "If enabled, remove any reads or fragments that are mapped to multiple regions of the genome, indiated by the 'NH' attribute being greater than 1. Otherwise, use EM algorithm to quantify reads after all other reads are counted."
action = :store_true
dest_name = "rm_multimap"
"--em_iterations", "-e"
help = "Number of iterations to perform EM algorithm on multimapped read depth. Only applies if --remove_multimapped flag is not passed (is disabled)"
default = EM_ITER_DEFAULT
arg_type = Int
range_tester = (x->x>0)
dest_name = "em_iter"
"--log_level"
help = "Set the log level. Options are: DEBUG, INFO, WARNING, ERROR, CRITICAL."
default = "INFO"
metavar = "LEVEL"
range_tester = (x->x in ["DEBUG", "INFO", "WARNING", "ERROR"])
"--log_file"
help = "Path to log file. If not specified, log messages will be printed to STDERR."
metavar = "/path/to/logfile.log"
# Will not add log_file or debug options for now
end
# Converts the ArgParseSettings object into key/value pairs
return parse_args(s)
end
function setup_logger(args)
"""Set up logging."""
stream = args["log_file"] === nothing ? stderr : open(args["log_file"], "w")
logleveldict = Dict("DEBUG" => -1000, "INFO" => 0, "WARNING" => 1000, "ERROR" => 2000)
loglevel = get(logleveldict, uppercase(args["log_level"]), 0)
logger = ConsoleLogger(stream, loglevel)
global_logger(logger)
# TODO: prevent debug messages from printing the file and line number
end
function validate_args(args)
"""Validate the passed arguments."""
@info("Validating arguments...")
isfile(args["bam_file"]) || error("BAM file does not exist. Please check supplied path")
isfile(args["gff3_file"]) || error("GFF3 file does not exist. Please check supplied path")
if !isdir(args["output_dir"])
@info("Creating output directory at ", args["output_dir"])
mkdir(args["output_dir"])
end
end
function main()
args = parse_commandline()
setup_logger(args)
try
validate_args(args)
catch e
@error(e)
exit(1)
end
@info("Parsed args:")
for (arg,val) in args
@info(" $arg => $val")
end
@info("Processing annotation features...")
features = try
open(collect, GFF3.Reader, args["gff3_file"]) # Array of GFF3.Record objects
catch e
println(stderr, "ERROR: GFF3 file was unable to be read, due to possible improper formatting. Please consult https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md for how to properly format the GFF3 file. Actual error is below. Program will now exit.\n")
showerror(stderr, e)
exit(1)
end
# Only keep features for the chosen feature type
filter!(x -> GFF3.featuretype(x) == args["feature_type"], features)
@info("Getting unique coordinates per contig and feature...")
uniq_coords = create_uniq_coords_dict(features, args["stranded"])
# If exclude_regions is passed, then remove any alignments that overlap these regions
if args["exclude_regions_file"] !== nothing
try
remove_excluded_regions!(uniq_coords, args["exclude_regions_file"], isstranded(args["stranded"]))
catch e
println(stderr, "ERROR: Excluded regions file was unable to be read, due to possible improper formatting. Please consult https://genome.ucsc.edu/FAQ/FAQformat.html#format1 for how to properly format the BED file. Actual error is below. Program will now exit.\n")
showerror(stderr, e)
exit(1)
end
end
@info("Initializing dictionary of feature count information")
feat_overlaps = create_feat_overlaps_dict(features, uniq_coords, args["attribute_type"], args["stranded"])
bai_file = replace(args["bam_file"], ".bam" => ".bai")
# Attempt to find index file for BAM file in same directory
if !isfile(bai_file)
bai_file = args["bam_file"] * ".bai"
if !isfile(bai_file)
bam = args["bam_file"]
error("Attempted to find index file for BAM file $bam but could not find one")
end
end
@debug("Found BAM index file at $bai_file")
# Open a BAM file and iterate over records overlapping GFF features.
@info("Opening BAM alignment file...")
reader = open(BAM.Reader, args["bam_file"], index = bai_file)
@info("Now finding overlaps between alignment and annotation records...")
multimapped_dict = process_all_feature_overlaps(feat_overlaps, features, reader, args)
close(reader)
# If multimapped reads are kept, use EM algorithm to count and re-add back into the feature counts
if !args["rm_multimap"]
num_multimaps = length(collect(keys(multimapped_dict)))
@debug("Multimapped alignment templates: ", num_multimaps)
featurenames = collect(keys(feat_overlaps))
mm_feat_overlaps = Dict{String, FeatureOverlap}(featurename => initialize_overlap_info(coordinate_set(feat_overlaps[featurename])) for featurename in featurenames)
@info("Counting and adjusting multimapped alignment feature counts via Expectation-Maximization algorithm...")
while args["em_iter"] > 0
@debug("\tEM iterations left: ", args["em_iter"])
adjusted_overlaps = merge_mm_counts(feat_overlaps, mm_feat_overlaps, false)
args["em_iter"] -= 1
mm_feat_overlaps = compute_mm_counts_by_em(adjusted_overlaps, multimapped_dict)
end
# Last iteration the alignment counts are added too
merge_mm_counts!(feat_overlaps, mm_feat_overlaps, true)
end
# Calculate sum of all counts for each feature
totalcounts = calc_totalcounts(feat_overlaps)
@info("Writing counts output to file...")
out_file = joinpath(args["output_dir"], splitext(basename(args["bam_file"]))[1]) * ".counts.txt"
out_f = open(out_file, "w")
if !args["no_header"]
write(out_f, "featureID\tuniq_len\tnum_alignments\tcounts\ttpm\n")
end
# Write output, sorted alphabetically
for featurename in sort(collect(keys(feat_overlaps)))
uniq_len::UInt = length(coordinate_set(feat_overlaps[featurename]))
num_alignments = totalalignments(feat_overlaps[featurename])
feat_counts = featurecounts(feat_overlaps[featurename])
if isnan(feat_counts)
feat_counts = zero(Float32)
end
tpm = calc_tpm(uniq_len, totalcounts, feat_counts)
if isnan(tpm)
tpm = zero(Float32)
end
s = @sprintf("%s\t%i\t%.1f\t%.2f\t%.2f\n", featurename, uniq_len, num_alignments, feat_counts, tpm)
write(out_f, s)
end
close(out_f)
@info("FADU is complete! Exiting!")
end
main()
exit()