From 029e00fc420784460243513bccb50ab79f72c454 Mon Sep 17 00:00:00 2001 From: MikeAxtell Date: Fri, 12 May 2023 11:43:12 -0400 Subject: [PATCH] version 4.0.2 --- README.md | 32 +++++----- ShortStack | 179 ++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 151 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 91939ee..b078d40 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Then, download the `ShortStack` script from this github repo. Make it executable # Usage ``` -ShortStack [-h] [--version] --genomefile GENOMEFILE [--knownRNAs KNOWNRNAS] +ShortStack [-h] [--version] --genomefile GENOMEFILE [--known_miRNAs KNOWN_MIRNAS] (--readfile [READFILE ...] | --bamfile [BAMFILE ...]) [--outdir OUTDIR] [--adapter ADAPTER | --autotrim] [--autotrim_key AUTOTRIM_KEY] [--threads THREADS] [--mmap {u,f,r}] [--align_only] [--show_secondaries] @@ -89,7 +89,7 @@ ShortStack [-h] [--version] --genomefile GENOMEFILE [--knownRNAs KNOWNRNAS] - `--bamfile [BAMFILE ...]` : Path(s) to one or more files of aligned sRNA-seq data in BAM format. Multiple files are separated by spaces. BAM files must match the reference genome given in `--genomefile`. ## Recommended -- `--knownRNAs KNOWNRNAS` : Path to FASTA-formatted file of known small RNAs. FASTA must be formatted such that a single RNA sequence is on one line only. ATCGUatcgu characters are acceptable. These RNAs are typically the sequences of known microRNAs; for instance, a FASTA file of mature miRNAs pulled from . Providing these data increases the accuracy of *MIRNA* locus identification. +- `--known_miRNAs KNOWN_MIRNAS` : Path to FASTA-formatted file of known mature miRNAs. FASTA must be formatted such that a single RNA sequence is on one line only. ATCGUatcgu characters are acceptable. These RNAs are typically the sequences of known microRNAs; for instance, a FASTA file of mature miRNAs pulled from . These known miRNA sequences are aligned to the genome and used to nucleate searches for loci that meet all expression-based and secondary structure-based requirements for *MIRNA* locus identification. See also option `--dn_mirna`. - `--outdir OUTDIR` : Specify the name of the directory that will be created for the results. - default: `ShortStack_[time]`, where `[time]` is the Unix time stamp according to the system when the run began. - `--autotrim` : This is strongly recommended **when supplying untrimmed reads via `--readfile`**. The `autotrim` method automatically infers the 3' adapter sequence of the untrimmed reads, and the uses that to coordinate read trimming. However, do **not** use `--autotrim` if your input reads have already been trimmed! @@ -118,7 +118,7 @@ ShortStack [-h] [--version] --genomefile GENOMEFILE [--knownRNAs KNOWNRNAS] - `--locifile LOCIFILE` : Path to a file of pre-determined loci to analyze. This will prevent *de novo* discovery of small RNA loci. The file may be in gff3, bed, or simple tab-delimited format (Chr:Start-Stop[tab]Name). Mutually exclusive with `--locus`. - `--locus LOCUS` : A single locus to analyze, given as a string in the format Chr:Start-Stop (using one-based, inclusive numbering). This will prevent *de novo* discovery of small RNA loci. Mutually exclusive with `--locifile`. - `--nohp` : Switch that prevents search for microRNAs. This saves computational time, but *MIRNA* loci will not be differentiated from other types of small RNA clusters. -- `--dn_mirna` : Switch that activates a *de novo* search for *MIRNA* loci. By default ShortStack will confine *MIRNA* analysis to loci where one or more queries from the `--knownRNAs` file are aligned to the genome. Activating *de novo* searching with `--dn_mirna` does a more comprehensive genome-wide scan for *MIRNA* loci. Loci discovered with `--dn_mirna` that do not overlap already known microRNAs should be treated with caution. +- `--dn_mirna` : Switch that activates a *de novo* search for *MIRNA* loci. By default ShortStack will confine *MIRNA* analysis to loci where one or more queries from the `--known_miRNAs` file are aligned to the genome. Activating *de novo* searching with `--dn_mirna` does a more comprehensive genome-wide scan for *MIRNA* loci. Loci discovered with `--dn_mirna` that do not overlap already known microRNAs should be treated with caution. - `--strand_cutoff STRAND_CUTOFF` : Floating point number that sets the cutoff for standedness. Must be > 0.5 and < 1. - default: 0.8. Loci with >80% reads on the top genomic strand are '+' stranded, loci with <20% reads on the top genomic strand are '-' stranded, and all others are unstranded '.' - `--mincov MINCOV` : Minimum alignment depth, in units of reads per million, required to nucleate a small RNA cluster during *de novo* cluster search. Must be an floating point number > 0. @@ -136,7 +136,7 @@ During the alignment phase ShortStack will potentially write many large, but tem ## CPU and running time All compute-intensive parts of ShortStack are now multi-threaded. Providing more threads via `--threads` generally will decrease run-times in near-linear fashion. Be sure to scale memory with thread use though .. 4-10GB RAM per thread, depending on genome size, seems usually sufficient. -Read alignment is often the most time-consuming portion of the analysis. *MIRNA* identification (triggered by `--knownRNAs` and/or `--dn_mirna`) is also time-consuming. Larger genomes generally run slower compared to smaller genomes. Highly fragmented genome assemblies (*e.g.* very high numbers of chromosomes/scaffolds) can be particularly slow because of the index lookup costs associated with thousands of entries. Consider obtaining and using better genome assemblies, or removing very short scaffolds from highly fragmented genome assemblies. +Read alignment is often the most time-consuming portion of the analysis. *MIRNA* identification (triggered by `--known_miRNAs` and/or `--dn_mirna`) is also time-consuming. Larger genomes generally run slower compared to smaller genomes. Highly fragmented genome assemblies (*e.g.* very high numbers of chromosomes/scaffolds) can be particularly slow because of the index lookup costs associated with thousands of entries. Consider obtaining and using better genome assemblies, or removing very short scaffolds from highly fragmented genome assemblies. # Testing and Examples ## Gather Test Data @@ -161,18 +161,18 @@ fasterq-dump SRR3222443 SRR3222444 SRR3222445 ``` You will now have 3 `.fastq` files of raw (untrimmed) sRNA-seq reads. These data are derived from Col-0 *Arabidopsis thaliana* immature inflorescence tissues (see Wang et al. 2017 ) -### Known RNAs -To get a list of known RNAs, we will use all [miRBase](https://www.mirbase.org) annotated mature miRNAs from miRBase. First, download the `mature.fa` file from miRBase at . Then filter it to get only the `ath` ones (*e.g.* the ones from *A. thaliana*). +### Known miRNAs +To get a list of known miRNAs, we will use all [miRBase](https://www.mirbase.org) annotated mature miRNAs from miRBase. First, download the `mature.fa` file from miRBase at . Then filter it to get only the `ath` ones (*e.g.* the ones from *A. thaliana*). ``` -grep -A 1 '>ath' mature.fa | grep -v '\-\-' > ath_known_RNAs.fasta +grep -A 1 '>ath' mature.fa | grep -v '\-\-' > ath_known_miRNAs.fasta ``` ## Example Run This example is a full run. It takes 3 raw (untrimmed) readfiles, identifies the adapters, trims the reads, indexes the genome, aligns the reads, discovers small RNA loci, and annotates high-confidence *MIRNA* loci. The example uses 6 threads; this can be adjusted up or down depending on your system's configuration; more threads decrease execution time but the response is non-linear (diminishing returns with very high thread numbers). The examples uses the test data described above. ``` -ShortStack --genomefile Arabidopsis_thalianaTAIR10.fa --readfile SRR3222443.fastq SRR3222444.fastq SRR3222445.fastq --autotrim --threads 6 --outdir ExampleShortStackRun --knownRNAs ath_known_RNAs.fasta +ShortStack --genomefile Arabidopsis_thalianaTAIR10.fa --readfile SRR3222443.fastq SRR3222444.fastq SRR3222445.fastq --autotrim --threads 6 --outdir ExampleShortStackRun --known_miRNAs ath_known_miRNAs.fasta ``` On my laptop this completes in about 18 minutes. All results are in the directory specified by `--outdir`, "ExampleShortStackRun". The outputs are described in the section below called "Outputs". @@ -200,7 +200,7 @@ A tab-delimited text file giving key information for all small RNA clusters. Col 18. *24*: Number of 24 nucleotide reads aligned to the locus. 19. *DicerCall*: If >= 80% of all aligned reads are within the boundaries of `--dicermin` and `--dicermax`, than the DicerCall gives the size of most abundant small RNA size. If < 80% of the aligned reads are in the `--dicermin` and `--dicermax` boundaries, DicerCall is set to 'N'. Loci with a DicerCall of 'N' are unlikely to be small RNAs related to the Dicer-Like/Argonaute system of gene regulation. 20. *MIRNA*: Did the locus pass all criteria to be called a *MIRNA* locus? If so, 'Y'. If not, 'N'. -21. *KnownRNAs*: Semicolon delimited list of user-provided known RNAs that aligned to the locus. If none, 'NA'. +21. *Known_miRNAs*: Semicolon delimited list of user-provided known RNAs that aligned to the locus. If none, 'NA'. ## Counts.txt A tab-delimited text file giving the raw alignment counts for each locus in each separate sample. Only produced if there was more than one sRNA-seq file used to create alignments. This file is useful for downstream analyses, especially differential expression analysis. @@ -217,10 +217,10 @@ A tab-delimited text file that gives details about small RNA-seq alignments as a ## Results.gff3 Small RNA loci in the gff3 format. Suitable for use on genome browsers. For loci that are annotated as *MIRNAs* there will be an additional entry for the mature microRNA position. The 'score' column in the gff3 format stores the number of sRNA-seq aligned reads at that locus. -## knownRNAs.gff3 -When the user provides known RNA sequences via `--knownRNAs`, they are aligned to the reference genome. Every (perfect) alignment to the reference is stored and reported in the knownRNAs.gff3 file. The score column shows the number of alignments that start and end at the exact coordinates and strand. +## known_miRNAs.gff3 +When the user provides known RNA sequences via `--known_miRNAs`, they are aligned to the reference genome. Every (perfect) alignment to the reference is stored and reported in the known_miRNAs.gff3 file. The score column shows the number of alignments that start and end at the exact coordinates and strand. -**Important** : knownRNAs are aligned and shown in the `knownRNAs.gff3` file regardless of whether any empirical small RNA-seq data are found. Thus, expect to find entries with a score of 0; these are cases where no instances of the given knownRNA were aligned to that location in the genome. +**Important** : known_miRNAs are aligned and shown in the `known_miRNAs.gff3` file regardless of whether any empirical small RNA-seq data are found. Thus, expect to find entries with a score of 0; these are cases where no instances of the given knownRNA were aligned to that location in the genome. ## strucVis/ The directory `strucVis/` contains visualizations of each locus that was annotated as a *MIRNA* locus. These are made by the script [strucVis](https://github.com/MikeAxtell/strucVis). For each locus there is a postscript file and a plain-text file. Both show the coverage of aligned small RNA-seq data as a function of position, aligned with the predicted RNA secondary structure of the inferred *MIRNA* hairpin precursor. These files are meant for manual inspection of *MIRNA* loci. @@ -250,7 +250,7 @@ The README for [ShortTracks](https://github.com/MikeAxtell/ShortTracks) has deta Loci annotated as *MIRNA* can be visualized from the `srucVis/` files. These show the predicted RNA secondary structures with the small RNA-seq read depth coverage. ## Genome Browsers -The output of ShortStack is designed to work with genome browsers. Specifically, the files `Results.gff3`, `knownRNAs.gff3`, the `.bam` files, and the `.bw` files can be directly visualized on either major genome browser (IGV, JBrowse). +The output of ShortStack is designed to work with genome browsers. Specifically, the files `Results.gff3`, `known_miRNAs.gff3`, the `.bam` files, and the `.bw` files can be directly visualized on either major genome browser (IGV, JBrowse). [JBrowse2](https://jbrowse.org/jb2/) has the ability to create "multi-wiggle" tracks. These tracks show multiple quantitative data tracks at once, bound to a common quantitative axis. The `.bw` bigwig files created by ShortStack & ShortTracks are normalized to reads-per-million, allowing direct comparisons in a multi-wiggle track. This allows visualization of size, coverage, and strandedness of the data. See the README for [ShortTracks](https://github.com/MikeAxtell/ShortTracks) for details. I recommend using the Desktop version of [JBrowse2](https://jbrowse.org/jb2/). @@ -268,7 +268,7 @@ Alignment of trimmed fastq data uses `bowtie`. There are usually two phases to a Genomic intervals where the depth of small RNA coverage, in reads-per-million, is greater than or equal to `--mincov` (1 by default) are identified. Each of these intervals are then extended in both directions by the length given by setting `--pad`. Regions that overlap after extension are merged. Note that if one or more *MIRNA* loci are later found to overlap the intial cluster, the initial cluster is removed from the output (and only the refined, trimmed *MIRNA* region(s) are reported). ## MIRNA annotation -*MIRNA* annotation has two entry points for initial searches: Locations of aligned user-provided sequences from `--knownRNAs` and, if option `--dn_mirna` is True, any 21 or 22 nt read whose abundance exceeds the depth of `--mincov`. From these initial starting points, ShortStack first examines the local region to find miR/miR-star-like patterns of read accumulation (essentially, "two-peaks" of read coverage on the same genomic strand that might correspond to the miR/miR-star pair). If such a pattern is found, the RNA secondary structure in the local area is predicted. The sRNA-seq alignments in conjunction with the predicted RNA secondary structure are analyzed with respect to the criteria in [Axtell and Meyers, 2018](https://doi.org/10.1105/tpc.17.00851). If the criteria are met, the locus is annotated as a *MIRNA*. +*MIRNA* annotation has two entry points for initial searches: Locations of aligned user-provided sequences from `--known_miRNAs` and, if option `--dn_mirna` is True, any 21 or 22 nt read whose abundance exceeds the depth of `--mincov`. From these initial starting points, ShortStack first examines the local region to find miR/miR-star-like patterns of read accumulation (essentially, "two-peaks" of read coverage on the same genomic strand that might correspond to the miR/miR-star pair). If such a pattern is found, the RNA secondary structure in the local area is predicted. The sRNA-seq alignments in conjunction with the predicted RNA secondary structure are analyzed with respect to the criteria in [Axtell and Meyers, 2018](https://doi.org/10.1105/tpc.17.00851). If the criteria are met, the locus is annotated as a *MIRNA*. # ShortStack version 4 Major Changes ShortStack version 4 is a major update. The major changes are: @@ -298,7 +298,7 @@ ShortStack version 4 is a major update. The major changes are: - Eliminate option `--total_primaries` .. instead use a fast hack to rapidly calculate this. - Option `--locifile` now understands .bed and .gff3 formats, as well as the original simple tab-delimited format. - Added options `--autotrim` and `--autotrim_key`. This allows automatic detection of 3' adapters by tallying the most common sequence that occurs after a known, highly abundant small RNA (given by `autotrim_key`). -- Add option `--knownRNAs`. Provide a FASTA file of known mature small RNA sequences to search for and to nucleate searches for qualifying *MIRNA* loci. +- Add option `--known_miRNAs`. Provide a FASTA file of known mature small RNA sequences to search for and to nucleate searches for qualifying *MIRNA* loci. - Add option `--dn_mirna`. The `--dn_mirna` activates a *de novo* search for *MIRNA* loci independent of those that align to the 'known RNAs' provided by the user. By default, `--dn_mirna` is not active. @@ -308,7 +308,7 @@ Please post issues, comments, bug reports, questions, etc. to the project github # FAQ - **I ran an analysis and found no loci annotated as *MIRNA* loci!** - - By default, ShortStack will not do a *de novo* search for loci that qualify as *MIRNA* loci. To search for *MIRNA* loci the user has to explicitly request it, using either or both of the options `--knownRNAs` and `--dn_mirna`. `knownRNAs` provides a list of known mature miRNA sequences. Places where these sequences align to the reference genome are examined to see if the small RNA alignment pattern and predicted RNA secondary structure qualifies as a *MIRNA* locus. The switch `--dn_mirna` turns on a *de novo MIRNA* search. The *de novo MIRNA* search is turned off by default to reduce false annotations. The idea is that most mature miRNAs are known in most species by now. + - By default, ShortStack will not do a *de novo* search for loci that qualify as *MIRNA* loci. To search for *MIRNA* loci the user has to explicitly request it, using either or both of the options `--known_miRNAs` and `--dn_mirna`. `known_miRNAs` provides a list of known mature miRNA sequences. Places where these sequences align to the reference genome are examined to see if the small RNA alignment pattern and predicted RNA secondary structure qualifies as a *MIRNA* locus. The switch `--dn_mirna` turns on a *de novo MIRNA* search. The *de novo MIRNA* search is turned off by default to reduce false annotations. The idea is that most mature miRNAs are known in most species by now. - **What happened to the phasing scores?** - I decided to omit phasing scores as of ShortStack version 4.0. This is because I gradually have lost confidence the accuracy of genome-wide scans to provide acceptable sensitivity *and* specificity for scoring phasing. For a detailed analysis of the challenges of calling phasing of siRNA clusters in genome-wide analyses, see [Polydore et al. (2018)](https://doi.org/10.1002/pld3.101). I am considering bringing phasing scores back, but just for 21-22 nt siRNA loci, in a future release. - **Installation fails with conda** diff --git a/ShortStack b/ShortStack index ebe9a38..99e17b9 100755 --- a/ShortStack +++ b/ShortStack @@ -1,6 +1,6 @@ #!/usr/bin/env python -version = '4.0.1' +version = '4.0.2' # Main script / control is at the BOTTOM of this file @@ -260,8 +260,8 @@ def check_executables(args): rqd.append('strucVis') # bowtie and bowtie-build are needed if --readfile is is not an empty list. - # OR if --knownRNAs is present - if (args.readfile is not None) or (args.knownRNAs is not None): + # OR if --known_miRNAs is present + if (args.readfile is not None) or (args.known_miRNAs is not None): rqd.append('bowtie') rqd.append('bowtie-build') @@ -303,19 +303,19 @@ def valid_genomefile(x): sys.exit(msg) return x -def valid_knownRNAs(x): - """Tests existence and extension of --knownRNAs +def valid_known_miRNAs(x): + """Tests existence and extension of --known_miRNAs - x : string input from option --knownRNAs + x : string input from option --known_miRNAs Extension must be .fa or .fasta. Uses modules os and sys. """ if not os.path.isfile(x): - msg = 'Invalid knownRNAs {0} : is not a file'.format(x) + msg = 'Invalid known_miRNAs {0} : is not a file'.format(x) sys.exit(msg) base,ext = os.path.splitext(x) if not (ext == '.fa' or ext == '.fasta'): - msg = ('Invalid knownRNAs {0}'.format(x) + + msg = ('Invalid known_miRNAs {0}'.format(x) + ' : does not end in .fa or .fasta'.format(x)) sys.exit(msg) return x @@ -487,8 +487,8 @@ def ShortStack_argparse(ss_version): required=True, type=valid_genomefile, help='FASTA file of the reference genome (required)') - parser.add_argument('--knownRNAs', - type=valid_knownRNAs, + parser.add_argument('--known_miRNAs', + type=valid_known_miRNAs, help='FASTA file of known/suspected mature microRNAs') parser_group = parser.add_mutually_exclusive_group(required=True) parser_group.add_argument('--readfile', @@ -586,7 +586,7 @@ def ShortStack_argparse(ss_version): ' search for new MIRNA loci will be performed.' + ' By default, de novo MIRNA finding is not performed ' + ' and MIRNA searches are limited to loci matching' + - ' RNAs from --knownRNAs that align to the genome') + ' RNAs from --known_miRNAs that align to the genome') parser.add_argument('--strand_cutoff', type=valid_strand_cutoff, default = 0.8, @@ -668,8 +668,8 @@ def align(args, fai, trimmed_files): if args.bamfile is not None: # No alignment. But we need to check the user's provided file(s) bam = check_user_bamfile(args, fai) - # Could still need bowtie indices if knownRNAs is provided - if args.knownRNAs is not None: + # Could still need bowtie indices if known_miRNAs is provided + if args.known_miRNAs is not None: bowtie_indices(args) # Still need to compute number of primary reads in the file n_reads = count_reads(bam, args.threads) @@ -1514,9 +1514,10 @@ def get_trimmed(args): ' more fastq files!. Try a different autotrim_key' + ' or trim your reads outside of ShortStack!') sys.exit(msg) - print('') - print('Adapters:') - pprint.pprint(found_adapters) + if len(found_adapters) > 0: + print('') + print('Adapters:') + pprint.pprint(found_adapters) # Adapter trimming trimmed_files = [] @@ -2087,12 +2088,12 @@ def mirna(args, merged_bam, fai, pmir_bedfile, read_count): if args.nohp is True: return None - # if no knownRNAs AND no de novo places to look, return nothing - if (args.knownRNAs is None) and (pmir_bedfile is None): + # if no known_miRNAs AND no de novo places to look, return nothing + if (args.known_miRNAs is None) and (pmir_bedfile is None): return None - # similarly, if no knownRNAs and de novo is turned off, return nothing - if (args.knownRNAs is None) and (args.dn_mirna is False): + # similarly, if no known_miRNAs and de novo is turned off, return nothing + if (args.known_miRNAs is None) and (args.dn_mirna is False): return None # report to user @@ -2100,19 +2101,19 @@ def mirna(args, merged_bam, fai, pmir_bedfile, read_count): print(time.strftime("%a %d %b %Y %H:%M:%S %z %Z", time.localtime())) print('Searching for valid microRNA loci') - if args.knownRNAs is not None: + if args.known_miRNAs is not None: # Align user's sequences to the genome - print('Aligning knownRNAs sequences to genome') + print('Aligning known_miRNAs sequences to genome') # we must sanitize user input .. capitalize, U to T - sane_knownRNAs = sanitize_knownRNAs(args) + sane_known_miRNAs = sanitize_known_miRNAs(args) # align using bowtie # "score" set to zero, reflecting unknown alignments at this point mir_samfile = args.outdir + '/user_mir.sam' - un_file = args.outdir + '/knownRNAs_unaligned.fasta' + un_file = args.outdir + '/known_miRNAs_unaligned.fasta' bt_args = ['bowtie', '-f', '-a', '-S', '-v', '0', '-p', str(args.threads), '--mapq', '0', '--un', un_file, '-x', - args.genomefile, sane_knownRNAs, mir_samfile] + args.genomefile, sane_known_miRNAs, mir_samfile] subprocess.run(args = bt_args) # convert to bam, sort, and convert to bed @@ -2131,8 +2132,31 @@ def mirna(args, merged_bam, fai, pmir_bedfile, read_count): subprocess.run(args=bam2bed_args, stdout=user_mir_bed_fh) user_mir_bed_fh.close() + # testing breakpoint + # sys.exit() + + # If user provided locus/loci, we only keep hits to those intervals. + # There will be a file called 'user.bed' present to screen against. + if (args.locifile is not None) or (args.locus is not None): + ub = args.outdir + '/user.bed' + user_mir_bed2_f = args.outdir + '/user_mir2.bed' + user_mir_bed2_fh = open(user_mir_bed2_f, "w") + bint_args = ['bedtools', 'intersect', '-u', '-a', user_mir_bed_f, '-b', ub] + subprocess.run(args=bint_args, stdout=user_mir_bed2_fh) + user_mir_bed2_fh.close() + + # clean up a little + subprocess.run(f'rm -f {user_mir_bed_f}', shell=True) + + # It's possible there were no overlaps. This will result + # in a file size of 0. If so, set to None + if os.stat(user_mir_bed2_f).st_size == 0: + user_mir_bed_f = None + else: + user_mir_bed_f = user_mir_bed2_f + # cleanup - rm_args = ['rm', '-f', sane_knownRNAs, mir_samfile, + rm_args = ['rm', '-f', sane_known_miRNAs, mir_samfile, mir_unsortedbam, mir_bam] subprocess.run(args = rm_args) else: @@ -2151,7 +2175,7 @@ def mirna(args, merged_bam, fai, pmir_bedfile, read_count): if user_mir_bed_f is not None: print('') - print('Screening of possible microRNAs from user provided knownRNAs') + print('Screening of possible microRNAs from user provided known_miRNAs') if __name__ == '__main__': with open(user_mir_bed_f) as f: bedlines = f.read().splitlines() @@ -2167,7 +2191,7 @@ def mirna(args, merged_bam, fai, pmir_bedfile, read_count): user_q_bedlines, user_locus_bedlines = zip(*user_mloci1) # Write the revised (with scores added) user_q_bedlines - u_q_file = args.outdir + '/knownRNAs.bed' + u_q_file = args.outdir + '/known_miRNAs.bed' u_q_fh = open(u_q_file, "w") for line in user_q_bedlines: u_q_fh.write(line) @@ -2659,12 +2683,12 @@ def count_exact_location(bedfields, merged_bam): return read_count -def sanitize_knownRNAs(args): +def sanitize_known_miRNAs(args): # sane means upper case, with Us converted to Ts # This is required for bowtie mapping - sane_filepath = args.outdir + '/user_knownRNAs.fasta' + sane_filepath = args.outdir + '/user_known_miRNAs.fasta' sane_fh = open(sane_filepath, "w") - for seq_record in SeqIO.parse(args.knownRNAs, "fasta"): + for seq_record in SeqIO.parse(args.known_miRNAs, "fasta"): out = ('>' + str(seq_record.id) + '\n' + str(seq_record.seq.upper().back_transcribe()) + '\n') sane_fh.write(out) @@ -3124,8 +3148,13 @@ def write_files(args, qdata, mir_qdata): bedlines = mh.readlines() m_bedlines = m_bedlines + bedlines + # Special case is when we have found one or more MIRNA loci + # from user-proided locifile or locus. + if (os.path.exists(u)) and (os.path.exists(m)): + nm_bedlines, m_bedlines = merge_locifile_mirs(args) + # First make a temp bed file with all Results loci - # this will be used for genome-sorting, prior to naming + # this will be used for genome-sorting, prior to final naming resUnsortedBedf = args.outdir + '/ResultsUnsorted.bed' resUnsortedBedfh = open(resUnsortedBedf, "w") @@ -3133,14 +3162,22 @@ def write_files(args, qdata, mir_qdata): if nm_bedlines: for line in nm_bedlines: f = line.rstrip().split('\t') - outline = f[0] + '\t' + f[1] + '\t' + f[2] + '\tnm\n' + outline = f[0] + '\t' + f[1] + '\t' + f[2] # + '\t' + f[3] + '\tnm\n' + if len(f) > 3: + outline = outline + '\t' + f[3] + '\tnm\n' + else : + outline = outline + '\t' + '.' + '\tnm\n' resUnsortedBedfh.write(outline) mat_lookups = {} star_lookups = {} if m_bedlines: for line in m_bedlines: f = line.rstrip().split('\t') - outline = f[0] + '\t' + f[1] + '\t' + f[2] + '\tm\n' + outline = f[0] + '\t' + f[1] + '\t' + f[2] # + '\t' + f[3] + '\tm\n' + if len(f) > 3: + outline = outline + '\t' + f[3] + '\tm\n' + else : + outline = outline + '\t' + '.' + '\tm\n' resUnsortedBedfh.write(outline) # We must also store information about the locations of the inferred # mature miRNA and miRNA* from this file @@ -3165,7 +3202,8 @@ def write_files(args, qdata, mir_qdata): f = line.rstrip().split('\t') onestart = int(f[1]) + 1 coords = f[0] + ':' + str(onestart) + '-' + f[2] - dtype = f[3] ## 'nm' or 'm' + name_from_bed = f[3] ## needed in certain miRNA cases from locifile + dtype = f[4] ## 'nm' or 'm' if dtype == 'nm': # name is allowed to be inherited, if it is not 'NA' @@ -3225,10 +3263,12 @@ def write_files(args, qdata, mir_qdata): cline = cline + '\n' countsfh.write(cline) elif dtype == 'm': - - # MIRNA loci are always 'de novo' - # So names will always be generic. - name = 'Cluster_' + str(clus_num) + # If locifile or locus was used, and we found MIRNAs, + # we need to honor the original names ... if not, generic + if(args.locifile is not None) or (args.locus is not None): + name = name_from_bed + else: + name = 'Cluster_' + str(clus_num) # Results.txt rline = f'{coords}\t{name}\t' @@ -3320,9 +3360,9 @@ def write_files(args, qdata, mir_qdata): print(dcall, tally) # Report overlaps between alignment positions of - # knownRNAs and loci in Results.gff/.txt + # known_miRNAs and loci in Results.gff/.txt # Do this by adding a column to Results.txt - k = args.outdir + '/knownRNAs.bed' + k = args.outdir + '/known_miRNAs.bed' if os.path.exists(k): ifile = args.outdir + '/intersect_temp.tsv' bicmd = f'bedtools intersect -wb -a {gff3f} -b {k} > {ifile}' @@ -3346,7 +3386,7 @@ def write_files(args, qdata, mir_qdata): line_num += 1 stub = oldres_line.rstrip() if line_num == 1: - new_line = stub + '\tKnownRNAs\n' + new_line = stub + '\tknown_miRNAs\n' else: res_fields = stub.split('\t') res_name = res_fields[1] @@ -3362,8 +3402,8 @@ def write_files(args, qdata, mir_qdata): subprocess.run(mv_cmd, shell=True) subprocess.run(rm_cmd, shell=True) - # Reformat the knownRNAs.bed into gff3 for better browser display - k_gff_f = args.outdir + '/knownRNAs.gff3' + # Reformat the known_miRNAs.bed into gff3 for better browser display + k_gff_f = args.outdir + '/known_miRNAs.gff3' k_gff_fh = open(k_gff_f, "w") with open(k) as kbed: for kbed_line in kbed: @@ -3466,6 +3506,54 @@ def get_reads_by_chrom(bamfile): rbc[row[0]] = int(row[2]) return(rbc) +def merge_locifile_mirs(args): + # bedtools intersect -wao -a user.bed -b passed_mir_final.bed + # Find overlaps between user's original intervals and passed MIRNA loci + wao_file = args.outdir + '/wao.bed' + wao_fh = open(wao_file, "w") + user_bed_path = args.outdir + '/user.bed' + passed_mir_path = args.outdir + '/passed_mir_final.bed' + wao_args = ['bedtools', 'intersect', '-wao', '-a', user_bed_path, '-b', passed_mir_path] + subprocess.run(wao_args, stdout=wao_fh) + wao_fh.close() + + # Scan the wao file. Cases with no hit need no action. + # Cases with overlap: Remove the user's originial from the 'nm_bedlines', + # and replace the name in the m_bedlines. + # By definition every line in m_bedline will have one (or more) overlaps with user.bed / nm_bedlines. + + # example nm_bedlines entry: chr11 15721625 15721756 ccm-MIR5226 + # example m_bedline entry: chr1 1692111 1692238 ccm-IIM-70-mature5p 1327 - 1692197 1692218 1692131 1692152 45 + + new_m_bedlines = [] + new_nm_bedlines = [] + + with open(wao_file) as wao_fh: + for wao_line in wao_fh: + wao_fields = wao_line.rstrip().split('\t') + if wao_fields[4] != '.': + new_m_fields = wao_fields[4:] + # replace the name with the user's name + new_m_fields[3] = wao_fields[3] + new_m_line = '\t'.join(new_m_fields) + new_m_bedlines.append(new_m_line) + else: + new_nm_fields = wao_fields[:4] + new_nm_line = '\t'.join(new_nm_fields) + new_nm_bedlines.append(new_nm_line) + # testing + #for x in new_m_bedlines: + # print(x) + #for y in new_nm_bedlines: + # print(y) + #sys.exit() + + # cleanup + rm_args = ['rm', '-f', wao_file] + subprocess.run(rm_args) + # return + return(new_nm_bedlines, new_m_bedlines) + ## Main control / script if __name__ == '__main__': # Process command line, setting and checking options. @@ -3509,6 +3597,9 @@ if __name__ == '__main__': if os.path.exists(todel): subprocess.run(f'rm {todel}', shell=True) + # breakpoint for testing + # sys.exit() + # Write most final files write_files(args, qdata, mir_qdata)