From c7c421f86b9544e9d6b35e5e6c1aef5b56598678 Mon Sep 17 00:00:00 2001 From: ygidtu Date: Wed, 15 Jul 2020 11:42:53 +0800 Subject: [PATCH] 0.0.2-beta more faster with more memory usage, fix gzip output issues --- README.md | 13 +++++++++++++ const.go | 55 ++++++++++++++++++++++++++++++++++++++----------------- load.go | 30 ++++++++++++++++++++---------- main.go | 33 ++++++++++++++++++--------------- 4 files changed, 89 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 2eece0f..40b2d57 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,18 @@ # REDItools2 +**Created at 2020.07.14** + + +A go version of REDItools2 + +1. Too much faster. + - The original Python version takes 3mins + - The `multiprocessing version` in previous commits, takes 1m22s with 10 processes + - The go version takes 1min with 1 process, 7~9s with 10 processes +2. More information, even more editing sites. + +--- + **REDItools2** is the optimized, parallel multi-node version of [ REDItools](https://github.com/BioinfoUNIBA/REDItools). REDItools takes in input a RNA-Seq (or DNA-Seq BAM) file and outputs a table of RNA-Seq editing events. Here is an example of REDItools's output: diff --git a/const.go b/const.go index 1ad178a..86fe852 100644 --- a/const.go +++ b/const.go @@ -20,14 +20,14 @@ var conf config var region *Region const ( - VERSION = "0.0.2" + VERSION = "0.0.2-beta" DefaultBaseQuality = 30 ) type config struct { Config string `goptions:"-c, --config, description='Config file'"` Version bool `goptions:"-v, --version, description='Show version'"` - Debug bool `goptions:"--debug, description='Show debug info'"` + Debug bool `goptions:"--debug, description='Show debug info'" long:"debug" description:"Config file"` File string `goptions:"-f, --file, description='The bam file to be analyzed'"` Output string `goptions:"-o, --output-file, description='The output statistics file'"` Process int `goptions:"-p, --process, description='How many process to use'"` @@ -38,21 +38,21 @@ type config struct { Region string `goptions:"-g, --region, description='The region of the bam file to be analyzed'"` OmopolymericFile string `goptions:"-m, --omopolymeric-file, description='The file containing the omopolymeric positions'"` CreateOmopolymericFile bool `goptions:"-c, --create-omopolymeric-file, description='Whether to create the omopolymeric span'"` - OmopolymericSpan int `goptions:"-os, --omopolymeric-span, description='The omopolymeric span'"` - SplicingFile string `goptions:"-sf, --splicing-file, description='The file containing the splicing sites positions'"` - SplicingSpan int `goptions:"-ss, --splicing-span, description='The splicing span'"` - MinReadLength int `goptions:"-mrl, --min-read-length, description='The minimum read length. Reads whose length is below this value will be discarded.'"` + OmopolymericSpan int `goptions:"--omopolymeric-span, description='The omopolymeric span'"` + SplicingFile string `goptions:"--splicing-file, description='The file containing the splicing sites positions'"` + SplicingSpan int `goptions:"--splicing-span, description='The splicing span'"` + MinReadLength int `goptions:"--min-read-length, description='The minimum read length. Reads whose length is below this value will be discarded.'"` MinReadQuality int `goptions:"-q, --min-read-quality, description='The minimum read quality. Reads whose mapping quality is below this value will be discarded.'"` - MinBaseQuality int `goptions:"-bq, --min-base-quality, description='The minimum base quality. Bases whose quality is below this value will not be included in the analysis.'"` - MinBasePosition int `goptions:"-mbp, --min-base-position, description='The minimum base position. Bases which reside in a previous position (in the read) will not be included in the analysis.'"` - MaxBasePosition int `goptions:"-Mbp, --max-base-position, description='The maximum base position. Bases which reside in a further position (in the read) will not be included in the analysis.'"` + MinBaseQuality int `goptions:"--min-base-quality, description='The minimum base quality. Bases whose quality is below this value will not be included in the analysis.'"` + MinBasePosition int `goptions:"--min-base-position, description='The minimum base position. Bases which reside in a previous position (in the read) will not be included in the analysis.'"` + MaxBasePosition int `goptions:"--max-base-position, description='The maximum base position. Bases which reside in a further position (in the read) will not be included in the analysis.'"` MinColumnLength int `goptions:"-l, --min-column-length, description='The minimum length of editing column (per position). Positions whose columns have length below this value will not be included in the analysis.'"` - MinEditsPerNucleotide int `goptions:"-men, --min-edits-per-nucleotide, description='The minimum number of editing for events each nucleotide (per position). Positions whose columns have bases with less than min-edits-per-base edits will not be included in the analysis.'"` - MinEdits int `goptions:"-me, --min-edits, description='The minimum number of editing events (per position). Positions whose columns have bases with less than \'min-edits-per-base edits\' will not be included in the analysis.'"` - MaxEditsPerNucleotide int `goptions:"-Men, --max-editing-nucleotides, description='The maximum number of editing nucleotides, from 0 to 4 (per position). Positions whose columns have more than \'max-editing-nucleotides\' will not be included in the analysis.'"` + MinEditsPerNucleotide int `goptions:"--min-edits-per-nucleotide, description='The minimum number of editing for events each nucleotide (per position). Positions whose columns have bases with less than min-edits-per-base edits will not be included in the analysis.'"` + MinEdits int `goptions:"--min-edits, description='The minimum number of editing events (per position). Positions whose columns have bases with less than \'min-edits-per-base edits\' will not be included in the analysis.'"` + MaxEditsPerNucleotide int `goptions:"--max-editing-nucleotides, description='The maximum number of editing nucleotides, from 0 to 4 (per position). Positions whose columns have more than \'max-editing-nucleotides\' will not be included in the analysis.'"` StrandConfidence int `goptions:"-T, --strand-confidence, description='Strand inference type 1:maxValue 2:useConfidence [1]; maxValue: the most prominent strand count will be used; useConfidence: strand is assigned if over a prefixed frequency confidence (-TV option)'"` StrandCorrection bool `goptions:"-C, --strand-corection, description='Strand correction. Once the strand has been inferred, only bases according to this strand will be selected.'"` - StrandConfidenceValue float64 `goptions:"-Tv, --strand-confidence-value, description='Strand confidence [0.70]'"` + StrandConfidenceValue float64 `goptions:"--strand-confidence-value, description='Strand confidence [0.70]'"` RemoveHeader bool `goptions:"-H, --remove-header, description='Do not include header in output file'"` Dna bool `goptions:"-N, --dna, description='Run REDItools 2.0 on DNA-Seq data'"` BedFile string `goptions:"-B, --bed_file, description='Path of BED file containing target regions'"` @@ -124,15 +124,36 @@ func decodeRegion(region string) *Region { return res } - type ChanChunk struct { - Ref string - Start int - End int + Ref string + Start int + End int Chunks bgzf.Chunk } +func (c *ChanChunk) ToRegion() *Region { + return &Region{ + Chrom: c.Ref, + Start: c.Start, + End: c.End, + } +} + +func (c *ChanChunk) SwitchRegion() *Region { + ref := c.Ref + if strings.HasPrefix(ref, "chr") { + ref = strings.ReplaceAll(ref, "chr", "") + } else { + ref = "chr" + ref + } + + return &Region{ + Chrom: ref, + Start: c.Start, + End: c.End, + } +} type Omopolymeric struct { Chromosome string diff --git a/load.go b/load.go index d310c72..3a85705 100644 --- a/load.go +++ b/load.go @@ -387,9 +387,9 @@ func splitRegion(end, chunks int) []int { } // adjustRegion is used to move the end site a little backwards -func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *bam.Reader) ([]bgzf.Chunk, error) { +func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *bam.Reader) ([]*ChanChunk, error) { - res := make([]bgzf.Chunk, 0, 0) + res := make([]*ChanChunk, 0, 0) for i := 0; i < len(regions); i++ { if i > 0 && i < len(regions)-1 { chunks, err := idx.Chunks(ref, regions[i], regions[i+1]) @@ -403,7 +403,6 @@ func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader * continue } - iter, err := bam.NewIterator(bamReader, chunks) if err != nil { return res, err @@ -427,12 +426,28 @@ func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader * } for i := 1; i < len(regions); i++ { + // avoid duplicated regions + if regions[i-1] > regions[i] { + regions[i] = regions[i-1] + continue + } + chunks, err := idx.Chunks(ref, regions[i-1], regions[i]) if err != nil { continue // return res, errors.Wrapf(err, "failed after adjust %s: %d - %d", ref.Name(), region[i-1], region[i]) } - res = append(res, chunks...) + //res = append(res, chunks...) + + for _, c := range chunks { + res = append(res, &ChanChunk{ + Ref: ref.Name(), + Start: regions[i-1], + End: regions[i], + Chunks: c, + }) + } + } return res, nil @@ -498,12 +513,7 @@ func fetchBamRefs() (map[string][]*ChanChunk, error) { continue } - tempRes := make([]*ChanChunk, 0, 0) - for _, t := range temp { - tempRes = append(tempRes, &ChanChunk{Ref: ref.Name(), Chunks: t}) - } - - res[ref.Name()] = tempRes + res[ref.Name()] = temp } sugar.Debug(res) diff --git a/main.go b/main.go index ca9a483..8d6c3b9 100644 --- a/main.go +++ b/main.go @@ -101,8 +101,17 @@ func worker( break } + //chrRef, err := fetchFasta(ref.ToRegion()) + //if err != nil { + // sugar.Warnf("try to modify %s", ref) + // chrRef, err = fetchFasta(ref.SwitchRegion()) + //} + //if err != nil { + // sugar.Fatal(err) + //} + chrRef, ok := chrRefs[ref.Ref] - if ! ok { + if !ok { sugar.Errorf("failed to get %s from reference", ref.Ref) continue } @@ -139,8 +148,6 @@ func worker( sugar.Fatal(record.SeqString()) } - // record.QueryPosition = append(record.QueryPosition, at) - genomic := start + record.Start if _, ok := edits[genomic]; !ok { @@ -170,13 +177,14 @@ func worker( w <- "done" } - func main() { conf = defaultConfig() goptions.ParseAndFail(&conf) setLogger(conf.Debug, conf.Log) + sugar.Debugf("options: %v", conf) + if conf.Version { sugar.Infof("current version: %v", VERSION) os.Exit(0) @@ -253,7 +261,7 @@ func main() { lock.Lock() chrRefs[ref] = temp lock.Unlock() - } (ref, &wg, &lock) + }(ref, &wg, &lock) } wg.Wait() @@ -266,19 +274,14 @@ func main() { wg.Add(1) } - if references, err := fetchBamRefs(); err == nil { - for ref, chunks := range references { - sugar.Infof("read reads from %s", ref) - for _, c := range chunks { - sugar.Debug(c) - refs <- c - } + for ref, chunks := range references { + sugar.Infof("read reads from %s", ref) + for _, c := range chunks { + sugar.Debug(c) + refs <- c } - } else { - sugar.Error(err) } - close(refs) wg.Wait()