0.0.2-beta more faster with more memory usage, fix gzip output issues

ygidtu · Jul 15, 2020 · c7c421f · c7c421f
1 parent f694eb6
commit c7c421f
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,18 @@
 # REDItools2
 
+**Created at 2020.07.14**
+
+
+A go version of REDItools2
+
+1. Too much faster. 
+    - The original Python version takes 3mins
+    - The `multiprocessing version` in previous commits, takes 1m22s with 10 processes
+    - The go version takes 1min with 1 process, 7~9s with 10 processes
+2. More information, even more editing sites.
+
+---
+
 **REDItools2** is the optimized, parallel multi-node version of [<i class="icon-link"></i> REDItools](https://github.com/BioinfoUNIBA/REDItools).
 
 REDItools takes in input a RNA-Seq (or DNA-Seq BAM) file and outputs a table of RNA-Seq editing events.  Here is an example of REDItools's output:

diff --git a/const.go b/const.go
@@ -20,14 +20,14 @@ var conf config
 var region *Region
 
 const (
-	VERSION            = "0.0.2"
+	VERSION            = "0.0.2-beta"
 	DefaultBaseQuality = 30
 )
 
 type config struct {
 	Config                 string  `goptions:"-c, --config, description='Config file'"`
 	Version                bool    `goptions:"-v, --version, description='Show version'"`
-	Debug                  bool    `goptions:"--debug, description='Show debug info'"`
+	Debug                  bool    `goptions:"--debug, description='Show debug info'" long:"debug" description:"Config file"`
 	File                   string  `goptions:"-f, --file, description='The bam file to be analyzed'"`
 	Output                 string  `goptions:"-o, --output-file, description='The output statistics file'"`
 	Process                int     `goptions:"-p, --process, description='How many process to use'"`
@@ -38,21 +38,21 @@ type config struct {
 	Region                 string  `goptions:"-g, --region, description='The region of the bam file to be analyzed'"`
 	OmopolymericFile       string  `goptions:"-m, --omopolymeric-file, description='The file containing the omopolymeric positions'"`
 	CreateOmopolymericFile bool    `goptions:"-c, --create-omopolymeric-file, description='Whether to create the omopolymeric span'"`
-	OmopolymericSpan       int     `goptions:"-os, --omopolymeric-span, description='The omopolymeric span'"`
-	SplicingFile           string  `goptions:"-sf, --splicing-file, description='The file containing the splicing sites positions'"`
-	SplicingSpan           int     `goptions:"-ss, --splicing-span, description='The splicing span'"`
-	MinReadLength          int     `goptions:"-mrl, --min-read-length, description='The minimum read length. Reads whose length is below this value will be discarded.'"`
+	OmopolymericSpan       int     `goptions:"--omopolymeric-span, description='The omopolymeric span'"`
+	SplicingFile           string  `goptions:"--splicing-file, description='The file containing the splicing sites positions'"`
+	SplicingSpan           int     `goptions:"--splicing-span, description='The splicing span'"`
+	MinReadLength          int     `goptions:"--min-read-length, description='The minimum read length. Reads whose length is below this value will be discarded.'"`
 	MinReadQuality         int     `goptions:"-q, --min-read-quality, description='The minimum read quality. Reads whose mapping quality is below this value will be discarded.'"`
-	MinBaseQuality         int     `goptions:"-bq, --min-base-quality, description='The minimum base quality. Bases whose quality is below this value will not be included in the analysis.'"`
-	MinBasePosition        int     `goptions:"-mbp, --min-base-position, description='The minimum base position. Bases which reside in a previous position (in the read) will not be included in the analysis.'"`
-	MaxBasePosition        int     `goptions:"-Mbp, --max-base-position, description='The maximum base position. Bases which reside in a further position (in the read) will not be included in the analysis.'"`
+	MinBaseQuality         int     `goptions:"--min-base-quality, description='The minimum base quality. Bases whose quality is below this value will not be included in the analysis.'"`
+	MinBasePosition        int     `goptions:"--min-base-position, description='The minimum base position. Bases which reside in a previous position (in the read) will not be included in the analysis.'"`
+	MaxBasePosition        int     `goptions:"--max-base-position, description='The maximum base position. Bases which reside in a further position (in the read) will not be included in the analysis.'"`
 	MinColumnLength        int     `goptions:"-l, --min-column-length, description='The minimum length of editing column (per position). Positions whose columns have length below this value will not be included in the analysis.'"`
-	MinEditsPerNucleotide  int     `goptions:"-men, --min-edits-per-nucleotide, description='The minimum number of editing for events each nucleotide (per position). Positions whose columns have bases with less than min-edits-per-base edits will not be included in the analysis.'"`
-	MinEdits               int     `goptions:"-me, --min-edits, description='The minimum number of editing events (per position). Positions whose columns have bases with less than \'min-edits-per-base edits\' will not be included in the analysis.'"`
-	MaxEditsPerNucleotide  int     `goptions:"-Men, --max-editing-nucleotides, description='The maximum number of editing nucleotides, from 0 to 4 (per position). Positions whose columns have more than \'max-editing-nucleotides\' will not be included in the analysis.'"`
+	MinEditsPerNucleotide  int     `goptions:"--min-edits-per-nucleotide, description='The minimum number of editing for events each nucleotide (per position). Positions whose columns have bases with less than min-edits-per-base edits will not be included in the analysis.'"`
+	MinEdits               int     `goptions:"--min-edits, description='The minimum number of editing events (per position). Positions whose columns have bases with less than \'min-edits-per-base edits\' will not be included in the analysis.'"`
+	MaxEditsPerNucleotide  int     `goptions:"--max-editing-nucleotides, description='The maximum number of editing nucleotides, from 0 to 4 (per position). Positions whose columns have more than \'max-editing-nucleotides\' will not be included in the analysis.'"`
 	StrandConfidence       int     `goptions:"-T, --strand-confidence, description='Strand inference type 1:maxValue 2:useConfidence [1]; maxValue: the most prominent strand count will be used; useConfidence: strand is assigned if over a prefixed frequency confidence (-TV option)'"`
 	StrandCorrection       bool    `goptions:"-C, --strand-corection, description='Strand correction. Once the strand has been inferred, only bases according to this strand will be selected.'"`
-	StrandConfidenceValue  float64 `goptions:"-Tv, --strand-confidence-value, description='Strand confidence [0.70]'"`
+	StrandConfidenceValue  float64 `goptions:"--strand-confidence-value, description='Strand confidence [0.70]'"`
 	RemoveHeader           bool    `goptions:"-H, --remove-header, description='Do not include header in output file'"`
 	Dna                    bool    `goptions:"-N, --dna, description='Run REDItools 2.0 on DNA-Seq data'"`
 	BedFile                string  `goptions:"-B, --bed_file, description='Path of BED file containing target regions'"`
@@ -124,15 +124,36 @@ func decodeRegion(region string) *Region {
 	return res
 }
 
-
 type ChanChunk struct {
-	Ref string
-	Start int
-	End int
+	Ref    string
+	Start  int
+	End    int
 	Chunks bgzf.Chunk
 }
 
+func (c *ChanChunk) ToRegion() *Region {
+	return &Region{
+		Chrom: c.Ref,
+		Start: c.Start,
+		End:   c.End,
+	}
+}
+
+func (c *ChanChunk) SwitchRegion() *Region {
+	ref := c.Ref
 
+	if strings.HasPrefix(ref, "chr") {
+		ref = strings.ReplaceAll(ref, "chr", "")
+	} else {
+		ref = "chr" + ref
+	}
+
+	return &Region{
+		Chrom: ref,
+		Start: c.Start,
+		End:   c.End,
+	}
+}
 
 type Omopolymeric struct {
 	Chromosome string

diff --git a/load.go b/load.go
@@ -387,9 +387,9 @@ func splitRegion(end, chunks int) []int {
 }
 
 // adjustRegion is used to move the end site a little backwards
-func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *bam.Reader) ([]bgzf.Chunk, error) {
+func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *bam.Reader) ([]*ChanChunk, error) {
 
-	res := make([]bgzf.Chunk, 0, 0)
+	res := make([]*ChanChunk, 0, 0)
 	for i := 0; i < len(regions); i++ {
 		if i > 0 && i < len(regions)-1 {
 			chunks, err := idx.Chunks(ref, regions[i], regions[i+1])
@@ -403,7 +403,6 @@ func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *
 				continue
 			}
 
-
 			iter, err := bam.NewIterator(bamReader, chunks)
 			if err != nil {
 				return res, err
@@ -427,12 +426,28 @@ func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *
 	}
 
 	for i := 1; i < len(regions); i++ {
+		// avoid duplicated regions
+		if regions[i-1] > regions[i] {
+			regions[i] = regions[i-1]
+			continue
+		}
+
 		chunks, err := idx.Chunks(ref, regions[i-1], regions[i])
 		if err != nil {
 			continue
 			// return res, errors.Wrapf(err, "failed after adjust %s: %d - %d", ref.Name(), region[i-1], region[i])
 		}
-		res = append(res, chunks...)
+		//res = append(res, chunks...)
+
+		for _, c := range chunks {
+			res = append(res, &ChanChunk{
+				Ref:    ref.Name(),
+				Start:  regions[i-1],
+				End:    regions[i],
+				Chunks: c,
+			})
+		}
+
 	}
 
 	return res, nil
@@ -498,12 +513,7 @@ func fetchBamRefs() (map[string][]*ChanChunk, error) {
 			continue
 		}
 
-		tempRes := make([]*ChanChunk, 0, 0)
-		for _, t := range temp {
-			tempRes = append(tempRes, &ChanChunk{Ref: ref.Name(), Chunks: t})
-		}
-
-		res[ref.Name()] = tempRes
+		res[ref.Name()] = temp
 	}
 
 	sugar.Debug(res)

diff --git a/main.go b/main.go
@@ -101,8 +101,17 @@ func worker(
 			break
 		}
 
+		//chrRef, err := fetchFasta(ref.ToRegion())
+		//if err != nil {
+		//	sugar.Warnf("try to modify %s", ref)
+		//	chrRef, err = fetchFasta(ref.SwitchRegion())
+		//}
+		//if err != nil {
+		//	sugar.Fatal(err)
+		//}
+
 		chrRef, ok := chrRefs[ref.Ref]
-		if ! ok {
+		if !ok {
 			sugar.Errorf("failed to get %s from reference", ref.Ref)
 			continue
 		}
@@ -139,8 +148,6 @@ func worker(
 							sugar.Fatal(record.SeqString())
 						}
 
-						// record.QueryPosition = append(record.QueryPosition, at)
-
 						genomic := start + record.Start
 
 						if _, ok := edits[genomic]; !ok {
@@ -170,13 +177,14 @@ func worker(
 	w <- "done"
 }
 
-
 func main() {
 	conf = defaultConfig()
 	goptions.ParseAndFail(&conf)
 
 	setLogger(conf.Debug, conf.Log)
 
+	sugar.Debugf("options: %v", conf)
+
 	if conf.Version {
 		sugar.Infof("current version: %v", VERSION)
 		os.Exit(0)
@@ -253,7 +261,7 @@ func main() {
 			lock.Lock()
 			chrRefs[ref] = temp
 			lock.Unlock()
-		} (ref, &wg, &lock)
+		}(ref, &wg, &lock)
 	}
 
 	wg.Wait()
@@ -266,19 +274,14 @@ func main() {
 		wg.Add(1)
 	}
 
-	if references, err := fetchBamRefs(); err == nil {
-		for ref, chunks := range references {
-			sugar.Infof("read reads from %s", ref)
-			for _, c := range chunks {
-				sugar.Debug(c)
-				refs <- c
-			}
+	for ref, chunks := range references {
+		sugar.Infof("read reads from %s", ref)
+		for _, c := range chunks {
+			sugar.Debug(c)
+			refs <- c
 		}
-	} else {
-		sugar.Error(err)
 	}
 
-
 	close(refs)
 	wg.Wait()