Skip to content

Commit

Permalink
0.0.2-beta more faster with more memory usage, fix gzip output issues
Browse files Browse the repository at this point in the history
  • Loading branch information
ygidtu committed Jul 15, 2020
1 parent f694eb6 commit c7c421f
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 42 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
# REDItools2

**Created at 2020.07.14**


A go version of REDItools2

1. Too much faster.
- The original Python version takes 3mins
- The `multiprocessing version` in previous commits, takes 1m22s with 10 processes
- The go version takes 1min with 1 process, 7~9s with 10 processes
2. More information, even more editing sites.

---

**REDItools2** is the optimized, parallel multi-node version of [<i class="icon-link"></i> REDItools](https://github.com/BioinfoUNIBA/REDItools).

REDItools takes in input a RNA-Seq (or DNA-Seq BAM) file and outputs a table of RNA-Seq editing events. Here is an example of REDItools's output:
Expand Down
55 changes: 38 additions & 17 deletions const.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ var conf config
var region *Region

const (
VERSION = "0.0.2"
VERSION = "0.0.2-beta"
DefaultBaseQuality = 30
)

type config struct {
Config string `goptions:"-c, --config, description='Config file'"`
Version bool `goptions:"-v, --version, description='Show version'"`
Debug bool `goptions:"--debug, description='Show debug info'"`
Debug bool `goptions:"--debug, description='Show debug info'" long:"debug" description:"Config file"`
File string `goptions:"-f, --file, description='The bam file to be analyzed'"`
Output string `goptions:"-o, --output-file, description='The output statistics file'"`
Process int `goptions:"-p, --process, description='How many process to use'"`
Expand All @@ -38,21 +38,21 @@ type config struct {
Region string `goptions:"-g, --region, description='The region of the bam file to be analyzed'"`
OmopolymericFile string `goptions:"-m, --omopolymeric-file, description='The file containing the omopolymeric positions'"`
CreateOmopolymericFile bool `goptions:"-c, --create-omopolymeric-file, description='Whether to create the omopolymeric span'"`
OmopolymericSpan int `goptions:"-os, --omopolymeric-span, description='The omopolymeric span'"`
SplicingFile string `goptions:"-sf, --splicing-file, description='The file containing the splicing sites positions'"`
SplicingSpan int `goptions:"-ss, --splicing-span, description='The splicing span'"`
MinReadLength int `goptions:"-mrl, --min-read-length, description='The minimum read length. Reads whose length is below this value will be discarded.'"`
OmopolymericSpan int `goptions:"--omopolymeric-span, description='The omopolymeric span'"`
SplicingFile string `goptions:"--splicing-file, description='The file containing the splicing sites positions'"`
SplicingSpan int `goptions:"--splicing-span, description='The splicing span'"`
MinReadLength int `goptions:"--min-read-length, description='The minimum read length. Reads whose length is below this value will be discarded.'"`
MinReadQuality int `goptions:"-q, --min-read-quality, description='The minimum read quality. Reads whose mapping quality is below this value will be discarded.'"`
MinBaseQuality int `goptions:"-bq, --min-base-quality, description='The minimum base quality. Bases whose quality is below this value will not be included in the analysis.'"`
MinBasePosition int `goptions:"-mbp, --min-base-position, description='The minimum base position. Bases which reside in a previous position (in the read) will not be included in the analysis.'"`
MaxBasePosition int `goptions:"-Mbp, --max-base-position, description='The maximum base position. Bases which reside in a further position (in the read) will not be included in the analysis.'"`
MinBaseQuality int `goptions:"--min-base-quality, description='The minimum base quality. Bases whose quality is below this value will not be included in the analysis.'"`
MinBasePosition int `goptions:"--min-base-position, description='The minimum base position. Bases which reside in a previous position (in the read) will not be included in the analysis.'"`
MaxBasePosition int `goptions:"--max-base-position, description='The maximum base position. Bases which reside in a further position (in the read) will not be included in the analysis.'"`
MinColumnLength int `goptions:"-l, --min-column-length, description='The minimum length of editing column (per position). Positions whose columns have length below this value will not be included in the analysis.'"`
MinEditsPerNucleotide int `goptions:"-men, --min-edits-per-nucleotide, description='The minimum number of editing for events each nucleotide (per position). Positions whose columns have bases with less than min-edits-per-base edits will not be included in the analysis.'"`
MinEdits int `goptions:"-me, --min-edits, description='The minimum number of editing events (per position). Positions whose columns have bases with less than \'min-edits-per-base edits\' will not be included in the analysis.'"`
MaxEditsPerNucleotide int `goptions:"-Men, --max-editing-nucleotides, description='The maximum number of editing nucleotides, from 0 to 4 (per position). Positions whose columns have more than \'max-editing-nucleotides\' will not be included in the analysis.'"`
MinEditsPerNucleotide int `goptions:"--min-edits-per-nucleotide, description='The minimum number of editing for events each nucleotide (per position). Positions whose columns have bases with less than min-edits-per-base edits will not be included in the analysis.'"`
MinEdits int `goptions:"--min-edits, description='The minimum number of editing events (per position). Positions whose columns have bases with less than \'min-edits-per-base edits\' will not be included in the analysis.'"`
MaxEditsPerNucleotide int `goptions:"--max-editing-nucleotides, description='The maximum number of editing nucleotides, from 0 to 4 (per position). Positions whose columns have more than \'max-editing-nucleotides\' will not be included in the analysis.'"`
StrandConfidence int `goptions:"-T, --strand-confidence, description='Strand inference type 1:maxValue 2:useConfidence [1]; maxValue: the most prominent strand count will be used; useConfidence: strand is assigned if over a prefixed frequency confidence (-TV option)'"`
StrandCorrection bool `goptions:"-C, --strand-corection, description='Strand correction. Once the strand has been inferred, only bases according to this strand will be selected.'"`
StrandConfidenceValue float64 `goptions:"-Tv, --strand-confidence-value, description='Strand confidence [0.70]'"`
StrandConfidenceValue float64 `goptions:"--strand-confidence-value, description='Strand confidence [0.70]'"`
RemoveHeader bool `goptions:"-H, --remove-header, description='Do not include header in output file'"`
Dna bool `goptions:"-N, --dna, description='Run REDItools 2.0 on DNA-Seq data'"`
BedFile string `goptions:"-B, --bed_file, description='Path of BED file containing target regions'"`
Expand Down Expand Up @@ -124,15 +124,36 @@ func decodeRegion(region string) *Region {
return res
}


type ChanChunk struct {
Ref string
Start int
End int
Ref string
Start int
End int
Chunks bgzf.Chunk
}

func (c *ChanChunk) ToRegion() *Region {
return &Region{
Chrom: c.Ref,
Start: c.Start,
End: c.End,
}
}

func (c *ChanChunk) SwitchRegion() *Region {
ref := c.Ref

if strings.HasPrefix(ref, "chr") {
ref = strings.ReplaceAll(ref, "chr", "")
} else {
ref = "chr" + ref
}

return &Region{
Chrom: ref,
Start: c.Start,
End: c.End,
}
}

type Omopolymeric struct {
Chromosome string
Expand Down
30 changes: 20 additions & 10 deletions load.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,9 +387,9 @@ func splitRegion(end, chunks int) []int {
}

// adjustRegion is used to move the end site a little backwards
func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *bam.Reader) ([]bgzf.Chunk, error) {
func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *bam.Reader) ([]*ChanChunk, error) {

res := make([]bgzf.Chunk, 0, 0)
res := make([]*ChanChunk, 0, 0)
for i := 0; i < len(regions); i++ {
if i > 0 && i < len(regions)-1 {
chunks, err := idx.Chunks(ref, regions[i], regions[i+1])
Expand All @@ -403,7 +403,6 @@ func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *
continue
}


iter, err := bam.NewIterator(bamReader, chunks)
if err != nil {
return res, err
Expand All @@ -427,12 +426,28 @@ func adjustRegion(regions []int, idx *bam.Index, ref *sam.Reference, bamReader *
}

for i := 1; i < len(regions); i++ {
// avoid duplicated regions
if regions[i-1] > regions[i] {
regions[i] = regions[i-1]
continue
}

chunks, err := idx.Chunks(ref, regions[i-1], regions[i])
if err != nil {
continue
// return res, errors.Wrapf(err, "failed after adjust %s: %d - %d", ref.Name(), region[i-1], region[i])
}
res = append(res, chunks...)
//res = append(res, chunks...)

for _, c := range chunks {
res = append(res, &ChanChunk{
Ref: ref.Name(),
Start: regions[i-1],
End: regions[i],
Chunks: c,
})
}

}

return res, nil
Expand Down Expand Up @@ -498,12 +513,7 @@ func fetchBamRefs() (map[string][]*ChanChunk, error) {
continue
}

tempRes := make([]*ChanChunk, 0, 0)
for _, t := range temp {
tempRes = append(tempRes, &ChanChunk{Ref: ref.Name(), Chunks: t})
}

res[ref.Name()] = tempRes
res[ref.Name()] = temp
}

sugar.Debug(res)
Expand Down
33 changes: 18 additions & 15 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,17 @@ func worker(
break
}

//chrRef, err := fetchFasta(ref.ToRegion())
//if err != nil {
// sugar.Warnf("try to modify %s", ref)
// chrRef, err = fetchFasta(ref.SwitchRegion())
//}
//if err != nil {
// sugar.Fatal(err)
//}

chrRef, ok := chrRefs[ref.Ref]
if ! ok {
if !ok {
sugar.Errorf("failed to get %s from reference", ref.Ref)
continue
}
Expand Down Expand Up @@ -139,8 +148,6 @@ func worker(
sugar.Fatal(record.SeqString())
}

// record.QueryPosition = append(record.QueryPosition, at)

genomic := start + record.Start

if _, ok := edits[genomic]; !ok {
Expand Down Expand Up @@ -170,13 +177,14 @@ func worker(
w <- "done"
}


func main() {
conf = defaultConfig()
goptions.ParseAndFail(&conf)

setLogger(conf.Debug, conf.Log)

sugar.Debugf("options: %v", conf)

if conf.Version {
sugar.Infof("current version: %v", VERSION)
os.Exit(0)
Expand Down Expand Up @@ -253,7 +261,7 @@ func main() {
lock.Lock()
chrRefs[ref] = temp
lock.Unlock()
} (ref, &wg, &lock)
}(ref, &wg, &lock)
}

wg.Wait()
Expand All @@ -266,19 +274,14 @@ func main() {
wg.Add(1)
}

if references, err := fetchBamRefs(); err == nil {
for ref, chunks := range references {
sugar.Infof("read reads from %s", ref)
for _, c := range chunks {
sugar.Debug(c)
refs <- c
}
for ref, chunks := range references {
sugar.Infof("read reads from %s", ref)
for _, c := range chunks {
sugar.Debug(c)
refs <- c
}
} else {
sugar.Error(err)
}


close(refs)
wg.Wait()

Expand Down

0 comments on commit c7c421f

Please sign in to comment.