Skip to content

Commit

Permalink
Merge pull request #92 from nschcolnicov/database_fix
Browse files Browse the repository at this point in the history
Added handling for custom databases
  • Loading branch information
lpantano authored Nov 8, 2024
2 parents 603f45c + 4405d3a commit 3ac54c4
Show file tree
Hide file tree
Showing 12 changed files with 44 additions and 19 deletions.
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0.4.29

* Update handling of --database argument [#90](https://github.com/miRTop/mirtop/issues/90)

0.4.28

* fix random order in Variant field [#84](https://github.com/miRTop/mirtop/issues/83)
Expand Down
4 changes: 4 additions & 0 deletions mirtop/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from mirtop.libs import spikeins
from mirtop.gff import update
from mirtop.sql import sql
from mirtop.mirna import mapper
import mirtop.libs.logger as mylog

import time
Expand All @@ -25,6 +26,9 @@ def main(**kwargs):
kwargs['args'].print_debug)
logger = mylog.getLogger(__name__)
start = time.time()
if not hasattr(kwargs["args"], "database"):
if ("sql" not in kwargs and "stats" not in kwargs and "update" not in kwargs and "validate" not in kwargs):
kwargs["args"].database = mapper.guess_database(kwargs["args"])

if "gff" in kwargs:
logger.info("Run annotation")
Expand Down
2 changes: 1 addition & 1 deletion mirtop/exporter/isomirs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def convert(args):
def _convert_file(gff, args):
sep = "\t"
precursors = fasta.read_precursor(args.hairpin, args.sps)
matures = mapper.read_gtf_to_precursor(args.gtf)
matures = mapper.read_gtf_to_precursor(args.gtf, args.database)
variant_header = sep.join(['mism', 'add', 't5', 't3'])

gff_file = open(gff, 'r')
Expand Down
6 changes: 3 additions & 3 deletions mirtop/exporter/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def convert(args):
for fn in args.files:
out_file = op.join(args.out, "%s.vcf" % op.splitext(op.basename(fn))[0])
logger.info("Reading %s" % fn)
create_vcf(fn, args.hairpin, args.gtf, out_file)
create_vcf(fn, args.hairpin, args.gtf, out_file, args.database)
logger.info("VCF generated %s" % out_file)


Expand Down Expand Up @@ -121,7 +121,7 @@ def cigar_2_key(cigar, readseq, refseq, pos, var5p, var3p, parent_ini_pos, paren
return(key_pos, key_var, ref, alt)


def create_vcf(mirgff3, precursor, gtf, vcffile):
def create_vcf(mirgff3, precursor, gtf, vcffile, database):
"""
Args:
'mirgff3(str)': File with mirGFF3 format that will be converted
Expand Down Expand Up @@ -178,7 +178,7 @@ def create_vcf(mirgff3, precursor, gtf, vcffile):
n_noSNP = 0
no_var = 0
hairpins = read_precursor(precursor)
gff3 = read_gtf_to_precursor(gtf)
gff3 = read_gtf_to_precursor(gtf, database)
gtf_dic = read_gtf_to_mirna(gtf)
for line in range(0, len(gff3_data)):
if not gff3_data[line]:
Expand Down
7 changes: 5 additions & 2 deletions mirtop/gff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ def reader(args):
read.reader(args)
return None
samples = []
database = mapper.guess_database(args)
if args.database is None:
database = mapper.guess_database(args)
else:
database = args.database
args.database = database
precursors = fasta.read_precursor(args.hairpin, args.sps)
args.precursors = precursors
matures = mapper.read_gtf_to_precursor(args.gtf)
matures = mapper.read_gtf_to_precursor(args.gtf,database)
args.matures = matures
# TODO check numbers of miRNA and precursors read
# TODO print message if numbers mismatch
Expand Down
2 changes: 1 addition & 1 deletion mirtop/gff/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def convert_gff_counts(args):
'iso_add3p', 'iso_snp']
if args.add_extra:
precursors = fasta.read_precursor(args.hairpin, args.sps)
matures = mapper.read_gtf_to_precursor(args.gtf)
matures = mapper.read_gtf_to_precursor(args.gtf, args.database)
variant_header = variant_header + ['iso_5p_nt', 'iso_3p_nt', 'iso_add3p_nt', 'iso_snp_nt']

logger.info("INFO Reading GFF file %s", args.gff)
Expand Down
2 changes: 1 addition & 1 deletion mirtop/gff/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def reader(args):
args.database = database
precursors = fasta.read_precursor(args.hairpin, args.sps)
args.precursors = precursors
matures = mapper.read_gtf_to_precursor(args.gtf)
matures = mapper.read_gtf_to_precursor(args.gtf, args.database)
args.matures = matures
# TODO check numbers of miRNA and precursors read
# TODO print message if numbers mismatch
Expand Down
2 changes: 1 addition & 1 deletion mirtop/importer/prost.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def read_file(fn, hairpins, database, mirna_gtf):
reads = defaultdict(hits)
sample = os.path.splitext(os.path.basename(fn))[0]
genomics = mapper.read_gtf_to_mirna(mirna_gtf)
matures = mapper.read_gtf_to_precursor(mirna_gtf)
matures = mapper.read_gtf_to_precursor(mirna_gtf, database)
non_mirna = 0
non_chromosome_mirna = 0
outside_mirna = 0
Expand Down
21 changes: 17 additions & 4 deletions mirtop/mirna/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def guess_database(args):
TODO: this needs to be generic to other databases.
"""
if not hasattr(args, "database"):
args.database = None
return _guess_database_file(args.gtf, args.database)


Expand Down Expand Up @@ -143,7 +145,7 @@ def read_gtf_chr2mirna2(gtf): # to remove
return db_mir


def read_gtf_to_precursor(gtf):
def read_gtf_to_precursor(gtf,database):
"""
Load GTF file with precursor positions on genome
Return dict with key being precursor name and
Expand All @@ -161,15 +163,26 @@ def read_gtf_to_precursor(gtf):
"""
if not gtf:
return gtf
if _guess_database_file(gtf).find("miRBase") > -1:
if _guess_database_file(gtf,database).find("miRBase") > -1:
mapped = read_gtf_to_precursor_mirbase(gtf)
elif _guess_database_file(gtf).find("MirGeneDB") > -1:
elif _guess_database_file(gtf,database).find("MirGeneDB") > -1:
mapped = read_gtf_to_precursor_mirgenedb(gtf)
else:
logger.info("Database different than miRBase or MirGeneDB")
logger.info("If you get an error when loading,")
logger.info("report it to https://github.com/miRTop/mirtop/issues")
mapped = read_gtf_to_precursor_mirbase(gtf)
try:
mapped = read_gtf_to_precursor_mirbase(gtf)
return mapped
except Exception as e:
print(f"Failed to parse with Mirbase: {e}")
try:
mapped = read_gtf_to_precursor_mirgenedb(gtf)
return mapped
except Exception as e:
print(f"Failed to parse with Mirgenedb: {e}")
raise ValueError(f"There is no parser available for the database that you used: {database}")

return mapped


Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ pybedtools
pandas
biopython
pyyaml
pybedtools
six
pytest
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
from setuptools import setup, find_packages

version = '0.4.28'
version = '0.4.29'
url = 'http://github.com/mirtop/mirtop'


Expand Down
9 changes: 5 additions & 4 deletions test/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def annotate(fn, read_file, load=False, create=True, keep_name=False,
args.keep_name = keep_name
from mirtop.mirna import fasta, mapper
precursors = fasta.read_precursor(args.hairpin, args.sps)
matures = mapper.read_gtf_to_precursor(args.gtf)
matures = mapper.read_gtf_to_precursor(args.gtf, args.database)
args.precursors = precursors
args.matures = matures
args.database = mapper.guess_database(args)
Expand Down Expand Up @@ -81,7 +81,7 @@ def test_read_hairpin(self):
from mirtop.libs import logger
logger.initialize_logger("test_read_files", True, True)
map_mir = mapper.read_gtf_to_precursor(
"data/examples/annotate/hsa.gff3")
"data/examples/annotate/hsa.gff3", None)
print(map_mir)
if map_mir["hsa-let-7a-1"]["hsa-let-7a-5p"][0] != 5:
raise ValueError("GFF is not loaded correctly.")
Expand All @@ -102,7 +102,7 @@ def test_read_hairpin_mirgenedb(self):
from mirtop.libs import logger
logger.initialize_logger("test_read_files", True, True)
map_mir = mapper.read_gtf_to_precursor(
"data/db/mirgenedb/hsa.gff")
"data/db/mirgenedb/hsa.gff", None)
print(map_mir)

##@attr(read_mir2chr=True)
Expand Down Expand Up @@ -259,7 +259,7 @@ def test_variant(self):
precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
"hsa")
matures = mapper.read_gtf_to_precursor(
"data/examples/annotate/hsa.gff3")
"data/examples/annotate/hsa.gff3", None)
res = get_mature_sequence("GAAAATTTTTTTTTTTAAAAG", [5, 15])
if res != "AAAATTTTTTTTTTTAAAA":
raise ValueError("Results for GAAAATTTTTTTTTTTAAAAG was %s" % res)
Expand Down Expand Up @@ -447,6 +447,7 @@ def test_counts(self):
args.gff = 'data/examples/synthetic/let7a-5p.gff'
args.out = 'data/examples/synthetic'
args.add_extra = True
args.database = None
convert_gff_counts(args)
os.remove(os.path.join(args.out, "let7a-5p.tsv"))

Expand Down

0 comments on commit 3ac54c4

Please sign in to comment.