Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Refactor subject database and signature loading for search, gather, and multigather. #934

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
807602f
initial refactoring of load_dbs_and_sigs
ctb Apr 4, 2020
42b569a
a bit of cleanup and refactoring on new load_dbs_and_sigs code
ctb Apr 4, 2020
d8f416b
getting started on refactoring gather
ctb Apr 4, 2020
a096a90
added utility functions for loading signatures and databases, + params
ctb Apr 5, 2020
18c5884
added tests for failed selector check
ctb Apr 5, 2020
709b3e0
implement add_database functionality
ctb Apr 5, 2020
39e8365
add query signature selection test
ctb Apr 6, 2020
51f49f4
made basic gather code work again
ctb Apr 6, 2020
aaf460c
fix some minor typos/bugs
ctb Apr 6, 2020
d385f55
get tests working, mostly
ctb Apr 6, 2020
268e1fe
fix missing import
ctb Apr 6, 2020
c7a1331
add selector framework to Index and moltype property to SourmashSigna…
ctb Apr 6, 2020
ff7bc9a
add selector framework to Index and moltype property to SourmashSigna…
ctb Apr 6, 2020
4ca476a
move moltype property to MinHash object
ctb Apr 6, 2020
97926f7
add tests for MinHash.moltype property
ctb Apr 6, 2020
5a98b90
remove whitespace
ctb Apr 6, 2020
0f0867b
test LinearIndex selector function
ctb Apr 6, 2020
f5fbcbb
add another selector test, this time for moltype
ctb Apr 6, 2020
b8ed293
Merge branch 'refactor/add_index_selectors' into refactor/load_dbs
ctb Apr 6, 2020
2cb891c
comments added to the gather command
ctb Apr 6, 2020
38c4d33
remove moltype from SourmashSignature
ctb Apr 6, 2020
1c77365
better error handling, add new structure into 'search'
ctb Apr 6, 2020
75dfd6c
comment out debug print
ctb Apr 7, 2020
8e598cc
found and fixed a test for improved functionality
ctb Apr 7, 2020
81a1368
most tests working, now need to work on scaled vs num
ctb Apr 7, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions sourmash/_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,16 +566,19 @@ def is_molecule_type(self, molecule):
"""Check if this MinHash is a particular human-readable molecule type.

Supports 'protein', 'dayhoff', 'hp', 'DNA'.
@CTB deprecate for 4.0?
"""
if molecule.lower() not in ('protein', 'dayhoff', 'hp', 'dna'):
raise ValueError("unknown moltype in query, '{}'".format(molecule))
if self.is_protein and molecule == 'protein':
return True
elif self.dayhoff and molecule == 'dayhoff':
return True
elif self.hp and molecule == 'hp':
return True
elif molecule.lower() == "dna" and self.is_dna:
return True

return False
return molecule == self.moltype

@property
def moltype(self): # TODO: test in minhash tests
if self.is_protein:
return 'protein'
elif self.dayhoff:
return 'dayhoff'
elif self.hp:
return 'hp'
else:
return 'DNA'
98 changes: 79 additions & 19 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,12 +414,37 @@ def search(args):
from .search import search_databases

set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

# set up the query.
query = sourmash_args.load_query_signature(args.query,
ksize=args.ksize,
select_moltype=moltype)
# load the command line argument selectors & query signature(s).
db_loader = sourmash_args.SearchDBLoader2(require_scaled=True)
db_loader.parse_args_selectors(args)
if not db_loader.load_query(args.query):
error("Cannot load query signature; exiting.")
sys.exit(-1)
if not db_loader.check_query_against_arg_selectors():
error("Cannot match query signature with args; exiting.")
sys.exit(-1)

# load the databases.
loaded_db_list = []
for filename in args.databases:
db, params = sourmash_args.load_target_with_params(filename)
if not db:
error("couldn't load {}", filename)
sys.exit(-1)
if not db_loader.add_database(filename, params):
error("couldn't match {}", filename)
sys.exit(-1)

loaded_db_list.append((db, filename, 'XYZ'))

# now that we've loaded the databases, figure out which query (queries?)
# are compatible. If there's exactly one, perfect!
if not db_loader.decide_query():
error("couldn't find acceptable query.")
sys.exit(-1)

query = db_loader.chosen_query
notify('loaded query: {}... (k={}, {})', query.name()[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))
Expand All @@ -435,15 +460,19 @@ def search(args):
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)

# set up the search databases
databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
not args.containment,
args.traverse_directory)

# forcibly ignore abundances if query has no abundances
if not query.minhash.track_abundance:
args.ignore_abundance = True

# set up the search databases
# now that we have the query, apply the same selector to the databaess.
databases = []
for (db, filename, _) in loaded_db_list:
new_db = db.select(ksize=query.minhash.ksize,
moltype=query.minhash.moltype)
# @CTB: here is also where we select the scaled.
databases.append((new_db, filename, 'XXX'))

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)
Expand Down Expand Up @@ -570,17 +599,42 @@ def gather(args):
from .search import gather_databases, format_bp

set_quiet(args.quiet, args.debug)
moltype = sourmash_args.calculate_moltype(args)

# load the query signature & figure out all the things
query = sourmash_args.load_query_signature(args.query,
ksize=args.ksize,
select_moltype=moltype)
# load the command line argument selectors & query signature(s).
db_loader = sourmash_args.SearchDBLoader2(require_scaled=True)
db_loader.parse_args_selectors(args)
if not db_loader.load_query(args.query):
error("Cannot load query signature; exiting.")
sys.exit(-1)
if not db_loader.check_query_against_arg_selectors():
error("Cannot match query signature with args; exiting.")
sys.exit(-1)

# load the databases.
loaded_db_list = []
for filename in args.databases:
db, params = sourmash_args.load_target_with_params(filename)
if not db:
error("couldn't load {}", filename)
sys.exit(-1)
if not db_loader.add_database(filename, params):
error("couldn't match {}", filename)
sys.exit(-1)

loaded_db_list.append((db, filename, 'XYZ'))

# now that we've loaded the databases, figure out which query (queries?)
# are compatible. If there's exactly one, perfect!
if not db_loader.decide_query():
error("couldn't find acceptable query.")
sys.exit(-1)

query = db_loader.chosen_query
notify('loaded query: {}... (k={}, {})', query.name()[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))

# verify signature was computed right.
# verify signature was computed with --scaled.
if query.minhash.scaled == 0:
error('query signature needs to be created with --scaled')
sys.exit(-1)
Expand All @@ -596,14 +650,20 @@ def gather(args):
error('no query hashes!? exiting.')
sys.exit(-1)

# set up the search databases
databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False,
args.traverse_directory)
# now that we have the query, apply the same selector to the databaess.
databases = []
for (db, filename, _) in loaded_db_list:
new_db = db.select(ksize=query.minhash.ksize,
moltype=query.minhash.moltype)
# @CTB: here is also where we select the scaled.
databases.append((new_db, filename, 'XXX'))

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)

### execute the gather algorithm.

found = []
weighted_missed = 1
new_max_hash = query.minhash.max_hash
Expand Down
15 changes: 15 additions & 0 deletions sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def gather(self, query, *args, **kwargs):

return results

@abstractmethod
def select(self, ksize=None, moltype=None):
""

class LinearIndex(Index):
def __init__(self, _signatures=None, filename=None):
Expand Down Expand Up @@ -125,3 +128,15 @@ def load(cls, location):

lidx = LinearIndex(si, filename=location)
return lidx

def select(self, ksize=None, moltype=None):
def select_sigs(siglist, ksize, moltype):
for ss in siglist:
# print(self.filename, ss, ss.minhash.ksize, ss.minhash.moltype,
# ksize, moltype)
if (ksize is None or ss.minhash.ksize == ksize) and \
(moltype is None or ss.minhash.moltype == moltype):
yield ss

siglist=select_sigs(self._signatures, ksize, moltype)
return LinearIndex(siglist, self.filename)
12 changes: 12 additions & 0 deletions sourmash/lca/lca_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,18 @@ def signatures(self):
for v in self._signatures.values():
yield SourmashSignature(v)

def select(self, ksize=None, moltype=None):
ok = True
if ksize is not None and self.ksize != ksize:
ok = False
if moltype is not None and moltype != 'DNA':
ok = False

if ok:
return self

raise ValueError("cannot select LCA on ksize {} / moltype {}".format(ksize, moltype))

def load(self, db_name):
"Load from a JSON file."
xopen = open
Expand Down
14 changes: 14 additions & 0 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,20 @@ def signatures(self):
for k in self.leaves():
yield k.data

def select(self, ksize=None, moltype=None):
first_sig = next(iter(self.signatures()))

ok = True
if ksize is not None and first_sig.minhash.ksize != ksize:
ok = False
if moltype is not None and first_sig.minhash.moltype != moltype:
ok = False

if ok:
return self

raise ValueError("cannot select SBT on ksize {} / moltype {}".format(ksize, moltype))

def new_node_pos(self, node):
if not self._nodes:
self.next_node = 1
Expand Down
1 change: 1 addition & 0 deletions sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def _find_best(dblist, query, threshold_bp):
best_filename = None

# search across all databases
# @CTB filetype no longer needed here
for (obj, filename, filetype) in dblist:
for cont, match, fname in obj.gather(query, threshold_bp=threshold_bp):
assert cont # all matches should be nonzero.
Expand Down
Loading