Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Counter-based gather #1311

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion src/sourmash/index.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"An Abstract Base Class for collections of signatures."

from abc import abstractmethod, ABC
from collections import namedtuple
from collections import namedtuple, Counter


class Index(ABC):
Expand Down Expand Up @@ -117,6 +117,69 @@ def gather(self, query, *args, **kwargs):

return results

def counter_gather(self, query, *args, **kwargs):
"Perform compositional analysis of the query using the gather algorithm"
if not query.minhash: # empty query? quit.
return []

scaled = query.minhash.scaled
if not scaled:
raise ValueError('gather requires scaled signatures')

threshold_bp = kwargs.get('threshold_bp', 0.0)
threshold = 0.0
n_threshold_hashes = 0

# are we setting a threshold?
if threshold_bp:
# if we have a threshold_bp of N, then that amounts to N/scaled
# hashes:
n_threshold_hashes = float(threshold_bp) / scaled

# that then requires the following containment:
threshold = n_threshold_hashes / len(query.minhash)

# is it too high to ever match? if so, exit.
if threshold > 1.0:
return []

# Pre-loading signatures so we can index datasets
signatures = list(self.signatures())

# Process all datasets and create a Counter containing the size
# of hashes in common between query and each signature
counter = Counter()
for (i, ss) in enumerate(signatures):
counter[i] = query.minhash.count_common(ss.minhash, True)

# Decompose query into matching signatures using a greedy approach (gather)
results = []
match_size = n_threshold_hashes
while counter and match_size >= n_threshold_hashes:
most_common = counter.most_common()
dataset_id, size = most_common[0]
if size >= n_threshold_hashes:
match_size = size
else:
break

match = signatures[dataset_id]
del counter[dataset_id]
cont = query.minhash.contained_by(match.minhash, True)
if cont and cont >= threshold:
results.append((cont, match, getattr(self, "filename", None)))

# Prepare counter for finding the next match by decrementing
# all hashes found in the current match in other datasets
for (dataset_id, _) in most_common:
counter[dataset_id] -= signatures[dataset_id].minhash.count_common(match.minhash, True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whups, this subtraction needs to be done for the overlap with intersection of the match and the query, not the overlap with the query (which may be far larger).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(see the copypasta code in #1371, src/sourmash/index.py::CounterGatherIndex.gather(...), which I fixed to pass gather tests)

if counter[dataset_id] == 0:
del counter[dataset_id]

results.sort(reverse=True, key=lambda x: (x[0], x[1].md5sum()))

return results

@abstractmethod
def select(self, ksize=None, moltype=None):
""
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def _find_best(dblist, query, threshold_bp):

# search across all databases
for (obj, filename, filetype) in dblist:
for cont, match, fname in obj.gather(query, threshold_bp=threshold_bp):
for cont, match, fname in obj.counter_gather(query, threshold_bp=threshold_bp):
assert cont # all matches should be nonzero.

# note, break ties based on name, to ensure consistent order.
Expand Down