Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: fix exponential time explosion in sig check #2762

Merged
merged 4 commits into from
Sep 15, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/sourmash/manifest.py
Original file line number Diff line number Diff line change
@@ -232,14 +232,16 @@ def add_row(self, row):
self._add_rows([row])

def _add_rows(self, rows):
self.rows.extend(rows)

# maintain a fast check for md5sums for __contains__ check.
md5set = self._md5_set
for row in self.rows:

# only iterate once, in case it's a generator
for row in rows:
self.rows.append(row)
md5set.add(row['md5'])

def __iadd__(self, other):
if self is other:
raise Exception("cannot directly add manifest to itself")
self._add_rows(other.rows)
return self

6 changes: 4 additions & 2 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@
from sourmash.sourmash_args import FileOutput

from sourmash.logging import (set_quiet, error, notify, print_results, debug,
debug_literal)
debug_literal, _debug)
from sourmash import sourmash_args
from sourmash.minhash import _get_max_hash_for_scaled
from sourmash.manifest import CollectionManifest
@@ -1359,7 +1359,9 @@ def check(args):
row['internal_location'] = filename
total_manifest_rows.add_row(row)

debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows")
# the len(sub_manifest) here should only be run when needed :)
if _debug:
debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows")

notify(f"loaded {total_rows_examined} signatures.")

86 changes: 84 additions & 2 deletions tests/test_manifest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
Tests for manifest code in databases, etc.
"""
import pytest
from io import StringIO

import sourmash
from sourmash import index
from sourmash import index, sourmash_args

import sourmash_tst_utils as utils

@@ -49,7 +50,8 @@ def test_manifest_operations():
siglist.append(sig)

manifest = index.CollectionManifest(rows)
manifest += manifest
manifest2 = index.CollectionManifest(rows)
manifest += manifest2

assert len(manifest) == 2*len(rows)
assert len(manifest) == 4
@@ -59,6 +61,25 @@ def test_manifest_operations():
assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list


def test_manifest_operations_fail():
# should not be able to add a manifest to itself - not only makes
# no sense, but it means you're modifying a generator in place, sometimes.
protzip = utils.get_test_data('prot/protein.zip')

loader = sourmash.load_file_as_index(protzip)

rows = []
siglist = []
for (sig, loc) in loader._signatures_with_internal():
row = index.CollectionManifest.make_manifest_row(sig, loc)
rows.append(row)
siglist.append(sig)

manifest = index.CollectionManifest(rows)
with pytest.raises(Exception):
manifest += manifest


def test_manifest_to_picklist():
# test manifest/picklist interaction basics
protzip = utils.get_test_data('prot/protein.zip')
@@ -162,3 +183,64 @@ def test_save_load_manifest():

short_mf = index.CollectionManifest(rows)
assert short_mf != manifest


def test_manifest_to_picklist_bug(runtmp):
# this tests a fun combination of things that led to a bug.
# tl;dr we only want to iterate once across a generator...
# ref #2762
c = runtmp
all_zip = utils.get_test_data('prot/all.zip')

idx = sourmash_args.load_file_as_index(all_zip)
assert len(idx) == 8

manifest = sourmash_args.get_manifest(idx)
assert len(manifest) == 8

def filter_fn(row):
# match?
keep = False
if "09a0869" in row['md5']:
keep = True

return keep

sub_manifest = manifest.filter_rows(filter_fn)
sub_picklist = sub_manifest.to_picklist()
idx = idx.select(picklist=sub_picklist)

assert len(idx) == 1
print(idx)

x = list(idx.signatures())
assert len(x)


def test_generate_manifest_iterate_once():
# we should only iterate across manifest rows once
protzip = utils.get_test_data('prot/protein.zip')

loader = sourmash.load_file_as_index(protzip)

siglist = []
for (sig, loc) in loader._signatures_with_internal():
siglist.append(sig)

# build generator function => will not allow iteration twice
def genfn():
for (sig, loc) in loader._signatures_with_internal():
row = index.CollectionManifest.make_manifest_row(sig, loc)
yield row

manifest = index.CollectionManifest(genfn())

assert len(manifest) == 2
assert len(manifest._md5_set) == 2

md5_list = [ row['md5'] for row in manifest.rows ]
assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list

for sig in siglist:
assert sig in manifest