From 3632dd74c8f014106655f62f59c145bea5e83ae5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Sep 2023 08:07:46 -0700 Subject: [PATCH 1/4] fix exponential behavior --- src/sourmash/manifest.py | 8 ++++---- src/sourmash/sig/__main__.py | 6 ++++-- tests/test_manifest.py | 33 ++++++++++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index 4bcf1b7dbc..a8fb1647a5 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -232,11 +232,11 @@ def add_row(self, row): self._add_rows([row]) def _add_rows(self, rows): - self.rows.extend(rows) - - # maintain a fast check for md5sums for __contains__ check. md5set = self._md5_set - for row in self.rows: + + # only iterate once, in case it's a generator + for row in rows: + self.rows.append(row) md5set.add(row['md5']) def __iadd__(self, other): diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 7336f7ac79..48addfe1e4 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -14,7 +14,7 @@ from sourmash.sourmash_args import FileOutput from sourmash.logging import (set_quiet, error, notify, print_results, debug, - debug_literal) + debug_literal, _debug) from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled from sourmash.manifest import CollectionManifest @@ -1359,7 +1359,9 @@ def check(args): row['internal_location'] = filename total_manifest_rows.add_row(row) - debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows") + # the len(sub_manifest) here should only be run when needed :) + if _debug: + debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows") notify(f"loaded {total_rows_examined} signatures.") diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 3f63d4e88f..11023ecd21 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -4,7 +4,7 @@ from io import StringIO import sourmash -from sourmash import index +from sourmash import index, sourmash_args import sourmash_tst_utils as utils @@ -162,3 +162,34 @@ def test_save_load_manifest(): short_mf = index.CollectionManifest(rows) assert short_mf != manifest + + +def test_manifest_to_picklist_bug(runtmp): + # this tests a fun combination of things that led to a bug. + # tl;dr we only want to iterate once across a generator... + c = runtmp + all_zip = utils.get_test_data('prot/all.zip') + + idx = sourmash_args.load_file_as_index(all_zip) + assert len(idx) == 8 + + manifest = sourmash_args.get_manifest(idx) + assert len(manifest) == 8 + + def filter_fn(row): + # match? + keep = False + if "09a0869" in row['md5']: + keep = True + + return keep + + sub_manifest = manifest.filter_rows(filter_fn) + sub_picklist = sub_manifest.to_picklist() + idx = idx.select(picklist=sub_picklist) + + assert len(idx) == 1 + print(idx) + + x = list(idx.signatures()) + assert len(x) From e0b940b20c6e33a32d4bb574958a617ec17fa048 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Sep 2023 08:42:55 -0700 Subject: [PATCH 2/4] test for a new problem --- src/sourmash/manifest.py | 2 ++ tests/test_manifest.py | 23 ++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index a8fb1647a5..466bfa8e7a 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -240,6 +240,8 @@ def _add_rows(self, rows): md5set.add(row['md5']) def __iadd__(self, other): + if self is other: + raise Exception("cannot directly add manifest to itself") self._add_rows(other.rows) return self diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 11023ecd21..97a9f42f8e 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -1,6 +1,7 @@ """ Tests for manifest code in databases, etc. """ +import pytest from io import StringIO import sourmash @@ -49,7 +50,8 @@ def test_manifest_operations(): siglist.append(sig) manifest = index.CollectionManifest(rows) - manifest += manifest + manifest2 = index.CollectionManifest(rows) + manifest += manifest2 assert len(manifest) == 2*len(rows) assert len(manifest) == 4 @@ -59,6 +61,24 @@ def test_manifest_operations(): assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list +def test_manifest_operations_fail(): + # test basic manifest operations - += + protzip = utils.get_test_data('prot/protein.zip') + + loader = sourmash.load_file_as_index(protzip) + + rows = [] + siglist = [] + for (sig, loc) in loader._signatures_with_internal(): + row = index.CollectionManifest.make_manifest_row(sig, loc) + rows.append(row) + siglist.append(sig) + + manifest = index.CollectionManifest(rows) + with pytest.raises(Exception): + manifest += manifest + + def test_manifest_to_picklist(): # test manifest/picklist interaction basics protzip = utils.get_test_data('prot/protein.zip') @@ -167,6 +187,7 @@ def test_save_load_manifest(): def test_manifest_to_picklist_bug(runtmp): # this tests a fun combination of things that led to a bug. # tl;dr we only want to iterate once across a generator... + # ref #2762 c = runtmp all_zip = utils.get_test_data('prot/all.zip') From d13dd24aa51ded81a4791c54e35f05809531dc10 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Sep 2023 08:47:14 -0700 Subject: [PATCH 3/4] ensure iterate only once across rows being added --- tests/test_manifest.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 97a9f42f8e..a8813dd92e 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -214,3 +214,32 @@ def filter_fn(row): x = list(idx.signatures()) assert len(x) + + +def test_generate_manifest_iterate_once(): + # we should only iterate across manifest rows once + protzip = utils.get_test_data('prot/protein.zip') + + loader = sourmash.load_file_as_index(protzip) + + siglist = [] + for (sig, loc) in loader._signatures_with_internal(): + siglist.append(sig) + + # build generator function => will not allow iteration twice + def genfn(): + for (sig, loc) in loader._signatures_with_internal(): + row = index.CollectionManifest.make_manifest_row(sig, loc) + yield row + + manifest = index.CollectionManifest(genfn()) + + assert len(manifest) == 2 + assert len(manifest._md5_set) == 2 + + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + for sig in siglist: + assert sig in manifest From e3c8d5802039d3c14deeb92ef99605513ea31154 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Sep 2023 08:52:45 -0700 Subject: [PATCH 4/4] update comment --- tests/test_manifest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index a8813dd92e..074d72d705 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -62,7 +62,8 @@ def test_manifest_operations(): def test_manifest_operations_fail(): - # test basic manifest operations - += + # should not be able to add a manifest to itself - not only makes + # no sense, but it means you're modifying a generator in place, sometimes. protzip = utils.get_test_data('prot/protein.zip') loader = sourmash.load_file_as_index(protzip)