Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[EXP] add support for rapid signature selection from Zipfile collections by md5 picklists #1589

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 54 additions & 17 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,24 +484,61 @@ def load(cls, location, traverse_yield_all=False):
def signatures(self):
"Load all signatures in the zip file."
from .signature import load_signatures
for zipinfo in self.zf.infolist():
# should we load this file? if it ends in .sig OR we are forcing:
if zipinfo.filename.endswith('.sig') or \
zipinfo.filename.endswith('.sig.gz') or \
self.traverse_yield_all:
fp = self.zf.open(zipinfo)

# now load all the signatures and select on ksize/moltype:
selection_dict = self.selection_dict

# note: if 'fp' doesn't contain a valid JSON signature,
# load_signatures will silently fail & yield nothing.
for ss in load_signatures(fp):
if selection_dict:
if select_signature(ss, **self.selection_dict):
yield ss
else:

picklist = None
if self.selection_dict:
picklist = self.selection_dict.get('picklist', None)

# treat md5 picklists specially!
# CTB: here, should we be worried if we don't find a signature we want?
# e.g. how do we conclude that a picklist optimization doesn't apply?
# CTB: also, could support md5 prefix by filtering all infolist.

if picklist and picklist.coltype == 'md5':
patterns = ("signatures/{md5}.sig", "signatures/{md5}.sig.gz")

def yield_fp():
for md5 in picklist.pickset:
for p in patterns:
p = p.format(md5=md5)
try:
zipinfo = self.zf.getinfo(p)
except KeyError:
print(f"({p} is not found)")
else:
print(f"({p} found!")
yield self.zf.open(zipinfo)
break
elif picklist and picklist.coltype == 'md5prefix8':
def yield_fp():
for zipinfo in self.zf.infolist():
if zipinfo.filename.startswith('signatures/'):
fn = zipinfo.filename[len('signatures/'):]
prefix = fn[:8]
if prefix in picklist.pickset:
print(f"({prefix} q found)")
yield self.zf.open(zipinfo)
else:
def yield_fp():
for zipinfo in self.zf.infolist():
# should we load this file? if it ends in .sig OR we are forcing:
if zipinfo.filename.endswith('.sig') or \
zipinfo.filename.endswith('.sig.gz') or \
self.traverse_yield_all:
yield self.zf.open(zipinfo)

for fp in yield_fp():
# now load all the signatures and select on ksize/moltype:
selection_dict = self.selection_dict

# note: if 'fp' doesn't contain a valid JSON signature,
# load_signatures will silently fail & yield nothing.
for ss in load_signatures(fp):
if selection_dict:
if select_signature(ss, **self.selection_dict):
yield ss
else:
yield ss

def select(self, **kwargs):
"Select signatures in zip file based on ksize/moltype/etc."
Expand Down