Skip to content

Commit

Permalink
do not overwrite signature even if duplicate md5sum (#1497)
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb authored May 5, 2021
1 parent 833645b commit b4cdbe8
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
27 changes: 27 additions & 0 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,17 @@ def open(self):
def add(self, ss):
super().add(ss)
md5 = ss.md5sum()

# don't overwrite even if duplicate md5sum
outname = os.path.join(self.location, f"{md5}.sig.gz")
if os.path.exists(outname):
i = 0
while 1:
outname = os.path.join(self.location, f"{md5}_{i}.sig.gz")
if not os.path.exists(outname):
break
i += 1

with gzip.open(outname, "wb") as fp:
sig.save_signatures([ss], fp, compression=1)

Expand Down Expand Up @@ -663,12 +673,29 @@ def close(self):
def open(self):
self.zf = zipfile.ZipFile(self.location, 'w', zipfile.ZIP_STORED)

def _exists(self, name):
try:
self.zf.getinfo(name)
return True
except KeyError:
return False

def add(self, ss):
assert self.zf
super().add(ss)

md5 = ss.md5sum()
outname = f"signatures/{md5}.sig.gz"

# don't overwrite even if duplicate md5sum.
if self._exists(outname):
i = 0
while 1:
outname = os.path.join(self.location, f"{md5}_{i}.sig.gz")
if not self._exists(outname):
break
i += 1

json_str = sourmash.save_signatures([ss], compression=1)
self.zf.writestr(outname, json_str)

Expand Down
48 changes: 48 additions & 0 deletions tests/test_sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,31 @@ def test_save_signatures_to_location_1_zip(runtmp):
assert len(saved) == 2


def test_save_signatures_to_location_1_zip_dup(runtmp):
# save to sigfile.zip
sig2 = utils.get_test_data('2.fa.sig')
ss2 = sourmash.load_one_signature(sig2, ksize=31)
sig47 = utils.get_test_data('47.fa.sig')
ss47 = sourmash.load_one_signature(sig47, ksize=31)

outloc = runtmp.output('foo.zip')
with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig:
print(save_sig)
save_sig.add(ss2)
save_sig.add(ss47)
save_sig.add(ss2)
save_sig.add(ss47)

# can we open as a .zip file?
with zipfile.ZipFile(outloc, "r") as zf:
assert list(zf.infolist())

saved = list(sourmash.load_file_as_signatures(outloc))
assert ss2 in saved
assert ss47 in saved
assert len(saved) == 4


def test_save_signatures_to_location_1_dirout(runtmp):
# save to sigout/ (directory)
sig2 = utils.get_test_data('2.fa.sig')
Expand All @@ -154,3 +179,26 @@ def test_save_signatures_to_location_1_dirout(runtmp):
assert ss2 in saved
assert ss47 in saved
assert len(saved) == 2


def test_save_signatures_to_location_1_dirout_duplicate(runtmp):
# save to sigout/ (directory)
sig2 = utils.get_test_data('2.fa.sig')
ss2 = sourmash.load_one_signature(sig2, ksize=31)
sig47 = utils.get_test_data('47.fa.sig')
ss47 = sourmash.load_one_signature(sig47, ksize=31)

outloc = runtmp.output('sigout/')
with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig:
print(save_sig)
save_sig.add(ss2)
save_sig.add(ss47)
save_sig.add(ss2)
save_sig.add(ss47)

assert os.path.isdir(outloc)

saved = list(sourmash.load_file_as_signatures(outloc))
assert ss2 in saved
assert ss47 in saved
assert len(saved) == 4

0 comments on commit b4cdbe8

Please sign in to comment.