Skip to content

Commit

Permalink
add tests for max ani, max contain
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Feb 24, 2024
1 parent 53dc312 commit 6929f50
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 10 deletions.
20 changes: 10 additions & 10 deletions src/python/tests/test-data/cluster.pairwise.csv
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes,query_ani,match_ani,average_containment_ani,max_containment_ani
n1,md5q1,n2,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.9,0.9
n1,md5q1,n3,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.9,0.9
n1,md5q1,n4,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.45,0.9
n1,md5q1,n5,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.30,0.9
n1,md5q1,n6,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.5,0.9
n2,md5q1,n1,md5m2,0.6,0.7,0.45,150,0.5,0.5,0.5,0.5
n2,md5q1,n2,md5m2,0.6,0.7,0.45,150,0.5,0.5,0.5,0.5
n2,md5q1,n3,md5m2,0.6,0.7,0.45,150,0.9,0.9,0.95,0.9
n2,md5q1,n4,md5m2,0.6,0.7,0.45,150,0.9,0.9,0.95,0.9
n3,md5q2,n4,md5m3,0.7,0.8,0.6,200,0.9,0.9,0.95,0.9
n1,md5q1,n4,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.45,0.45
n1,md5q1,n5,md5m1,0.8,0.9,0.55,100,0.9,0.5,0.3,0.3
n1,md5q1,n6,md5m1,0.8,0.8,0.55,100,0.9,0.5,0.5,0.5
n2,md5q1,n1,md5m2,0.6,0.6,0.45,150,0.5,0.5,0.5,0.5
n2,md5q1,n2,md5m2,0.6,0.6,0.45,150,0.5,0.5,0.5,0.5
n2,md5q1,n3,md5m2,0.6,0.6,0.45,150,0.9,0.9,0.95,0.95
n2,md5q1,n4,md5m2,0.6,0.6,0.45,150,0.9,0.9,0.95,0.95
n3,md5q2,n4,md5m3,0.7,0.8,0.6,200,0.9,0.9,0.95,0.95
n4,md5q3,n5,md5m4,0.4,0.5,0.65,250,0.9,0.9,0.9,0.9
n5,md5q3,n6,md5m4,0.4,0.5,0.65,250,0.9,0.9,0.7,0.9
n6,md5q4,n7,md5m5,0.85,0.95,0.5,300,0.9,0.9,0.92,0.9
n5,md5q3,n6,md5m4,0.4,0.5,0.65,250,0.9,0.9,0.7,0.7
n6,md5q4,n7,md5m5,0.85,0.95,0.5,300,0.9,0.9,0.92,0.92
104 changes: 104 additions & 0 deletions src/python/tests/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,73 @@ def test_cluster_containment(runtmp):
assert rows[0]['count'] == '1'


def test_cluster_max_containment_1(runtmp):
pairwise_csv = get_test_data('cluster.pairwise.csv')
output = runtmp.output('clusters.csv')
sizes = runtmp.output('sizes.csv')
threshold = '0.7'

runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output,
'--similarity-column', "max_containment", "--cluster-sizes",
sizes, '--threshold', threshold)

assert os.path.exists(output)

# check cluster output
with open(output, mode='r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
rows = [row for row in reader]
assert reader.fieldnames == ['cluster','nodes']
assert len(rows) == 1, f"Expected 1 data row but found {len(rows)}"
assert rows[0]['cluster'] == 'Component_1'
expected = set("n2;n3;n7;n1;n6;n5;n4".split(';'))
assert set(rows[0]['nodes'].split(';')) == expected

# check cluster size histogram
with open(sizes, mode='r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
rows = [row for row in reader]
assert reader.fieldnames == ['cluster_size','count']
assert len(rows) == 1, f"Expected 1 data row but found {len(rows)}"
assert rows[0]['cluster_size'] == '7'
assert rows[0]['count'] == '1'


def test_cluster_max_containment_2(runtmp):
pairwise_csv = get_test_data('cluster.pairwise.csv')
output = runtmp.output('clusters.csv')
sizes = runtmp.output('sizes.csv')
threshold = '0.9'

runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output,
'--similarity-column', "max_containment", "--cluster-sizes",
sizes, '--threshold', threshold)

assert os.path.exists(output)

# check cluster output
with open(output, mode='r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
rows = [row for row in reader]
assert reader.fieldnames == ['cluster','nodes']
assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}"
assert rows[0]['cluster'] == 'Component_1'
expected = set("n1;n2;n3;n4;n5".split(';'))
assert set(rows[0]['nodes'].split(';')) == expected
expected = set("n6;n7".split(';'))
assert set(rows[1]['nodes'].split(';')) == expected

# check cluster size histogram
with open(sizes, mode='r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
rows = [row for row in reader]
assert reader.fieldnames == ['cluster_size','count']
assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}"
rows_as_tuples = {tuple(row.values()) for row in rows}
expected = {('5', '1'), ('2', '1')}
assert rows_as_tuples == expected


def test_cluster_jaccard(runtmp):
pairwise_csv = get_test_data('cluster.pairwise.csv')
output = runtmp.output('clusters.csv')
Expand Down Expand Up @@ -91,6 +158,43 @@ def test_cluster_ani(runtmp):
assert rows[0]['cluster'] == 'Component_1'
expected = set("n1;n2;n3;n4;n5".split(';'))
assert set(rows[0]['nodes'].split(';')) == expected
expected = set("n6;n7".split(';'))
assert set(rows[1]['nodes'].split(';')) == expected

# check cluster size histogram
with open(sizes, mode='r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
rows = [row for row in reader]
assert reader.fieldnames == ['cluster_size','count']
assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}"
rows_as_tuples = {tuple(row.values()) for row in rows}
expected = {('5', '1'), ('2', '1')}
assert rows_as_tuples == expected


def test_cluster_max_ani(runtmp):
pairwise_csv = get_test_data('cluster.pairwise.csv')
output = runtmp.output('clusters.csv')
sizes = runtmp.output('sizes.csv')
threshold = '0.9'

runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output,
'--similarity-column', "max_ani", "--cluster-sizes",
sizes, '--threshold', threshold)

assert os.path.exists(output)

# check cluster output
with open(output, mode='r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
rows = [row for row in reader]
assert reader.fieldnames == ['cluster','nodes']
assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}"
assert rows[0]['cluster'] == 'Component_1'
expected = set("n1;n2;n3;n4;n5".split(';'))
assert set(rows[0]['nodes'].split(';')) == expected
expected = set("n6;n7".split(';'))
assert set(rows[1]['nodes'].split(';')) == expected

# check cluster size histogram
with open(sizes, mode='r', newline='') as csvfile:
Expand Down

0 comments on commit 6929f50

Please sign in to comment.