diff --git a/src/python/tests/test-data/cluster.pairwise.csv b/src/python/tests/test-data/cluster.pairwise.csv index aed0441c..5412208f 100644 --- a/src/python/tests/test-data/cluster.pairwise.csv +++ b/src/python/tests/test-data/cluster.pairwise.csv @@ -1,14 +1,14 @@ query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes,query_ani,match_ani,average_containment_ani,max_containment_ani n1,md5q1,n2,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.9,0.9 n1,md5q1,n3,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.9,0.9 -n1,md5q1,n4,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.45,0.9 -n1,md5q1,n5,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.30,0.9 -n1,md5q1,n6,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.5,0.9 -n2,md5q1,n1,md5m2,0.6,0.7,0.45,150,0.5,0.5,0.5,0.5 -n2,md5q1,n2,md5m2,0.6,0.7,0.45,150,0.5,0.5,0.5,0.5 -n2,md5q1,n3,md5m2,0.6,0.7,0.45,150,0.9,0.9,0.95,0.9 -n2,md5q1,n4,md5m2,0.6,0.7,0.45,150,0.9,0.9,0.95,0.9 -n3,md5q2,n4,md5m3,0.7,0.8,0.6,200,0.9,0.9,0.95,0.9 +n1,md5q1,n4,md5m1,0.8,0.9,0.55,100,0.9,0.9,0.45,0.45 +n1,md5q1,n5,md5m1,0.8,0.9,0.55,100,0.9,0.5,0.3,0.3 +n1,md5q1,n6,md5m1,0.8,0.8,0.55,100,0.9,0.5,0.5,0.5 +n2,md5q1,n1,md5m2,0.6,0.6,0.45,150,0.5,0.5,0.5,0.5 +n2,md5q1,n2,md5m2,0.6,0.6,0.45,150,0.5,0.5,0.5,0.5 +n2,md5q1,n3,md5m2,0.6,0.6,0.45,150,0.9,0.9,0.95,0.95 +n2,md5q1,n4,md5m2,0.6,0.6,0.45,150,0.9,0.9,0.95,0.95 +n3,md5q2,n4,md5m3,0.7,0.8,0.6,200,0.9,0.9,0.95,0.95 n4,md5q3,n5,md5m4,0.4,0.5,0.65,250,0.9,0.9,0.9,0.9 -n5,md5q3,n6,md5m4,0.4,0.5,0.65,250,0.9,0.9,0.7,0.9 -n6,md5q4,n7,md5m5,0.85,0.95,0.5,300,0.9,0.9,0.92,0.9 +n5,md5q3,n6,md5m4,0.4,0.5,0.65,250,0.9,0.9,0.7,0.7 +n6,md5q4,n7,md5m5,0.85,0.95,0.5,300,0.9,0.9,0.92,0.92 diff --git a/src/python/tests/test_cluster.py b/src/python/tests/test_cluster.py index ac2e81a5..e237b2ea 100644 --- a/src/python/tests/test_cluster.py +++ b/src/python/tests/test_cluster.py @@ -38,6 +38,73 @@ def test_cluster_containment(runtmp): assert rows[0]['count'] == '1' +def test_cluster_max_containment_1(runtmp): + pairwise_csv = get_test_data('cluster.pairwise.csv') + output = runtmp.output('clusters.csv') + sizes = runtmp.output('sizes.csv') + threshold = '0.7' + + runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, + '--similarity-column', "max_containment", "--cluster-sizes", + sizes, '--threshold', threshold) + + assert os.path.exists(output) + + # check cluster output + with open(output, mode='r', newline='') as csvfile: + reader = csv.DictReader(csvfile) + rows = [row for row in reader] + assert reader.fieldnames == ['cluster','nodes'] + assert len(rows) == 1, f"Expected 1 data row but found {len(rows)}" + assert rows[0]['cluster'] == 'Component_1' + expected = set("n2;n3;n7;n1;n6;n5;n4".split(';')) + assert set(rows[0]['nodes'].split(';')) == expected + + # check cluster size histogram + with open(sizes, mode='r', newline='') as csvfile: + reader = csv.DictReader(csvfile) + rows = [row for row in reader] + assert reader.fieldnames == ['cluster_size','count'] + assert len(rows) == 1, f"Expected 1 data row but found {len(rows)}" + assert rows[0]['cluster_size'] == '7' + assert rows[0]['count'] == '1' + + +def test_cluster_max_containment_2(runtmp): + pairwise_csv = get_test_data('cluster.pairwise.csv') + output = runtmp.output('clusters.csv') + sizes = runtmp.output('sizes.csv') + threshold = '0.9' + + runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, + '--similarity-column', "max_containment", "--cluster-sizes", + sizes, '--threshold', threshold) + + assert os.path.exists(output) + + # check cluster output + with open(output, mode='r', newline='') as csvfile: + reader = csv.DictReader(csvfile) + rows = [row for row in reader] + assert reader.fieldnames == ['cluster','nodes'] + assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" + assert rows[0]['cluster'] == 'Component_1' + expected = set("n1;n2;n3;n4;n5".split(';')) + assert set(rows[0]['nodes'].split(';')) == expected + expected = set("n6;n7".split(';')) + assert set(rows[1]['nodes'].split(';')) == expected + + # check cluster size histogram + with open(sizes, mode='r', newline='') as csvfile: + reader = csv.DictReader(csvfile) + rows = [row for row in reader] + assert reader.fieldnames == ['cluster_size','count'] + assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" + rows_as_tuples = {tuple(row.values()) for row in rows} + expected = {('5', '1'), ('2', '1')} + assert rows_as_tuples == expected + + def test_cluster_jaccard(runtmp): pairwise_csv = get_test_data('cluster.pairwise.csv') output = runtmp.output('clusters.csv') @@ -91,6 +158,43 @@ def test_cluster_ani(runtmp): assert rows[0]['cluster'] == 'Component_1' expected = set("n1;n2;n3;n4;n5".split(';')) assert set(rows[0]['nodes'].split(';')) == expected + expected = set("n6;n7".split(';')) + assert set(rows[1]['nodes'].split(';')) == expected + + # check cluster size histogram + with open(sizes, mode='r', newline='') as csvfile: + reader = csv.DictReader(csvfile) + rows = [row for row in reader] + assert reader.fieldnames == ['cluster_size','count'] + assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" + rows_as_tuples = {tuple(row.values()) for row in rows} + expected = {('5', '1'), ('2', '1')} + assert rows_as_tuples == expected + + +def test_cluster_max_ani(runtmp): + pairwise_csv = get_test_data('cluster.pairwise.csv') + output = runtmp.output('clusters.csv') + sizes = runtmp.output('sizes.csv') + threshold = '0.9' + + runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, + '--similarity-column', "max_ani", "--cluster-sizes", + sizes, '--threshold', threshold) + + assert os.path.exists(output) + + # check cluster output + with open(output, mode='r', newline='') as csvfile: + reader = csv.DictReader(csvfile) + rows = [row for row in reader] + assert reader.fieldnames == ['cluster','nodes'] + assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" + assert rows[0]['cluster'] == 'Component_1' + expected = set("n1;n2;n3;n4;n5".split(';')) + assert set(rows[0]['nodes'].split(';')) == expected + expected = set("n6;n7".split(';')) + assert set(rows[1]['nodes'].split(';')) == expected # check cluster size histogram with open(sizes, mode='r', newline='') as csvfile: