From 9ff523e6cc5b32420d79901b244f6da3df436620 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 15 Oct 2023 08:44:12 -0700 Subject: [PATCH] MRG: make `sourmash plot` labels/indices arguments make sense (#2790) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR rationalizes `sig plot` arguments for `--labels` (show names) and `--indices` (show numbers), and adds `--no-labels` and `--no-indices`, as follows. See https://github.com/sourmash-bio/sourmash/issues/2667 for motivating bug. 1. `sourmash plot compare-demo` - labels on dendrogram, labels on matrix ✅ (FIXED) 2. `sourmash plot compare-demo --labels` - labels on both dendrogram and matrix ✅ 3. `sourmash plot compare-demo --indices` - indices on both dendrogram and matrix ✅ 4. `sourmash plot compare-demo --labels --indices` - labels on both ✅ (FIXED - labels override indices) New arguments from this PR: 5. `sourmash plot compare-demo --no-labels` - indices on both ✅ 6. `sourmash plot compare-demo --no-labels --no-indices` - no labels/indices on either ✅ 7. `sourmash plot compare-demo --no-indices` - labels on both ✅ The PR also simplifies some of the `plot` command code as well as code in `fig.py`. TODO: - [x] write some tests for new args - [x] update documentation - [x] check to see if notebook code should be updated Fixes https://github.com/sourmash-bio/sourmash/issues/2667 Closes https://github.com/sourmash-bio/sourmash/pull/2672 --- doc/command-line.md | 21 ++-- doc/plotting-compare.ipynb | 116 +++++++-------------- src/sourmash/cli/plot.py | 16 ++- src/sourmash/commands.py | 64 +++++++----- src/sourmash/fig.py | 18 ++-- tests/test_sourmash.py | 207 ++++++++++++++++++++++++++++++++----- 6 files changed, 296 insertions(+), 146 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 640ff9c9a5..8b3b43780d 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -259,15 +259,18 @@ sourmash plot ``` Options: -``` ---pdf -- output PDF files. ---labels -- display the signature names (by default, the filenames) on the plot ---indices -- turn off index display on the plot. ---vmax -- maximum value (default 1.0) for heatmap. ---vmin -- minimum value (default 0.0) for heatmap. ---subsample= -- plot a maximum of samples, randomly chosen. ---subsample-seed= -- seed for pseudorandom number generator. -``` +* `--pdf` -- output PDF files. (defaults to PNG) +* `--labels` -- display the signature names on the plot (default) +* `--indices` -- turn on index display on the plot. +* `--vmax` -- maximum value (default 1.0) for heatmap. +* `--vmin` -- minimum value (default 0.0) for heatmap. +* `--subsample=` -- plot a maximum of samples, randomly chosen. +* `--subsample-seed=` -- seed for pseudorandom number generator. + +Example command lines for label and index display - + +* `--indices` will show only numbers; +* `--no-labels --no-indices` will remove all labels! Example output: diff --git a/doc/plotting-compare.ipynb b/doc/plotting-compare.ipynb index 40e6e520ad..6659e170cd 100644 --- a/doc/plotting-compare.ipynb +++ b/doc/plotting-compare.ipynb @@ -51,70 +51,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "\r", - "\u001b[K\r\n", - "== This is sourmash version 4.8.2. ==\r\n", - "\r", - "\u001b[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==\r\n", - "\r\n", - "\r", - "\u001b[Kloading '../tests/test-data/demo/SRR2060939_1.sig'\r", - "\r", - "\u001b[KLoaded 1 sigs from '../tests/test-data/demo/SRR2060939_1.sig'\r", - "\r", - "\u001b[Kloading '../tests/test-data/demo/SRR2060939_2.sig'\r", - "\r", - "\u001b[K<< 1.0 or D.min() < 0.0:\n", " error('This matrix doesn\\'t look like a distance matrix - min value {}, max value {}', D.min(), D.max())\n", " if not force:\n", @@ -288,12 +245,8 @@ " # plot dendrogram\n", " Y = sch.linkage(D, method='single') # centroid\n", "\n", - " dendrolabels = labeltext\n", - " if not show_labels:\n", - " dendrolabels = [str(i) for i in range(len(labeltext))]\n", - "\n", - " Z1 = sch.dendrogram(Y, orientation='left', labels=dendrolabels,\n", - " no_labels=not show_indices)\n", + " Z1 = sch.dendrogram(Y, orientation='left', labels=labeltext,\n", + " no_labels=not show_labels, get_leaves=True)\n", " ax1.set_xticks([])\n", "\n", " xstart = 0.45\n", @@ -302,15 +255,17 @@ " xstart = 0.315\n", " scale_xstart = xstart + width + 0.01\n", "\n", - " # plot matrix\n", - " axmatrix = fig.add_axes([xstart, 0.1, width, 0.6])\n", - "\n", - " # (this reorders D by the clustering in Z1)\n", + " # re-order labels along rows, top to bottom\n", " idx1 = Z1['leaves']\n", + " reordered_labels = [ labeltext[i] for i in idx1 ]\n", + "\n", + " # reorder D by the clustering in the dendrogram\n", " D = D[idx1, :]\n", " D = D[:, idx1]\n", "\n", " # show matrix\n", + " axmatrix = fig.add_axes([xstart, 0.1, width, 0.6])\n", + "\n", " im = axmatrix.matshow(D, aspect='auto', origin='lower',\n", " cmap=pylab.cm.YlGnBu, vmin=vmin, vmax=vmax)\n", " axmatrix.set_xticks([])\n", @@ -320,7 +275,7 @@ " axcolor = fig.add_axes([scale_xstart, 0.1, 0.02, 0.6])\n", " pylab.colorbar(im, cax=axcolor)\n", "\n", - " return fig" + " return fig, reordered_labels, D" ] }, { @@ -342,6 +297,13 @@ "source": [ "_ = plot_composite_matrix(matrix, labels)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/sourmash/cli/plot.py b/src/sourmash/cli/plot.py index 11c0b45f04..a548683c39 100644 --- a/src/sourmash/cli/plot.py +++ b/src/sourmash/cli/plot.py @@ -10,16 +10,24 @@ def subparser(subparsers): help='output PDF; default is PNG' ) subparser.add_argument( - '--labels', action='store_true', + '--labels', action='store_true', default=None, help='show sample labels on dendrogram/matrix' ) + subparser.add_argument( + '--no-labels', action='store_false', dest='labels', + help='do not show sample labels' + ) subparser.add_argument( '--labeltext', - help='filename containing list of labels (overrides signature names)' + help='filename containing list of labels (overrides signature names); implies --labels' + ) + subparser.add_argument( + '--indices', action='store_true', default=None, + help='show sample indices but not labels; overridden by --labels' ) subparser.add_argument( - '--indices', action='store_false', - help='show sample indices but not labels' + '--no-indices', action='store_false', dest='indices', + help='do not show sample indices' ) subparser.add_argument( '--vmin', default=0.0, type=float, diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index cc2b4e5f26..9f30e64ec3 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -246,39 +246,55 @@ def plot(args): # load files D_filename = args.distances - labelfilename = D_filename + '.labels.txt' notify(f'loading comparison matrix from {D_filename}...') D = numpy.load(open(D_filename, 'rb')) # not sure how to change this to use f-strings notify('...got {} x {} matrix.', *D.shape) - if args.labeltext: - labelfilename = args.labeltext - notify(f'loading labels from {labelfilename}') - labeltext = [ x.strip() for x in open(labelfilename) ] - if len(labeltext) != D.shape[0]: - error('{} labels != matrix size, exiting', len(labeltext)) - sys.exit(-1) - - # build filenames, decide on PDF/PNG output - dendrogram_out = os.path.basename(D_filename) + '.dendro' - if args.pdf: - dendrogram_out += '.pdf' + # see sourmash#2790 for details :) + if args.labeltext or args.labels: + display_labels = True + args.labels = True # override => labels always true + elif args.labels is None and not args.indices: + # default to labels + args.labels = True + display_labels = True + elif args.indices or (not args.labels and args.indices is None): + # turn on indices only, not label names + args.indices = True + display_labels = True else: - dendrogram_out += '.png' + display_labels = False - matrix_out = os.path.basename(D_filename) + '.matrix' - if args.pdf: - matrix_out += '.pdf' + if args.labels: + if args.labeltext: + labelfilename = args.labeltext + else: + labelfilename = D_filename + '.labels.txt' + + notify(f'loading labels from {labelfilename}') + labeltext = [ x.strip() for x in open(labelfilename) ] + + if len(labeltext) != D.shape[0]: + error('{} labels != matrix size, exiting', len(labeltext)) + sys.exit(-1) + elif args.indices: + # construct integer labels + labeltext = [str(i + 1) for i in range(D.shape[0])] else: - matrix_out += '.png' + assert not display_labels + labeltext = [""] * D.shape[0] - hist_out = os.path.basename(D_filename) + '.hist' if args.pdf: - hist_out += '.pdf' + ext = '.pdf' else: - hist_out += '.png' + ext = '.png' + + # build filenames, decide on PDF/PNG output + dendrogram_out = os.path.basename(D_filename) + '.dendro' + ext + matrix_out = os.path.basename(D_filename) + '.matrix' + ext + hist_out = os.path.basename(D_filename) + '.hist' + ext # output to a different directory? if args.output_dir: @@ -314,14 +330,14 @@ def plot(args): ### do clustering Y = sch.linkage(D, method='single') - sch.dendrogram(Y, orientation='right', labels=labeltext) + sch.dendrogram(Y, orientation='right', labels=labeltext, + no_labels=not display_labels) fig.savefig(dendrogram_out) notify(f'wrote dendrogram to: {dendrogram_out}') ### make the dendrogram+matrix: (fig, rlabels, rmat) = sourmash_fig.plot_composite_matrix(D, labeltext, - show_labels=args.labels, - show_indices=args.indices, + show_labels=display_labels, vmin=args.vmin, vmax=args.vmax, force=args.force) diff --git a/src/sourmash/fig.py b/src/sourmash/fig.py index 039c4ea07e..4454ef64d9 100644 --- a/src/sourmash/fig.py +++ b/src/sourmash/fig.py @@ -1,6 +1,6 @@ #! /usr/bin/env python """ -Make plots using the distance matrix+labels output by ``sourmash compare``. +Make plots using the distance matrix+labels output by `sourmash compare`. """ from .logging import error, notify try: @@ -20,11 +20,15 @@ def load_matrix_and_labels(basefile): return (D, labeltext) -def plot_composite_matrix(D, labeltext, show_labels=True, show_indices=True, +def plot_composite_matrix(D, labeltext, show_labels=True, vmax=1.0, vmin=0.0, force=False): """Build a composite plot showing dendrogram + distance matrix/heatmap. - Returns a matplotlib figure.""" + Returns a matplotlib figure. + + If show_labels is True, display labels. Otherwise, no labels are + shown on the plot. + """ if D.max() > 1.0 or D.min() < 0.0: error('This matrix doesn\'t look like a distance matrix - min value {}, max value {}', D.min(), D.max()) if not force: @@ -43,12 +47,8 @@ def plot_composite_matrix(D, labeltext, show_labels=True, show_indices=True, # plot dendrogram Y = sch.linkage(D, method='single') # centroid - dendrolabels = labeltext - if not show_labels: - dendrolabels = [str(i) for i in range(len(labeltext))] - - Z1 = sch.dendrogram(Y, orientation='left', labels=dendrolabels, - no_labels=not show_indices, get_leaves=True) + Z1 = sch.dendrogram(Y, orientation='left', labels=labeltext, + no_labels=not show_labels, get_leaves=True) ax1.set_xticks([]) xstart = 0.45 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 9d113a3c6e..d8c585b3f0 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -838,8 +838,10 @@ def test_do_plot_comparison(runtmp): assert os.path.exists(c.output("cmp.matrix.png")) -@utils.in_tempdir -def test_do_plot_comparison_2(c): +def test_do_plot_comparison_2_pdf(runtmp): + # test plot --pdf + c = runtmp + testdata1 = utils.get_test_data('short.fa') testdata2 = utils.get_test_data('short2.fa') c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) @@ -851,8 +853,10 @@ def test_do_plot_comparison_2(c): assert os.path.exists(c.output("cmp.matrix.pdf")) -@utils.in_tempdir -def test_do_plot_comparison_3(c): +def test_do_plot_comparison_3(runtmp): + # test plot --labels + c = runtmp + testdata1 = utils.get_test_data('short.fa') testdata2 = utils.get_test_data('short2.fa') c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) @@ -865,8 +869,10 @@ def test_do_plot_comparison_3(c): assert os.path.exists(c.output("cmp.matrix.png")) -@utils.in_tempdir -def test_do_plot_comparison_4_output_dir(c): +def test_do_plot_comparison_4_output_dir(runtmp): + # test plot --output-dir + c = runtmp + output_dir = c.output('xyz_test') testdata1 = utils.get_test_data('short.fa') @@ -881,8 +887,10 @@ def test_do_plot_comparison_4_output_dir(c): assert os.path.exists(os.path.join(output_dir, "cmp.matrix.png")) -@utils.in_tempdir -def test_do_plot_comparison_5_force(c): +def test_do_plot_comparison_5_force(runtmp): + # test -f to force display of something that's not a distance matrix + c = runtmp + D = numpy.zeros([2, 2]) D[0, 0] = 5 with open(c.output('cmp'), 'wb') as fp: @@ -896,8 +904,10 @@ def test_do_plot_comparison_5_force(c): assert c.last_result.status == 0 -@utils.in_tempdir -def test_do_plot_comparison_4_fail_not_distance(c): +def test_do_plot_comparison_4_fail_not_distance(runtmp): + # plot should fail when not a distance matrix + c = runtmp + D = numpy.zeros([2, 2]) D[0, 0] = 5 with open(c.output('cmp'), 'wb') as fp: @@ -913,32 +923,181 @@ def test_do_plot_comparison_4_fail_not_distance(c): assert c.last_result.status != 0 +def test_plot_6_labels_default(runtmp): + # plot --labels is default + testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') + testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') + testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata4 = utils.get_test_data('genome-s10+s11.sig') + + runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') + + runtmp.sourmash('plot', 'cmp', '--labels') + + print(runtmp.last_result.out) + + expected = """\ +0\tgenome-s10 +1\tgenome-s11 +2\tgenome-s12 +3\tgenome-s10+s11""" + assert expected in runtmp.last_result.out + + +def test_plot_6_labels(runtmp): + # specifing --labels gives the right result + testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') + testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') + testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata4 = utils.get_test_data('genome-s10+s11.sig') + + runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') + + runtmp.sourmash('plot', 'cmp', '--labels') + + print(runtmp.last_result.out) + + expected = """\ +0\tgenome-s10 +1\tgenome-s11 +2\tgenome-s12 +3\tgenome-s10+s11""" + assert expected in runtmp.last_result.out + + +def test_plot_6_indices(runtmp): + # test plot --indices + testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') + testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') + testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata4 = utils.get_test_data('genome-s10+s11.sig') + + runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') + + runtmp.sourmash('plot', 'cmp', '--indices') + + print(runtmp.last_result.out) + + expected = """\ +0\t1 +1\t2 +2\t3 +3\t4""" + assert expected in runtmp.last_result.out + + +def test_plot_6_no_labels(runtmp): + # test plot --no-labels + testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') + testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') + testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata4 = utils.get_test_data('genome-s10+s11.sig') + + runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') + + runtmp.sourmash('plot', 'cmp', '--no-labels') + + print(runtmp.last_result.out) + + expected = """\ +0\t1 +1\t2 +2\t3 +3\t4""" + assert expected in runtmp.last_result.out + + +def test_plot_6_no_indices(runtmp): + # test plot --no-labels + testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') + testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') + testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata4 = utils.get_test_data('genome-s10+s11.sig') + + runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') + + runtmp.sourmash('plot', 'cmp', '--no-labels') + + print(runtmp.last_result.out) + + expected = """\ +0\t1 +1\t2 +2\t3 +3\t4""" + assert expected in runtmp.last_result.out + + +def test_plot_6_no_labels_no_indices(runtmp): + # test plot --no-labels --no-indices + testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') + testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') + testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata4 = utils.get_test_data('genome-s10+s11.sig') + + runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') + + runtmp.sourmash('plot', 'cmp', '--no-labels', '--no-indices') + + print((runtmp.last_result.out,)) + + expected = """\ +0\t +1\t +2\t +3\t""" + assert expected in runtmp.last_result.out + + +def test_plot_6_indices_labels(runtmp): + # check that --labels --indices => --labels + testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') + testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') + testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata4 = utils.get_test_data('genome-s10+s11.sig') + + runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') + + runtmp.sourmash('plot', 'cmp', '--labels', '--indices') + + print(runtmp.last_result.out) + + expected = """\ +0\tgenome-s10 +1\tgenome-s11 +2\tgenome-s12 +3\tgenome-s10+s11""" + assert expected in runtmp.last_result.out + + def test_plot_override_labeltext(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') + # test overriding labeltext + testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') + testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') + testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata4 = utils.get_test_data('genome-s10+s11.sig') - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') + runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - with open(runtmp.output('new.labels.txt'), 'wt') as fp: - fp.write('a\nb\nc\nd\n') + with open(runtmp.output('new.labels.txt'), 'wt') as fp: + fp.write('a\nb\nc\nd\n') - runtmp.sourmash('plot', 'cmp', '--labeltext', 'new.labels.txt') + runtmp.sourmash('plot', 'cmp', '--labeltext', 'new.labels.txt') - print(runtmp.last_result.out) + print(runtmp.last_result.out) - assert 'loading labels from new.labels.txt' in runtmp.last_result.err + assert 'loading labels from new.labels.txt' in runtmp.last_result.err - expected = """\ + expected = """\ 0\ta 1\tb 2\tc 3\td""" - assert expected in runtmp.last_result.out + assert expected in runtmp.last_result.out def test_plot_override_labeltext_fail(runtmp): + # test failed override of labeltext testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') @@ -960,7 +1119,7 @@ def test_plot_override_labeltext_fail(runtmp): def test_plot_reordered_labels_csv(runtmp): - # test 'plot --csv' + # test 'plot --csv' & correct ordering of labels c = runtmp ss2 = utils.get_test_data('2.fa.sig') @@ -1006,6 +1165,7 @@ def test_plot_reordered_labels_csv_gz(runtmp): def test_plot_subsample_1(runtmp): + # test plotting with --subsample testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') @@ -1025,6 +1185,7 @@ def test_plot_subsample_1(runtmp): def test_plot_subsample_2(runtmp): + # test plotting --subsample with --subsample-seed testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') testdata3 = utils.get_test_data('genome-s12.fa.gz.sig')