diff --git a/.gitignore b/.gitignore index 0eb50a92a..f76b7b4d6 100644 --- a/.gitignore +++ b/.gitignore @@ -94,6 +94,7 @@ docs/_build docs/gallery docs/tutorials docs/sample_data +docs/benchmarks # Spyder project settings .spyderproject diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b913ff051..6035a9b44 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -120,6 +120,8 @@ We recommended that your contribution complies with the following rules before y pip install pytest pytest-cov pytest ``` +- If you are proposing adding a new module, make sure you are inheriting from the appropriate base class. See + [the documentation](https://hyppo.neurodata.io/api/index.html#base-classes) for the appropriate class. # Guidelines diff --git a/.codecov.yml b/codecov.yml similarity index 100% rename from .codecov.yml rename to codecov.yml diff --git a/docs/api/index.rst b/docs/api/index.rst index baec6cbda..b1318a834 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -148,3 +148,16 @@ Miscellaneous tools.perm_test tools.chi2_approx tools.power + + + +Base Classes +------------- + +.. autosummary:: + :toctree: generated/ + + independence.base.IndependenceTest + ksample.base.KSampleTest + time_series.base.TimeSeriesTest + discrim.base.DiscriminabilityTest diff --git a/docs/benchmarks/images/sphx_glr_indep_power_dimension_001.png b/docs/benchmarks/images/sphx_glr_indep_power_dimension_001.png deleted file mode 100644 index 8b4a8dcc0..000000000 Binary files a/docs/benchmarks/images/sphx_glr_indep_power_dimension_001.png and /dev/null differ diff --git a/docs/benchmarks/images/sphx_glr_indep_power_sampsize_001.png b/docs/benchmarks/images/sphx_glr_indep_power_sampsize_001.png deleted file mode 100644 index c7876009d..000000000 Binary files a/docs/benchmarks/images/sphx_glr_indep_power_sampsize_001.png and /dev/null differ diff --git a/docs/benchmarks/images/sphx_glr_perf_1d_001.png b/docs/benchmarks/images/sphx_glr_perf_1d_001.png deleted file mode 100644 index c1496440d..000000000 Binary files a/docs/benchmarks/images/sphx_glr_perf_1d_001.png and /dev/null differ diff --git a/docs/benchmarks/images/sphx_glr_same_stat_001.png b/docs/benchmarks/images/sphx_glr_same_stat_001.png deleted file mode 100644 index 11db1f48a..000000000 Binary files a/docs/benchmarks/images/sphx_glr_same_stat_001.png and /dev/null differ diff --git a/docs/benchmarks/images/thumb/sphx_glr_indep_power_dimension_thumb.png b/docs/benchmarks/images/thumb/sphx_glr_indep_power_dimension_thumb.png deleted file mode 100644 index 38169303e..000000000 Binary files a/docs/benchmarks/images/thumb/sphx_glr_indep_power_dimension_thumb.png and /dev/null differ diff --git a/docs/benchmarks/images/thumb/sphx_glr_indep_power_sampsize_thumb.png b/docs/benchmarks/images/thumb/sphx_glr_indep_power_sampsize_thumb.png deleted file mode 100644 index 41c9b3ad0..000000000 Binary files a/docs/benchmarks/images/thumb/sphx_glr_indep_power_sampsize_thumb.png and /dev/null differ diff --git a/docs/benchmarks/images/thumb/sphx_glr_perf_1d_thumb.png b/docs/benchmarks/images/thumb/sphx_glr_perf_1d_thumb.png deleted file mode 100644 index f82901bc6..000000000 Binary files a/docs/benchmarks/images/thumb/sphx_glr_perf_1d_thumb.png and /dev/null differ diff --git a/docs/benchmarks/images/thumb/sphx_glr_same_stat_thumb.png b/docs/benchmarks/images/thumb/sphx_glr_same_stat_thumb.png deleted file mode 100644 index 1218de3a2..000000000 Binary files a/docs/benchmarks/images/thumb/sphx_glr_same_stat_thumb.png and /dev/null differ diff --git a/docs/benchmarks/indep_power_dimension.ipynb b/docs/benchmarks/indep_power_dimension.ipynb deleted file mode 100644 index 15f71ec7c..000000000 --- a/docs/benchmarks/indep_power_dimension.ipynb +++ /dev/null @@ -1,54 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n# Independence Testing Power vs. Dimension\n\nHere, we show finite testing power comparisons between the various tests within hyppo.\nBecause of the struggles of high dimensional data, we expect that in most cases, that\nfinite testing power decreases as dimension increases.\nTests that converge to 0 slower have higher finite testing power and\nare likely better to use for your use case. The simulation in the bottom right is\nused so that we know that we are properly controlling for type I error, which is\nimportant becase otherwise the test would be invalid (power = alpha-level = 0.05).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import os\nimport sys\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport seaborn as sns\nfrom hyppo.independence import INDEP_TESTS\nfrom hyppo.tools import SIMULATIONS, power\nfrom joblib import Parallel, delayed\n\nsys.path.append(os.path.realpath(\"..\"))\n\n# make plots look pretty\nsns.set(color_codes=True, style=\"white\", context=\"talk\", font_scale=2)\nPALETTE = sns.color_palette(\"Set1\")\nsns.set_palette(PALETTE[1:])\n\n# constants\nPOWER_REPS = 5\n\n# simulation titles\nSIM_TITLES = [\n \"Linear\",\n \"Exponential\",\n \"Cubic\",\n \"Joint Normal\",\n \"Step\",\n \"Quadratic\",\n \"W-Shaped\",\n \"Spiral\",\n \"Bernoulli\",\n \"Logarithmic\",\n \"Fourth Root\",\n \"Sine 4\\u03C0\",\n \"Sine 16\\u03C0\",\n \"Square\",\n \"Two Parabolas\",\n \"Circle\",\n \"Ellipse\",\n \"Diamond\",\n \"Noise\",\n \"Independence\",\n]\n\n# haven't run for MaxMargin yet\nremove = [\"maxmargin\"]\nINDEP_TESTS = dict([(k, v) for k, v in INDEP_TESTS.items() if k not in remove])\n\n\ndef find_dim(sim):\n \"\"\"Find dimension maximum for the simulation.\"\"\"\n if sim not in SIMULATIONS.keys():\n raise ValueError(\"Invalid simulation\")\n\n if sim in [\"joint_normal\", \"sin_four_pi\", \"sin_sixteen_pi\", \"multiplicative_noise\"]:\n dim = 10\n elif sim in [\"multimodal_independence\", \"uncorrelated_bernoulli\", \"logarithmic\"]:\n dim = 100\n elif sim in [\"linear\", \"exponential\", \"cubic\"]:\n dim = 1000\n elif sim in [\"square\", \"diamond\"]:\n dim = 40\n else:\n dim = 20\n\n return dim\n\n\ndef find_dim_range(dim):\n \"\"\"Create list of dimension range to calculate power for.\"\"\"\n if dim < 20:\n lim = 10\n else:\n lim = 20\n\n dim_range = list(range(int(dim / lim), dim + 1, int(dim / lim)))\n if int(dim / lim) != 1:\n dim_range.insert(0, 1)\n return dim_range\n\n\ndef estimate_power(sim, test, auto=False):\n \"\"\"Compute the mean of the estimated power of 5 replications over sample sizes.\"\"\"\n if test == \"MaxMargin\":\n test = [\"MaxMargin\", \"Dcorr\"]\n dim_range = find_dim_range(find_dim(sim))\n est_power = np.array(\n [\n np.mean(\n [\n power(\n test,\n pow_type=\"indep\",\n sim=sim,\n n=100,\n p=dim,\n auto=auto,\n noise=True,\n )\n for _ in range(POWER_REPS)\n ]\n )\n for dim in dim_range\n ]\n )\n np.savetxt(\n \"../benchmarks/vs_samplesize/{}_{}.csv\".format(sim, test),\n est_power,\n delimiter=\",\",\n )\n\n return est_power\n\n\n# At this point, we would run this bit of code to generate the data for the figure and\n# store it under the \"vs_sampsize\" directory. Since this code takes a very long time,\n# we have commented out these lines of codes. If you would like to generate the data,\n# uncomment these lines and run the file.\n#\n# outputs = Parallel(n_jobs=-1, verbose=100)(\n# [\n# delayed(estimate_featimport)(sim_name, test)\n# for sim_name in SIMULATIONS.keys()\n# for test in INDEP_TESTS.keys()\n# ]\n# )\n\n\ndef plot_power():\n fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(25, 20))\n plt.suptitle(\n \"Multivariate Independence Testing (Increasing Dimension)\",\n y=0.93,\n va=\"baseline\",\n )\n\n for i, row in enumerate(ax):\n for j, col in enumerate(row):\n count = 5 * i + j\n sim = list(SIMULATIONS.keys())[count]\n\n for test in INDEP_TESTS.keys():\n est_power = np.genfromtxt(\n \"../benchmarks/vs_dimension/{}_{}.csv\".format(sim, test),\n delimiter=\",\",\n )\n dim_range = find_dim_range(find_dim(sim))\n\n col.plot(dim_range, est_power, label=INDEP_TESTS[test].__name__, lw=2)\n col.set_xticks([])\n if i == 3:\n col.set_xticks([dim_range[0], dim_range[-1]])\n col.set_ylim(-0.05, 1.05)\n col.set_yticks([])\n if j == 0:\n col.set_yticks([0, 1])\n col.set_title(SIM_TITLES[count])\n\n fig.text(0.5, 0.05, \"Dimension\", ha=\"center\")\n fig.text(\n 0.07,\n 0.5,\n \"Statistical Power\",\n va=\"center\",\n rotation=\"vertical\",\n )\n leg = plt.legend(\n bbox_to_anchor=(0.5, 0.05),\n bbox_transform=plt.gcf().transFigure,\n ncol=len(INDEP_TESTS.keys()),\n loc=\"upper center\",\n )\n leg.get_frame().set_linewidth(0.0)\n for legobj in leg.legendHandles:\n legobj.set_linewidth(5.0)\n plt.subplots_adjust(hspace=0.50)\n\n\n# plot the power\nplot_power()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/docs/benchmarks/indep_power_dimension.py b/docs/benchmarks/indep_power_dimension.py deleted file mode 100644 index 8a4bf3986..000000000 --- a/docs/benchmarks/indep_power_dimension.py +++ /dev/null @@ -1,193 +0,0 @@ -""" -Independence Testing Power vs. Dimension -=============================================== - -Here, we show finite testing power comparisons between the various tests within hyppo. -Because of the struggles of high dimensional data, we expect that in most cases, that -finite testing power decreases as dimension increases. -Tests that converge to 0 slower have higher finite testing power and -are likely better to use for your use case. The simulation in the bottom right is -used so that we know that we are properly controlling for type I error, which is -important becase otherwise the test would be invalid (power = alpha-level = 0.05). -""" - -import os -import sys - -import matplotlib.pyplot as plt -import numpy as np -import seaborn as sns -from hyppo.independence import INDEP_TESTS -from hyppo.tools import SIMULATIONS, power -from joblib import Parallel, delayed - -sys.path.append(os.path.realpath("..")) - -# make plots look pretty -sns.set(color_codes=True, style="white", context="talk", font_scale=2) -PALETTE = sns.color_palette("Set1") -sns.set_palette(PALETTE[1:]) - -# constants -POWER_REPS = 5 - -# simulation titles -SIM_TITLES = [ - "Linear", - "Exponential", - "Cubic", - "Joint Normal", - "Step", - "Quadratic", - "W-Shaped", - "Spiral", - "Bernoulli", - "Logarithmic", - "Fourth Root", - "Sine 4\u03C0", - "Sine 16\u03C0", - "Square", - "Two Parabolas", - "Circle", - "Ellipse", - "Diamond", - "Noise", - "Independence", -] - -# haven't run for MaxMargin yet -remove = ["maxmargin"] -INDEP_TESTS = dict([(k, v) for k, v in INDEP_TESTS.items() if k not in remove]) - - -def find_dim(sim): - """Find dimension maximum for the simulation.""" - if sim not in SIMULATIONS.keys(): - raise ValueError("Invalid simulation") - - if sim in ["joint_normal", "sin_four_pi", "sin_sixteen_pi", "multiplicative_noise"]: - dim = 10 - elif sim in ["multimodal_independence", "uncorrelated_bernoulli", "logarithmic"]: - dim = 100 - elif sim in ["linear", "exponential", "cubic"]: - dim = 1000 - elif sim in ["square", "diamond"]: - dim = 40 - else: - dim = 20 - - return dim - - -def find_dim_range(dim): - """Create list of dimension range to calculate power for.""" - if dim < 20: - lim = 10 - else: - lim = 20 - - dim_range = list(range(int(dim / lim), dim + 1, int(dim / lim))) - if int(dim / lim) != 1: - dim_range.insert(0, 1) - return dim_range - - -def estimate_power(sim, test, auto=False): - """Compute the mean of the estimated power of 5 replications over sample sizes.""" - if test == "MaxMargin": - test = ["MaxMargin", "Dcorr"] - dim_range = find_dim_range(find_dim(sim)) - est_power = np.array( - [ - np.mean( - [ - power( - test, - pow_type="indep", - sim=sim, - n=100, - p=dim, - auto=auto, - noise=True, - ) - for _ in range(POWER_REPS) - ] - ) - for dim in dim_range - ] - ) - np.savetxt( - "../benchmarks/vs_samplesize/{}_{}.csv".format(sim, test), - est_power, - delimiter=",", - ) - - return est_power - - -# At this point, we would run this bit of code to generate the data for the figure and -# store it under the "vs_sampsize" directory. Since this code takes a very long time, -# we have commented out these lines of codes. If you would like to generate the data, -# uncomment these lines and run the file. -# -# outputs = Parallel(n_jobs=-1, verbose=100)( -# [ -# delayed(estimate_featimport)(sim_name, test) -# for sim_name in SIMULATIONS.keys() -# for test in INDEP_TESTS.keys() -# ] -# ) - - -def plot_power(): - fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(25, 20)) - plt.suptitle( - "Multivariate Independence Testing (Increasing Dimension)", - y=0.93, - va="baseline", - ) - - for i, row in enumerate(ax): - for j, col in enumerate(row): - count = 5 * i + j - sim = list(SIMULATIONS.keys())[count] - - for test in INDEP_TESTS.keys(): - est_power = np.genfromtxt( - "../benchmarks/vs_dimension/{}_{}.csv".format(sim, test), - delimiter=",", - ) - dim_range = find_dim_range(find_dim(sim)) - - col.plot(dim_range, est_power, label=INDEP_TESTS[test].__name__, lw=2) - col.set_xticks([]) - if i == 3: - col.set_xticks([dim_range[0], dim_range[-1]]) - col.set_ylim(-0.05, 1.05) - col.set_yticks([]) - if j == 0: - col.set_yticks([0, 1]) - col.set_title(SIM_TITLES[count]) - - fig.text(0.5, 0.05, "Dimension", ha="center") - fig.text( - 0.07, - 0.5, - "Statistical Power", - va="center", - rotation="vertical", - ) - leg = plt.legend( - bbox_to_anchor=(0.5, 0.05), - bbox_transform=plt.gcf().transFigure, - ncol=len(INDEP_TESTS.keys()), - loc="upper center", - ) - leg.get_frame().set_linewidth(0.0) - for legobj in leg.legendHandles: - legobj.set_linewidth(5.0) - plt.subplots_adjust(hspace=0.50) - - -# plot the power -plot_power() diff --git a/docs/benchmarks/indep_power_dimension.py.md5 b/docs/benchmarks/indep_power_dimension.py.md5 deleted file mode 100644 index 15406ebe2..000000000 --- a/docs/benchmarks/indep_power_dimension.py.md5 +++ /dev/null @@ -1 +0,0 @@ -7857de3cdffcc6ce65c58e064f2796aa \ No newline at end of file diff --git a/docs/benchmarks/indep_power_dimension.rst b/docs/benchmarks/indep_power_dimension.rst deleted file mode 100644 index 274e19d8f..000000000 --- a/docs/benchmarks/indep_power_dimension.rst +++ /dev/null @@ -1,259 +0,0 @@ - -.. DO NOT EDIT. -.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. -.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: -.. "benchmarks/indep_power_dimension.py" -.. LINE NUMBERS ARE GIVEN BELOW. - -.. only:: html - - .. note:: - :class: sphx-glr-download-link-note - - Click :ref:`here ` - to download the full example code - -.. rst-class:: sphx-glr-example-title - -.. _sphx_glr_benchmarks_indep_power_dimension.py: - - -Independence Testing Power vs. Dimension -=============================================== - -Here, we show finite testing power comparisons between the various tests within hyppo. -Because of the struggles of high dimensional data, we expect that in most cases, that -finite testing power decreases as dimension increases. -Tests that converge to 0 slower have higher finite testing power and -are likely better to use for your use case. The simulation in the bottom right is -used so that we know that we are properly controlling for type I error, which is -important becase otherwise the test would be invalid (power = alpha-level = 0.05). - -.. GENERATED FROM PYTHON SOURCE LINES 13-194 - - - -.. image:: /benchmarks/images/sphx_glr_indep_power_dimension_001.png - :alt: Multivariate Independence Testing (Increasing Dimension), Linear, Exponential, Cubic, Joint Normal, Step, Quadratic, W-Shaped, Spiral, Bernoulli, Logarithmic, Fourth Root, Sine 4π, Sine 16π, Square, Two Parabolas, Circle, Ellipse, Diamond, Noise, Independence - :class: sphx-glr-single-img - - - - - -.. code-block:: default - - - import os - import sys - - import matplotlib.pyplot as plt - import numpy as np - import seaborn as sns - from hyppo.independence import INDEP_TESTS - from hyppo.tools import SIMULATIONS, power - from joblib import Parallel, delayed - - sys.path.append(os.path.realpath("..")) - - # make plots look pretty - sns.set(color_codes=True, style="white", context="talk", font_scale=2) - PALETTE = sns.color_palette("Set1") - sns.set_palette(PALETTE[1:]) - - # constants - POWER_REPS = 5 - - # simulation titles - SIM_TITLES = [ - "Linear", - "Exponential", - "Cubic", - "Joint Normal", - "Step", - "Quadratic", - "W-Shaped", - "Spiral", - "Bernoulli", - "Logarithmic", - "Fourth Root", - "Sine 4\u03C0", - "Sine 16\u03C0", - "Square", - "Two Parabolas", - "Circle", - "Ellipse", - "Diamond", - "Noise", - "Independence", - ] - - # haven't run for MaxMargin yet - remove = ["maxmargin"] - INDEP_TESTS = dict([(k, v) for k, v in INDEP_TESTS.items() if k not in remove]) - - - def find_dim(sim): - """Find dimension maximum for the simulation.""" - if sim not in SIMULATIONS.keys(): - raise ValueError("Invalid simulation") - - if sim in ["joint_normal", "sin_four_pi", "sin_sixteen_pi", "multiplicative_noise"]: - dim = 10 - elif sim in ["multimodal_independence", "uncorrelated_bernoulli", "logarithmic"]: - dim = 100 - elif sim in ["linear", "exponential", "cubic"]: - dim = 1000 - elif sim in ["square", "diamond"]: - dim = 40 - else: - dim = 20 - - return dim - - - def find_dim_range(dim): - """Create list of dimension range to calculate power for.""" - if dim < 20: - lim = 10 - else: - lim = 20 - - dim_range = list(range(int(dim / lim), dim + 1, int(dim / lim))) - if int(dim / lim) != 1: - dim_range.insert(0, 1) - return dim_range - - - def estimate_power(sim, test, auto=False): - """Compute the mean of the estimated power of 5 replications over sample sizes.""" - if test == "MaxMargin": - test = ["MaxMargin", "Dcorr"] - dim_range = find_dim_range(find_dim(sim)) - est_power = np.array( - [ - np.mean( - [ - power( - test, - pow_type="indep", - sim=sim, - n=100, - p=dim, - auto=auto, - noise=True, - ) - for _ in range(POWER_REPS) - ] - ) - for dim in dim_range - ] - ) - np.savetxt( - "../benchmarks/vs_samplesize/{}_{}.csv".format(sim, test), - est_power, - delimiter=",", - ) - - return est_power - - - # At this point, we would run this bit of code to generate the data for the figure and - # store it under the "vs_sampsize" directory. Since this code takes a very long time, - # we have commented out these lines of codes. If you would like to generate the data, - # uncomment these lines and run the file. - # - # outputs = Parallel(n_jobs=-1, verbose=100)( - # [ - # delayed(estimate_featimport)(sim_name, test) - # for sim_name in SIMULATIONS.keys() - # for test in INDEP_TESTS.keys() - # ] - # ) - - - def plot_power(): - fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(25, 20)) - plt.suptitle( - "Multivariate Independence Testing (Increasing Dimension)", - y=0.93, - va="baseline", - ) - - for i, row in enumerate(ax): - for j, col in enumerate(row): - count = 5 * i + j - sim = list(SIMULATIONS.keys())[count] - - for test in INDEP_TESTS.keys(): - est_power = np.genfromtxt( - "../benchmarks/vs_dimension/{}_{}.csv".format(sim, test), - delimiter=",", - ) - dim_range = find_dim_range(find_dim(sim)) - - col.plot(dim_range, est_power, label=INDEP_TESTS[test].__name__, lw=2) - col.set_xticks([]) - if i == 3: - col.set_xticks([dim_range[0], dim_range[-1]]) - col.set_ylim(-0.05, 1.05) - col.set_yticks([]) - if j == 0: - col.set_yticks([0, 1]) - col.set_title(SIM_TITLES[count]) - - fig.text(0.5, 0.05, "Dimension", ha="center") - fig.text( - 0.07, - 0.5, - "Statistical Power", - va="center", - rotation="vertical", - ) - leg = plt.legend( - bbox_to_anchor=(0.5, 0.05), - bbox_transform=plt.gcf().transFigure, - ncol=len(INDEP_TESTS.keys()), - loc="upper center", - ) - leg.get_frame().set_linewidth(0.0) - for legobj in leg.legendHandles: - legobj.set_linewidth(5.0) - plt.subplots_adjust(hspace=0.50) - - - # plot the power - plot_power() - - -.. rst-class:: sphx-glr-timing - - **Total running time of the script:** ( 0 minutes 3.835 seconds) - - -.. _sphx_glr_download_benchmarks_indep_power_dimension.py: - - -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download sphx-glr-download-python - - :download:`Download Python source code: indep_power_dimension.py ` - - - - .. container:: sphx-glr-download sphx-glr-download-jupyter - - :download:`Download Jupyter notebook: indep_power_dimension.ipynb ` - - -.. only:: html - - .. rst-class:: sphx-glr-signature - - `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/benchmarks/indep_power_dimension_codeobj.pickle b/docs/benchmarks/indep_power_dimension_codeobj.pickle deleted file mode 100644 index fe4ad1cb0..000000000 Binary files a/docs/benchmarks/indep_power_dimension_codeobj.pickle and /dev/null differ diff --git a/docs/benchmarks/indep_power_sampsize.ipynb b/docs/benchmarks/indep_power_sampsize.ipynb deleted file mode 100644 index fce25b947..000000000 --- a/docs/benchmarks/indep_power_sampsize.ipynb +++ /dev/null @@ -1,54 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n# 1D Independence Testing Power vs. Sample Size\n\nHere, we show finite testing power comparisons between the various tests within hyppo.\nFor a test to be consistent, we would expect power to converge to 1 as sample size\nincreases. Tests that converge to 1 quicker have higher finite testing power and\nare likely better to use for your use case. The simulation in the bottom right is\nused so that we know that we are properly controlling for type I error, which is\nimportant becase otherwise the test would be invalid (power = alpha-level = 0.05).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import os\nimport sys\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport seaborn as sns\nfrom hyppo.independence import INDEP_TESTS\nfrom hyppo.tools import SIMULATIONS, power\nfrom joblib import Parallel, delayed\n\nsys.path.append(os.path.realpath(\"..\"))\n\n# make plots look pretty\nsns.set(color_codes=True, style=\"white\", context=\"talk\", font_scale=2)\nPALETTE = sns.color_palette(\"Set1\")\nsns.set_palette(PALETTE[1:])\n\n# constants\nMAX_SAMPLE_SIZE = 100\nSTEP_SIZE = 5\nSAMP_SIZES = range(5, MAX_SAMPLE_SIZE + STEP_SIZE, STEP_SIZE)\nPOWER_REPS = 5\n\n# simulation titles\nSIM_TITLES = [\n \"Linear\",\n \"Exponential\",\n \"Cubic\",\n \"Joint Normal\",\n \"Step\",\n \"Quadratic\",\n \"W-Shaped\",\n \"Spiral\",\n \"Bernoulli\",\n \"Logarithmic\",\n \"Fourth Root\",\n \"Sine 4\\u03C0\",\n \"Sine 16\\u03C0\",\n \"Square\",\n \"Two Parabolas\",\n \"Circle\",\n \"Ellipse\",\n \"Diamond\",\n \"Noise\",\n \"Independence\",\n]\n\n# these tests only make sense for > 1 dimension data\nremove = [\"maxmargin\", \"kmerf\"]\nINDEP_TESTS = dict([(k, v) for k, v in INDEP_TESTS.items() if k not in remove])\n\n\ndef estimate_power(sim, test, auto=False):\n \"\"\"Compute the mean of the estimated power of 5 replications over sample sizes.\"\"\"\n if test == \"MaxMargin\":\n test = [\"MaxMargin\", \"Dcorr\"]\n est_power = np.array(\n [\n np.mean(\n [\n power(\n test, pow_type=\"indep\", sim=sim, n=i, p=1, auto=auto, noise=True\n )\n for _ in range(POWER_REPS)\n ]\n )\n for i in SAMP_SIZES\n ]\n )\n np.savetxt(\n \"../benchmarks/vs_samplesize/{}_{}.csv\".format(sim, test),\n est_power,\n delimiter=\",\",\n )\n\n return est_power\n\n\n# At this point, we would run this bit of code to generate the data for the figure and\n# store it under the \"vs_sampsize\" directory. Since this code takes a very long time,\n# we have commented out these lines of codes. If you would like to generate the data,\n# uncomment these lines and run the file.\n#\n# outputs = Parallel(n_jobs=-1, verbose=100)(\n# [\n# delayed(estimate_featimport)(sim_name, test)\n# for sim_name in SIMULATIONS.keys()\n# for test in INDEP_TESTS.keys()\n# ]\n# )\n\n\ndef plot_power():\n fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(25, 20))\n plt.suptitle(\n \"Multivariate Independence Testing (Increasing Sample Size)\",\n y=0.93,\n va=\"baseline\",\n )\n\n for i, row in enumerate(ax):\n for j, col in enumerate(row):\n count = 5 * i + j\n sim = list(SIMULATIONS.keys())[count]\n\n for test in INDEP_TESTS.keys():\n est_power = np.genfromtxt(\n \"../benchmarks/vs_samplesize/{}_{}.csv\".format(sim, test),\n delimiter=\",\",\n )\n\n col.plot(SAMP_SIZES, est_power, label=INDEP_TESTS[test].__name__, lw=2)\n col.set_xticks([])\n if i == 3:\n col.set_xticks([SAMP_SIZES[0], SAMP_SIZES[-1]])\n col.set_ylim(-0.05, 1.05)\n col.set_yticks([])\n if j == 0:\n col.set_yticks([0, 1])\n col.set_title(SIM_TITLES[count])\n\n fig.text(0.5, 0.05, \"Sample Size\", ha=\"center\")\n fig.text(\n 0.07,\n 0.5,\n \"Statistical Power\",\n va=\"center\",\n rotation=\"vertical\",\n )\n leg = plt.legend(\n bbox_to_anchor=(0.5, 0.05),\n bbox_transform=plt.gcf().transFigure,\n ncol=len(INDEP_TESTS.keys()),\n loc=\"upper center\",\n )\n leg.get_frame().set_linewidth(0.0)\n for legobj in leg.legendHandles:\n legobj.set_linewidth(5.0)\n plt.subplots_adjust(hspace=0.50)\n\n\n# plot the power\nplot_power()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/docs/benchmarks/indep_power_sampsize.py b/docs/benchmarks/indep_power_sampsize.py deleted file mode 100644 index 0984806ac..000000000 --- a/docs/benchmarks/indep_power_sampsize.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -1D Independence Testing Power vs. Sample Size -=============================================== - -Here, we show finite testing power comparisons between the various tests within hyppo. -For a test to be consistent, we would expect power to converge to 1 as sample size -increases. Tests that converge to 1 quicker have higher finite testing power and -are likely better to use for your use case. The simulation in the bottom right is -used so that we know that we are properly controlling for type I error, which is -important becase otherwise the test would be invalid (power = alpha-level = 0.05). -""" - -import os -import sys - -import matplotlib.pyplot as plt -import numpy as np -import seaborn as sns -from hyppo.independence import INDEP_TESTS -from hyppo.tools import SIMULATIONS, power -from joblib import Parallel, delayed - -sys.path.append(os.path.realpath("..")) - -# make plots look pretty -sns.set(color_codes=True, style="white", context="talk", font_scale=2) -PALETTE = sns.color_palette("Set1") -sns.set_palette(PALETTE[1:]) - -# constants -MAX_SAMPLE_SIZE = 100 -STEP_SIZE = 5 -SAMP_SIZES = range(5, MAX_SAMPLE_SIZE + STEP_SIZE, STEP_SIZE) -POWER_REPS = 5 - -# simulation titles -SIM_TITLES = [ - "Linear", - "Exponential", - "Cubic", - "Joint Normal", - "Step", - "Quadratic", - "W-Shaped", - "Spiral", - "Bernoulli", - "Logarithmic", - "Fourth Root", - "Sine 4\u03C0", - "Sine 16\u03C0", - "Square", - "Two Parabolas", - "Circle", - "Ellipse", - "Diamond", - "Noise", - "Independence", -] - -# these tests only make sense for > 1 dimension data -remove = ["maxmargin", "kmerf"] -INDEP_TESTS = dict([(k, v) for k, v in INDEP_TESTS.items() if k not in remove]) - - -def estimate_power(sim, test, auto=False): - """Compute the mean of the estimated power of 5 replications over sample sizes.""" - if test == "MaxMargin": - test = ["MaxMargin", "Dcorr"] - est_power = np.array( - [ - np.mean( - [ - power( - test, pow_type="indep", sim=sim, n=i, p=1, auto=auto, noise=True - ) - for _ in range(POWER_REPS) - ] - ) - for i in SAMP_SIZES - ] - ) - np.savetxt( - "../benchmarks/vs_samplesize/{}_{}.csv".format(sim, test), - est_power, - delimiter=",", - ) - - return est_power - - -# At this point, we would run this bit of code to generate the data for the figure and -# store it under the "vs_sampsize" directory. Since this code takes a very long time, -# we have commented out these lines of codes. If you would like to generate the data, -# uncomment these lines and run the file. -# -# outputs = Parallel(n_jobs=-1, verbose=100)( -# [ -# delayed(estimate_featimport)(sim_name, test) -# for sim_name in SIMULATIONS.keys() -# for test in INDEP_TESTS.keys() -# ] -# ) - - -def plot_power(): - fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(25, 20)) - plt.suptitle( - "Multivariate Independence Testing (Increasing Sample Size)", - y=0.93, - va="baseline", - ) - - for i, row in enumerate(ax): - for j, col in enumerate(row): - count = 5 * i + j - sim = list(SIMULATIONS.keys())[count] - - for test in INDEP_TESTS.keys(): - est_power = np.genfromtxt( - "../benchmarks/vs_samplesize/{}_{}.csv".format(sim, test), - delimiter=",", - ) - - col.plot(SAMP_SIZES, est_power, label=INDEP_TESTS[test].__name__, lw=2) - col.set_xticks([]) - if i == 3: - col.set_xticks([SAMP_SIZES[0], SAMP_SIZES[-1]]) - col.set_ylim(-0.05, 1.05) - col.set_yticks([]) - if j == 0: - col.set_yticks([0, 1]) - col.set_title(SIM_TITLES[count]) - - fig.text(0.5, 0.05, "Sample Size", ha="center") - fig.text( - 0.07, - 0.5, - "Statistical Power", - va="center", - rotation="vertical", - ) - leg = plt.legend( - bbox_to_anchor=(0.5, 0.05), - bbox_transform=plt.gcf().transFigure, - ncol=len(INDEP_TESTS.keys()), - loc="upper center", - ) - leg.get_frame().set_linewidth(0.0) - for legobj in leg.legendHandles: - legobj.set_linewidth(5.0) - plt.subplots_adjust(hspace=0.50) - - -# plot the power -plot_power() diff --git a/docs/benchmarks/indep_power_sampsize.py.md5 b/docs/benchmarks/indep_power_sampsize.py.md5 deleted file mode 100644 index ee40d815b..000000000 --- a/docs/benchmarks/indep_power_sampsize.py.md5 +++ /dev/null @@ -1 +0,0 @@ -f64709c8fdc9e94435894fc33074bb7b \ No newline at end of file diff --git a/docs/benchmarks/indep_power_sampsize.rst b/docs/benchmarks/indep_power_sampsize.rst deleted file mode 100644 index 5fd4916ac..000000000 --- a/docs/benchmarks/indep_power_sampsize.rst +++ /dev/null @@ -1,221 +0,0 @@ - -.. DO NOT EDIT. -.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. -.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: -.. "benchmarks/indep_power_sampsize.py" -.. LINE NUMBERS ARE GIVEN BELOW. - -.. only:: html - - .. note:: - :class: sphx-glr-download-link-note - - Click :ref:`here ` - to download the full example code - -.. rst-class:: sphx-glr-example-title - -.. _sphx_glr_benchmarks_indep_power_sampsize.py: - - -1D Independence Testing Power vs. Sample Size -=============================================== - -Here, we show finite testing power comparisons between the various tests within hyppo. -For a test to be consistent, we would expect power to converge to 1 as sample size -increases. Tests that converge to 1 quicker have higher finite testing power and -are likely better to use for your use case. The simulation in the bottom right is -used so that we know that we are properly controlling for type I error, which is -important becase otherwise the test would be invalid (power = alpha-level = 0.05). - -.. GENERATED FROM PYTHON SOURCE LINES 12-156 - - - -.. image:: /benchmarks/images/sphx_glr_indep_power_sampsize_001.png - :alt: Multivariate Independence Testing (Increasing Sample Size), Linear, Exponential, Cubic, Joint Normal, Step, Quadratic, W-Shaped, Spiral, Bernoulli, Logarithmic, Fourth Root, Sine 4π, Sine 16π, Square, Two Parabolas, Circle, Ellipse, Diamond, Noise, Independence - :class: sphx-glr-single-img - - - - - -.. code-block:: default - - - import os - import sys - - import matplotlib.pyplot as plt - import numpy as np - import seaborn as sns - from hyppo.independence import INDEP_TESTS - from hyppo.tools import SIMULATIONS, power - from joblib import Parallel, delayed - - sys.path.append(os.path.realpath("..")) - - # make plots look pretty - sns.set(color_codes=True, style="white", context="talk", font_scale=2) - PALETTE = sns.color_palette("Set1") - sns.set_palette(PALETTE[1:]) - - # constants - MAX_SAMPLE_SIZE = 100 - STEP_SIZE = 5 - SAMP_SIZES = range(5, MAX_SAMPLE_SIZE + STEP_SIZE, STEP_SIZE) - POWER_REPS = 5 - - # simulation titles - SIM_TITLES = [ - "Linear", - "Exponential", - "Cubic", - "Joint Normal", - "Step", - "Quadratic", - "W-Shaped", - "Spiral", - "Bernoulli", - "Logarithmic", - "Fourth Root", - "Sine 4\u03C0", - "Sine 16\u03C0", - "Square", - "Two Parabolas", - "Circle", - "Ellipse", - "Diamond", - "Noise", - "Independence", - ] - - # these tests only make sense for > 1 dimension data - remove = ["maxmargin", "kmerf"] - INDEP_TESTS = dict([(k, v) for k, v in INDEP_TESTS.items() if k not in remove]) - - - def estimate_power(sim, test, auto=False): - """Compute the mean of the estimated power of 5 replications over sample sizes.""" - if test == "MaxMargin": - test = ["MaxMargin", "Dcorr"] - est_power = np.array( - [ - np.mean( - [ - power( - test, pow_type="indep", sim=sim, n=i, p=1, auto=auto, noise=True - ) - for _ in range(POWER_REPS) - ] - ) - for i in SAMP_SIZES - ] - ) - np.savetxt( - "../benchmarks/vs_samplesize/{}_{}.csv".format(sim, test), - est_power, - delimiter=",", - ) - - return est_power - - - # At this point, we would run this bit of code to generate the data for the figure and - # store it under the "vs_sampsize" directory. Since this code takes a very long time, - # we have commented out these lines of codes. If you would like to generate the data, - # uncomment these lines and run the file. - # - # outputs = Parallel(n_jobs=-1, verbose=100)( - # [ - # delayed(estimate_featimport)(sim_name, test) - # for sim_name in SIMULATIONS.keys() - # for test in INDEP_TESTS.keys() - # ] - # ) - - - def plot_power(): - fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(25, 20)) - plt.suptitle( - "Multivariate Independence Testing (Increasing Sample Size)", - y=0.93, - va="baseline", - ) - - for i, row in enumerate(ax): - for j, col in enumerate(row): - count = 5 * i + j - sim = list(SIMULATIONS.keys())[count] - - for test in INDEP_TESTS.keys(): - est_power = np.genfromtxt( - "../benchmarks/vs_samplesize/{}_{}.csv".format(sim, test), - delimiter=",", - ) - - col.plot(SAMP_SIZES, est_power, label=INDEP_TESTS[test].__name__, lw=2) - col.set_xticks([]) - if i == 3: - col.set_xticks([SAMP_SIZES[0], SAMP_SIZES[-1]]) - col.set_ylim(-0.05, 1.05) - col.set_yticks([]) - if j == 0: - col.set_yticks([0, 1]) - col.set_title(SIM_TITLES[count]) - - fig.text(0.5, 0.05, "Sample Size", ha="center") - fig.text( - 0.07, - 0.5, - "Statistical Power", - va="center", - rotation="vertical", - ) - leg = plt.legend( - bbox_to_anchor=(0.5, 0.05), - bbox_transform=plt.gcf().transFigure, - ncol=len(INDEP_TESTS.keys()), - loc="upper center", - ) - leg.get_frame().set_linewidth(0.0) - for legobj in leg.legendHandles: - legobj.set_linewidth(5.0) - plt.subplots_adjust(hspace=0.50) - - - # plot the power - plot_power() - - -.. rst-class:: sphx-glr-timing - - **Total running time of the script:** ( 0 minutes 0.928 seconds) - - -.. _sphx_glr_download_benchmarks_indep_power_sampsize.py: - - -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download sphx-glr-download-python - - :download:`Download Python source code: indep_power_sampsize.py ` - - - - .. container:: sphx-glr-download sphx-glr-download-jupyter - - :download:`Download Jupyter notebook: indep_power_sampsize.ipynb ` - - -.. only:: html - - .. rst-class:: sphx-glr-signature - - `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/benchmarks/indep_power_sampsize_codeobj.pickle b/docs/benchmarks/indep_power_sampsize_codeobj.pickle deleted file mode 100644 index c3d405bbf..000000000 Binary files a/docs/benchmarks/indep_power_sampsize_codeobj.pickle and /dev/null differ diff --git a/docs/benchmarks/index.rst b/docs/benchmarks/index.rst deleted file mode 100644 index 3be8e6f3f..000000000 --- a/docs/benchmarks/index.rst +++ /dev/null @@ -1,109 +0,0 @@ -:orphan: - - - -.. _sphx_glr_benchmarks: - -.. _benchmarks: - -Benchmarks -============= - -We have benchmarked tests within hyppo and between similar packages in R and python. -Here are some of these comparisons: - - -.. raw:: html - -
- -.. only:: html - - .. figure:: /benchmarks/images/thumb/sphx_glr_indep_power_dimension_thumb.png - :alt: Independence Testing Power vs. Dimension - - :ref:`sphx_glr_benchmarks_indep_power_dimension.py` - -.. raw:: html - -
- - -.. toctree:: - :hidden: - - /benchmarks/indep_power_dimension - -.. raw:: html - -
- -.. only:: html - - .. figure:: /benchmarks/images/thumb/sphx_glr_indep_power_sampsize_thumb.png - :alt: 1D Independence Testing Power vs. Sample Size - - :ref:`sphx_glr_benchmarks_indep_power_sampsize.py` - -.. raw:: html - -
- - -.. toctree:: - :hidden: - - /benchmarks/indep_power_sampsize - -.. raw:: html - -
- -.. only:: html - - .. figure:: /benchmarks/images/thumb/sphx_glr_perf_1d_thumb.png - :alt: 1D Performance Comparisons - - :ref:`sphx_glr_benchmarks_perf_1d.py` - -.. raw:: html - -
- - -.. toctree:: - :hidden: - - /benchmarks/perf_1d - -.. raw:: html - -
- -.. only:: html - - .. figure:: /benchmarks/images/thumb/sphx_glr_same_stat_thumb.png - :alt: Comparisons of Test Statistics - - :ref:`sphx_glr_benchmarks_same_stat.py` - -.. raw:: html - -
- - -.. toctree:: - :hidden: - - /benchmarks/same_stat -.. raw:: html - -
- - - -.. only:: html - - .. rst-class:: sphx-glr-signature - - `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/benchmarks/perf_1d.ipynb b/docs/benchmarks/perf_1d.ipynb deleted file mode 100644 index d38467793..000000000 --- a/docs/benchmarks/perf_1d.ipynb +++ /dev/null @@ -1,61 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n# 1D Performance Comparisons\n\nThere are few implementations in :mod:`hyppo.independence` the have implementations\nin R. Here, we compare the test statistics between the R-generated values and our\npackage values. As you can see, there is a minimum difference between test statistics.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import os\nimport sys\nimport timeit\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport seaborn as sns\nfrom hyppo.independence import HHG, MGC, Dcorr\nfrom hyppo.ksample import MMD\nfrom hyppo.tools import linear\n\nsys.path.append(os.path.realpath(\"..\"))\n\n# make plots look pretty\nsns.set(color_codes=True, style=\"white\", context=\"talk\", font_scale=1)\nPALETTE = sns.color_palette(\"Set1\")\nsns.set_palette(PALETTE[1:])\n\n# constants\nN = [50, 100, 200, 500, 1000, 2000, 5000, 10000] # sample sizes\nFONTSIZE = 20\n\n# tests\nTESTS = {\"indep\": [Dcorr, MGC, HHG], \"ksample\": [MMD], \"fast\": [Dcorr]}\n\n\n# function runs wall time estimates using timeit (for python)\ndef estimate_wall_times(test_type, tests, **kwargs):\n for test in tests:\n times = []\n for n in N:\n x, y = linear(n, 1, noise=True)\n # numba is a JIT compiler, run code to cache first, then time\n _ = test().test(x, y, workers=-1, **kwargs)\n start_time = timeit.default_timer()\n _ = test().test(x, y, workers=-1, **kwargs)\n times.append(timeit.default_timer() - start_time)\n np.savetxt(\n \"../benchmarks/perf/{}_{}.csv\".format(test_type, test.__name__),\n times,\n delimiter=\",\",\n )\n return times\n\n\n# compute wall times, uncomment to recompute\n# kwargs = {}\n# for test_type in TESTS.keys():\n# if test_type == \"fast\":\n# kwargs[\"auto\"] = True\n# estimate_wall_times(test_type, TESTS[test_type], **kwargs)\n\n# Dictionary of test colors and labels\nTEST_METADATA = {\n \"MGC\": {\"test_name\": \"MGC (hyppo)\", \"color\": \"#e41a1c\"},\n \"HHG\": {\"test_name\": \"HHG (hyppo)\", \"color\": \"#4daf4a\"},\n \"Dcorr\": {\"test_name\": \"Dcorr (hyppo)\", \"color\": \"#377eb8\"},\n \"ksample_Hsic\": {\"test_name\": \"MMD (hyppo)\", \"color\": \"#ff7f00\"},\n \"fast_Dcorr\": {\"test_name\": \"Fast 1D Dcorr (hyppo)\", \"color\": \"#984ea3\"},\n \"HHG_hhg\": {\"test_name\": \"HHG (HHG)\", \"color\": \"#4daf4a\"},\n \"Dcorr_energy\": {\"test_name\": \"Dcorr (energy)\", \"color\": \"#377eb8\"},\n \"Dcorr_kernlab\": {\"test_name\": \"MMD (kernlab)\", \"color\": \"#ff7f00\"},\n}\n\n\ndef plot_wall_times():\n _ = plt.figure(figsize=(10, 10))\n ax = plt.subplot(111)\n\n i = 0\n kwargs = {}\n for file_name, metadata in TEST_METADATA.items():\n test_times = np.genfromtxt(\n \"../benchmarks/perf/{}.csv\".format(file_name), delimiter=\",\"\n )\n\n if file_name in [\"HHG_hhg\", \"Dcorr_energy\", \"Dcorr_kernlab\"]:\n kwargs = {\"linestyle\": \"dashed\"}\n ax.plot(\n N,\n test_times,\n color=metadata[\"color\"],\n label=metadata[\"test_name\"],\n lw=5,\n **kwargs\n )\n i += 1\n\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.set_xlabel(\"Number of Samples\")\n ax.set_ylabel(\"Execution Time\\n(Seconds)\")\n ax.set_xscale(\"log\")\n ax.set_yscale(\"log\")\n ax.set_xticks([1e2, 1e3, 1e4])\n ax.set_yticks([1e-4, 1e-2, 1e0, 1e2, 1e4])\n\n leg = plt.legend(\n bbox_to_anchor=(0.5, 0.95),\n bbox_transform=plt.gcf().transFigure,\n ncol=2,\n loc=\"upper center\",\n )\n leg.get_frame().set_linewidth(0.0)\n for legobj in leg.legendHandles:\n legobj.set_linewidth(5.0)\n\n\n# plot the wall times\nplot_wall_times()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following shows the code that was used to compute the R test statistics.\nCertain lines were commented out depending on whether or not they were useful.\n\n.. code-block::\n\n rm(list=ls())\n\n require(\"energy\")\n require(\"kernlab\")\n require(\"mgc\")\n require(\"HHG\")\n require(\"microbenchmark\")\n\n num_samples_range = c(50, 100, 200, 500, 1000, 2000, 5000, 10000)\n linear_data <- list()\n i <- 1\n for (num_samples in num_samples_range){\n data <- mgc.sims.linear(num_samples, 1)\n x <- data$X\n y <- data$Y\n times = seq(1, 3, by=1)\n executions <- list()\n for (t in times){\n # x <- as.matrix(dist((x), diag = TRUE, upper = TRUE))\n # y <- as.matrix(dist((y), diag = TRUE, upper = TRUE))\n\n # best of 5 executions\n # time <- microbenchmark(kmmd(x, y, ntimes=1000), times=1, unit=\"secs\")\n # time <- microbenchmark(dcor.test(x, y, R=1000), times=1, unit=\"secs\")\n # time <- microbenchmark(dcor.test(x, y, R=1000), times=1, unit=\"secs\")\n time <- microbenchmark(dcor2d(x, y), times=1, unit=\"secs\")\n # time <- microbenchmark(hhg.test(x, y, nr.perm=1000), times=1, unit=\"secs\")\n executions <- c(executions, list(time[1, 2]/(10^9)))\n }\n linear_data <- c(linear_data, list(sapply(executions, mean)))\n\n print(\"Finished\")\n i <- i + 1\n }\n\n df <- data.frame(\n matrix(unlist(linear_data), nrow=length(linear_data), byrow=T),\n stringsAsFactors=FALSE\n )\n write.csv(df, row.names=FALSE)\n\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/docs/benchmarks/perf_1d.py b/docs/benchmarks/perf_1d.py deleted file mode 100644 index f6710c0fe..000000000 --- a/docs/benchmarks/perf_1d.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -1D Performance Comparisons -================================= - -There are few implementations in :mod:`hyppo.independence` the have implementations -in R. Here, we compare the test statistics between the R-generated values and our -package values. As you can see, there is a minimum difference between test statistics. -""" - - -import os -import sys -import timeit - -import matplotlib.pyplot as plt -import numpy as np -import seaborn as sns -from hyppo.independence import HHG, MGC, Dcorr -from hyppo.ksample import MMD -from hyppo.tools import linear - -sys.path.append(os.path.realpath("..")) - -# make plots look pretty -sns.set(color_codes=True, style="white", context="talk", font_scale=1) -PALETTE = sns.color_palette("Set1") -sns.set_palette(PALETTE[1:]) - -# constants -N = [50, 100, 200, 500, 1000, 2000, 5000, 10000] # sample sizes -FONTSIZE = 20 - -# tests -TESTS = {"indep": [Dcorr, MGC, HHG], "ksample": [MMD], "fast": [Dcorr]} - - -# function runs wall time estimates using timeit (for python) -def estimate_wall_times(test_type, tests, **kwargs): - for test in tests: - times = [] - for n in N: - x, y = linear(n, 1, noise=True) - # numba is a JIT compiler, run code to cache first, then time - _ = test().test(x, y, workers=-1, **kwargs) - start_time = timeit.default_timer() - _ = test().test(x, y, workers=-1, **kwargs) - times.append(timeit.default_timer() - start_time) - np.savetxt( - "../benchmarks/perf/{}_{}.csv".format(test_type, test.__name__), - times, - delimiter=",", - ) - return times - - -# compute wall times, uncomment to recompute -# kwargs = {} -# for test_type in TESTS.keys(): -# if test_type == "fast": -# kwargs["auto"] = True -# estimate_wall_times(test_type, TESTS[test_type], **kwargs) - -# Dictionary of test colors and labels -TEST_METADATA = { - "MGC": {"test_name": "MGC (hyppo)", "color": "#e41a1c"}, - "HHG": {"test_name": "HHG (hyppo)", "color": "#4daf4a"}, - "Dcorr": {"test_name": "Dcorr (hyppo)", "color": "#377eb8"}, - "ksample_Hsic": {"test_name": "MMD (hyppo)", "color": "#ff7f00"}, - "fast_Dcorr": {"test_name": "Fast 1D Dcorr (hyppo)", "color": "#984ea3"}, - "HHG_hhg": {"test_name": "HHG (HHG)", "color": "#4daf4a"}, - "Dcorr_energy": {"test_name": "Dcorr (energy)", "color": "#377eb8"}, - "Dcorr_kernlab": {"test_name": "MMD (kernlab)", "color": "#ff7f00"}, -} - - -def plot_wall_times(): - _ = plt.figure(figsize=(10, 10)) - ax = plt.subplot(111) - - i = 0 - kwargs = {} - for file_name, metadata in TEST_METADATA.items(): - test_times = np.genfromtxt( - "../benchmarks/perf/{}.csv".format(file_name), delimiter="," - ) - - if file_name in ["HHG_hhg", "Dcorr_energy", "Dcorr_kernlab"]: - kwargs = {"linestyle": "dashed"} - ax.plot( - N, - test_times, - color=metadata["color"], - label=metadata["test_name"], - lw=5, - **kwargs - ) - i += 1 - - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - ax.set_xlabel("Number of Samples") - ax.set_ylabel("Execution Time\n(Seconds)") - ax.set_xscale("log") - ax.set_yscale("log") - ax.set_xticks([1e2, 1e3, 1e4]) - ax.set_yticks([1e-4, 1e-2, 1e0, 1e2, 1e4]) - - leg = plt.legend( - bbox_to_anchor=(0.5, 0.95), - bbox_transform=plt.gcf().transFigure, - ncol=2, - loc="upper center", - ) - leg.get_frame().set_linewidth(0.0) - for legobj in leg.legendHandles: - legobj.set_linewidth(5.0) - - -# plot the wall times -plot_wall_times() - -######################################################################################## -# The following shows the code that was used to compute the R test statistics. -# Certain lines were commented out depending on whether or not they were useful. -# -# .. code-block:: -# -# rm(list=ls()) -# -# require("energy") -# require("kernlab") -# require("mgc") -# require("HHG") -# require("microbenchmark") -# -# num_samples_range = c(50, 100, 200, 500, 1000, 2000, 5000, 10000) -# linear_data <- list() -# i <- 1 -# for (num_samples in num_samples_range){ -# data <- mgc.sims.linear(num_samples, 1) -# x <- data$X -# y <- data$Y -# times = seq(1, 3, by=1) -# executions <- list() -# for (t in times){ -# # x <- as.matrix(dist((x), diag = TRUE, upper = TRUE)) -# # y <- as.matrix(dist((y), diag = TRUE, upper = TRUE)) -# -# # best of 5 executions -# # time <- microbenchmark(kmmd(x, y, ntimes=1000), times=1, unit="secs") -# # time <- microbenchmark(dcor.test(x, y, R=1000), times=1, unit="secs") -# # time <- microbenchmark(dcor.test(x, y, R=1000), times=1, unit="secs") -# time <- microbenchmark(dcor2d(x, y), times=1, unit="secs") -# # time <- microbenchmark(hhg.test(x, y, nr.perm=1000), times=1, unit="secs") -# executions <- c(executions, list(time[1, 2]/(10^9))) -# } -# linear_data <- c(linear_data, list(sapply(executions, mean))) -# -# print("Finished") -# i <- i + 1 -# } -# -# df <- data.frame( -# matrix(unlist(linear_data), nrow=length(linear_data), byrow=T), -# stringsAsFactors=FALSE -# ) -# write.csv(df, row.names=FALSE) diff --git a/docs/benchmarks/perf_1d.py.md5 b/docs/benchmarks/perf_1d.py.md5 deleted file mode 100644 index f91675987..000000000 --- a/docs/benchmarks/perf_1d.py.md5 +++ /dev/null @@ -1 +0,0 @@ -1e571047575f98da0e663bb6412c789b \ No newline at end of file diff --git a/docs/benchmarks/perf_1d.rst b/docs/benchmarks/perf_1d.rst deleted file mode 100644 index c7d48de00..000000000 --- a/docs/benchmarks/perf_1d.rst +++ /dev/null @@ -1,235 +0,0 @@ - -.. DO NOT EDIT. -.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. -.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: -.. "benchmarks/perf_1d.py" -.. LINE NUMBERS ARE GIVEN BELOW. - -.. only:: html - - .. note:: - :class: sphx-glr-download-link-note - - Click :ref:`here ` - to download the full example code - -.. rst-class:: sphx-glr-example-title - -.. _sphx_glr_benchmarks_perf_1d.py: - - -1D Performance Comparisons -================================= - -There are few implementations in :mod:`hyppo.independence` the have implementations -in R. Here, we compare the test statistics between the R-generated values and our -package values. As you can see, there is a minimum difference between test statistics. - -.. GENERATED FROM PYTHON SOURCE LINES 9-122 - -.. code-block:: default - - - - import os - import sys - import timeit - - import matplotlib.pyplot as plt - import numpy as np - import seaborn as sns - from hyppo.independence import HHG, MGC, Dcorr - from hyppo.ksample import MMD - from hyppo.tools import linear - - sys.path.append(os.path.realpath("..")) - - # make plots look pretty - sns.set(color_codes=True, style="white", context="talk", font_scale=1) - PALETTE = sns.color_palette("Set1") - sns.set_palette(PALETTE[1:]) - - # constants - N = [50, 100, 200, 500, 1000, 2000, 5000, 10000] # sample sizes - FONTSIZE = 20 - - # tests - TESTS = {"indep": [Dcorr, MGC, HHG], "ksample": [MMD], "fast": [Dcorr]} - - - # function runs wall time estimates using timeit (for python) - def estimate_wall_times(test_type, tests, **kwargs): - for test in tests: - times = [] - for n in N: - x, y = linear(n, 1, noise=True) - # numba is a JIT compiler, run code to cache first, then time - _ = test().test(x, y, workers=-1, **kwargs) - start_time = timeit.default_timer() - _ = test().test(x, y, workers=-1, **kwargs) - times.append(timeit.default_timer() - start_time) - np.savetxt( - "../benchmarks/perf/{}_{}.csv".format(test_type, test.__name__), - times, - delimiter=",", - ) - return times - - - # compute wall times, uncomment to recompute - # kwargs = {} - # for test_type in TESTS.keys(): - # if test_type == "fast": - # kwargs["auto"] = True - # estimate_wall_times(test_type, TESTS[test_type], **kwargs) - - # Dictionary of test colors and labels - TEST_METADATA = { - "MGC": {"test_name": "MGC (hyppo)", "color": "#e41a1c"}, - "HHG": {"test_name": "HHG (hyppo)", "color": "#4daf4a"}, - "Dcorr": {"test_name": "Dcorr (hyppo)", "color": "#377eb8"}, - "ksample_Hsic": {"test_name": "MMD (hyppo)", "color": "#ff7f00"}, - "fast_Dcorr": {"test_name": "Fast 1D Dcorr (hyppo)", "color": "#984ea3"}, - "HHG_hhg": {"test_name": "HHG (HHG)", "color": "#4daf4a"}, - "Dcorr_energy": {"test_name": "Dcorr (energy)", "color": "#377eb8"}, - "Dcorr_kernlab": {"test_name": "MMD (kernlab)", "color": "#ff7f00"}, - } - - - def plot_wall_times(): - _ = plt.figure(figsize=(10, 10)) - ax = plt.subplot(111) - - i = 0 - kwargs = {} - for file_name, metadata in TEST_METADATA.items(): - test_times = np.genfromtxt( - "../benchmarks/perf/{}.csv".format(file_name), delimiter="," - ) - - if file_name in ["HHG_hhg", "Dcorr_energy", "Dcorr_kernlab"]: - kwargs = {"linestyle": "dashed"} - ax.plot( - N, - test_times, - color=metadata["color"], - label=metadata["test_name"], - lw=5, - **kwargs - ) - i += 1 - - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - ax.set_xlabel("Number of Samples") - ax.set_ylabel("Execution Time\n(Seconds)") - ax.set_xscale("log") - ax.set_yscale("log") - ax.set_xticks([1e2, 1e3, 1e4]) - ax.set_yticks([1e-4, 1e-2, 1e0, 1e2, 1e4]) - - leg = plt.legend( - bbox_to_anchor=(0.5, 0.95), - bbox_transform=plt.gcf().transFigure, - ncol=2, - loc="upper center", - ) - leg.get_frame().set_linewidth(0.0) - for legobj in leg.legendHandles: - legobj.set_linewidth(5.0) - - - # plot the wall times - plot_wall_times() - - - - -.. image:: /benchmarks/images/sphx_glr_perf_1d_001.png - :alt: perf 1d - :class: sphx-glr-single-img - - - - - -.. GENERATED FROM PYTHON SOURCE LINES 123-168 - -The following shows the code that was used to compute the R test statistics. -Certain lines were commented out depending on whether or not they were useful. - -.. code-block:: - - rm(list=ls()) - - require("energy") - require("kernlab") - require("mgc") - require("HHG") - require("microbenchmark") - - num_samples_range = c(50, 100, 200, 500, 1000, 2000, 5000, 10000) - linear_data <- list() - i <- 1 - for (num_samples in num_samples_range){ - data <- mgc.sims.linear(num_samples, 1) - x <- data$X - y <- data$Y - times = seq(1, 3, by=1) - executions <- list() - for (t in times){ - # x <- as.matrix(dist((x), diag = TRUE, upper = TRUE)) - # y <- as.matrix(dist((y), diag = TRUE, upper = TRUE)) - - # best of 5 executions - # time <- microbenchmark(kmmd(x, y, ntimes=1000), times=1, unit="secs") - # time <- microbenchmark(dcor.test(x, y, R=1000), times=1, unit="secs") - # time <- microbenchmark(dcor.test(x, y, R=1000), times=1, unit="secs") - time <- microbenchmark(dcor2d(x, y), times=1, unit="secs") - # time <- microbenchmark(hhg.test(x, y, nr.perm=1000), times=1, unit="secs") - executions <- c(executions, list(time[1, 2]/(10^9))) - } - linear_data <- c(linear_data, list(sapply(executions, mean))) - - print("Finished") - i <- i + 1 - } - - df <- data.frame( - matrix(unlist(linear_data), nrow=length(linear_data), byrow=T), - stringsAsFactors=FALSE - ) - write.csv(df, row.names=FALSE) - - -.. rst-class:: sphx-glr-timing - - **Total running time of the script:** ( 0 minutes 1.901 seconds) - - -.. _sphx_glr_download_benchmarks_perf_1d.py: - - -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download sphx-glr-download-python - - :download:`Download Python source code: perf_1d.py ` - - - - .. container:: sphx-glr-download sphx-glr-download-jupyter - - :download:`Download Jupyter notebook: perf_1d.ipynb ` - - -.. only:: html - - .. rst-class:: sphx-glr-signature - - `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/benchmarks/perf_1d_codeobj.pickle b/docs/benchmarks/perf_1d_codeobj.pickle deleted file mode 100644 index 0f78f40c9..000000000 Binary files a/docs/benchmarks/perf_1d_codeobj.pickle and /dev/null differ diff --git a/docs/benchmarks/same_stat.ipynb b/docs/benchmarks/same_stat.ipynb deleted file mode 100644 index 76005b4ec..000000000 --- a/docs/benchmarks/same_stat.ipynb +++ /dev/null @@ -1,61 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n# Comparisons of Test Statistics\n\nThere are few implementations in :mod:`hyppo.independence` the have implementations\nin R. Here, we compare the test statistics between the R-generated values and our\npackage values. As you can see, there is a minimum difference between test statistics.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import os\nimport sys\n\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as mticker\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom hyppo.independence import HHG, Dcorr\nfrom hyppo.ksample import MMD\nfrom hyppo.tools import rot_ksamp, spiral\n\nsys.path.append(os.path.realpath(\"..\"))\n\n# make plots look pretty\nsns.set(color_codes=True, style=\"white\", context=\"talk\", font_scale=1)\nPALETTE = sns.color_palette(\"Set1\")\nsns.set_palette(PALETTE[1:])\n\n# constants\nN = 100 # number of replications to show\nFONTSIZE = 20\n\n# tests\nTESTS = {\n \"Dcorr\": Dcorr,\n \"MMD\": MMD,\n \"HHG\": HHG,\n}\n\n\n# generate the simulations, uncomment this code to regenerate\n# for i in range(N):\n# x, y = spiral(1000, 1, noise=True)\n# sim = np.hstack([x, y])\n# np.savetxt(\"../benchmarks/same_stat/indep/{}.csv\".format(i + 1), sim, delimiter=\",\")\n# sim1, sim2 = rot_ksamp(\"spiral\", 200, 1, noise=True)\n# np.savetxt(\n# \"../benchmarks/same_stat/ksample/sim1_{}.csv\".format(i + 1), sim, delimiter=\",\"\n# )\n# np.savetxt(\n# \"../benchmarks/same_stat/ksample/sim2_{}.csv\".format(i + 1), sim, delimiter=\",\"\n# )\n\n\n# compute test statistics, uncomment to recompute\n# for key, test in TESTS.items():\n# stats = []\n# for i in range(N):\n# if key == \"MMD\":\n# sim1 = np.genfromtxt(\n# \"../benchmarks/same_stat/ksample/sim1_{}.csv\".format(i + 1),\n# delimiter=\",\",\n# )\n# sim2 = np.genfromtxt(\n# \"../benchmarks/same_stat/ksample/sim2_{}.csv\".format(i + 1),\n# delimiter=\",\",\n# )\n# stat = test(bias=True).statistic(sim1, sim2)\n# else:\n# sim = np.genfromtxt(\n# \"../benchmarks/same_stat/indep/{}.csv\".format(i + 1), delimiter=\",\"\n# )\n# x, y = np.hsplit(sim, 2)\n# stat = test().statistic(x, y)\n# stats.append(stat)\n# np.savetxt(\"../benchmarks/same_stat/{}.csv\".format(key), stats, delimiter=\",\")\n\n\ndef plot_stats():\n _ = plt.figure(figsize=(5, 10))\n ax = plt.subplot(111)\n\n test_names = list(TESTS.keys())\n data = np.zeros((N, len(test_names)))\n for i in range(len(test_names)):\n if test_names[i] == \"MMD\":\n hyppo_stat = np.genfromtxt(\"../benchmarks/same_stat/MMD.csv\", delimiter=\",\")\n r_stat = np.genfromtxt(\"../benchmarks/same_stat/RMMD.csv\", delimiter=\",\")\n else:\n hyppo_stat = np.genfromtxt(\n \"../benchmarks/same_stat/{}.csv\".format(test_names[i]), delimiter=\",\"\n )\n r_stat = np.genfromtxt(\n \"../benchmarks/same_stat/R{}.csv\".format(test_names[i]), delimiter=\",\"\n )\n if (\n test_names[i] == \"HHG\"\n ): # Fix for large HHG stats so difference is comparable\n hyppo_stat *= 1e-8\n r_stat *= 1e-8\n data[:, i] = np.abs(hyppo_stat) - np.abs(r_stat)\n\n data = pd.DataFrame(data=data, columns=test_names)\n sns.stripplot(\n data=data,\n edgecolor=\"gray\",\n size=5,\n palette=[\"#377eb8\", \"#ff7f00\", \"#4daf4a\"],\n jitter=1,\n )\n ax.axhline(y=0, color=\"red\", linewidth=1)\n\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.spines[\"bottom\"].set_visible(False)\n ax.set_ylabel(\"Test Statistic\\nDifference\")\n ax.set_ylim(-5e-4, 5e-4)\n ax.set_yticks([-5e-4, 0, 5e-4])\n f = mticker.ScalarFormatter(useOffset=False)\n ax.yaxis.set_major_formatter(\n mticker.FuncFormatter(\n lambda x, pos: \"{}\".format(f._formatSciNotation(\"%1.1e\" % x))\n )\n )\n\n\n# plot the statistic differences\nplot_stats()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following shows the code that was used to compute the R test statistics.\nCertain lines were commented out depending on whether or not they were useful.\n\n.. code-block::\n\n rm(list=ls())\n\n require(\"energy\")\n require(\"kernlab\")\n require(\"HHG\")\n # change to your file path using setwd to same_stat/indep and same_stat/ksample\n # filepath =\n\n times = seq(1, 20, by=1)\n statistics <- list()\n for (t in times){\n # df <- read.csv(paste(filepath, \"/\", t, \".csv\", sep=\"\"), sep=\",\")\n df1 <- read.csv(paste(filepath, \"/sim1_\", t, \".csv\", sep=\"\"), sep=\",\")\n df2 <- read.csv(paste(filepath, \"/sim2_\", t, \".csv\", sep=\"\"), sep=\",\")\n # x <- df[, 1]\n # y <- df[, 2]\n x <- df1[,]\n y <- df2[,]\n # stat <- bcdcor(x, y)\n # Dx = as.matrix(dist((x), diag = TRUE, upper = TRUE))\n # Dy = as.matrix(dist((y), diag = TRUE, upper = TRUE))\n # stat <- hhg.test(Dx, Dy, nr.perm=0)$sum.chisq\n stat <- kmmd(x, y, ntimes=0)@mmdstats[2]\n statistics <- c(statistics, list(stat))\n }\n\n df <- data.frame(matrix(unlist(statistics), nrow=length(statistics), byrow=T), stringsAsFactors=FALSE)\n write.csv(df, row.names=FALSE)\n\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/docs/benchmarks/same_stat.py b/docs/benchmarks/same_stat.py deleted file mode 100644 index ab6e162cc..000000000 --- a/docs/benchmarks/same_stat.py +++ /dev/null @@ -1,164 +0,0 @@ -""" -Comparisons of Test Statistics -================================= - -There are few implementations in :mod:`hyppo.independence` the have implementations -in R. Here, we compare the test statistics between the R-generated values and our -package values. As you can see, there is a minimum difference between test statistics. -""" - - -import os -import sys - -import matplotlib.pyplot as plt -import matplotlib.ticker as mticker -import numpy as np -import pandas as pd -import seaborn as sns -from hyppo.independence import HHG, Dcorr -from hyppo.ksample import MMD -from hyppo.tools import rot_ksamp, spiral - -sys.path.append(os.path.realpath("..")) - -# make plots look pretty -sns.set(color_codes=True, style="white", context="talk", font_scale=1) -PALETTE = sns.color_palette("Set1") -sns.set_palette(PALETTE[1:]) - -# constants -N = 100 # number of replications to show -FONTSIZE = 20 - -# tests -TESTS = { - "Dcorr": Dcorr, - "MMD": MMD, - "HHG": HHG, -} - - -# generate the simulations, uncomment this code to regenerate -# for i in range(N): -# x, y = spiral(1000, 1, noise=True) -# sim = np.hstack([x, y]) -# np.savetxt("../benchmarks/same_stat/indep/{}.csv".format(i + 1), sim, delimiter=",") -# sim1, sim2 = rot_ksamp("spiral", 200, 1, noise=True) -# np.savetxt( -# "../benchmarks/same_stat/ksample/sim1_{}.csv".format(i + 1), sim, delimiter="," -# ) -# np.savetxt( -# "../benchmarks/same_stat/ksample/sim2_{}.csv".format(i + 1), sim, delimiter="," -# ) - - -# compute test statistics, uncomment to recompute -# for key, test in TESTS.items(): -# stats = [] -# for i in range(N): -# if key == "MMD": -# sim1 = np.genfromtxt( -# "../benchmarks/same_stat/ksample/sim1_{}.csv".format(i + 1), -# delimiter=",", -# ) -# sim2 = np.genfromtxt( -# "../benchmarks/same_stat/ksample/sim2_{}.csv".format(i + 1), -# delimiter=",", -# ) -# stat = test(bias=True).statistic(sim1, sim2) -# else: -# sim = np.genfromtxt( -# "../benchmarks/same_stat/indep/{}.csv".format(i + 1), delimiter="," -# ) -# x, y = np.hsplit(sim, 2) -# stat = test().statistic(x, y) -# stats.append(stat) -# np.savetxt("../benchmarks/same_stat/{}.csv".format(key), stats, delimiter=",") - - -def plot_stats(): - _ = plt.figure(figsize=(5, 10)) - ax = plt.subplot(111) - - test_names = list(TESTS.keys()) - data = np.zeros((N, len(test_names))) - for i in range(len(test_names)): - if test_names[i] == "MMD": - hyppo_stat = np.genfromtxt("../benchmarks/same_stat/MMD.csv", delimiter=",") - r_stat = np.genfromtxt("../benchmarks/same_stat/RMMD.csv", delimiter=",") - else: - hyppo_stat = np.genfromtxt( - "../benchmarks/same_stat/{}.csv".format(test_names[i]), delimiter="," - ) - r_stat = np.genfromtxt( - "../benchmarks/same_stat/R{}.csv".format(test_names[i]), delimiter="," - ) - if ( - test_names[i] == "HHG" - ): # Fix for large HHG stats so difference is comparable - hyppo_stat *= 1e-8 - r_stat *= 1e-8 - data[:, i] = np.abs(hyppo_stat) - np.abs(r_stat) - - data = pd.DataFrame(data=data, columns=test_names) - sns.stripplot( - data=data, - edgecolor="gray", - size=5, - palette=["#377eb8", "#ff7f00", "#4daf4a"], - jitter=1, - ) - ax.axhline(y=0, color="red", linewidth=1) - - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - ax.spines["bottom"].set_visible(False) - ax.set_ylabel("Test Statistic\nDifference") - ax.set_ylim(-5e-4, 5e-4) - ax.set_yticks([-5e-4, 0, 5e-4]) - f = mticker.ScalarFormatter(useOffset=False) - ax.yaxis.set_major_formatter( - mticker.FuncFormatter( - lambda x, pos: "{}".format(f._formatSciNotation("%1.1e" % x)) - ) - ) - - -# plot the statistic differences -plot_stats() - -######################################################################################## -# The following shows the code that was used to compute the R test statistics. -# Certain lines were commented out depending on whether or not they were useful. -# -# .. code-block:: -# -# rm(list=ls()) -# -# require("energy") -# require("kernlab") -# require("HHG") -# # change to your file path using setwd to same_stat/indep and same_stat/ksample -# # filepath = -# -# times = seq(1, 20, by=1) -# statistics <- list() -# for (t in times){ -# # df <- read.csv(paste(filepath, "/", t, ".csv", sep=""), sep=",") -# df1 <- read.csv(paste(filepath, "/sim1_", t, ".csv", sep=""), sep=",") -# df2 <- read.csv(paste(filepath, "/sim2_", t, ".csv", sep=""), sep=",") -# # x <- df[, 1] -# # y <- df[, 2] -# x <- df1[,] -# y <- df2[,] -# # stat <- bcdcor(x, y) -# # Dx = as.matrix(dist((x), diag = TRUE, upper = TRUE)) -# # Dy = as.matrix(dist((y), diag = TRUE, upper = TRUE)) -# # stat <- hhg.test(Dx, Dy, nr.perm=0)$sum.chisq -# stat <- kmmd(x, y, ntimes=0)@mmdstats[2] -# statistics <- c(statistics, list(stat)) -# } -# -# df <- data.frame(matrix(unlist(statistics), nrow=length(statistics), byrow=T), stringsAsFactors=FALSE) -# write.csv(df, row.names=FALSE) diff --git a/docs/benchmarks/same_stat.py.md5 b/docs/benchmarks/same_stat.py.md5 deleted file mode 100644 index b3dc2c104..000000000 --- a/docs/benchmarks/same_stat.py.md5 +++ /dev/null @@ -1 +0,0 @@ -a460d675ea8e93b7eb2870d6250b17ab \ No newline at end of file diff --git a/docs/benchmarks/same_stat.rst b/docs/benchmarks/same_stat.rst deleted file mode 100644 index c175aa6ea..000000000 --- a/docs/benchmarks/same_stat.rst +++ /dev/null @@ -1,232 +0,0 @@ - -.. DO NOT EDIT. -.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. -.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: -.. "benchmarks/same_stat.py" -.. LINE NUMBERS ARE GIVEN BELOW. - -.. only:: html - - .. note:: - :class: sphx-glr-download-link-note - - Click :ref:`here ` - to download the full example code - -.. rst-class:: sphx-glr-example-title - -.. _sphx_glr_benchmarks_same_stat.py: - - -Comparisons of Test Statistics -================================= - -There are few implementations in :mod:`hyppo.independence` the have implementations -in R. Here, we compare the test statistics between the R-generated values and our -package values. As you can see, there is a minimum difference between test statistics. - -.. GENERATED FROM PYTHON SOURCE LINES 9-131 - -.. code-block:: default - - - - import os - import sys - - import matplotlib.pyplot as plt - import matplotlib.ticker as mticker - import numpy as np - import pandas as pd - import seaborn as sns - from hyppo.independence import HHG, Dcorr - from hyppo.ksample import MMD - from hyppo.tools import rot_ksamp, spiral - - sys.path.append(os.path.realpath("..")) - - # make plots look pretty - sns.set(color_codes=True, style="white", context="talk", font_scale=1) - PALETTE = sns.color_palette("Set1") - sns.set_palette(PALETTE[1:]) - - # constants - N = 100 # number of replications to show - FONTSIZE = 20 - - # tests - TESTS = { - "Dcorr": Dcorr, - "MMD": MMD, - "HHG": HHG, - } - - - # generate the simulations, uncomment this code to regenerate - # for i in range(N): - # x, y = spiral(1000, 1, noise=True) - # sim = np.hstack([x, y]) - # np.savetxt("../benchmarks/same_stat/indep/{}.csv".format(i + 1), sim, delimiter=",") - # sim1, sim2 = rot_ksamp("spiral", 200, 1, noise=True) - # np.savetxt( - # "../benchmarks/same_stat/ksample/sim1_{}.csv".format(i + 1), sim, delimiter="," - # ) - # np.savetxt( - # "../benchmarks/same_stat/ksample/sim2_{}.csv".format(i + 1), sim, delimiter="," - # ) - - - # compute test statistics, uncomment to recompute - # for key, test in TESTS.items(): - # stats = [] - # for i in range(N): - # if key == "MMD": - # sim1 = np.genfromtxt( - # "../benchmarks/same_stat/ksample/sim1_{}.csv".format(i + 1), - # delimiter=",", - # ) - # sim2 = np.genfromtxt( - # "../benchmarks/same_stat/ksample/sim2_{}.csv".format(i + 1), - # delimiter=",", - # ) - # stat = test(bias=True).statistic(sim1, sim2) - # else: - # sim = np.genfromtxt( - # "../benchmarks/same_stat/indep/{}.csv".format(i + 1), delimiter="," - # ) - # x, y = np.hsplit(sim, 2) - # stat = test().statistic(x, y) - # stats.append(stat) - # np.savetxt("../benchmarks/same_stat/{}.csv".format(key), stats, delimiter=",") - - - def plot_stats(): - _ = plt.figure(figsize=(5, 10)) - ax = plt.subplot(111) - - test_names = list(TESTS.keys()) - data = np.zeros((N, len(test_names))) - for i in range(len(test_names)): - if test_names[i] == "MMD": - hyppo_stat = np.genfromtxt("../benchmarks/same_stat/MMD.csv", delimiter=",") - r_stat = np.genfromtxt("../benchmarks/same_stat/RMMD.csv", delimiter=",") - else: - hyppo_stat = np.genfromtxt( - "../benchmarks/same_stat/{}.csv".format(test_names[i]), delimiter="," - ) - r_stat = np.genfromtxt( - "../benchmarks/same_stat/R{}.csv".format(test_names[i]), delimiter="," - ) - if ( - test_names[i] == "HHG" - ): # Fix for large HHG stats so difference is comparable - hyppo_stat *= 1e-8 - r_stat *= 1e-8 - data[:, i] = np.abs(hyppo_stat) - np.abs(r_stat) - - data = pd.DataFrame(data=data, columns=test_names) - sns.stripplot( - data=data, - edgecolor="gray", - size=5, - palette=["#377eb8", "#ff7f00", "#4daf4a"], - jitter=1, - ) - ax.axhline(y=0, color="red", linewidth=1) - - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - ax.spines["bottom"].set_visible(False) - ax.set_ylabel("Test Statistic\nDifference") - ax.set_ylim(-5e-4, 5e-4) - ax.set_yticks([-5e-4, 0, 5e-4]) - f = mticker.ScalarFormatter(useOffset=False) - ax.yaxis.set_major_formatter( - mticker.FuncFormatter( - lambda x, pos: "{}".format(f._formatSciNotation("%1.1e" % x)) - ) - ) - - - # plot the statistic differences - plot_stats() - - - - -.. image:: /benchmarks/images/sphx_glr_same_stat_001.png - :alt: same stat - :class: sphx-glr-single-img - - - - - -.. GENERATED FROM PYTHON SOURCE LINES 132-165 - -The following shows the code that was used to compute the R test statistics. -Certain lines were commented out depending on whether or not they were useful. - -.. code-block:: - - rm(list=ls()) - - require("energy") - require("kernlab") - require("HHG") - # change to your file path using setwd to same_stat/indep and same_stat/ksample - # filepath = - - times = seq(1, 20, by=1) - statistics <- list() - for (t in times){ - # df <- read.csv(paste(filepath, "/", t, ".csv", sep=""), sep=",") - df1 <- read.csv(paste(filepath, "/sim1_", t, ".csv", sep=""), sep=",") - df2 <- read.csv(paste(filepath, "/sim2_", t, ".csv", sep=""), sep=",") - # x <- df[, 1] - # y <- df[, 2] - x <- df1[,] - y <- df2[,] - # stat <- bcdcor(x, y) - # Dx = as.matrix(dist((x), diag = TRUE, upper = TRUE)) - # Dy = as.matrix(dist((y), diag = TRUE, upper = TRUE)) - # stat <- hhg.test(Dx, Dy, nr.perm=0)$sum.chisq - stat <- kmmd(x, y, ntimes=0)@mmdstats[2] - statistics <- c(statistics, list(stat)) - } - - df <- data.frame(matrix(unlist(statistics), nrow=length(statistics), byrow=T), stringsAsFactors=FALSE) - write.csv(df, row.names=FALSE) - - -.. rst-class:: sphx-glr-timing - - **Total running time of the script:** ( 0 minutes 0.106 seconds) - - -.. _sphx_glr_download_benchmarks_same_stat.py: - - -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download sphx-glr-download-python - - :download:`Download Python source code: same_stat.py ` - - - - .. container:: sphx-glr-download sphx-glr-download-jupyter - - :download:`Download Jupyter notebook: same_stat.ipynb ` - - -.. only:: html - - .. rst-class:: sphx-glr-signature - - `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/benchmarks/same_stat_codeobj.pickle b/docs/benchmarks/same_stat_codeobj.pickle deleted file mode 100644 index 730fc107a..000000000 Binary files a/docs/benchmarks/same_stat_codeobj.pickle and /dev/null differ diff --git a/docs/benchmarks/searchindex.db b/docs/benchmarks/searchindex.db deleted file mode 100644 index 550a6e6a4..000000000 Binary files a/docs/benchmarks/searchindex.db and /dev/null differ diff --git a/docs/benchmarks/sg_execution_times.rst b/docs/benchmarks/sg_execution_times.rst deleted file mode 100644 index f0f826c32..000000000 --- a/docs/benchmarks/sg_execution_times.rst +++ /dev/null @@ -1,18 +0,0 @@ - -:orphan: - -.. _sphx_glr_benchmarks_sg_execution_times: - -Computation times -================= -**00:03.835** total execution time for **benchmarks** files: - -+------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_benchmarks_indep_power_dimension.py` (``indep_power_dimension.py``) | 00:03.835 | 0.0 MB | -+------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_benchmarks_indep_power_sampsize.py` (``indep_power_sampsize.py``) | 00:00.000 | 0.0 MB | -+------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_benchmarks_perf_1d.py` (``perf_1d.py``) | 00:00.000 | 0.0 MB | -+------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_benchmarks_same_stat.py` (``same_stat.py``) | 00:00.000 | 0.0 MB | -+------------------------------------------------------------------------------------+-----------+--------+ diff --git a/hyppo/discrim/base.py b/hyppo/discrim/base.py index 4ade493f9..d638aaaef 100644 --- a/hyppo/discrim/base.py +++ b/hyppo/discrim/base.py @@ -5,7 +5,7 @@ class DiscriminabilityTest(ABC): r""" - A base class for a Discriminability test. + A base class for a discriminability test. """ def __init__(self): @@ -23,37 +23,11 @@ def statistic(self, x, y): Input data matrices. """ - rdfs = self._discr_rdf(x, y) + rdfs = _discr_rdf(x, y) stat = np.nanmean(rdfs) return stat - def _discr_rdf(self, dissimilarities, labels): - # calculates test statistics distribution - rdfs = [] - - for i, label in enumerate(labels): - di = dissimilarities[i] - - # All other samples except its own label - idx = labels == label - Dij = di[~idx] - - # All samples except itself - idx[i] = False - Dii = di[idx] - - rdf = [ - 1 - ((Dij < d).sum() + 0.5 * (Dij == d).sum()) / Dij.size for d in Dii - ] - rdfs.append(rdf) - - out = np.full((len(rdfs), max(map(len, rdfs))), np.nan) - for i, rdf in enumerate(rdfs): - out[i, : len(rdf)] = rdf - - return out - def _perm_stat(self, index): r""" Helper function that is used to calculate parallel permuted test @@ -73,6 +47,45 @@ def _perm_stat(self, index): @abstractmethod def test(self): r""" - Calculates the test statistic and p-value for Discriminability one sample - and two sample test. + Calculates the Discriminability test statistic and p-value. """ + + +def _discr_rdf(dissimilarities, labels): + """ + Calculates the distributions of the test statistics. + + Parameters + ---------- + dissimilarities : ndarray + Input matrix containing the disimilarities. + labels : ndarray + The label matrix corresponding to each dissimilarity. + + Returns + ------- + [type] + [description] + """ + # calculates test statistics distribution + rdfs = [] + + for i, label in enumerate(labels): + di = dissimilarities[i] + + # All other samples except its own label + idx = labels == label + Dij = di[~idx] + + # All samples except itself + idx[i] = False + Dii = di[idx] + + rdf = [1 - ((Dij < d).sum() + 0.5 * (Dij == d).sum()) / Dij.size for d in Dii] + rdfs.append(rdf) + + out = np.full((len(rdfs), max(map(len, rdfs))), np.nan) + for i, rdf in enumerate(rdfs): + out[i, : len(rdf)] = rdf + + return out diff --git a/hyppo/discrim/discrim_two_samp.py b/hyppo/discrim/discrim_two_samp.py index 584c1ba75..22a7c1fbd 100644 --- a/hyppo/discrim/discrim_two_samp.py +++ b/hyppo/discrim/discrim_two_samp.py @@ -10,18 +10,8 @@ class DiscrimTwoSample(DiscriminabilityTest): r""" A class that compares the discriminability of two datasets. Two sample test measures whether the discriminability is different for - one dataset compared to another. More details can be described in [#1Dscr]_. - Parameters - ---------- - is_dist : bool, optional (default: False) - Whether `x1` and `x2` are distance matrices or not. - remove_isolates : bool, optional (default: True) - Whether to remove the measurements with a single instance or not. - See Also - -------- - DiscrimOneSample : One sample test for discriminability of a single measurement - Notes - ----- + one dataset compared to another. More details can be described in `[1]`_. + Let :math:`\hat D_{x_1}` denote the sample discriminability of one approach, and :math:`\hat D_{x_2}` denote the sample discriminability of another approach. Then, @@ -30,6 +20,15 @@ class DiscrimTwoSample(DiscriminabilityTest): H_A: D_{x_1} &> D_{x_2} Alternatively, tests can be done for :math:`D_{x_1} < D_{x_2}` and :math:`D_{x_1} \neq D_{x_2}`. + + .. _[1]: https://www.biorxiv.org/content/10.1101/802629v1 + + Parameters + ---------- + is_dist : bool, default: False + Whether `x1` and `x2` are distance matrices or not. + remove_isolates : bool, default: True + Whether to remove the measurements with a single instance or not. """ def __init__(self, is_dist=False, remove_isolates=True): @@ -40,6 +39,7 @@ def __init__(self, is_dist=False, remove_isolates=True): def statistic(self, x, y): """ Helper function that calculates the discriminability test statistic. + Parameters ---------- x, y : ndarray @@ -48,6 +48,7 @@ def statistic(self, x, y): `n` is the number of samples and `p` and `q` are the number of dimensions. Alternatively, `x` and `y` can be distance matrices, where the shapes must both be `(n, n)`. + Returns ------- stat : float @@ -61,6 +62,7 @@ def test(self, x1, x2, y, reps=1000, alt="neq", workers=-1): r""" Calculates the test statistic and p-value for a two sample test for discriminability. + Parameters ---------- x1, x2 : ndarray @@ -84,6 +86,7 @@ def test(self, x1, x2, y, reps=1000, alt="neq", workers=-1): workers : int, optional (default: -1) The number of cores to parallelize the p-value computation over. Supply -1 to use all cores available to the Process. + Returns ------- d1 : float @@ -92,6 +95,7 @@ def test(self, x1, x2, y, reps=1000, alt="neq", workers=-1): The computed discriminability score for ``x2``. pvalue : float The computed two sample test p-value. + Examples -------- >>> import numpy as np @@ -157,10 +161,12 @@ def _perm_stat(self, index): # pragma: no cover r""" Helper function that is used to calculate parallel permuted test statistics. + Parameters ---------- index : int Iterator used for parallel statistic calculation + Returns ------- perm_stat1, perm_stat2 : float diff --git a/hyppo/independence/_utils.py b/hyppo/independence/_utils.py index 08d4d6a8a..e9292905b 100644 --- a/hyppo/independence/_utils.py +++ b/hyppo/independence/_utils.py @@ -94,7 +94,7 @@ def sim_matrix(model, x): terminals = model.apply(x) ntrees = terminals.shape[1] - prox_mat = np.sum( + prox_mat = sum( np.equal.outer(terminals[:, i], terminals[:, i]) for i in range(ntrees) ) prox_mat = prox_mat / ntrees diff --git a/hyppo/independence/base.py b/hyppo/independence/base.py index 65c9aab18..86c9b2d83 100644 --- a/hyppo/independence/base.py +++ b/hyppo/independence/base.py @@ -9,26 +9,36 @@ class IndependenceTest(ABC): Parameters ---------- - compute_distance : callable(), optional (default: None) + compute_distance : str, callable, or None, default: "euclidean" or "gaussian" A function that computes the distance among the samples within each data matrix. - Valid strings for ``metric`` are, as defined in - ``sklearn.metrics.pairwise_distances``, + Valid strings for ``compute_distance`` are, as defined in + :func:`sklearn.metrics.pairwise_distances`, - - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, - ‘manhattan’] See the documentation for scipy.spatial.distance for details + - From scikit-learn: [``"euclidean"``, ``"cityblock"``, ``"cosine"``, + ``"l1"``, ``"l2"``, ``"manhattan"``] See the documentation for + :mod:`scipy.spatial.distance` for details on these metrics. - - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, - ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, - ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, - ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the - documentation for scipy.spatial.distance for details on these metrics. - - Set to `None` or `precomputed` if `x` and `y` are already distance - matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``metric(x, **kwargs)`` - where `x` is the data matrix for which pairwise distances are - calculated and kwargs are extra arguements to send to your custom function. + - From scipy.spatial.distance: [``"braycurtis"``, ``"canberra"``, + ``"chebyshev"``, ``"correlation"``, ``"dice"``, ``"hamming"``, + ``"jaccard"``, ``"kulsinski"``, ``"mahalanobis"``, ``"minkowski"``, + ``"rogerstanimoto"``, ``"russellrao"``, ``"seuclidean"``, + ``"sokalmichener"``, ``"sokalsneath"``, ``"sqeuclidean"``, + ``"yule"``] See the documentation for :mod:`scipy.spatial.distance` for + details on these metrics. + + Alternatively, this function computes the kernel similarity among the + samples within each data matrix. + Valid strings for ``compute_kernel`` are, as defined in + :func:`sklearn.metrics.pairwise.pairwise_kernels`, + + [``"additive_chi2"``, ``"chi2"``, ``"linear"``, ``"poly"``, + ``"polynomial"``, ``"rbf"``, + ``"laplacian"``, ``"sigmoid"``, ``"cosine"``] + + Note ``"rbf"`` and ``"gaussian"`` are the same metric. + **kwargs + Arbitrary keyword arguments for ``compute_distkern``. """ def __init__(self, compute_distance=None, **kwargs): @@ -47,30 +57,43 @@ def statistic(self, x, y): Parameters ---------- - x, y : ndarray - Input data matrices. + x,y : ndarray + Input data matrices. ``x`` and ``y`` must have the same number of + samples. That is, the shapes must be ``(n, p)`` and ``(n, q)`` where + `n` is the number of samples and `p` and `q` are the number of + dimensions. Alternatively, ``x`` and ``y`` can be distance matrices, + where the shapes must both be ``(n, n)``. """ @abstractmethod def test(self, x, y, reps=1000, workers=1, is_distsim=True, perm_blocks=None): r""" - Calulates the independence test p-value. + Calulates the independence test statistic and p-value. Parameters ---------- - x, y : ndarray - Input data matrices. - reps : int, optional - The number of replications used in permutation, by default 1000. - workers : int, optional (default: 1) - Evaluates method using `multiprocessing.Pool `). - Supply `-1` to use all cores available to the Process. - perm_blocks : 2d ndarray, optional - Restricts permutations to account for dependencies in data. Columns - recursively partition samples based on unique labels. Groups at - each partition are exchangeable under a permutation but remain - fixed if label is negative. - perm_blocks : ndarray, optional (default None) + x,y : ndarray + Input data matrices. ``x`` and ``y`` must have the same number of + samples. That is, the shapes must be ``(n, p)`` and ``(n, q)`` where + `n` is the number of samples and `p` and `q` are the number of + dimensions. Alternatively, ``x`` and ``y`` can be distance matrices, + where the shapes must both be ``(n, n)``. + reps : int, default: 1000 + The number of replications used to estimate the null distribution + when using the permutation test used to calculate the p-value. + workers : int, default: 1 + The number of cores to parallelize the p-value computation over. + Supply ``-1`` to use all cores available to the Process. + auto : bool, default: True + Automatically uses fast approximation when `n` and size of array + is greater than 20. If ``True``, and sample size is greater than 20, then + :class:`hyppo.tools.chi2_approx` will be run. Parameters ``reps`` and + ``workers`` are + irrelevant in this case. Otherwise, :class:`hyppo.tools.perm_test` will be + run. + is_distsim : bool, default: True + Whether or not ``x`` and ``y`` are input matrices. + perm_blocks : None or ndarray, default: None Defines blocks of exchangeable samples during the permutation test. If None, all samples can be permuted with one another. Requires `n` rows. At each column, samples with matching column value are @@ -84,7 +107,7 @@ def test(self, x, y, reps=1000, workers=1, is_distsim=True, perm_blocks=None): stat : float The computed independence test statistic. pvalue : float - The pvalue obtained via permutation. + The computed independence p-value. """ self.x = x self.y = y diff --git a/hyppo/independence/dcorr.py b/hyppo/independence/dcorr.py index 3e503ba10..5534ddd19 100644 --- a/hyppo/independence/dcorr.py +++ b/hyppo/independence/dcorr.py @@ -136,7 +136,7 @@ def statistic(self, x, y): distx = x disty = y - if not self.is_distance and not self.is_fast: + if not (self.is_distance or self.is_fast): distx, disty = compute_dist( x, y, metric=self.compute_distance, **self.kwargs ) @@ -235,13 +235,16 @@ def test(self, x, y, reps=1000, workers=1, auto=True, perm_blocks=None): self.pvalue = pvalue self.null_dist = None else: - is_distsim = False if not self.is_fast: x, y = compute_dist(x, y, metric=self.compute_distance, **self.kwargs) self.is_distance = True - is_distsim = True stat, pvalue = super(Dcorr, self).test( - x, y, reps, workers, perm_blocks=perm_blocks, is_distsim=is_distsim + x, + y, + reps, + workers, + perm_blocks=perm_blocks, + is_distsim=self.is_distance, ) return stat, pvalue @@ -273,10 +276,9 @@ def _center_distmat(distx, bias): # pragma: no cover @jit(nopython=True, cache=True) def _cpu_cumsum(data): # pragma: no cover """Create cumulative sum since numba doesn't sum over axes.""" - cumsum = data - if data.shape[0] != 1 and data.shape[1] != 1: - for i in range(1, data.shape[0]): - cumsum[i, :] = data[i, :] + cumsum[i - 1, :] + cumsum = data.copy() + for i in range(1, data.shape[0]): + cumsum[i, :] = data[i, :] + cumsum[i - 1, :] return cumsum @@ -289,72 +291,81 @@ def _fast_1d_dcov(x, y, bias=False): # pragma: no cover n = x.shape[0] # sort inputs - x = np.sort(x.ravel()) - y = y[np.argsort(x)] + x_orig = x.ravel() + x = np.sort(x_orig) + y = y[np.argsort(x_orig)] x = x.reshape(-1, 1) # for numba # cumulative sum si = _cpu_cumsum(x) - ax = np.arange(-(n - 2), n + 1, 2).reshape(-1, 1) * x + ( - si[-1] - 2 * si.copy().reshape(-1, 1) - ) + ax = (np.arange(-(n - 2), n + 1, 2) * x.ravel()).reshape(-1, 1) + (si[-1] - 2 * si) v = np.hstack((x, y, x * y)) nw = v.shape[1] idx = np.vstack((np.arange(n), np.zeros(n))).astype(np.int64).T - ivs = [np.zeros(n)] * 4 + iv1 = np.zeros((n, 1)) + iv2 = np.zeros((n, 1)) + iv3 = np.zeros((n, 1)) + iv4 = np.zeros((n, 1)) - i = 0 + i = 1 r = 0 s = 1 while i < n: - gap = 2 * (i + 1) + gap = 2 * i k = 0 idx_r = idx[:, r] - csumv = np.vstack((np.zeros(nw).reshape(1, -1), _cpu_cumsum(v[idx_r, :]))) - for j in range(0, n, gap): - sts = [j, j + i] - es = [min(sts[0] + i, n), min(sts[1] + i, n)] + csumv = np.vstack((np.zeros((1, nw)), _cpu_cumsum(v[idx_r, :]))) + + for j in range(1, n + 1, gap): + st1 = j - 1 + e1 = min(st1 + i - 1, n - 1) + st2 = j + i - 1 + e2 = min(st2 + i - 1, n - 1) - while (sts[0] < es[0]) and (sts[1] < es[1]): - indexes = [idx_r[sts[0]], idx_r[sts[1]]] + while (st1 <= e1) and (st2 <= e2): + idx1 = idx_r[st1] + idx2 = idx_r[st2] - if y[indexes[0]] >= y[indexes[1]]: - idx[k, s] = indexes[0] - sts[0] += 1 + if y[idx1] >= y[idx2]: + idx[k, s] = idx1 + st1 += 1 else: - idx[k, s] = indexes[1] - sts[1] += 1 - ivs[0][indexes[1]] += es[0] - sts[0] + 1 - ivs[1][indexes[1]] += csumv[es[0], 0] - csumv[sts[0], 0] - ivs[2][indexes[1]] += csumv[es[0], 1] - csumv[sts[0], 1] - ivs[3][indexes[1]] += csumv[es[0], 2] - csumv[sts[0], 2] + idx[k, s] = idx2 + st2 += 1 + iv1[idx2] += e1 - st1 + 1 + iv2[idx2] += csumv[e1 + 1, 0] - csumv[st1, 0] + iv3[idx2] += csumv[e1 + 1, 1] - csumv[st1, 1] + iv4[idx2] += csumv[e1 + 1, 2] - csumv[st1, 2] k += 1 - if sts[0] < es[0]: - kf = k + es[0] - sts[0] - idx[k:kf, s] = idx_r[sts[0] : es[0]] + if st1 <= e1: + kf = k + e1 - st1 + 1 + idx[k:kf, s] = idx_r[st1 : e1 + 1] k = kf - elif sts[1] < es[1]: - kf = k + es[1] - sts[1] - idx[k:kf, s] = idx_r[sts[1] : es[1]] + elif st2 <= e2: + kf = k + e2 - st2 + 1 + idx[k:kf, s] = idx_r[st2 : e2 + 1] k = kf i = gap r = 1 - r s = 1 - s - covterm = n * (x - np.mean(x)).T @ (y - np.mean(y)) - cs = [ivs[0].T @ v[:, 2].copy(), np.sum(ivs[3]), ivs[1].T @ y, ivs[2].T @ x] - d = 4 * ((cs[0] + cs[1]) - (cs[2] + cs[3])) - 2 * covterm + covterm = np.sum(n * (x - np.mean(x)).T @ (y - np.mean(y))) + c1 = np.sum(iv1.T @ v[:, 2].copy()) + c2 = np.sum(iv4) + c3 = np.sum(iv2.T @ y) + c4 = np.sum(iv3.T @ x) + d = 4 * ((c1 + c2) - (c3 + c4)) - 2 * covterm - y = y[np.flip(idx[:, r])] - si = _cpu_cumsum(y) + y_sorted = y[idx[n::-1, r], :] + si = _cpu_cumsum(y_sorted) by = np.zeros((n, 1)) - by[np.flip(idx[:, r])] = np.arange(-(n - 2), n + 1, 2).reshape(-1, 1) * y + ( - si[-1] - 2 * si.copy().reshape(-1, 1) - ) + by[idx[::-1, r]] = (np.arange(-(n - 2), n + 1, 2) * y_sorted.ravel()).reshape( + -1, 1 + ) + (si[-1] - 2 * si) if bias: denom = [n ** 2, n ** 3, n ** 4] @@ -393,8 +404,7 @@ def _dcov(distx, disty, bias=False, only_dcov=True): # pragma: no cover @jit(nopython=True, cache=True) def _dcorr(distx, disty, bias=False, is_fast=False): # pragma: no cover """ - Calculate the Dcorr test statistic. Note that though Dcov is calculated - and stored in covar, but not called due to a slower implementation. + Calculate the Dcorr test statistic. """ if is_fast: # calculate covariances and variances diff --git a/hyppo/independence/hsic.py b/hyppo/independence/hsic.py index f121146a1..0b4710c7d 100644 --- a/hyppo/independence/hsic.py +++ b/hyppo/independence/hsic.py @@ -3,7 +3,7 @@ from ..tools import chi2_approx, compute_kern from ._utils import _CheckInputs from .base import IndependenceTest -from .dcorr import _dcov +from .dcorr import Dcorr class Hsic(IndependenceTest): @@ -31,11 +31,14 @@ class Hsic(IndependenceTest): \mathrm{Hsic}^b_n (x, y) = \frac{1}{n^2} \mathrm{tr} (D^x H D^y H) - They are exactly equivalent in the sense that every valid kernel has a corresponding + Hsic and Dcov are exactly equivalent in the sense that every valid kernel has a + corresponding valid semimetric to ensure their equivalence, and vice versa `[3]`_ `[4]`_. In other words, every Dcorr test is also an Hsic and vice versa. Nonetheless, implementations of Dcorr and Hsic use different metrics by default: Dcorr uses a euclidean distance while Hsic uses a Gaussian median kernel. + We consider the normalized version (see :class:`hyppo.independence`) for the + transformation. The p-value returned is calculated using a permutation test using :meth:`hyppo.tools.perm_test`. The fast version of the test uses @@ -108,7 +111,8 @@ def statistic(self, x, y): distx = 1 - kernx / np.max(kernx) disty = 1 - kerny / np.max(kerny) - stat = _dcov(distx, disty, self.bias) + # Hsic and Dcorr are equivalent, cannot use dcov otherwise fast is invalid + stat = Dcorr(bias=self.bias, compute_distance=None).statistic(distx, disty) self.stat = stat return stat @@ -155,7 +159,7 @@ def test(self, x, y, reps=1000, workers=1, auto=True): >>> y = x >>> stat, pvalue = Hsic().test(x, y) >>> '%.1f, %.2f' % (stat, pvalue) - '0.1, 0.00' + '1.0, 0.00' """ check_input = _CheckInputs( x, diff --git a/hyppo/independence/tests/test_cca.py b/hyppo/independence/tests/test_cca.py index 5bb3d4c6c..3c1858fe9 100644 --- a/hyppo/independence/tests/test_cca.py +++ b/hyppo/independence/tests/test_cca.py @@ -2,7 +2,7 @@ import pytest from numpy.testing import assert_almost_equal -from ...tools import joint_normal, linear +from ...tools import joint_normal, linear, power from .. import CCA @@ -28,3 +28,18 @@ def test_linear_threed(self, n, obs_stat, obs_pvalue): assert_almost_equal(stat, obs_stat, decimal=1) assert_almost_equal(pvalue, obs_pvalue, decimal=1) + + +class TestCCATypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "CCA", + sim_type="indep", + sim="multimodal_independence", + n=1000, + p=1, + alpha=0.05, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/independence/tests/test_dcorr.py b/hyppo/independence/tests/test_dcorr.py index c6343f096..b655db2a5 100644 --- a/hyppo/independence/tests/test_dcorr.py +++ b/hyppo/independence/tests/test_dcorr.py @@ -2,7 +2,7 @@ import pytest from numpy.testing import assert_almost_equal, assert_raises, assert_warns -from ...tools import linear +from ...tools import linear, power from .. import Dcorr @@ -19,3 +19,46 @@ def test_linear_oned(self, n, obs_stat, obs_pvalue): assert_almost_equal(stat1, obs_stat, decimal=2) assert_almost_equal(stat2, obs_stat, decimal=2) assert_almost_equal(pvalue1, obs_pvalue, decimal=2) + + +class TestDcorrTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "Dcorr", + sim_type="indep", + sim="multimodal_independence", + n=100, + p=1, + alpha=0.05, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) + + def test_oned_fast(self): + np.random.seed(123456789) + est_power = power( + "Dcorr", + sim_type="indep", + sim="multimodal_independence", + n=100, + p=1, + alpha=0.05, + auto=True, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) + + def test_threed_fast(self): + np.random.seed(123456789) + est_power = power( + "Dcorr", + sim_type="indep", + sim="multimodal_independence", + n=100, + p=3, + alpha=0.05, + auto=True, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/independence/tests/test_hhg.py b/hyppo/independence/tests/test_hhg.py index da6f42130..066d8a0d3 100644 --- a/hyppo/independence/tests/test_hhg.py +++ b/hyppo/independence/tests/test_hhg.py @@ -3,7 +3,7 @@ from numpy.testing import assert_almost_equal from sklearn.metrics import pairwise_distances -from ...tools import linear +from ...tools import linear, power from .. import HHG @@ -28,3 +28,18 @@ def test_diststat(self): stat = HHG().statistic(distx, disty) assert_almost_equal(stat, 950600.0, decimal=2) + + +class TestHHGTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "HHG", + sim_type="indep", + sim="multimodal_independence", + n=50, + p=1, + alpha=0.05, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/independence/tests/test_hsic.py b/hyppo/independence/tests/test_hsic.py index 2a26ae942..d07c7e527 100644 --- a/hyppo/independence/tests/test_hsic.py +++ b/hyppo/independence/tests/test_hsic.py @@ -3,12 +3,12 @@ from numpy.testing import assert_almost_equal from sklearn.metrics.pairwise import pairwise_kernels -from ...tools import linear +from ...tools import linear, power from .. import Hsic class TestHsicStat: - @pytest.mark.parametrize("n, obs_stat", [(100, 0.107), (200, 0.102)]) + @pytest.mark.parametrize("n, obs_stat", [(100, 1.0), (200, 1.0)]) @pytest.mark.parametrize("obs_pvalue", [1 / 1000]) def test_linear_oned(self, n, obs_stat, obs_pvalue): np.random.seed(123456789) @@ -25,5 +25,34 @@ def test_kernstat(self): kerny = pairwise_kernels(y, y) stat, pvalue = Hsic(compute_kernel=None).test(kernx, kerny, auto=False) - assert_almost_equal(stat, 0.124, decimal=2) + assert_almost_equal(stat, 1.0, decimal=2) assert_almost_equal(pvalue, 1 / 1000, decimal=2) + + +class TestHsicTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "Hsic", + sim_type="indep", + sim="multimodal_independence", + n=200, + p=1, + alpha=0.05, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) + + def test_oned_fast(self): + np.random.seed(123456789) + est_power = power( + "Hsic", + sim_type="indep", + sim="multimodal_independence", + n=500, + p=1, + alpha=0.05, + auto=True, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/independence/tests/test_kmerf.py b/hyppo/independence/tests/test_kmerf.py index f37ad80e7..5d4b4719b 100644 --- a/hyppo/independence/tests/test_kmerf.py +++ b/hyppo/independence/tests/test_kmerf.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from numpy.testing import assert_approx_equal, assert_raises +from numpy.testing import assert_almost_equal, assert_approx_equal, assert_raises -from ...tools import linear, multimodal_independence, spiral +from ...tools import linear, multimodal_independence, power, spiral from .. import KMERF @@ -38,3 +38,20 @@ class TestKmerfErrorWarn: def test_no_indeptest(self): # raises error if not indep test assert_raises(ValueError, KMERF, "abcd") + + +# code too slow to run, check benchmarks in docs for verification +@pytest.mark.skip +class TestKmerfTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "KMERF", + sim_type="indep", + sim="multimodal_independence", + n=5, + p=1, + alpha=0.05, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/independence/tests/test_maxmargin.py b/hyppo/independence/tests/test_maxmargin.py index 80337a84e..c633aaf03 100644 --- a/hyppo/independence/tests/test_maxmargin.py +++ b/hyppo/independence/tests/test_maxmargin.py @@ -1,10 +1,8 @@ import numpy as np import pytest from numpy.testing import assert_almost_equal, assert_raises -from sklearn.metrics import pairwise_distances -from sklearn.metrics.pairwise import pairwise_kernels -from ...tools import linear +from ...tools import linear, power from .. import MaxMargin @@ -26,7 +24,7 @@ def test_kernstat(self): x, y = linear(100, 1) stat, pvalue = MaxMargin("Hsic").test(x, y, auto=False) - assert_almost_equal(stat, 0.1072, decimal=2) + assert_almost_equal(stat, 1.0, decimal=2) assert_almost_equal(pvalue, 1 / 1000, decimal=2) @@ -36,3 +34,19 @@ class TestMaxMarginErrorWarn: def test_no_indeptest(self): # raises error if not indep test assert_raises(ValueError, MaxMargin, "abcd") + + +class TestMaxMarginTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + ["MaxMargin", "Dcorr"], + sim_type="indep", + sim="multimodal_independence", + n=100, + p=1, + alpha=0.05, + auto=True, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/independence/tests/test_mgc.py b/hyppo/independence/tests/test_mgc.py index 2fa8f8a5b..ece652da7 100644 --- a/hyppo/independence/tests/test_mgc.py +++ b/hyppo/independence/tests/test_mgc.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from numpy.testing import assert_approx_equal +from numpy.testing import assert_almost_equal, assert_approx_equal -from ...tools import linear, multimodal_independence, spiral +from ...tools import linear, multimodal_independence, power, spiral from .. import MGC @@ -14,7 +14,6 @@ class TestMGCStat(object): [ (linear, 0.97, 1 / 1000), # test linear simulation (spiral, 0.163, 1 / 1000), # test spiral simulation - (multimodal_independence, -0.0094, 0.78), # test independence simulation ], ) def test_oned(self, sim, obs_stat, obs_pvalue): @@ -49,3 +48,18 @@ def test_fived(self, sim, obs_stat, obs_pvalue): assert_approx_equal(stat1, obs_stat, significant=1) assert_approx_equal(stat2, obs_stat, significant=1) assert_approx_equal(pvalue, obs_pvalue, significant=1) + + +class TestMGCTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "MGC", + sim_type="indep", + sim="multimodal_independence", + n=50, + p=1, + alpha=0.05, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/independence/tests/test_rvcorr.py b/hyppo/independence/tests/test_rvcorr.py index b1c77ee06..fcd783ce4 100644 --- a/hyppo/independence/tests/test_rvcorr.py +++ b/hyppo/independence/tests/test_rvcorr.py @@ -2,7 +2,7 @@ import pytest from numpy.testing import assert_almost_equal, assert_raises, assert_warns -from ...tools import linear +from ...tools import linear, power from .. import RV @@ -17,3 +17,18 @@ def test_linear_oned(self, n, obs_stat, obs_pvalue): assert_almost_equal(stat, obs_stat, decimal=2) assert_almost_equal(pvalue, obs_pvalue, decimal=2) + + +class TestCCATypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "RV", + sim_type="indep", + sim="multimodal_independence", + n=1000, + p=1, + alpha=0.05, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/ksample/base.py b/hyppo/ksample/base.py index a98f1a9a9..0a7d3852a 100644 --- a/hyppo/ksample/base.py +++ b/hyppo/ksample/base.py @@ -3,33 +3,43 @@ class KSampleTest(ABC): """ - A base class for a k-sample test. + A base class for a *k*-sample test. Parameters ---------- - compute_distance : callable(), optional (default: None) + compute_distance : str, callable, or None, default: "euclidean" or "gaussian" A function that computes the distance among the samples within each data matrix. - Valid strings for ``metric`` are, as defined in - ``sklearn.metrics.pairwise_distances``, + Valid strings for ``compute_distance`` are, as defined in + :func:`sklearn.metrics.pairwise_distances`, - - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, - ‘manhattan’] See the documentation for scipy.spatial.distance for details + - From scikit-learn: [``"euclidean"``, ``"cityblock"``, ``"cosine"``, + ``"l1"``, ``"l2"``, ``"manhattan"``] See the documentation for + :mod:`scipy.spatial.distance` for details on these metrics. - - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, - ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, - ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, - ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the - documentation for scipy.spatial.distance for details on these metrics. + - From scipy.spatial.distance: [``"braycurtis"``, ``"canberra"``, + ``"chebyshev"``, ``"correlation"``, ``"dice"``, ``"hamming"``, + ``"jaccard"``, ``"kulsinski"``, ``"mahalanobis"``, ``"minkowski"``, + ``"rogerstanimoto"``, ``"russellrao"``, ``"seuclidean"``, + ``"sokalmichener"``, ``"sokalsneath"``, ``"sqeuclidean"``, + ``"yule"``] See the documentation for :mod:`scipy.spatial.distance` for + details on these metrics. - Set to `None` or `precomputed` if `x` and `y` are already distance - matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``metric(x, **kwargs)`` - where `x` is the data matrix for which pairwise distances are - calculated and kwargs are extra arguements to send to your custom function. + Alternatively, this function computes the kernel similarity among the + samples within each data matrix. + Valid strings for ``compute_kernel`` are, as defined in + :func:`sklearn.metrics.pairwise.pairwise_kernels`, + + [``"additive_chi2"``, ``"chi2"``, ``"linear"``, ``"poly"``, + ``"polynomial"``, ``"rbf"``, + ``"laplacian"``, ``"sigmoid"``, ``"cosine"``] + + Note ``"rbf"`` and ``"gaussian"`` are the same metric. bias : bool (default: False) Whether or not to use the biased or unbiased test statistics. Only applies to ``Dcorr`` and ``Hsic``. + **kwargs + Arbitrary keyword arguments for ``compute_distkern``. """ def __init__(self, compute_distance=None, bias=False, **kwargs): @@ -43,28 +53,47 @@ def __init__(self, compute_distance=None, bias=False, **kwargs): super().__init__() @abstractmethod - def statistic(self, inputs): + def statistic(self, *args): r""" Calulates the *k*-sample test statistic. Parameters ---------- - inputs : ndarray - Input data matrices. + *args : ndarray + Variable length input data matrices. All inputs must have the same + number of dimensions. That is, the shapes must be `(n, p)` and + `(m, p)`, ... where `n`, `m`, ... are the number of samples and `p` is + the number of dimensions. + + Returns + ------- + stat : float + The computed *k*-Sample statistic. """ @abstractmethod - def test(self, inputs, reps=1000, workers=1): + def test(self, *args, reps=1000, workers=1): r""" - Calulates the k-sample test p-value. + Calculates the *k*-sample test statistic and p-value. Parameters ---------- - inputs : list of ndarray - Input data matrices. - reps : int, optional - The number of replications used in permutation, by default 1000. - workers : int, optional (default: 1) - Evaluates method using `multiprocessing.Pool `). - Supply `-1` to use all cores available to the Process. + *args : ndarray + Variable length input data matrices. All inputs must have the same + number of dimensions. That is, the shapes must be `(n, p)` and + `(m, p)`, ... where `n`, `m`, ... are the number of samples and `p` is + the number of dimensions. + reps : int, default: 1000 + The number of replications used to estimate the null distribution + when using the permutation test used to calculate the p-value. + workers : int, default: 1 + The number of cores to parallelize the p-value computation over. + Supply ``-1`` to use all cores available to the Process. + + Returns + ------- + stat : float + The computed *k*-sample statistic. + pvalue : float + The computed *k*-sample p-value. """ diff --git a/hyppo/ksample/disco.py b/hyppo/ksample/disco.py index 85d7346f8..2cb79f32f 100644 --- a/hyppo/ksample/disco.py +++ b/hyppo/ksample/disco.py @@ -119,7 +119,7 @@ def statistic(self, *args): distu, distv = compute_dist(u, v, metric=self.compute_distance, **self.kwargs) - # exact equivalence transformation Dcorr and DISCO + # exact equivalence transformation Dcov and DISCO stat = _dcov(distu, distv, self.bias) * np.sum(N) * len(N) / 2 self.stat = stat diff --git a/hyppo/ksample/ksamp.py b/hyppo/ksample/ksamp.py index f4568e250..220cf0cc6 100644 --- a/hyppo/ksample/ksamp.py +++ b/hyppo/ksample/ksamp.py @@ -143,15 +143,6 @@ class KSample(KSampleTest): ``"yule"``] See the documentation for :mod:`scipy.spatial.distance` for details on these metrics. - Set to ``None`` or ``"precomputed"`` if ``x`` and ``y`` are already distance - or similarity - matrices. To call a custom function, either create the distance or similarity - matrix - before-hand or create a function of the form ``metric(x, **kwargs)`` - where ``x`` is the data matrix for which pairwise distances or similarities are - calculated and ``**kwargs`` are extra arguements to send to your custom - function. - Alternatively, this function computes the kernel similarity among the samples within each data matrix. Valid strings for ``compute_kernel`` are, as defined in @@ -230,7 +221,7 @@ def statistic(self, *args): Returns ------- stat : float - The computed *k*-Sample statistic. + The computed *k*-sample statistic. """ inputs = list(args) if self.indep_test_name == "kmerf": @@ -269,9 +260,9 @@ def test(self, *args, reps=1000, workers=1, auto=True): Returns ------- stat : float - The computed *k*-Sample statistic. + The computed *k*-sample statistic. pvalue : float - The computed *k*-Sample p-value. + The computed *k*-sample p-value. dict A dictionary containing optional parameters for tests that return them. See the relevant test in :mod:`hyppo.independence`. @@ -285,7 +276,7 @@ def test(self, *args, reps=1000, workers=1, auto=True): >>> z = np.arange(10) >>> stat, pvalue = KSample("Dcorr").test(x, y) >>> '%.3f, %.1f' % (stat, pvalue) - '0.000, 1.0' + '-0.136, 1.0' """ inputs = list(args) check_input = _CheckInputs( diff --git a/hyppo/ksample/mmd.py b/hyppo/ksample/mmd.py index 804a1aa8c..9953bca2e 100644 --- a/hyppo/ksample/mmd.py +++ b/hyppo/ksample/mmd.py @@ -155,7 +155,7 @@ def test(self, x, y, reps=1000, workers=1, auto=True): >>> y = x >>> stat, pvalue = MMD().test(x, y) >>> '%.3f, %.1f' % (stat, pvalue) - '-0.000, 1.0' + '-0.015, 1.0' """ check_input = _CheckInputs( inputs=[x, y], diff --git a/hyppo/ksample/tests/test_disco.py b/hyppo/ksample/tests/test_disco.py index 7e0fa3fcd..879678873 100644 --- a/hyppo/ksample/tests/test_disco.py +++ b/hyppo/ksample/tests/test_disco.py @@ -3,7 +3,7 @@ from numpy.testing import assert_almost_equal, assert_raises from sklearn.metrics import pairwise_distances -from ...tools import rot_ksamp +from ...tools import power, rot_ksamp from .. import DISCO @@ -30,3 +30,20 @@ def test_diffshape(self): y = np.arange(10) assert_raises(ValueError, DISCO().statistic, x, y) assert_raises(ValueError, DISCO().test, x, y) + + +class TestDISCOTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "DISCO", + sim_type="ksamp", + sim="multimodal_independence", + k=2, + n=100, + p=1, + alpha=0.05, + auto=True, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/ksample/tests/test_energy.py b/hyppo/ksample/tests/test_energy.py index 9bee16516..f00e91a23 100644 --- a/hyppo/ksample/tests/test_energy.py +++ b/hyppo/ksample/tests/test_energy.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from numpy.testing import assert_almost_equal, assert_raises +from numpy.testing import assert_almost_equal -from ...tools import rot_ksamp +from ...tools import power, rot_ksamp from .. import Energy @@ -18,3 +18,20 @@ def test_linear_oned(self, n, obs_stat, obs_pvalue): assert_almost_equal(stat, obs_stat, decimal=1) assert_almost_equal(pvalue, obs_pvalue, decimal=1) + + +class TestEnergyTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "Energy", + sim_type="ksamp", + sim="multimodal_independence", + k=2, + n=100, + p=1, + alpha=0.05, + auto=True, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/ksample/tests/test_hotelling.py b/hyppo/ksample/tests/test_hotelling.py index 451835f95..efb4d817f 100644 --- a/hyppo/ksample/tests/test_hotelling.py +++ b/hyppo/ksample/tests/test_hotelling.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from numpy.testing import assert_almost_equal, assert_raises +from numpy.testing import assert_almost_equal -from ...tools import rot_ksamp +from ...tools import power, rot_ksamp from .. import Hotelling @@ -21,3 +21,19 @@ def test_linear_oned(self, n, obs_stat, obs_pvalue): assert_almost_equal(stat, obs_stat, decimal=1) assert_almost_equal(pvalue, obs_pvalue, decimal=1) + + +class TestHotellingTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "Hotelling", + sim_type="ksamp", + sim="multimodal_independence", + k=2, + n=100, + p=1, + alpha=0.05, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/ksample/tests/test_ksamp.py b/hyppo/ksample/tests/test_ksamp.py index be866a778..386b7b39c 100644 --- a/hyppo/ksample/tests/test_ksamp.py +++ b/hyppo/ksample/tests/test_ksamp.py @@ -2,7 +2,7 @@ import pytest from numpy.testing import assert_almost_equal, assert_raises -from ...tools import rot_ksamp +from ...tools import power, rot_ksamp from .. import KSample @@ -45,3 +45,20 @@ def test_no_second_maxmargin(self): np.random.seed(123456789) x, y = rot_ksamp("linear", 50, 1, k=2) assert_raises(ValueError, KSample, ["MaxMargin", "abcd"]) + + +class TestKSampleTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "CCA", + sim_type="ksamp", + sim="multimodal_independence", + k=2, + n=100, + p=1, + alpha=0.05, + auto=True, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/ksample/tests/test_manova.py b/hyppo/ksample/tests/test_manova.py index 3f53f27d6..8b5b220f6 100644 --- a/hyppo/ksample/tests/test_manova.py +++ b/hyppo/ksample/tests/test_manova.py @@ -2,7 +2,7 @@ import pytest from numpy.testing import assert_almost_equal, assert_raises -from ...tools import rot_ksamp +from ...tools import power, rot_ksamp from .. import MANOVA @@ -28,3 +28,18 @@ def test_no_indeptest(self): x = np.arange(100).reshape(5, 20) y = np.arange(50, 150).reshape(5, 20) assert_raises(ValueError, MANOVA().test, x, y) + + +class TestManovaTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "MANOVA", + sim_type="gauss", + sim="multimodal_independence", + case=1, + n=100, + alpha=0.05, + ) + + assert est_power <= 0.05 diff --git a/hyppo/ksample/tests/test_mmd.py b/hyppo/ksample/tests/test_mmd.py index d139816d1..3dd4e8d3e 100644 --- a/hyppo/ksample/tests/test_mmd.py +++ b/hyppo/ksample/tests/test_mmd.py @@ -2,7 +2,7 @@ import pytest from numpy.testing import assert_almost_equal -from ...tools import rot_ksamp +from ...tools import power, rot_ksamp from .. import MMD @@ -18,3 +18,20 @@ def test_linear_oned(self, n, obs_stat, obs_pvalue): assert_almost_equal(stat, obs_stat, decimal=1) assert_almost_equal(pvalue, obs_pvalue, decimal=1) + + +class TestMMDTypeIError: + def test_oned(self): + np.random.seed(123456789) + est_power = power( + "MMD", + sim_type="ksamp", + sim="multimodal_independence", + k=2, + n=100, + p=1, + alpha=0.05, + auto=True, + ) + + assert_almost_equal(est_power, 0.05, decimal=2) diff --git a/hyppo/time_series/base.py b/hyppo/time_series/base.py index f943e5d10..178be148e 100644 --- a/hyppo/time_series/base.py +++ b/hyppo/time_series/base.py @@ -8,41 +8,38 @@ class TimeSeriesTest(ABC): """ - Base class for time series in hyppo. + A base class for a time-series test. Parameters ---------- - compute_distance : callable, optional (default: None) + compute_distance : str, callable, or None, default: "euclidean" A function that computes the distance among the samples within each data matrix. - Valid strings for ``metric`` are, as defined in - ``sklearn.metrics.pairwise_distances``, + Valid strings for ``compute_distance`` are, as defined in + :func:`sklearn.metrics.pairwise_distances`, - - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, - ‘manhattan’] See the documentation for scipy.spatial.distance for details + - From scikit-learn: [``"euclidean"``, ``"cityblock"``, ``"cosine"``, + ``"l1"``, ``"l2"``, ``"manhattan"``] See the documentation for + :mod:`scipy.spatial.distance` for details on these metrics. - - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, - ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, - ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, - ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the - documentation for scipy.spatial.distance for details on these metrics. - - Set to `None` or `precomputed` if `x` and `y` are already distance + - From scipy.spatial.distance: [``"braycurtis"``, ``"canberra"``, + ``"chebyshev"``, ``"correlation"``, ``"dice"``, ``"hamming"``, + ``"jaccard"``, ``"kulsinski"``, ``"mahalanobis"``, ``"minkowski"``, + ``"rogerstanimoto"``, ``"russellrao"``, ``"seuclidean"``, + ``"sokalmichener"``, ``"sokalsneath"``, ``"sqeuclidean"``, + ``"yule"``] See the documentation for :mod:`scipy.spatial.distance` for + details on these metrics. + + Set to ``None`` or ``"precomputed"`` if ``x`` and ``y`` are already distance matrices. To call a custom function, either create the distance matrix before-hand or create a function of the form ``metric(x, **kwargs)`` - where `x` is the data matrix for which pairwise distances are - calculated and kwargs are extra arguements to send to your custom function. - - Attributes - ---------- - stat : float - The computed independence test statistic. - pvalue : float - The computed independence test p-value. - compute_distance : callable, optional - Function indicating distance metric (or alternatively the kernel) to - use. Calculates the pairwise distance for each input, by default - euclidean. + where ``x`` is the data matrix for which pairwise distances are + calculated and ``**kwargs`` are extra arguements to send to your custom + function. + max_lag : float, default: 0 + The maximium lag to consider when computing the test statistics and p-values. + **kwargs + Arbitrary keyword arguments for ``compute_distance``. """ def __init__(self, compute_distance=None, max_lag=0, **kwargs): @@ -60,39 +57,44 @@ def __init__(self, compute_distance=None, max_lag=0, **kwargs): @abstractmethod def statistic(self, x, y): """ - Calulates the independence test statistic. + Calulates the time-series test statistic. Parameters ---------- - x, y : ndarray - Input data matrices that have shapes depending on the particular - independence tests (check desired test class for specifics). + x,y : ndarray + Input data matrices. ``x`` and ``y`` must have the same number of + samples. That is, the shapes must be ``(n, p)`` and ``(n, q)`` where + `n` is the number of samples and `p` and `q` are the number of + dimensions. Alternatively, ``x`` and ``y`` can be distance matrices, + where the shapes must both be ``(n, n)``. """ @abstractmethod def test(self, x, y, reps=1000, workers=1): """ - Calulates the independece test p-value. + Calulates the time-series test test statistic and p-value. Parameters ---------- - x, y : ndarray - Input data matrices that have shapes depending on the particular - independence tests (check desired test class for specifics). - reps : int, optional - The number of replications used in permutation, by default 1000. - workers : int, optional - Evaluates method using `multiprocessing.Pool `). - Supply `-1` to use all cores available to the Process. - random_state : int or np.random.RandomState instance, optional - If already a RandomState instance, use it. - If seed is an int, return a new RandomState instance seeded with seed. - If None, use np.random.RandomState. Default is None. + x,y : ndarray + Input data matrices. ``x`` and ``y`` must have the same number of + samples. That is, the shapes must be ``(n, p)`` and ``(n, q)`` where + `n` is the number of samples and `p` and `q` are the number of + dimensions. Alternatively, ``x`` and ``y`` can be distance matrices, + where the shapes must both be ``(n, n)``. + reps : int, default: 1000 + The number of replications used to estimate the null distribution + when using the permutation test used to calculate the p-value. + workers : int, default: 1 + The number of cores to parallelize the p-value computation over. + Supply ``-1`` to use all cores available to the Process. Returns ------- + stat : float + The discriminability test statistic. pvalue : float - The pvalue obtained via permutation. + The discriminability p-value. null_dist : list The null distribution of the permuted test statistics. """ @@ -119,6 +121,7 @@ def test(self, x, y, reps=1000, workers=1): def _perm_stat(calc_stat, distx, disty): + """Permutes the test statistics.""" n = distx.shape[0] block_size = int(np.ceil(np.sqrt(n))) perm_index = np.r_[ diff --git a/hyppo/tools/ksample_sim.py b/hyppo/tools/ksample_sim.py index f6ae51cf8..d5fa2df86 100644 --- a/hyppo/tools/ksample_sim.py +++ b/hyppo/tools/ksample_sim.py @@ -118,7 +118,7 @@ def rot_ksamp(sim, n, p, k=2, noise=True, degree=90, pow_type="samp", **kwargs): ) if sim == "multimodal_independence": - sims = [np.hstack(SIMULATIONS[sim](n, p)) for _ in range(k)] + sims = [np.hstack(SIMULATIONS[sim](n, p, **kwargs)) for _ in range(k)] else: if sim != "multiplicative_noise": kwargs["noise"] = noise diff --git a/hyppo/tools/power.py b/hyppo/tools/power.py index 33c4f14d1..660c23795 100644 --- a/hyppo/tools/power.py +++ b/hyppo/tools/power.py @@ -3,7 +3,7 @@ import numpy as np from ..independence import INDEP_TESTS -from ..ksample import KSAMP_TESTS, k_sample_transform +from ..ksample import KSAMP_TESTS, KSample, k_sample_transform from .indep_sim import indep_sim from .ksample_sim import gaussian_3samp, rot_ksamp @@ -14,34 +14,37 @@ } _NONPERM_TESTS = { - "Dcorr": "fast", - "Hsic": "fast", - "Energy": "fast", - "MMD": "fast", - "DISCO": "fast", - "MANOVA": "analytical", - "Hotelling": "analytical", + "dcorr": "fast", + "hsic": "fast", + "energy": "fast", + "mmd": "fast", + "disco": "fast", + "manova": "analytical", + "hotelling": "analytical", } -def _sim_gen(pow_type, **kwargs): +def _sim_gen(sim_type, **kwargs): """ Generate ``sims`` for the desired simulations. """ - if pow_type in ["indep", "ksamp"]: - if kwargs["sim"] in ["multiplicative_noise", "multimodal_independence"]: + if sim_type in ["indep", "ksamp"]: + if ( + kwargs["sim"] in ["multiplicative_noise", "multimodal_independence"] + and "noise" in kwargs.keys() + ): kwargs = kwargs.copy().pop("noise") - sims = _ALL_SIMS[pow_type](**kwargs) + sims = _ALL_SIMS[sim_type](**kwargs) return sims -def _indep_perm_stat(test, pow_type, **kwargs): +def _indep_perm_stat(test, sim_type, **kwargs): """ Generates null and alternate distributions for the independence test. """ - x, y = _sim_gen(pow_type=pow_type, **kwargs) + x, y = _sim_gen(sim_type=sim_type, **kwargs) obs_stat = test.statistic(x, y) permy = np.random.permutation(y) perm_stat = test.statistic(x, permy) @@ -49,11 +52,11 @@ def _indep_perm_stat(test, pow_type, **kwargs): return obs_stat, perm_stat -def _ksamp_perm_stat(test, pow_type, **kwargs): +def _ksamp_perm_stat(test, sim_type, **kwargs): """ Generates null and alternate distributions for the k-sample test. """ - sims = _sim_gen(pow_type=pow_type, **kwargs) + sims = _sim_gen(sim_type=sim_type, **kwargs) u, v = k_sample_transform(sims) obs_stat = test.statistic(u, v) permv = np.random.permutation(v) @@ -68,17 +71,17 @@ def _ksamp_perm_stat(test, pow_type, **kwargs): } -def _nonperm_pval(test, **kwargs): +def _nonperm_pval(test, sim_type, **kwargs): """ Generates fast permutation pvalues """ - x, y = _sim_gen(pow_type="indep", **kwargs) - pvalue = test.test(x, y, auto=True)[1] + sims = _sim_gen(sim_type=sim_type, **kwargs) + pvalue = test.test(*sims)[1] return pvalue -def power(test, pow_type, sim=None, n=100, alpha=0.05, reps=1000, auto=False, **kwargs): +def power(test, sim_type, sim=None, n=100, alpha=0.05, reps=1000, auto=False, **kwargs): """ Computes empircal power for k-sample tests @@ -88,12 +91,14 @@ def power(test, pow_type, sim=None, n=100, alpha=0.05, reps=1000, auto=False, ** The name of the independence test (from the :mod:`hyppo.independence` module) that is to be tested. If MaxMargin, accepts list with first entry "MaxMargin" and second entry the name of another independence test. - pow_type : "indep", "ksamp", "gauss" + For :class:`hyppo.ksample.KSample` put the name of the independence test. + For other tests in :mod:`hyppo.ksample` just use the name of the class. + sim_type : "indep", "ksamp", "gauss" Type of power method to calculate. Depends on the type of ``sim``. sim : str, default: None The name of the independence simulation (from the :mod:`hyppo.tools` module). that is to be used. Set to ``None`` if using gaussian simulation curve. - n : int, default: None + n : int, default: 100 The number of samples desired by the simulation (>= 5). alpha : float, default: 0.05 The alpha level of the test. @@ -115,58 +120,66 @@ def power(test, pow_type, sim=None, n=100, alpha=0.05, reps=1000, auto=False, ** empirical_power : ndarray Estimated empirical power for the test. """ - if pow_type not in _ALL_SIMS.keys(): + if sim_type not in _ALL_SIMS.keys(): raise ValueError( - "pow_type {}, must be in {}".format(pow_type, list(_ALL_SIMS.keys())) + "sim_type {}, must be in {}".format(sim_type, list(_ALL_SIMS.keys())) ) + if type(test) is list: - test = [t.lower() for t in test] - if test[0] != "maxmargin" or test[1] not in INDEP_TESTS.keys(): + test_name = [t.lower() for t in test] + if test_name[0] != "maxmargin" or test_name[1] not in INDEP_TESTS.keys(): raise ValueError( "Test 1 must be Maximal Margin, currently {}; Test 2 must be an " "independence test, currently {}".format(*test) ) - test = INDEP_TESTS[test[0]](indep_test=test[1]) + test = INDEP_TESTS[test_name[0]](indep_test=test_name[1]) + test_name = test_name[1] else: - test = test.lower() - if test in INDEP_TESTS.keys(): - test = INDEP_TESTS[test]() - elif test in KSAMP_TESTS.keys(): - test = KSAMP_TESTS[test]() + test_name = test.lower() + if test_name in INDEP_TESTS.keys(): + test = INDEP_TESTS[test_name]() + elif test_name in KSAMP_TESTS.keys(): + test = KSAMP_TESTS[test_name]() else: raise ValueError( "Test {} not in {}".format( - test, list(INDEP_TESTS.keys()) + list(KSAMP_TESTS.keys()) + test_name, list(INDEP_TESTS.keys()) + list(KSAMP_TESTS.keys()) ) ) kwargs["n"] = n perm_type = "indep" - if pow_type in ["ksamp", "gauss"]: + if sim_type in ["ksamp", "gauss"]: perm_type = "ksamp" - if pow_type != "gauss": + if sim_type != "gauss": kwargs["sim"] = sim - if test in _NONPERM_TESTS.keys() and auto: - if n < 20 and _NONPERM_TESTS[test] == "fast": + if test_name in _NONPERM_TESTS.keys() and ( + auto or _NONPERM_TESTS[test_name] == "analytical" + ): + if test_name in INDEP_TESTS.keys() and perm_type == "ksamp": + test = KSample(test_name) + if n < 20 and _NONPERM_TESTS[test_name] == "fast": empirical_power = np.nan else: - pvals = np.array([_nonperm_pval(test=test, **kwargs) for _ in range(reps)]) - empirical_power = (pvals <= alpha).sum() / reps + pvals = np.array( + [ + _nonperm_pval(test=test, sim_type=sim_type, **kwargs) + for _ in range(reps) + ] + ) + empirical_power = (1 + (pvals <= alpha).sum()) / (1 + reps) else: alt_dist, null_dist = map( np.float64, zip( *[ - _PERM_STATS[perm_type](test=test, pow_type=pow_type, **kwargs) + _PERM_STATS[perm_type](test=test, sim_type=sim_type, **kwargs) for _ in range(reps) ] ), ) cutoff = np.sort(null_dist)[ceil(reps * (1 - alpha))] - empirical_power = (alt_dist >= cutoff).sum() / reps - - if empirical_power == 0: - empirical_power = 1 / reps + empirical_power = (1 + (alt_dist >= cutoff).sum()) / (1 + reps) return empirical_power diff --git a/hyppo/tools/tests/test_power.py b/hyppo/tools/tests/test_power.py index 715abe78e..1c60b9b70 100644 --- a/hyppo/tools/tests/test_power.py +++ b/hyppo/tools/tests/test_power.py @@ -18,7 +18,7 @@ def test_power_fast(self): def test_power_fast_nless20(self): np.random.seed(123456789) est_power = power("Dcorr", "indep", sim="linear", n=19, p=1, auto=True) - assert_almost_equal(est_power, 1.0, decimal=1) + assert_almost_equal(est_power, np.nan, decimal=1) def test_ksamp(self): np.random.seed(123456789) @@ -27,7 +27,7 @@ def test_ksamp(self): def test_gaussian(self): np.random.seed(123456789) - est_power = power(test="Dcorr", pow_type="gauss", auto=True, case=2) + est_power = power("Dcorr", "gauss", auto=True, case=2) assert_almost_equal(est_power, 1.0, decimal=1)