From d88adec5a25d070ad9db51e18d2ab24738be4ab3 Mon Sep 17 00:00:00 2001
From: GitHub Action Steps without Clone the git repository to the desired location: [Optional] both Mamba and Miniconda can be automatically activated via Exit the shell and re-enter to make sure Conda is available, [Optional] both Mamba and Miniconda can be automatically activated via Exit the shell and re-enter to make sure Conda is available. [Optional] update Conda if available before continuing: [Optional] to use the correct environment by default, execute Make sure that non-Python dependencies are loaded if applicable, such as CUDA, OpenCL or HIP. On most clusters it is possible to load (or unload) modules (e.g. CUDA, OpenCL / ROCM). For more information, see Installation. Do not forget to make sure the paths are set correctly. If you’re using CUDA, the desired CUDA version should be in [Optional] the loading of modules and setting of paths is likely convenient to put in your To run the tests you can use Local setup
Cluster setup¶
sudo
access (e.g. on a cluster):
+
git clone https://github.com/KernelTuner/kernel_tuner.git
.
-
-
~/.bashrc
. Do not forget to add these (usually mentioned at the end of the installation).cd
to the kernel tuner directory.
+
+
+
+
+.condarc
file:envs_dirs:
+ - /path/to/directory
+
~/.bashrc
. Do not forget to add these (usually provided at the end of the installation).cd
to the kernel tuner directory.conda update -n base -c conda-forge conda
.Cluster setup
conda activate kerneltuner
.
+conda config --set auto_activate_base false
, and add conda activate kerneltuner to your .bash_profile
or .bashrc
.
+
@@ -216,14 +229,16 @@ $PATH
, $LD_LIBARY_PATH
and $CPATH
..bash_profile
or .bashrc
.Cluster setup
Running tests¶
nox
(to run against all supported Python versions in isolated environments) and pytest
(to run against the local Python version) in the top-level directory.
+For full coverage, make Nox install and use the additional tests (such as cupy and cuda-python) with nox -- additional-tests
.
It’s also possible to invoke PyTest from the ‘Testing’ tab in Visual Studio Code.
-The isolated environments can take up to 1 gigabyte in size, so users tight on diskspace can run nox
with the small-disk
option. This removes the other environment caches before each session is ran.nox
with the small-disk
option. This removes the other environment caches before each session is ran (note that this will take longer to run).
Note that tests that require PyCuda and/or a CUDA capable GPU will be skipped if these are not installed/present. The same holds for tests that require PyOpenCL, Cupy, Nvidia CUDA.
Contributions you make to the Kernel Tuner should not break any of the tests even if you cannot run them locally.
diff --git a/latest/searchindex.js b/latest/searchindex.js index f7146d47e..8690cd4bc 100644 --- a/latest/searchindex.js +++ b/latest/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["cache_files", "contents", "contributing", "convolution", "correctness", "design", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"A": [0, 3, 5, 12, 13, 14, 16, 17], "veri": [0, 4, 6, 7, 8, 11, 13, 14, 16, 19, 20], "us": [0, 1, 2, 3, 4, 5, 9, 11, 12, 13, 15, 16, 17, 18, 20, 22], "featur": [0, 3, 4, 9, 13, 15, 16, 18, 20], "kernel": [0, 2, 3, 4, 5, 11, 13, 15, 16, 17, 18, 19, 21, 22], "tuner": [0, 2, 3, 4, 5, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "i": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22], "abil": 0, "store": [0, 2, 3, 5, 8, 14, 16, 18], "benchmark": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 22], "result": [0, 2, 3, 4, 5, 8, 10, 14, 15, 16, 17, 18, 22], "dure": [0, 5, 6, 7, 8, 10, 16], "tune": [0, 1, 4, 5, 9, 12, 13, 17, 18, 20, 22], "you": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22], "can": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22], "enabl": [0, 16, 17, 19, 20], "pass": [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 20], "ani": [0, 2, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 19, 20, 22], "filenam": [0, 3, 5, 9, 14, 18], "option": [0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 20, 21, 22], "argument": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21], "tune_kernel": [0, 3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17, 18, 19, 20], "The": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20], "individu": [0, 16, 17], "configur": [0, 3, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17], "ar": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22], "append": [0, 5, 13], "run": [0, 3, 4, 5, 6, 7, 10, 11, 13, 14, 16, 17], "thi": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "also": [0, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 22], "allow": [0, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 20], "restart": [0, 6, 7, 8, 17], "session": [0, 2, 5, 17], "from": [0, 2, 3, 4, 5, 6, 9, 10, 11, 13, 14, 16, 17, 19, 20], "an": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], "exist": [0, 5], "should": [0, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 18], "someth": [0, 3, 6, 7, 8, 14], "have": [0, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 20, 22], "termin": [0, 13], "previou": [0, 2, 6, 7, 8, 17], "befor": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17], "had": [0, 3], "complet": [0, 3], "happen": [0, 2, 3, 14, 18], "quit": [0, 6, 7, 8, 10, 14, 20], "often": [0, 6, 7, 8, 16], "hpc": 0, "environ": [0, 3, 5, 13, 17], "when": [0, 2, 3, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 19, 20, 22], "job": 0, "reserv": [0, 7, 22], "out": [0, 2, 3, 4, 10, 13, 14], "number": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18, 19, 22], "other": [0, 2, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 22], "simul": [0, 5, 8, 12, 17, 19], "visual": [0, 2, 14], "optim": [0, 1, 3, 4, 5, 6, 7, 8, 11, 12, 14, 15, 16], "strategi": [0, 1, 3, 15], "start": [0, 1, 3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 17], "call": [0, 3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19, 20, 21], "contain": [0, 3, 5, 6, 7, 8, 10, 11, 14, 16, 17, 20], "full": [0, 5, 16, 18], "search": [0, 3, 5, 9, 12, 14, 15, 17], "space": [0, 3, 4, 5, 10, 11, 14, 15, 17], "true": [0, 3, 4, 5, 6, 7, 8, 11, 14, 16, 17], "creat": [0, 2, 3, 5, 6, 7, 8, 10, 14, 16, 18, 19], "even": [0, 2, 6, 7, 8, 11, 14, 17], "work": [0, 2, 3, 5, 6, 7, 8, 13, 15, 17, 20], "while": [0, 3, 5, 6, 7, 8, 9, 14, 16, 17], "still": [0, 2, 4, 14], "As": [0, 3, 6, 7, 8, 10, 13, 14, 16], "new": [0, 2, 5, 6, 7, 8, 17], "come": [0, 5, 6, 7, 8, 14, 16, 20], "thei": [0, 2, 5, 6, 7, 8, 9, 14, 15], "stream": [0, 5, 6, 7, 8], "pleas": [0, 2, 3, 12, 13, 16, 18, 19], "see": [0, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 18, 20], "dashboard": [0, 12], "introduct": 1, "instal": [1, 2, 3, 6, 7, 8, 10, 11, 14, 16, 18], "get": [1, 3, 5, 6, 7, 8, 10, 13, 14], "convolut": [1, 4, 11, 14], "diffus": 1, "matrix": 1, "multipl": [1, 5, 11, 16, 20], "exampl": [1, 2, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19], "cach": [1, 2, 5, 6, 7, 8, 13, 14, 17], "file": [1, 2, 3, 5, 6, 7, 9, 11, 14, 17, 18, 20, 21], "correct": [1, 2, 11, 19], "verif": [1, 9], "host": [1, 2, 5, 7, 8, 9, 16, 19, 20], "code": [1, 3, 5, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22], "struct": 1, "templat": [1, 10], "metric": [1, 3, 5, 9, 14], "object": [1, 3, 4, 5, 6, 7, 8, 17], "observ": [1, 5, 15, 22], "api": [1, 3, 5], "paramet": [1, 4, 5, 6, 7, 9, 11, 14, 15, 17, 18, 19, 20], "vocabulari": [1, 16, 18], "design": [1, 2, 6, 7, 8, 16], "contribut": 1, "thank": 2, "consid": [2, 10, 12, 14], "Not": [2, 5], "all": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 21], "help": [2, 20], "u": [2, 3, 6, 7, 8], "improv": [2, 5, 6, 7, 8, 14, 17], "about": [2, 3, 5, 6, 7, 8, 12, 14, 16, 17, 18, 21], "problem": [2, 3, 5, 6, 7, 8, 9, 10, 11, 14], "ensur": [2, 4, 6, 7, 8, 11, 13, 16, 19], "follow": [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 20], "describ": [2, 3, 5, 11, 16, 19], "what": [2, 3, 4, 5, 6, 7, 8, 11, 14, 16, 18, 19, 20, 22], "expect": [2, 3, 4, 5, 6, 7, 8, 14, 16], "If": [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 19], "possibl": [2, 3, 4, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19], "includ": [2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 20, 21], "minim": [2, 15, 20], "reproduc": 2, "actual": [2, 3, 4, 5, 6, 7, 8, 10, 14, 20], "output": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 18, 22], "error": [2, 3, 4, 5, 11, 14, 20], "print": [2, 3, 5, 6, 7, 8, 10, 14], "list": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19], "version": [2, 3, 14, 16], "python": [2, 3, 5, 9, 10, 11, 14, 16, 18, 19, 20], "cuda": [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 16, 18, 19, 20], "opencl": [2, 3, 6, 7, 8, 9, 11, 12, 14], "c": [2, 3, 9, 11, 12, 13, 14, 18, 20], "compil": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 22], "applic": [2, 3, 6, 7, 8, 9, 10, 11, 12, 15, 16, 19, 20], "For": [2, 3, 4, 5, 6, 7, 8, 10, 13, 16, 18, 19], "select": [2, 3, 5, 6, 7, 8, 10, 14, 16, 17], "propos": 2, "chang": [2, 10, 16], "addit": [2, 3, 6, 7, 8, 13, 15, 18], "signific": 2, "requir": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 20], "first": [2, 3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 19, 20], "discuss": [2, 5], "Then": [2, 6, 7, 8, 10, 12, 13, 20], "fork": 2, "repositori": [2, 3, 6, 7, 8, 10, 12, 13, 14], "branch": 2, "one": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17], "per": [2, 3, 6, 7, 8, 10, 15, 16], "pull": 2, "request": [2, 16], "googl": 2, "style": 2, "sphinxdoc": 2, "docstr": [2, 5], "modul": [2, 5, 11, 16], "public": [2, 12], "function": [2, 3, 4, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21], "up": [2, 3, 5, 6, 7, 8, 13, 14, 18], "date": 2, "written": [2, 20], "unit": [2, 5], "your": [2, 3, 6, 7, 8, 10, 11, 12, 13, 16, 19], "nox": 2, "do": [2, 3, 5, 6, 7, 8, 10, 11, 14], "hardwar": [2, 6, 7, 8, 10, 16, 17, 18], "skip": [2, 3, 6, 7, 8], "gpu": [2, 3, 4, 5, 9, 11, 12, 14, 16, 18, 19, 22], "hip": [2, 12], "produc": [2, 4], "same": [2, 3, 4, 6, 7, 8, 10, 11, 16, 18], "better": [2, 6, 7, 8], "entri": [2, 5, 6, 7], "changelog": 2, "md": 2, "match": [2, 3, 4, 5], "roadmap": 2, "updat": [2, 5], "remov": [2, 17], "doubt": 2, "where": [2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 19, 20], "put": [2, 5, 6, 7, 8], "look": [2, 3, 5, 6, 7, 8, 10, 13, 14, 20], "regard": [2, 5, 17], "step": [2, 6, 7, 8, 13, 14, 15, 17, 20], "set": [2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 16, 17, 18, 20, 22], "sudo": [2, 13], "access": [2, 3, 6, 7, 8, 10, 16, 19], "e": [2, 13, 15, 16, 17], "g": [2, 13, 15, 16], "devic": [2, 3, 4, 6, 7, 8, 9, 11, 16, 20], "clone": [2, 3, 6, 7, 8, 10, 13, 14], "git": [2, 16], "desir": 2, "locat": [2, 4, 10, 16], "http": [2, 12, 13, 16], "github": [2, 3, 6, 7, 8, 10, 13, 14], "com": [2, 12, 13], "kerneltun": [2, 12], "kernel_tun": [2, 3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22], "cd": [2, 13], "pyenv": 2, "curl": [2, 13], "bash": [2, 13], "rememb": [2, 3, 6, 7, 8, 14], "add": [2, 3, 5, 6, 7, 8, 11, 14, 16, 17], "bash_profil": 2, "bashrc": 2, "specifi": [2, 3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22], "virtual": [2, 13], "folder": 2, "virtualenv": 2, "whatev": [2, 11, 17], "name": [2, 3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 18, 22], "prefer": [2, 3, 5, 6, 8, 16], "3": [2, 4, 6, 7, 8, 10, 11, 13, 14, 17], "8": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16], "9": [2, 3, 4, 6, 7, 8, 11], "10": [2, 6, 7, 8, 12, 17], "11": [2, 6, 7, 8], "so": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 20], "found": [2, 3, 5, 12, 16, 17], "global": [2, 5, 6, 7, 8, 17], "replac": [2, 3, 4, 5, 6, 7, 8, 10, 14], "poetri": [2, 13], "ssl": [2, 13], "org": [2, 12, 13], "python3": [2, 13], "make": [2, 3, 6, 7, 8, 10, 12, 13, 14, 16, 19, 20], "sure": [2, 3, 6, 7, 8, 12, 13, 14], "non": [2, 4], "depend": [2, 3, 4, 8, 9, 10, 12, 15], "project": 2, "extra": [2, 13, 20], "doc": [2, 3, 6, 7, 8, 10, 13, 14], "leav": 2, "doe": [2, 4, 5, 6, 7, 8, 10, 11, 14, 16, 20], "appli": [2, 6, 7, 8], "system": [2, 12, 13, 16], "To": [2, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20], "go": [2, 3, 6, 7, 8, 10, 12, 13, 14, 18], "mai": [2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19], "necessari": [2, 4, 5, 6, 7, 8], "conveni": [2, 6, 7, 8, 11], "packag": 2, "cupi": [2, 16, 20], "cuda11x": 2, "cuda12x": 2, "These": [2, 6, 7, 8, 10, 13, 14, 16, 20], "current": [2, 3, 4, 5, 6, 7, 8, 13, 14, 16, 17], "defin": [2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 15, 16, 20], "part": [2, 6, 7, 8, 12, 13, 14, 15, 19], "forget": [2, 10], "path": [2, 3, 16], "correctli": [2, 14], "re": [2, 3, 6, 7, 8, 10, 14], "ld_libary_path": 2, "cpath": 2, "check": [2, 4, 5, 6, 7, 8, 11, 14], "pytest": 2, "except": [2, 5, 9], "more": [2, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 18, 20], "ha": [2, 3, 5, 6, 7, 8, 11, 14, 16, 17], "been": [2, 3, 5, 6, 7, 8, 11, 14, 17], "left": [2, 6, 7, 8, 10, 15], "gracefulli": 2, "without": [2, 6, 7, 8, 10, 11, 16, 17], "conda": 2, "mamba": 2, "perform": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19], "miniconda": [2, 13], "tradit": 2, "both": [2, 6, 7, 8, 9, 14], "automat": [2, 3, 6, 7, 8, 10, 14, 20], "activ": 2, "via": [2, 17], "usual": [2, 16], "mention": [2, 10], "end": [2, 3, 5, 6, 7, 8, 10, 14, 16, 17, 19], "exit": 2, "shell": 2, "enter": [2, 3, 6, 7, 8, 10, 14], "avail": [2, 3, 6, 7, 8, 9, 10, 13, 16], "directori": [2, 3, 6, 7, 8, 10, 13, 14], "continu": [2, 3, 5, 6, 7, 8, 13, 16, 17], "n": [2, 4, 6, 7, 8, 10, 11, 12, 14, 17, 18, 20], "base": [2, 5, 15, 16, 20], "forg": 2, "default": [2, 3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 20], "execut": [2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17], "config": [2, 5], "auto_activate_bas": 2, "fals": [2, 5, 16, 17], "load": 2, "On": [2, 6, 7, 8], "most": [2, 5, 6, 7, 8, 9, 11, 12, 14, 16, 17, 18, 19], "unload": 2, "rocm": [2, 13, 16], "inform": [2, 3, 5, 6, 7, 8, 12, 16, 17, 18, 22], "like": [2, 3, 5, 6, 7, 8, 9, 10, 14, 17, 18, 19, 20], "keyr": 2, "seemingli": 2, "weird": 2, "known": [2, 14], "some": [2, 3, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, 19, 20], "pip": [2, 3, 6, 7, 12, 13, 14], "m": [2, 6, 7, 8, 10], "disabl": 2, "node": [2, 17], "backend": [2, 11, 16], "2": [2, 3, 4, 6, 7, 8, 9, 10, 11, 14, 16, 17], "echo": 2, "noxenv": 2, "txt": 2, "anaconda": 2, "altern": [2, 13], "venv": 2, "alreadi": [2, 3, 5, 6, 7, 8, 13, 14], "Be": [2, 6, 7, 8], "adjust": [2, 3], "against": [2, 4, 5], "support": [2, 3, 5, 6, 7, 8, 11, 13, 16, 17, 20, 22], "isol": [2, 20], "top": [2, 5, 10, 16], "level": [2, 5, 16], "It": [2, 3, 5, 6, 7, 8, 11, 13, 14, 16, 20], "": [2, 3, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 18, 19, 20, 21], "invok": 2, "tab": 2, "studio": 2, "take": [2, 3, 5, 6, 7, 8, 10, 14, 16, 17, 18, 20], "1": [2, 3, 4, 6, 7, 8, 10, 11, 14, 16, 17], "gigabyt": 2, "size": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 18, 20], "user": [2, 3, 4, 5, 7, 9, 13, 14, 15, 16, 17, 20], "tight": 2, "diskspac": 2, "small": [2, 3, 6, 7, 8, 14], "disk": 2, "each": [2, 3, 4, 5, 6, 7, 10, 14, 16, 17], "ran": 2, "note": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 19], "pycuda": [2, 6, 8, 10, 11, 16, 20], "capabl": [2, 5, 6, 7, 14], "present": [2, 14], "hold": [2, 6, 7, 14, 18, 19], "pyopencl": [2, 5, 7, 16], "nvidia": [2, 5, 13, 14, 16, 20], "break": [2, 20], "cannot": [2, 6, 7, 8, 16], "them": [2, 3, 8, 10, 11, 14], "seen": [2, 3, 5, 14], "integr": [2, 20], "type": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20], "html": 2, "gener": [2, 3, 5, 6, 7, 8, 12, 14, 16, 17, 19, 22], "page": [2, 3, 6, 7, 8, 9, 10, 12, 14, 15], "sourc": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 20], "inspect": [2, 5, 16], "commit": 2, "brows": 2, "through": [2, 5, 6, 7, 8, 10, 12, 15, 16, 17], "least": [2, 5], "those": [2, 3, 9, 16], "pandoc": 2, "ubuntu": 2, "apt": 2, "mac": 2, "brew": 2, "differ": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17], "onlin": 2, "built": [2, 16, 17, 19], "action": 2, "correspond": [2, 3, 6, 7, 8, 10, 16, 17, 18], "master": 2, "latest": [2, 13], "last": [2, 5, 19], "releas": [2, 5], "stabl": 2, "publish": [2, 12], "point": [2, 3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18], "process": [2, 3, 5, 6, 7, 8, 14, 15, 16, 17, 20], "again": [2, 3, 6, 7, 8, 10, 14], "fulli": [2, 13], "autom": 2, "guid": [3, 6, 14, 15, 18], "meant": 3, "write": [3, 9, 10, 14, 20], "script": [3, 5, 14, 19, 20], "we": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20], "ll": [3, 6, 7, 8, 13, 14], "simpl": [3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19], "find": [3, 11, 14, 17], "shortli": 3, "much": [3, 6, 7, 8, 10, 16, 20], "reus": [3, 6, 7, 8, 14], "read": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16], "document": [3, 4, 6, 7, 8, 10, 13, 14, 19, 22], "jupyt": [3, 6, 7, 8, 10, 13, 14], "notebook": [3, 6, 7, 8, 10, 13, 14], "just": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14], "tutori": [3, 6, 10, 12, 13, 14], "readi": [3, 5, 6, 7, 8, 10, 14], "oper": [3, 6, 7, 8, 10, 11, 14, 15], "essenti": 3, "signal": [3, 22], "imag": [3, 6, 7, 8], "main": [3, 5, 10, 16, 18], "neural": 3, "network": 3, "deep": 3, "learn": 3, "comput": [3, 4, 5, 9, 10, 11, 12, 14, 17], "linear": [3, 14], "combin": [3, 5, 6, 7, 8, 9, 10, 14, 16, 17, 18], "weight": [3, 17], "filter": [3, 4, 9, 11], "rang": [3, 4, 6, 7, 8, 10, 11, 20], "pixel": 3, "input": [3, 4, 6, 7, 8, 9, 11, 14, 15, 18, 19], "w": [3, 6, 7, 15, 17], "time": [3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 17, 20, 22], "h": [3, 10], "f": [3, 4, 10, 11, 19], "f_w": 3, "f_h": 3, "o": [3, 5], "begin": [3, 6, 7, 8, 10], "equat": [3, 6, 7, 8, 10, 17], "nonumb": [3, 10], "x": [3, 4, 5, 6, 7, 8, 10, 12, 14, 18, 20], "y": [3, 5, 6, 7, 8, 10, 11, 14], "sum": [3, 4, 5, 14], "limits_": 3, "j": [3, 6, 7, 8, 12, 14], "0": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 19], "naiv": [3, 4, 6, 7, 8], "parallel": [3, 6, 7, 8], "thread": [3, 5, 6, 7, 8, 9, 10, 15, 16, 18, 22], "avoid": [3, 14, 22], "confus": 3, "around": [3, 9], "term": 3, "refer": [3, 4, 5, 6, 7, 8, 9, 11, 16], "shown": [3, 5, 16], "block": [3, 5, 6, 7, 8, 9, 10, 13, 14, 15, 18, 22], "press": [3, 6, 7, 8, 10, 14], "shift": [3, 6, 7, 8, 10, 14], "writefil": [3, 14], "convolution_na": [3, 4], "cu": [3, 4, 11, 14, 18, 20], "__global__": [3, 6, 8, 10, 12, 14, 18, 20], "void": [3, 6, 7, 8, 10, 12, 14, 18, 19, 20], "convolution_kernel": [3, 4], "float": [3, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20], "int": [3, 5, 6, 7, 8, 10, 12, 14, 18, 20], "blockidx": [3, 6, 7, 8, 10, 12, 14, 18, 20], "blockdim": [3, 18], "threadidx": [3, 6, 7, 8, 10, 12, 14, 18, 20], "image_height": 3, "image_width": 3, "filter_height": 3, "filter_width": 3, "input_width": 3, "run_kernel": [3, 4, 5, 9], "our": [3, 6, 7, 8, 10, 14, 18, 19], "But": [3, 6, 7, 8, 10, 18], "data": [3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18, 19], "which": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22], "import": [3, 4, 6, 7, 8, 10, 13, 14, 15, 18, 19, 20], "numpi": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 18, 19, 20], "np": [3, 5, 10, 14, 18, 19], "filter_s": 3, "17": [3, 4, 6, 7, 8, 11], "output_s": 3, "4096": [3, 4, 6, 7, 8, 11, 14], "prod": [3, 4, 11], "border_s": 3, "input_s": [3, 4, 11], "output_imag": 3, "zero": [3, 4, 10, 11, 14], "astyp": [3, 4, 6, 7, 8, 10, 11, 12, 14, 18, 20], "float32": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 18, 20], "input_imag": 3, "random": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 20], "randn": [3, 4, 11, 12, 14, 18, 20], "conv_filt": 3, "now": [3, 6, 7, 8, 10, 11, 14, 18], "structur": [3, 5, 6, 7, 14, 18], "how": [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 18, 19, 20, 21], "signatur": [3, 5], "kernel_nam": [3, 5, 11, 19, 20], "kernel_sourc": [3, 5, 19], "problem_s": [3, 4, 5, 6, 7, 8, 10, 11, 14, 18, 19, 22], "param": [3, 4, 5, 16, 17], "ellipsi": 3, "here": [3, 10, 11, 13, 14, 16], "indic": [3, 17, 22], "mani": [3, 5, 6, 7, 8, 14, 15, 16, 17], "won": 3, "t": [3, 5, 6, 7, 8, 10, 11, 13, 17, 20], "need": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 18, 19, 20, 21], "right": [3, 6, 7, 8, 10, 13], "interest": [3, 19], "five": [3, 5, 18], "string": [3, 5, 6, 7, 8, 9, 14, 15, 16, 18, 19], "domain": [3, 6, 7, 8, 9, 10], "three": [3, 4, 14], "dimens": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 22], "dictionari": [3, 5, 6, 7, 8, 10, 14, 16, 17, 18], "simpli": [3, 4, 5, 6, 7, 8, 10, 17, 18], "cell": [3, 6, 7, 8, 10, 14], "wrote": 3, "determin": [3, 6, 7, 8, 10, 16, 17], "grid": [3, 5, 6, 7, 8, 9, 11, 14, 22], "abov": [3, 5, 6, 7, 8, 10, 13, 14, 18, 19], "divid": [3, 6, 7, 8, 10, 11, 14], "divisor": [3, 5, 6, 7, 8, 14], "arrai": [3, 4, 5, 6, 7, 8, 10, 18, 19], "scalar": [3, 6, 7, 8, 10], "therefor": [3, 4, 6, 7, 8, 10, 11, 14], "exactli": [3, 5, 6, 7, 8, 14, 16], "order": [3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 17, 18], "32": [3, 5, 6, 7, 8, 10, 12, 14, 18], "bit": [3, 5, 6, 7, 8, 10, 11, 14], "final": [3, 4, 6, 7, 8, 10], "anyth": 3, "insert": [3, 4, 5, 8, 10, 11, 14, 18, 20, 22], "preprocessor": [3, 5], "statement": [3, 8, 10, 14, 20], "valu": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18], "were": [3, 6, 7, 8, 10, 14], "i_like_convolut": 3, "42": 3, "line": [3, 6, 7, 8], "definit": [3, 10], "effect": [3, 6, 7, 8], "unless": 3, "cours": [3, 6, 7, 8, 13, 14], "somewher": 3, "token": 3, "In": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 18, 19, 22], "freeli": 3, "few": [3, 6, 7, 8, 10, 11, 20], "special": [3, 6, 7, 8, 16, 18, 22], "notic": [3, 6, 7, 8], "haven": [3, 13], "yet": [3, 5, 10, 11, 18], "basic": [3, 5, 6, 7, 8, 18], "block_size_x": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 18, 20], "block_size_i": [3, 4, 6, 7, 8, 10, 11, 14], "block_size_z": [3, 6, 7, 8, 10], "interpret": 3, "z": [3, 5, 10], "block_size_nam": [3, 5], "let": [3, 5, 6, 7, 8, 18, 20], "creation": [3, 12, 17], "trusti": 3, "old": 3, "16": [3, 4, 6, 7, 8, 10, 11, 14], "dict": [3, 4, 5, 8, 11, 12, 16, 17, 18, 20], "undefin": [3, 5, 6, 7, 8, 14], "constant": [3, 5, 6, 7, 8, 9, 11, 14, 17], "filter_heigth": 3, "could": [3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 17, 20], "runtim": [3, 5, 6, 7, 8, 12, 13, 16, 20], "setup": [3, 6, 7, 8, 11, 13, 16, 19], "everyth": [3, 5, 6, 7, 8], "answer": [3, 4, 5, 6, 7, 8, 9], "done": [3, 13, 15, 16], "alloc": [3, 5, 6, 7, 8, 9, 11], "memori": [3, 5, 9, 11, 16, 19, 22], "move": [3, 5, 6, 11, 14, 17], "content": [3, 5], "deriv": [3, 5, 6, 7, 8, 15], "after": [3, 4, 5, 6, 7, 8, 11, 13, 14, 16], "retriev": [3, 5], "free": [3, 6, 7, 8, 11, 13, 14], "return": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19], "contrast": 3, "wa": [3, 5, 6, 7, 8, 16], "finish": [3, 5, 7, 10, 11, 16], "particularli": [3, 15], "compar": [3, 4, 6, 7, 8, 10, 14, 15, 16], "case": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 18, 19], "than": [3, 6, 7, 8, 10, 15, 16, 17, 22], "highli": [3, 12, 14], "parametr": 3, "long": [3, 6, 7, 8, 10, 11, 14, 19], "instead": [3, 5, 9, 14], "littl": [3, 6, 7, 8, 14], "ve": [3, 6, 7, 8, 13, 14], "interfac": [3, 4, 11, 13, 16, 17, 19], "familiar": [3, 14], "becaus": [3, 4, 6, 7, 8, 11, 13, 14, 15, 20, 22], "kernel_str": [3, 4, 5, 6, 7, 8, 11, 12, 17], "tune_param": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 19, 20], "onli": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 19], "similarli": 3, "singl": [3, 4, 5, 6, 7, 8, 11, 14, 16, 20], "wai": [3, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16], "64": [3, 6, 7, 8, 12, 14, 18, 20], "128": [3, 6, 7, 8, 12, 18, 20], "try": [3, 5, 6, 7, 8, 13, 14, 17], "env": [3, 5, 17, 18], "cartesian": [3, 10], "product": [3, 6, 7], "realli": [3, 6, 7, 8, 13], "howev": [3, 4, 6, 7, 8, 11, 13, 14, 16, 19, 20], "lot": [3, 6, 7, 8, 14, 16, 18, 19], "problemat": 3, "explain": [3, 5, 6, 7, 8, 11, 13, 14, 15, 18, 20], "illeg": 3, "2048": 3, "limit": [3, 5, 6, 7, 8, 9, 14, 16, 17, 20, 22], "1024": [3, 6, 7, 8, 18], "fail": [3, 5, 13], "reason": [3, 5, 19], "too": [3, 6, 7, 8, 10, 11, 14], "share": [3, 5], "regist": [3, 6, 7, 8, 14, 16], "silent": 3, "verbos": [3, 4, 5, 6, 7, 8, 11], "bound": [3, 5, 14, 17], "ignor": [3, 5, 6, 7, 8], "two": [3, 5, 6, 7, 8, 9, 14, 15, 17], "thing": [3, 11, 14], "record": [3, 5, 6, 16], "show": [3, 6, 7, 8, 9, 12, 15, 19], "specif": [3, 5, 6, 7, 8, 10, 15, 16, 17], "secondli": [3, 14], "experi": 3, "took": [3, 6, 8, 17, 18], "place": [3, 6, 7, 8, 16, 17, 18], "That": [3, 6, 7, 8, 11, 14, 15, 18], "mean": [3, 11, 14, 15, 17, 19, 20, 22], "softwar": [3, 6, 7, 8, 12, 13, 16, 17, 18], "along": [3, 5, 13, 18, 22], "second": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17], "alwai": [3, 5, 6, 7, 8], "under": [3, 12], "circumst": 3, "obtain": [3, 6, 7, 8, 10, 16], "promis": 3, "would": [3, 6, 7, 8, 20], "tile": [3, 9, 14], "factor": [3, 6, 7, 8, 9, 10, 14, 22], "amount": [3, 6, 7, 8, 14, 15], "particular": [3, 5, 6, 7, 9, 11, 14, 16, 19], "increas": [3, 6, 7, 8, 16], "certain": [3, 5, 6, 7, 8, 16, 22], "tile_size_x": [3, 4, 6, 7, 8, 11, 14], "4": [3, 6, 7, 8, 10, 14, 16], "tile_size_i": [3, 4, 6, 7, 8, 11, 14], "understand": 3, "everi": [3, 4, 6, 7, 8, 9, 16, 18], "fewer": [3, 6, 7, 8], "total": [3, 5, 6, 7, 8, 14, 15, 18], "stai": 3, "tell": [3, 6, 7, 8, 9, 11, 14, 18, 19], "influenc": 3, "did": [3, 6, 7, 8, 14], "mimick": 3, "behavior": [3, 14, 16], "assum": [3, 5, 6, 7, 8, 14], "far": [3, 6, 7, 8, 14, 18], "grid_div_x": [3, 4, 6, 7, 8, 11, 14], "grid_div_i": [3, 4, 6, 7, 8, 11, 14], "decreas": [3, 14], "correspondingli": 3, "displai": 3, "commonli": [3, 6, 7, 8, 13, 14], "gflop": [3, 5, 9, 14, 15], "giga": [3, 14], "compos": [3, 5, 14, 15], "lambda": [3, 5, 6, 7, 14, 15], "collect": [3, 5, 6, 7, 8, 10, 14, 16, 19], "ordereddict": [3, 6, 7, 8, 10, 14, 15], "p": [3, 5, 14, 15, 19], "1e9": [3, 14], "1e3": [3, 6, 7, 8, 14, 15], "expand": [3, 14, 16], "longer": [3, 5, 15], "sinc": [3, 8, 10, 14, 20], "And": [3, 6, 7, 8, 17, 20], "know": [3, 6, 7, 8, 14, 15], "enough": [3, 4, 14], "abl": [3, 5, 6, 7, 8], "own": [3, 8, 11, 13, 15, 16], "whenev": 4, "program": [4, 6, 7, 8, 11, 14, 19, 20], "good": [4, 6, 7, 8, 22], "fast": [4, 6, 7, 8], "verifi": [4, 5, 9], "instanc": [4, 5, 6, 7, 8, 11, 16], "none": [4, 5, 16, 17], "onc": [4, 5, 6, 7, 8, 10, 16], "comparison": 4, "implement": [4, 5, 9, 10, 15, 16, 17], "allclos": 4, "maximum": [4, 5, 10, 17], "absolut": 4, "1e": 4, "6": [4, 6, 7, 8, 10, 11], "want": [4, 8, 10, 11, 13, 14, 16, 18, 22], "toler": 4, "atol": [4, 5], "convolution_correct": 4, "py": [4, 11, 13], "demonstr": [4, 8, 9, 14], "open": [4, 6, 7, 11, 14], "r": [4, 11], "cmem_arg": [4, 5], "d_filter": 4, "arg": [4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 19, 20], "field": [4, 6, 7, 8], "its": [4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16], "almost": [4, 6, 7, 8, 16], "whose": 4, "trust": [4, 17], "construct": [4, 14], "There": [4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 22], "precomput": 4, "flexibl": [4, 6, 7, 14], "callabl": [4, 5], "accept": [4, 5, 17], "cpu_result": 4, "gpu_result": [4, 6, 8], "although": 4, "semant": 4, "posit": [4, 5, 10, 17, 20], "reflect": [4, 16], "reduct": [4, 15], "snippet": 4, "sum_x": 4, "custom": [4, 9, 15, 16, 19], "def": [4, 5, 6, 7, 8, 10, 16, 19], "verify_partial_reduc": 4, "isclos": 4, "first_kernel": 4, "_": [4, 6, 7, 8], "sum_float": 4, "map": [4, 9, 10], "provid": [4, 5, 6, 7, 8, 11, 20, 21], "third": [4, 14], "partial": [4, 6, 7, 8, 9], "cpu": [4, 7, 8, 11], "achiev": [4, 8], "element": [4, 6, 7, 8, 14, 15, 18, 19], "necessarili": [4, 11], "section": [5, 6, 7, 8], "detail": [5, 13, 21], "intern": [5, 12, 17, 20], "mostli": [5, 12], "relev": [5, 12, 16], "develop": [5, 12, 13], "extens": 5, "architectur": [5, 16], "At": [5, 10], "expos": 5, "respons": 5, "iter": [5, 6, 7, 8, 10, 14, 16, 17, 18], "over": [5, 6, 7, 8, 13, 14, 16, 17], "brute_forc": 5, "valid": [5, 9, 14], "random_sampl": 5, "sampl": [5, 17], "advanc": [5, 20], "being": [5, 6, 7, 8, 14, 16, 17], "strategy_opt": [5, 17], "sai": [5, 6, 7, 8, 18, 20], "foreseen": 5, "futur": [5, 12, 22], "high": [5, 6, 7, 8, 12, 14, 16], "wrap": [5, 18, 20], "low": [5, 6, 7, 8, 14], "abstract": [5, 16], "ready_argument_list": 5, "build": [5, 6, 7, 8], "bottom": 5, "pyhip": 5, "either": [5, 10, 17, 20], "typic": [5, 13, 14], "nvcc": 5, "gcc": 5, "fortran": [5, 9, 20], "turn": 5, "launch": [5, 6, 7, 8, 11, 16], "rest": [5, 6, 7, 8], "helper": [5, 16], "get_opt": 5, "suppli": [5, 11, 14, 17, 20], "get_strategy_docstr": 5, "method": [5, 6, 7, 8, 11, 14, 16, 17], "make_strategy_options_doc": 5, "scale_from_param": 5, "ep": [5, 17], "func": [5, 16], "invers": 5, "unscal": 5, "setup_method_argu": 5, "prepar": [5, 6, 7, 8], "setup_method_opt": 5, "tuning_opt": [5, 17], "snap_to_nearest_config": 5, "closest": 5, "unscale_and_snap_to_nearest": 5, "snap": 5, "scale": 5, "variabl": [5, 10, 13, 17], "nearest": 5, "class": [5, 16, 17], "kernel_opt": 5, "device_opt": 5, "__init__": 5, "instanti": [5, 20], "kernelsourc": 5, "parameter_spac": [5, 17], "entir": [5, 6, 7, 8, 14, 17], "iterfac": 5, "platform": [5, 12, 13, 16], "quiet": 5, "compiler_opt": 5, "7": [5, 6, 7, 8, 10], "offer": 5, "languag": [5, 8, 11, 14, 19], "lang": [5, 9, 11, 20], "bool": [5, 19], "gpu_arg": 5, "benchmark_continu": 5, "durat": [5, 16], "benchmark_default": 5, "check_kernel_output": 5, "compile_kernel": 5, "copy_constant_memory_arg": 5, "recent": [5, 13, 16], "copy_shared_memory_arg": 5, "smem_arg": 5, "copy_texture_memory_arg": 5, "texmem_arg": 5, "textur": 5, "create_kernel_inst": 5, "get_environ": 5, "memcpy_dtoh": [5, 6], "dest": 5, "src": 5, "copi": [5, 6, 7, 8, 18], "static": 5, "preprocess_gpu_argu": 5, "old_argu": 5, "flat": 5, "given": [5, 6, 7, 8, 10, 16, 17], "mem": 5, "group": [5, 6, 7, 8], "maintain": 5, "state": [5, 6, 7, 8, 16], "interact": [5, 16], "properti": [5, 14], "context": [5, 6, 8, 10], "kernel_inst": 5, "lookup": 5, "directli": [5, 6, 7, 8, 11, 14, 16, 20], "driver": [5, 6, 8, 10], "ndarrai": [5, 10], "format": [5, 6, 7, 19], "kei": [5, 6, 7, 8, 14, 17, 18], "symbol": 5, "similar": [5, 11, 14], "regular": [5, 8, 16], "int32": [5, 12, 18, 20], "kernel_finish": 5, "otherwis": [5, 14], "devicealloc": 5, "memcpy_htod": [5, 6], "memset": 5, "unsign": [5, 7], "byte": [5, 19], "tupl": [5, 8, 10, 17], "start_ev": 5, "event": [5, 6, 11, 16], "mark": 5, "measur": [5, 6, 7, 8, 10, 11, 14, 15, 16, 22], "stop_ev": 5, "synchron": [5, 6, 8, 10, 14, 15], "halt": [5, 11], "until": [5, 11], "task": 5, "rawkernel": 5, "cudeviceptr": 5, "cufunct": 5, "id": [5, 16], "must": [5, 15], "dynam": 5, "buffer": [5, 7, 19], "fill": [5, 14], "item": [5, 6, 7, 8, 10], "ndrang": 5, "kernelinst": 5, "repres": [5, 6, 7, 8], "tunabl": [5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 22], "ctype": 5, "_funcptr": 5, "ptr": 5, "pionter": 5, "compilationfailedconfig": 5, "errorconfig": 5, "invalidconfig": 5, "npencod": 5, "skipkei": 5, "ensure_ascii": 5, "check_circular": 5, "allow_nan": 5, "sort_kei": 5, "indent": 5, "separ": [5, 9, 11, 20], "dump": [5, 6, 7], "json": [5, 6, 7, 9], "obj": 5, "subclass": 5, "serializ": 5, "rais": 5, "typeerror": 5, "arbitrari": 5, "self": [5, 16, 17], "els": 5, "jsonencod": 5, "runtimefailedconfig": 5, "skippablefailur": 5, "stopcriterionreach": 5, "thrown": 5, "stop": [5, 17], "criterion": [5, 17], "reach": 5, "check_argument_list": 5, "check_argument_typ": 5, "dtype": [5, 19], "kernel_argu": 5, "check_restrict": 5, "restrict": [5, 9, 14, 20], "whether": [5, 15, 17], "meet": 5, "check_stop_criterion": 5, "max_fev": [5, 17], "exceed": 5, "check_thread_block_dimens": 5, "max_thread": 5, "check_tune_params_list": 5, "simulation_mod": 5, "forbidden": 5, "compile_restrict": 5, "monolith": 5, "try_to_constraint": 5, "union": 5, "str": [5, 6, 7, 8, 10], "constraint": 5, "pars": [5, 6, 7], "config_valid": 5, "max": 5, "convert_constraint_restrict": 5, "convert": [5, 6, 7], "backward": 5, "compat": [5, 13], "cuda_error_check": 5, "statu": 5, "delete_temp_fil": 5, "delet": 5, "temporari": 5, "don": [5, 6, 8, 10, 11], "complain": 5, "detect_languag": 5, "attempt": [5, 20], "detect": [5, 17, 20], "dump_cach": 5, "omit": 5, "sever": [5, 6, 7, 8, 9, 10, 13, 14, 20], "store_cach": 5, "speed": 5, "great": [5, 6, 7, 8, 18], "power": [5, 14, 16, 22], "get_best_config": 5, "objective_higher_is_bett": [5, 15], "best": [5, 6, 7, 10, 14, 17, 20, 22], "accord": 5, "get_config_str": 5, "compact": 5, "represent": [5, 19], "get_grid_dimens": 5, "current_problem_s": 5, "grid_div": 5, "dim": 5, "get_instance_str": 5, "debug": 5, "advis": 5, "get_kernel_str": [5, 6, 7, 8], "One": [5, 6, 7, 8, 16, 19], "get_problem_s": 5, "get_smem_arg": 5, "get_temp_filenam": 5, "suffix": 5, "form": [5, 14, 16, 17], "temp_x": 5, "larg": [5, 6, 7, 8, 10], "integ": [5, 16, 19], "get_thread_block_dimens": 5, "convent": [5, 11], "get_total_tim": 5, "overhead_tim": 5, "looks_like_a_filenam": 5, "normalize_verify_funct": 5, "v": [5, 6, 7, 8, 10], "normal": [5, 17], "result_host": 5, "keyword": 5, "behaviour": 5, "parse_restrict": 5, "prepare_kernel_str": 5, "prepend": [5, 8], "seri": [5, 10], "By": [5, 11, 14, 17], "macro": 5, "made": 5, "print_config": 5, "print_config_output": 5, "process_cach": 5, "device_nam": 5, "tune_params_kei": 5, "x1": 5, "x2": 5, "xn": 5, "234342": 5, "y1": 5, "y2": 5, "yn": 5, "134233": 5, "close": [5, 6, 7, 8], "bracket": 5, "miss": 5, "earlier": [5, 6, 7, 8, 10], "abruptli": 5, "process_metr": 5, "calcul": [5, 10], "express": [5, 6, 7, 8, 9, 11, 14], "10000": 5, "read_cach": 5, "open_cach": 5, "cachefil": 5, "read_fil": 5, "replace_param_occurr": 5, "occurr": 5, "setup_block_and_grid": 5, "write_fil": 5, "whole": [6, 7, 8, 14, 17], "model": [6, 7, 8, 12], "physic": 6, "numer": [6, 7, 8], "introduc": [6, 7, 8, 14, 16], "redistribut": [6, 7, 8], "region": [6, 7, 8], "concentr": [6, 7, 8], "bulk": [6, 7, 8], "motion": [6, 7, 8], "concept": [6, 7, 8], "wide": [6, 7, 8, 13, 14], "chemistri": [6, 7, 8], "biologi": [6, 7, 8], "suppos": [6, 7, 8], "metal": [6, 7, 8], "sheet": [6, 7, 8], "temperatur": [6, 7, 8, 16, 17, 22], "equal": [6, 7, 8, 14], "degre": [6, 7, 8], "everywher": [6, 7, 8], "heat": [6, 7, 8], "thousand": [6, 7, 8], "instant": [6, 7, 8, 10], "hotspot": [6, 7, 8], "cooler": [6, 7, 8], "area": [6, 7, 8, 14], "melt": [6, 7, 8], "loss": [6, 7, 8], "radiat": [6, 7, 8], "caus": [6, 7, 8], "frac": [6, 7, 8], "d": [6, 7, 8, 10, 17, 18], "spatial": [6, 7, 8], "descret": [6, 7, 8], "2d": [6, 7, 8, 9], "quantiti": [6, 7, 8, 15, 16], "nx": [6, 7, 8, 10], "equi": [6, 7, 8], "distant": [6, 7, 8], "direct": [6, 7, 8, 11, 14, 15], "ny": [6, 7, 8, 10], "distanc": [6, 7, 8, 17], "delta": [6, 7, 8], "between": [6, 7, 8, 11, 13, 14, 15, 17], "central": [6, 7, 8], "approxim": [6, 7, 8], "x_i": [6, 7, 8, 10], "x_": [6, 7, 8], "approx": [6, 7, 8], "u_": [6, 7, 8], "2u_": [6, 7, 8], "y_": [6, 7, 8], "estim": [6, 7, 8], "next": [6, 7, 8, 14, 19], "simplifi": [6, 7, 8], "formula": [6, 7, 8], "further": [6, 7, 8, 13, 14], "4u_": [6, 7, 8], "simplic": [6, 7, 8, 10], "assumpt": [6, 7, 8], "boundari": [6, 7, 8], "condit": [6, 7, 8, 14], "dt": [6, 7, 8], "225": [6, 7, 8], "give": [6, 7, 8, 17], "test": [6, 7, 8, 9, 13, 14, 16], "initi": [6, 7, 8, 19], "hot": [6, 7, 8], "plot": [6, 7, 8], "anoth": [6, 7, 8, 11, 14, 15, 17], "color": [6, 7, 8], "matplotlib": [6, 7, 8, 13], "pyplot": [6, 7, 8], "inlin": [6, 7, 8], "get_initial_condit": [6, 7, 8], "ones": [6, 7, 8, 22], "randint": [6, 7, 8], "1000": [6, 7, 8, 10], "2000": [6, 7, 8], "fig": [6, 7, 8], "ax1": [6, 7, 8], "ax2": [6, 7, 8], "subplot": [6, 7, 8], "imshow": [6, 7, 8], "lt": [6, 7, 8], "axesimag": [6, 7, 8], "0x2aaab952f240": 6, "gt": [6, 7, 8], "quick": [6, 7, 8], "save": [6, 7], "later": [6, 7, 8, 10], "field_copi": [6, 7], "4164": 6, "018869400024": 6, "0x2aab1c98b3c8": 6, "worri": [6, 8], "terminologi": [6, 8], "text": [6, 8, 14], "5": [6, 7, 8, 10, 17], "225f": [6, 7, 8], "diffuse_kernel": [6, 7, 8], "u_new": [6, 7, 8], "0f": [6, 7, 8], "togeth": [6, 7, 8, 13], "choos": [6, 7, 8, 14, 17], "impact": [6, 7, 8, 11], "fix": [6, 7, 8, 17], "unrol": [6, 7, 8, 9, 14, 22], "loop": [6, 7, 8, 9, 14, 22], "drv": 6, "sourcemodul": [6, 8, 10], "init": 6, "make_context": 6, "devprop": 6, "k": [6, 7, 8, 10, 12, 14, 18], "get_devic": 6, "get_attribut": 6, "cc": 6, "compute_capability_major": 6, "compute_capability_minor": 6, "u_old": [6, 8], "mem_alloc": 6, "nbyte": 6, "block_size_str": [6, 8], "arch": 6, "sm_": 6, "get_funct": [6, 8, 10], "boilerpl": [6, 7, 8], "moment": [6, 7, 8], "serv": [6, 7, 8, 15, 17], "guess": [6, 7, 8], "pair": [6, 7, 8], "500": [6, 7, 8], "time_sinc": 6, "zeros_lik": [6, 10, 12, 14, 18, 20], "set_titl": [6, 7, 8], "53": [6, 7, 8], "423038482666016": 6, "0x2aaabbdcb2e8": 6, "faster": [6, 7, 8, 14], "cleanup": 6, "pop": 6, "think": [6, 7, 8], "messi": [6, 7, 8], "got": [6, 7, 8], "cleaner": [6, 7, 8], "previous": [6, 7, 8, 14], "plai": [6, 7, 8], "difficult": [6, 7, 8, 19, 20], "rather": [6, 7, 8], "underutil": [6, 7, 8], "purpos": [6, 7, 8, 11, 14, 22], "feel": [6, 7, 8], "48": [6, 7, 8], "care": [6, 7, 8], "appropi": [6, 7, 8], "fly": [6, 7, 8], "12": [6, 7, 8], "13": [6, 7, 8], "geforc": [6, 7, 8, 10], "gtx": [6, 7, 8, 10], "titan": [6, 7, 8], "22305920124": 6, "779033613205": 6, "824838399887": 6, "900499212742": 6, "999763202667": 6, "727967989445": 6, "752479994297": 6, "797900807858": 6, "876627194881": 6, "93347837925": 6, "766662418842": 6, "803033602238": 6, "853574407101": 6, "971545600891": 6, "763775992393": 6, "791257584095": 6, "848044800758": 6, "922745585442": 6, "792595207691": 6, "822137594223": 6, "893279993534": 6, "well": [6, 7, 8, 10, 14, 16], "millisecond": [6, 7, 8], "averag": [6, 7, 8, 11, 16], "matter": [6, 7, 8, 11], "analyz": [6, 7, 8], "seem": [6, 7, 8], "vari": [6, 7, 8, 10, 14, 15], "addtion": [6, 7, 8], "among": [6, 7, 8, 12, 17], "128x32": [6, 7, 8], "likewis": [6, 7, 8], "becom": [6, 7, 8, 16, 17], "affect": [6, 7, 8, 14], "within": [6, 7, 8, 10, 14, 17], "exchang": [6, 7, 8], "fact": [6, 7, 8, 11], "commun": [6, 7, 8], "idea": [6, 7, 8, 11, 14, 22], "control": [6, 7, 8, 16, 17], "l2": [6, 7, 8], "closer": [6, 7, 8], "multiprocessor": [6, 7, 8], "l1": [6, 7, 8], "fine": [6, 7, 8], "grain": [6, 7, 8], "manag": [6, 7, 8, 14, 16], "cost": [6, 7, 8, 17], "instruct": [6, 7, 8, 9, 13, 14], "overhead": [6, 7, 8, 14], "degrad": [6, 7, 8], "intermedi": [6, 7, 8], "mind": [6, 7, 8], "14": [6, 7, 8], "tx": [6, 7, 8, 14], "ty": [6, 7, 8, 14], "bx": [6, 7, 8, 10], "__shared__": [6, 8, 14], "sh_u": [6, 7, 8], "pragma": [6, 7, 8, 14], "__syncthread": [6, 7, 8, 14], "75041918755": 6, "18713598251": 6, "09015038013": 6, "06844799519": 6, "09730558395": 6, "14420480728": 6, "05957758427": 6, "07508480549": 6, "0731967926": 6, "14729599953": 6, "08389122486": 6, "10700161457": 6, "10125439167": 6, "31661438942": 6, "0629119873": 6, "04807043076": 6, "054880023": 6, "12033278942": 6, "06672639847": 6, "05816960335": 6, "12000002861": 6, "sometim": [6, 7, 8, 19], "merg": [6, 7, 8, 14], "half": [6, 7, 8], "doubl": [6, 7, 8, 19, 20], "cover": [6, 7, 8, 17], "beyond": [6, 7, 8], "reduc": [6, 7, 8, 14], "condens": [6, 7, 8], "keep": [6, 7, 8, 14, 19], "importantli": [6, 7, 8], "worst": [6, 7, 8], "15": [6, 7, 8, 20], "tj": [6, 7, 8], "ti": [6, 7, 8, 10], "ad": [6, 7, 8, 11], "somehow": [6, 7, 8], "larger": [6, 7, 8, 11, 17, 20], "insid": [6, 7, 8, 11, 14, 20], "round": [6, 7, 8], "arithmet": [6, 7, 8], "evalu": [6, 7, 8, 14, 17], "759308815": 6, "29789438248": 6, "06983039379": 6, "2634239912": 6, "997139203548": 6, "843692803383": 6, "05549435616": 6, "862348806858": 6, "750636804104": 6, "19084160328": 6, "876377594471": 6, "714169609547": 6, "875001597404": 6, "691116797924": 6, "575859189034": 6, "759679996967": 6, "622867202759": 6, "650336003304": 6, "09794559479": 6, "826515209675": 6, "692665600777": 6, "78363519907": 6, "646092808247": 6, "554745602608": 6, "716115188599": 6, "581280004978": 6, "662566399574": 6, "07386879921": 6, "833420813084": 6, "705055999756": 6, "840755212307": 6, "652575993538": 6, "569388794899": 6, "689356791973": 6, "597267186642": 6, "675232005119": 6, "10033922195": 6, "860332798958": 6, "731891202927": 6, "867276787758": 6, "68781440258": 6, "595276796818": 6, "735436797142": 6, "60216319561": 6, "852166390419": 6, "15089921951": 6, "852575981617": 6, "705932807922": 6, "888671982288": 6, "673248004913": 6, "563417613506": 6, "761139214039": 6, "621254396439": 6, "676595199108": 6, "06709122658": 6, "804953610897": 6, "685670387745": 6, "801798415184": 6, "632006394863": 6, "542387211323": 6, "722668802738": 6, "578745603561": 6, "618598401546": 6, "08220798969": 6, "821881604195": 6, "687955200672": 6, "77759360075": 6, "618003201485": 6, "539891195297": 6, "705900788307": 6, "568556785583": 6, "624492788315": 6, "0799423933": 6, "832300806046": 6, "70140799284": 6, "835481595993": 6, "638348805904": 6, "550105595589": 6, "667251205444": 6, "576044797897": 6, "732409596443": 6, "15916161537": 6, "869497597218": 6, "733248019218": 6, "890803205967": 6, "677363204956": 6, "577215993404": 6, "730982398987": 6, "58035838604": 6, "10066559315": 6, "837804794312": 6, "691385602951": 6, "851040017605": 6, "666656005383": 6, "560505592823": 6, "771103990078": 6, "626163220406": 6, "694451200962": 6, "11514236927": 6, "837299215794": 6, "703302407265": 6, "806828796864": 6, "648620784283": 6, "562521612644": 6, "760915207863": 6, "605760002136": 6, "690009605885": 6, "10740480423": 6, "841631996632": 6, "700883197784": 6, "838195204735": 6, "649779188633": 6, "56585599184": 6, "7168192029": 6, "59088640213": 6, "69627519846": 6, "3269824028": 6, "02665598392": 6, "840908801556": 6, "03752319813": 6, "788345599174": 6, "662041604519": 6, "85437438488": 6, "680422389507": 6, "0759360075": 6, "801996803284": 6, "666003203392": 6, "808000004292": 6, "643359994888": 6, "544691193104": 6, "741964805126": 6, "60942081213": 6, "681350398064": 6, "05262081623": 6, "792108798027": 6, "66344319582": 6, "768064010143": 6, "625260794163": 6, "540352010727": 6, "721862399578": 6, "579411196709": 6, "626976013184": 6, "06332798004": 6, "808211183548": 6, "679372787476": 6, "803718411922": 6, "627136015892": 6, "538227200508": 6, "682188808918": 6, "573836791515": 6, "725548803806": 6, "13023357391": 6, "843411195278": 6, "713843202591": 6, "85886080265": 6, "657920002937": 6, "565254402161": 6, "697094392776": 6, "579904007912": 6, "07484800816": 6, "801119995117": 6, "667347204685": 6, "799059200287": 6, "643820810318": 6, "542937588692": 6, "740518403053": 6, "615148806572": 6, "731334400177": 6, "07002239227": 6, "805299210548": 6, "675923216343": 6, "782060790062": 6, "631142401695": 6, "540383994579": 6, "723999989033": 6, "578681600094": 6, "726335990429": 6, "13297917843": 6, "844428789616": 6, "710278391838": 6, "835494399071": 6, "637958395481": 6, "567417597771": 6, "699366402626": 6, "588492810726": 6, "tri": [6, 7, 8, 17], "grow": [6, 7, 8], "quickli": [6, 7, 8], "went": [6, 7, 8, 10], "72": [6, 7, 8], "26": [6, 7, 8], "32x2": [6, 7, 8], "64x4": [6, 7, 8], "four": [6, 7, 8], "best_tim": [6, 7], "min": [6, 7], "05": [6, 7], "join": [6, 7], "nice": [6, 7], "stdout": [6, 7], "why": [6, 7, 11, 15], "easili": [6, 7, 16], "easi": [6, 7, 15, 16], "csv": [6, 7, 9], "analysi": [6, 7], "panda": [6, 7, 9, 13], "18": [6, 7, 8], "fp": [6, 7], "datafram": [6, 7], "df": [6, 7], "to_csv": [6, 7], "0x2aab1de088d0": 7, "01": 7, "sy": 7, "140": 7, "wall": 7, "98": 7, "__kernel": 7, "get_group_id": 7, "get_local_id": 7, "cl": 7, "ctx": 7, "create_some_context": 7, "mf": 7, "mem_flag": 7, "a_h": 7, "a_d": 7, "read_writ": 7, "copy_host_ptr": 7, "hostbuf": 7, "b_d": 7, "kernel_src": 7, "prg": 7, "queue": 7, "commandqueu": 7, "run_gpu": 7, "444": 7, "154": 7, "598": 7, "985": 7, "enqueue_copi": 7, "1748096": 7, "7284544": 7, "7707904": 7, "8573184": 7, "8380288": 7, "686528": 7, "69648": 7, "7461632": 7, "818304": 7, "771072": 7, "7190464": 7, "7522432": 7, "7982208": 7, "9624512": 7, "7214464": 7, "7453312": 7, "8028416": 7, "8922624": 7, "747328": 7, "7860736": 7, "8637184": 7, "__local": 7, "barrier": 7, "clk_local_mem_f": 7, "8449472": 7, "1912576": 7, "1035136": 7, "0927808": 7, "1140736": 7, "1790336": 7, "0808192": 7, "0809792": 7, "0836928": 7, "1545856": 7, "1249984": 7, "1264": 7, "1230336": 7, "4015104": 7, "0873216": 7, "0626496": 7, "0692224": 7, "140192": 7, "0801344": 7, "0688128": 7, "1428928": 7, "8844544": 7, "3245952": 7, "0911808": 7, "3039616": 7, "0079296": 7, "84848": 7, "0708288": 7, "857728": 7, "7561792": 7, "231072": 7, "8774336": 7, "7087296": 7, "8772672": 7, "6911872": 7, "5715968": 7, "7584896": 7, "6292032": 7, "6498688": 7, "1145664": 7, "8252928": 7, "6757568": 7, "7881152": 7, "6237696": 7, "544224": 7, "6951168": 7, "5648128": 7, "6452736": 7, "1065792": 7, "8313792": 7, "6905984": 7, "8302656": 7, "6367488": 7, "5478592": 7, "6660672": 7, "5719744": 7, "6551744": 7, "1384064": 7, "8531072": 7, "7078976": 7, "8516672": 7, "6677696": 7, "5685632": 7, "7074048": 7, "5753152": 7, "8228864": 7, "2124736": 7, "8633344": 7, "6921216": 7, "8896384": 7, "6659904": 7, "5582144": 7, "7522624": 7, "6081536": 7, "6664448": 7, "1095936": 7, "8063424": 7, "6717888": 7, "7982848": 7, "6263552": 7, "5289728": 7, "7008832": 7, "567456": 7, "5968704": 7, "1018432": 7, "8117248": 7, "6724736": 7, "7728576": 7, "6038336": 7, "5172352": 7, "6796352": 7, "5470016": 7, "5968448": 7, "1107712": 7, "8237248": 7, "6810944": 7, "821952": 7, "620352": 7, "5230208": 7, "6415552": 7, "5476864": 7, "7168192": 7, "1942016": 7, "8626304": 7, "7099712": 7, "9123328": 7, "6608448": 7, "5631168": 7, "7113024": 7, "556576": 7, "1583104": 7, "8384832": 7, "67856": 7, "845856": 7, "6581248": 7, "54944": 7, "7520064": 7, "6076224": 7, "6842112": 7, "1547072": 7, "8422016": 7, "6895552": 7, "8037312": 7, "6387072": 7, "5383296": 7, "7326656": 7, "5863488": 7, "6813376": 7, "1493952": 7, "8444928": 7, "6929216": 7, "832768": 7, "6389312": 7, "5412672": 7, "698336": 7, "5717568": 7, "676096": 7, "4303104": 7, "0341696": 7, "8365184": 7, "0398656": 7, "7786496": 7, "648928": 7, "8479232": 7, "6508544": 7, "1219392": 7, "7994048": 7, "6492288": 7, "8068416": 7, "6343168": 7, "5235328": 7, "7268928": 7, "5898432": 7, "6633536": 7, "0849664": 7, "7869632": 7, "6458624": 7, "7611968": 7, "613088": 7, "50912": 7, "6972928": 7, "5620608": 7, "601856": 7, "095232": 7, "7967488": 7, "6601472": 7, "7952896": 7, "6047296": 7, "5108224": 7, "6607744": 7, "5492416": 7, "7091136": 7, "171552": 7, "8473408": 7, "6962112": 7, "8663936": 7, "6466816": 7, "5475584": 7, "6754048": 7, "5591744": 7, "108896": 7, "7907264": 7, "6459328": 7, "7965888": 7, "6250816": 7, "5188416": 7, "721408": 7, "5920832": 7, "7068608": 7, "0909248": 7, "7930752": 7, "6524544": 7, "7745216": 7, "6146176": 7, "5116928": 7, "6975872": 7, "5548416": 7, "7075136": 7, "174624": 7, "8384512": 7, "69104": 7, "8335488": 7, "6264192": 7, "5445248": 7, "6719104": 7, "5592064": 7, "19": [7, 8], "solv": 8, "0x7f888f8cd7b8": 8, "4152": 8, "086019515991": 8, "0x7f8865b51f28": 8, "gpuarrai": [8, 10], "tool": [8, 10, 12], "autoinit": [8, 10], "to_gpu": [8, 10], "mod": [8, 10], "t0": [8, 10], "ona": 8, "33": 8, "46109390258789": 8, "0x7f8858b873c8": 8, "1080": [8, 10], "916985595226": 8, "489004802704": 8, "500524806976": 8, "513356792927": 8, "545715200901": 8, "486515200138": 8, "449055999517": 8, "44974719882": 8, "457427197695": 8, "492915201187": 8, "464863997698": 8, "466118401289": 8, "475264000893": 8, "513632011414": 8, "458412796259": 8, "457715201378": 8, "461017608643": 8, "475987195969": 8, "460032004118": 8, "457779198885": 8, "462649595737": 8, "kernel_string_shar": 8, "22673916817": 8, "826361596584": 8, "793516802788": 8, "782112002373": 8, "776639997959": 8, "795135998726": 8, "722777605057": 8, "762777590752": 8, "75422719717": 8, "804876792431": 8, "778656005859": 8, "769734406471": 8, "782495999336": 8, "932281601429": 8, "734028804302": 8, "721625590324": 8, "736511993408": 8, "800019192696": 8, "724966406822": 8, "722969603539": 8, "759430396557": 8, "kernel_string_til": 8, "22200961113": 8, "91601279974": 8, "752838408947": 8, "873651194572": 8, "69833599329": 8, "586931192875": 8, "516473591328": 8, "411392003298": 8, "384262400866": 8, "82159358263": 8, "632607996464": 8, "506457602978": 8, "618758392334": 8, "500288009644": 8, "429862397909": 8, "44995200038": 8, "366150397062": 8, "342201602459": 8, "793542397022": 8, "58026239872": 8, "494163197279": 8, "546316814423": 8, "467059195042": 8, "404249596596": 8, "440895992517": 8, "341376006603": 8, "339692795277": 8, "783923208714": 8, "597920000553": 8, "50277120471": 8, "615475213528": 8, "470937597752": 8, "418393599987": 8, "443519997597": 8, "343961596489": 8, "342540800571": 8, "780352008343": 8, "611705589294": 8, "515667212009": 8, "622534394264": 8, "502195191383": 8, "437388807535": 8, "45568639636": 8, "359289598465": 8, "426995199919": 8, "788947200775": 8, "616556799412": 8, "496121603251": 8, "629164803028": 8, "474841600657": 8, "407667201757": 8, "47406719923": 8, "371507203579": 8, "352531200647": 8, "72023679018": 8, "574816000462": 8, "481817597151": 8, "580928003788": 8, "455724793673": 8, "394975996017": 8, "464659202099": 8, "357107198238": 8, "324083191156": 8, "759910392761": 8, "569177603722": 8, "481279999018": 8, "528115200996": 8, "441734397411": 8, "393126398325": 8, "455404800177": 8, "350457596779": 8, "322547197342": 8, "754201591015": 8, "579827189445": 8, "491852802038": 8, "582751989365": 8, "451283198595": 8, "391807991266": 8, "456275194883": 8, "356716805696": 8, "362937599421": 8, "809894394875": 8, "60433280468": 8, "507142400742": 8, "655827200413": 8, "474092799425": 8, "408166396618": 8, "480531209707": 8, "346707201004": 8, "780134403706": 8, "601049602032": 8, "493900799751": 8, "620384001732": 8, "494553589821": 8, "425414395332": 8, "467033600807": 8, "375468802452": 8, "346079999208": 8, "771052801609": 8, "593977594376": 8, "49723520875": 8, "583270406723": 8, "478079998493": 8, "416320002079": 8, "443942397833": 8, "359744000435": 8, "343545603752": 8, "780960011482": 8, "598758399487": 8, "498617601395": 8, "57678719759": 8, "46561280489": 8, "41324160099": 8, "431225597858": 8, "351263999939": 8, "34440960288": 8, "933260798454": 8, "715257608891": 8, "586604809761": 8, "711615991592": 8, "558771193027": 8, "466284793615": 8, "44043520093": 8, "361823999882": 8, "731839990616": 8, "57044479847": 8, "470220798254": 8, "608800005913": 8, "472665601969": 8, "416352003813": 8, "481376004219": 8, "380812799931": 8, "351923197508": 8, "719257593155": 8, "55171200037": 8, "466758400202": 8, "568435204029": 8, "459654402733": 8, "394380801916": 8, "463052803278": 8, "36409599781": 8, "328998398781": 8, "73579518795": 8, "564575994015": 8, "472236800194": 8, "549024009705": 8, "438406395912": 8, "389945602417": 8, "455193603039": 8, "364051198959": 8, "375519996881": 8, "798195195198": 8, "588998401165": 8, "49552000761": 8, "595462405682": 8, "460972803831": 8, "400672000647": 8, "465132802725": 8, "364627194405": 8, "729363203049": 8, "558815991879": 8, "466655993462": 8, "600819194317": 8, "460281592607": 8, "404908800125": 8, "478739196062": 8, "386668801308": 8, "385510402918": 8, "720915210247": 8, "550668799877": 8, "466937589645": 8, "564921605587": 8, "447974395752": 8, "394271999598": 8, "46233600378": 8, "365190398693": 8, "387827193737": 8, "762003195286": 8, "579007995129": 8, "486649608612": 8, "557331204414": 8, "443033593893": 8, "396070402861": 8, "457075202465": 8, "369555193186": 8, "wish": 8, "modifi": [8, 16], "tile_size_j": 8, "fixed_param": [8, 10], "ceil": [8, 10], "zip": [8, 10], "transfer": [8, 9, 11], "20": [8, 17], "21": 8, "618": 8, "2231903076172": 8, "0x7f887c3d2358": 8, "incorpor": 8, "ifndef": 8, "kerenel": 8, "psedo": 8, "below": [8, 9, 10, 11, 13, 14, 15, 16, 17, 19], "endif": 8, "bypass": 8, "usecas": 9, "test_vector_add": 9, "test_vector_add_parameter": 9, "illustr": 9, "dimension": [9, 10], "clean": [9, 14], "center": [9, 10], "lock": [9, 16], "overlap": [9, 11], "shuffl": 9, "pipelin": 9, "consist": [9, 14], "scipi": 9, "algorithm": [9, 12, 17], "cub": 9, "librari": [9, 16, 19], "gaussian": 10, "delv": 10, "hand": [10, 14], "sum_": 10, "exp": 10, "beta": [10, 17], "sqrt": 10, "y_i": 10, "z_i": 10, "vector": [10, 11, 18], "coordin": 10, "linalg": 10, "la": 10, "compute_grid": 10, "xgrid": 10, "ygrid": 10, "zgrid": 10, "x0": 10, "y0": 10, "z0": 10, "themselv": 10, "meshgrid": 10, "send": 10, "interv": 10, "256": [10, 12, 18], "suffici": [10, 15], "100": [10, 17], "randomli": [10, 17], "distribut": [10, 14], "linspac": 10, "cpu_grid": 10, "npt": 10, "rand": 10, "xyz": 10, "52320": 10, "160627": 10, "might": [10, 15], "nz": 10, "bz": 10, "kernel_cod": 10, "math": 10, "__host__": 10, "__device__": [10, 20], "b": [10, 12, 14, 17, 18, 20], "addgrid": 10, "xvect": 10, "yvect": 10, "zvect": 10, "dx": 10, "dy": 10, "dz": 10, "assign": 10, "explor": 10, "middl": 10, "henc": [10, 19], "56833920479": 10, "80796158314": 10, "940044796467": 10, "855628800392": 10, "855359995365": 10, "16174077988": 10, "11877760887": 10, "01592960358": 10, "849273598194": 10, "849235200882": 10, "19029750824": 10, "16199679375": 10, "40401918888": 10, "39618558884": 10, "39508478642": 10, "31647996902": 10, "31470079422": 10, "50787198544": 10, "53760001659": 10, "56709756851": 10, "34500494003": 10, "25130877495": 10, "50662400723": 10, "55267841816": 10, "17987194061": 10, "12309756279": 10, "01125121117": 10, "849631989002": 10, "853708791733": 10, "17051515579": 10, "15584001541": 10, "40074241161": 10, "39547519684": 10, "39331197739": 10, "30295038223": 10, "28725762367": 10, "39589118958": 10, "38867840767": 10, "37724158764": 10, "34344320297": 10, "26213116646": 10, "38793599606": 10, "3775359869": 10, "74003200531": 10, "13276162148": 10, "37233917713": 10, "18835201263": 10, "15777277946": 10, "40247042179": 10, "39366400242": 10, "39439997673": 10, "23719043732": 10, "28542718887": 10, "39207677841": 10, "38956804276": 10, "3778496027": 10, "29814395905": 10, "26398081779": 10, "38625922203": 10, "3754431963": 10, "72981758118": 10, "12483196259": 10, "37322881222": 10, "61618566513": 10, "2194111824": 10, "17600002289": 10, "27082881927": 10, "38787200451": 10, "3835711956": 10, "37543039322": 10, "30227203369": 10, "23127679825": 10, "38627202511": 10, "37677440643": 10, "64358406067": 10, "12255358696": 10, "37474560738": 10, "61655673981": 10, "19179515839": 10, "99912958145": 10, "213971138": 10, "16430072784": 10, "38772480488": 10, "3735104084": 10, "54432649612": 10, "05524477959": 10, "36935677528": 10, "42449922562": 10, "10455036163": 10, "67516155243": 10, "programmat": 10, "With": [10, 11], "30": 10, "minimum": 10, "84": 10, "suit": 10, "grid_dim": 10, "associ": 10, "substitut": 10, "ourselv": 10, "extract": 10, "manual": [10, 13], "exlicitli": 10, "accur": [10, 16], "xgpu": 10, "ygpu": 10, "zgpu": 10, "grid_gpu": 10, "80": 10, "133200": 10, "lower": [10, 16, 17], "roughli": [10, 14], "40000": 10, "across": [11, 14], "handl": 11, "qualiti": 11, "itself": [11, 12], "precis": 11, "plain": 11, "omp_get_wtim": 11, "openmp": 11, "convolution_stream": 11, "complex": [11, 14], "behind": 11, "spread": 11, "back": 11, "split": 11, "chunk": 11, "slightli": [11, 14, 20], "account": [11, 14], "border": 11, "latter": 11, "cudastreamwaitev": 11, "num_stream": 11, "clarifi": 11, "fit": [11, 17], "choic": [11, 13], "grid_size_x": 11, "grid_size_i": 11, "cudamemcpytosymbol": 11, "upload": 11, "memcpi": 11, "yourself": 11, "spent": 11, "relat": [12, 15, 22], "famili": 12, "launcher": 12, "kt": [12, 19], "easiest": 12, "toolkit": [12, 13], "intend": 12, "Or": [12, 13], "vector_add": [12, 17, 18, 20], "10000000": 12, "512": [12, 18], "research": 12, "cite": 12, "articl": [12, 18], "author": 12, "ben": 12, "van": 12, "werkhoven": 12, "titl": 12, "auto": [12, 14, 16, 17, 20, 22], "journal": 12, "year": 12, "2019": 12, "volum": 12, "90": 12, "347": 12, "358": 12, "url": 12, "www": 12, "sciencedirect": 12, "scienc": 12, "pii": 12, "s0167739x18313359": 12, "doi": 12, "1016": 12, "2018": 12, "08": 12, "004": 12, "willemsen2021bayesian": 12, "willemsen": [12, 17], "flori": 12, "jan": 12, "nieuwpoort": 12, "rob": 12, "bayesian": [12, 17], "workshop": 12, "pmb": 12, "supercomput": 12, "sc21": 12, "2021": 12, "arxiv": 12, "ab": 12, "2111": 12, "14991": 12, "schoonhoven2022benchmark": 12, "schoonhoven": 12, "richard": 12, "batenburg": 12, "joost": 12, "ieee": 12, "transact": 12, "evolutionari": 12, "2022": 12, "schoonhoven2022go": 12, "veenboer": 12, "bram": 12, "green": 12, "energi": [12, 16, 17, 22], "effici": [12, 14, 16], "steer": 12, "sc22": 12, "2211": 12, "07260": 12, "comprehens": 13, "recommend": [13, 19], "linux": 13, "download": 13, "wget": 13, "repo": 13, "continuum": 13, "io": 13, "miniconda3": 13, "x86_64": 13, "sh": 13, "newer": [13, 16], "nativ": 13, "command": 13, "prefix": 13, "home": 13, "pythonpath": 13, "bind": [13, 16], "older": 13, "troubl": 13, "retri": 13, "dir": 13, "wiki": 13, "tiker": 13, "net": 13, "amd": [13, 16], "app": 13, "sdk": 13, "intel": 13, "appl": 13, "beignet": 13, "stack": 13, "jatinx": 13, "navig": 13, "benvanwerkhoven": 13, "differenti": [13, 17], "chanc": [13, 17, 20], "algebra": 14, "frequent": 14, "programm": [14, 16], "row": 14, "column": 14, "squar": 14, "matric": 14, "matmul_na": 14, "width": 14, "matmul_kernel": 14, "height": 14, "Of": 14, "solut": [14, 16], "realiti": 14, "contant": 14, "denot": [14, 18], "sensibl": 14, "pick": 14, "word": 14, "warpsiz": 14, "namelijk": 14, "stand": 14, "briefli": 14, "figur": 14, "fifth": 14, "fourth": 14, "dramat": 14, "profil": 14, "util": 14, "pretti": 14, "opportun": 14, "realiz": 14, "collabor": 14, "bandwidth": 14, "techniqu": 14, "submatric": 14, "proce": 14, "matmul_shar": 14, "sa": 14, "sb": 14, "kb": 14, "outer": 14, "inner": 14, "race": 14, "drastic": 14, "consumpt": [14, 16], "due": [14, 20], "significantli": [14, 16], "fortun": 14, "benefit": 14, "redund": 14, "distinct": 14, "1xn": 14, "usag": [14, 16], "occup": 14, "goe": 14, "down": 14, "matmul": 14, "newli": 14, "coupl": 14, "respect": [14, 16], "independ": 14, "yield": 14, "discontinu": 14, "room": 14, "impos": 14, "report": [15, 16, 22], "possibli": 15, "_flop": 15, "total_flop": 15, "ps_energi": [15, 16, 22], "occur": 15, "exhaust": 15, "brute": [15, 17, 18], "forc": [15, 17, 18, 20], "maxim": 15, "boolean": [15, 16], "facilit": 16, "layer": 16, "act": 16, "hook": 16, "pattern": 16, "subscrib": 16, "benchmarkobserv": 16, "overwritten": 16, "extend": 16, "mandatori": 16, "get_result": 16, "aggreg": 16, "after_finish": 16, "after_start": 16, "before_start": 16, "register_configur": 16, "register_devic": 16, "dev": 16, "variou": [16, 18], "registerobserv": 16, "track": 16, "counter": 16, "num_reg": 16, "current_modul": 16, "powersensor2": 16, "pcie": 16, "intercept": 16, "sensor": 16, "transmit": 16, "usb": 16, "connect": 16, "advantag": 16, "instantan": 16, "frequenc": 16, "khz": 16, "pybind11": 16, "powersensor": [16, 22], "extern": [16, 20], "ps_power": [16, 22], "joul": [16, 22], "watt": [16, 22], "ttyacm0": 16, "core": 16, "voltag": 16, "thin": 16, "wrapper": [16, 20], "intricaci": 16, "friendli": 16, "mode": 16, "repeatedli": 16, "downsid": 16, "approach": 16, "save_al": 16, "nvidia_smi_fallback": 16, "use_locked_clock": 16, "continous_dur": 16, "monitor": 16, "clock": [16, 22], "power_read": [16, 22], "nvml_power": [16, 22], "nvml_energi": [16, 22], "core_freq": [16, 22], "mem_freq": [16, 22], "gr_voltag": 16, "ordin": 16, "identifi": 16, "smi": 16, "root": 16, "privileg": 16, "opt": 16, "amper": 16, "continuous_dur": 16, "common": [16, 20], "cap": 16, "popular": 16, "nvml_gr_clock": [16, 22], "nvml_mem_clock": [16, 22], "nvml_pwr_limit": [16, 22], "graphic": [16, 22], "jetson": 16, "rapl": 16, "xilinx": 16, "pmt": 16, "astron": 16, "nl": 16, "rd": 16, "meter": 16, "arduino": 16, "_energi": 16, "_power": 16, "acceler": 17, "prohibit": 17, "slow": 17, "wast": 17, "basin": 17, "hop": 17, "dual": 17, "anneal": 17, "evolut": 17, "firefli": 17, "genet": 17, "greedi": 17, "local": 17, "multi": 17, "particl": 17, "swarm": 17, "mechan": 17, "overrid": 17, "time_limit": 17, "uniqu": 17, "count": 17, "searchspac": 17, "runner": 17, "nelder": 17, "mead": 17, "powel": 17, "cg": 17, "bfg": 17, "l": 17, "tnc": 17, "cobyla": 17, "slsqp": 17, "reject": 17, "thesi": 17, "generate_normalized_param_dict": 17, "denorm": 17, "normalize_parameter_spac": 17, "param_spac": 17, "prune_parameter_spac": 17, "normalize_dict": 17, "prune": 17, "hyperparamet": 17, "popul": 17, "best1bin": 17, "best1exp": 17, "rand1exp": 17, "randtobest1exp": 17, "best2exp": 17, "rand2exp": 17, "randtobest1bin": 17, "best2bin": 17, "rand2bin": 17, "rand1bin": 17, "popsiz": 17, "maxit": 17, "constr": 17, "compute_intens": 17, "fun": 17, "intens": 17, "distance_to": 17, "euclidian": 17, "move_toward": 17, "alpha": 17, "toward": 17, "b0": 17, "attract": 17, "gamma": 17, "light": 17, "absorpt": 17, "coeffici": 17, "disruptive_uniform_crossov": 17, "dna1": 17, "dna2": 17, "disrupt": 17, "uniform": 17, "crossov": 17, "uniformli": 17, "gene": 17, "children": 17, "guarante": 17, "parent": 17, "mutat": 17, "dna": 17, "mutation_ch": 17, "single_point_crossov": 17, "index": 17, "single_point": 17, "two_point": 17, "disruptive_uniform": 17, "two_point_crossov": 17, "uniform_crossov": 17, "weighted_choic": 17, "probabl": 17, "il": 17, "neighbor": 17, "ham": 17, "adjac": 17, "greedy": 17, "soon": 17, "no_improv": 17, "exce": 17, "50": 17, "random_walk": 17, "hillclimb": 17, "travers": 17, "inertia": 17, "c1": 17, "cognit": 17, "c2": 17, "social": 17, "fraction": 17, "acceptance_prob": 17, "old_cost": 17, "new_cost": 17, "modif": [17, 19], "po": 17, "t_min": 17, "001": 17, "995": 17, "vector_add_kernel": 18, "wise": 18, "1000000": [18, 20], "recogn": 18, "alright": 18, "issu": 19, "portabl": 19, "stick": 19, "pointer": 19, "primit": 19, "lead": 19, "ineffici": 19, "situat": 19, "scientif": 19, "sens": 19, "experiment": 19, "pack": 19, "consult": 19, "create_receive_spec_struct": 19, "0l": 19, "pad": 19, "8byte": 19, "packstr": 19, "iiiiiiiiiiippi": 19, "fffi": 19, "nsampl": 19, "nsamplesiq": 19, "nslowtimesampl": 19, "nchannel": 19, "ntx": 19, "nrepeat": 19, "nfasttimesampl": 19, "rfsize": 19, "mnrow": 19, "mnrowsiq": 19, "nactivechannel": 19, "isiq": 19, "fsiq": 19, "fc": 19, "nbuffer": 19, "frombuff": 19, "len": 19, "receive_spec": 19, "bf": 19, "rf": 19, "recon": 19, "sync": 19, "length": 19, "slight": 19, "matlab": 20, "typenam": 20, "my_typ": 20, "linkag": 20, "regardless": 20, "demot": 20, "rewrit": 20, "real": 20, "risk": 20, "nvrtc": 20, "seper": 20, "block_size_": 22, "grid_size_": 22, "compiler_opt_": 22, "loop_unroll_factor_": 22, "nvml_": 22, "nvml": 22, "nvmlobserv": 22}, "objects": {"kernel_tuner.backends.cupy": [[5, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[5, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[5, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[5, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[5, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[5, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "benchmark"], [5, 1, 1, "", "benchmark_continuous"], [5, 1, 1, "", "benchmark_default"], [5, 1, 1, "", "check_kernel_output"], [5, 1, 1, "", "compile_kernel"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "create_kernel_instance"], [5, 1, 1, "", "get_environment"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "preprocess_gpu_arguments"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"]], "kernel_tuner.observers": [[16, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[16, 1, 1, "", "after_finish"], [16, 1, 1, "", "after_start"], [16, 1, 1, "", "before_start"], [16, 1, 1, "", "during"], [16, 1, 1, "", "get_results"], [16, 1, 1, "", "register_configuration"], [16, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[16, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[16, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[16, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[5, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[5, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "run"]], "kernel_tuner.strategies": [[17, 2, 0, "-", "basinhopping"], [17, 2, 0, "-", "bayes_opt"], [17, 2, 0, "-", "brute_force"], [5, 2, 0, "-", "common"], [17, 2, 0, "-", "diff_evo"], [17, 2, 0, "-", "dual_annealing"], [17, 2, 0, "-", "firefly_algorithm"], [17, 2, 0, "-", "genetic_algorithm"], [17, 2, 0, "-", "greedy_ils"], [17, 2, 0, "-", "greedy_mls"], [17, 2, 0, "-", "minimize"], [17, 2, 0, "-", "mls"], [17, 2, 0, "-", "ordered_greedy_mls"], [17, 2, 0, "-", "pso"], [17, 2, 0, "-", "random_sample"], [17, 2, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[17, 3, 1, "", "generate_normalized_param_dicts"], [17, 3, 1, "", "normalize_parameter_space"], [17, 3, 1, "", "prune_parameter_space"], [17, 3, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.common": [[5, 3, 1, "", "get_options"], [5, 3, 1, "", "get_strategy_docstring"], [5, 3, 1, "", "make_strategy_options_doc"], [5, 3, 1, "", "scale_from_params"], [5, 3, 1, "", "setup_method_arguments"], [5, 3, 1, "", "setup_method_options"], [5, 3, 1, "", "snap_to_nearest_config"], [5, 3, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[17, 0, 1, "", "Firefly"], [17, 3, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[17, 1, 1, "", "compute_intensity"], [17, 1, 1, "", "distance_to"], [17, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[17, 3, 1, "", "disruptive_uniform_crossover"], [17, 3, 1, "", "mutate"], [17, 3, 1, "", "single_point_crossover"], [17, 3, 1, "", "tune"], [17, 3, 1, "", "two_point_crossover"], [17, 3, 1, "", "uniform_crossover"], [17, 3, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[17, 3, 1, "", "acceptance_prob"], [17, 3, 1, "", "neighbor"], [17, 3, 1, "", "tune"]], "kernel_tuner": [[5, 2, 0, "-", "util"]], "kernel_tuner.util": [[5, 0, 1, "", "CompilationFailedConfig"], [5, 0, 1, "", "ErrorConfig"], [5, 0, 1, "", "InvalidConfig"], [5, 0, 1, "", "NpEncoder"], [5, 0, 1, "", "RuntimeFailedConfig"], [5, 4, 1, "", "SkippableFailure"], [5, 4, 1, "", "StopCriterionReached"], [5, 3, 1, "", "check_argument_list"], [5, 3, 1, "", "check_argument_type"], [5, 3, 1, "", "check_restrictions"], [5, 3, 1, "", "check_stop_criterion"], [5, 3, 1, "", "check_thread_block_dimensions"], [5, 3, 1, "", "check_tune_params_list"], [5, 3, 1, "", "compile_restrictions"], [5, 3, 1, "", "config_valid"], [5, 3, 1, "", "convert_constraint_restriction"], [5, 3, 1, "", "cuda_error_check"], [5, 3, 1, "", "delete_temp_file"], [5, 3, 1, "", "detect_language"], [5, 3, 1, "", "dump_cache"], [5, 3, 1, "", "get_best_config"], [5, 3, 1, "", "get_config_string"], [5, 3, 1, "", "get_grid_dimensions"], [5, 3, 1, "", "get_instance_string"], [5, 3, 1, "", "get_kernel_string"], [5, 3, 1, "", "get_problem_size"], [5, 3, 1, "", "get_smem_args"], [5, 3, 1, "", "get_temp_filename"], [5, 3, 1, "", "get_thread_block_dimensions"], [5, 3, 1, "", "get_total_timings"], [5, 3, 1, "", "looks_like_a_filename"], [5, 3, 1, "", "normalize_verify_function"], [5, 3, 1, "", "parse_restrictions"], [5, 3, 1, "", "prepare_kernel_string"], [5, 3, 1, "", "print_config"], [5, 3, 1, "", "print_config_output"], [5, 3, 1, "", "process_cache"], [5, 3, 1, "", "process_metrics"], [5, 3, 1, "", "read_cache"], [5, 3, 1, "", "read_file"], [5, 3, 1, "", "replace_param_occurrences"], [5, 3, 1, "", "setup_block_and_grid"], [5, 3, 1, "", "store_cache"], [5, 3, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[5, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:module", "3": "py:function", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "module", "Python module"], "3": ["py", "function", "Python function"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"cach": 0, "file": 0, "The": [1, 12], "kernel": [1, 6, 7, 8, 9, 10, 12, 14, 20], "tuner": [1, 6, 7, 8, 9, 10, 12], "document": [1, 2, 5, 12, 21], "guid": [1, 2, 13], "featur": 1, "refer": 1, "contribut": 2, "report": 2, "issu": 2, "code": [2, 6, 7, 8, 9, 11], "develop": 2, "environ": 2, "local": [2, 7], "setup": 2, "cluster": 2, "run": [2, 8], "test": [2, 3], "build": 2, "convolut": [3, 9], "2d": 3, "exampl": [3, 9, 12, 20], "implement": [3, 6, 7, 8], "tune": [3, 6, 7, 8, 10, 11, 14, 15, 16], "more": 3, "tunabl": 3, "paramet": [3, 8, 10, 16, 22], "correct": 4, "verif": 4, "design": 5, "strategi": [5, 17], "kernel_tun": [5, 17], "common": 5, "runner": 5, "sequenti": 5, "sequentialrunn": 5, "simulationrunn": 5, "devic": 5, "interfac": 5, "core": 5, "deviceinterfac": 5, "backend": [5, 20], "pycuda": [5, 13], "pycudafunct": 5, "cupi": 5, "cupyfunct": 5, "nvcuda": 5, "cudafunct": 5, "opencl": [5, 13], "openclfunct": 5, "c": [5, 8], "cfunction": 5, "hip": [5, 13], "hipfunct": 5, "util": 5, "function": 5, "diffus": [6, 7, 8], "python": [6, 7, 8, 13], "comput": [6, 7, 8], "gpu": [6, 7, 8, 10], "auto": [6, 7, 8], "us": [6, 7, 8, 10, 14, 19], "share": [6, 7, 8, 14], "memori": [6, 7, 8, 14], "tile": [6, 7, 8], "store": [6, 7], "result": [6, 7], "tutori": [7, 8], "from": [7, 8], "physic": [7, 8], "best": 8, "product": 8, "vector": 9, "add": 9, "stencil": 9, "matrix": [9, 14], "multipl": [9, 14], "py": 9, "sepconv": 9, "convolution_correct": 9, "convolution_stream": 9, "reduct": 9, "spars": 9, "point": 9, "polygon": 9, "expdist": 9, "gener": 9, "3d": 10, "grid": 10, "let": 10, "": 10, "start": [10, 18], "cpu": 10, "move": 10, "optim": [10, 17], "host": 11, "number": 11, "stream": 11, "quick": 12, "instal": [12, 13], "usag": 12, "citat": 12, "packag": 13, "cuda": [13, 14], "pyopencl": 13, "pyhip": 13, "git": 13, "version": 13, "depend": 13, "naiv": 14, "increas": 14, "work": 14, "per": 14, "thread": 14, "metric": 15, "object": 15, "observ": 16, "powersensorobserv": 16, "nvmlobserv": 16, "execut": 16, "nvml": 16, "pmtobserv": 16, "basinhop": 17, "bayes_opt": 17, "brute_forc": 17, "diff_evo": 17, "dual_ann": 17, "firefly_algorithm": 17, "genetic_algorithm": 17, "greedy_il": 17, "greedy_ml": 17, "minim": 17, "ml": 17, "ordered_greedy_ml": 17, "pso": 17, "random_sampl": 17, "simulated_ann": 17, "get": 18, "struct": 19, "templat": 20, "select": 20, "api": 21, "vocabulari": 22}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 58}, "alltitles": {"Cache files": [[0, "cache-files"]], "The Kernel Tuner documentation": [[1, "the-kernel-tuner-documentation"], [12, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[1, null]], "Guides": [[1, null]], "Features": [[1, null]], "Reference": [[1, null]], "Contribution guide": [[2, "contribution-guide"]], "Reporting Issues": [[2, "reporting-issues"]], "Contributing Code": [[2, "contributing-code"]], "Development environment": [[2, "development-environment"]], "Local setup": [[2, "local-setup"]], "Cluster setup": [[2, "cluster-setup"]], "Running tests": [[2, "running-tests"]], "Building documentation": [[2, "building-documentation"]], "Convolution": [[3, "Convolution"], [9, "convolution"]], "2D Convolution example": [[3, "2D-Convolution-example"]], "Implement a test": [[3, "Implement-a-test"]], "Tuning 2D Convolution": [[3, "Tuning-2D-Convolution"]], "More tunable parameters": [[3, "More-tunable-parameters"]], "Correctness Verification": [[4, "correctness-verification"]], "Design documentation": [[5, "design-documentation"]], "Strategies": [[5, "strategies"]], "kernel_tuner.strategies.common": [[5, "module-kernel_tuner.strategies.common"]], "Runners": [[5, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[5, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[5, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[5, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[5, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[5, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[5, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[5, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[5, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.c.CFunctions": [[5, "kernel-tuner-backends-c-cfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[5, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[5, "util-functions"]], "kernel_tuner.util": [[5, "module-kernel_tuner.util"]], "Diffusion": [[6, "Diffusion"], [6, "id1"], [7, "Diffusion"], [8, "Diffusion"]], "Python implementation": [[6, "Python-implementation"], [7, "Python-implementation"], [8, "Python-implementation"]], "Computing on the GPU": [[6, "Computing-on-the-GPU"], [7, "Computing-on-the-GPU"], [8, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[6, "Auto-Tuning-with-the-Kernel-Tuner"], [7, "Auto-Tuning-with-the-Kernel-Tuner"], [8, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[6, "Using-Shared-Memory"]], "Tiling GPU Code": [[6, "Tiling-GPU-Code"], [7, "Tiling-GPU-Code"], [8, "Tiling-GPU-Code"]], "Storing the results": [[6, "Storing-the-results"], [7, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[7, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [8, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[7, "Using-Shared-(local)-Memory"]], "Using shared memory": [[8, "Using-shared-memory"], [14, "Using-shared-memory"]], "Using the best parameters in a production run": [[8, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[8, "Python-run"]], "C run": [[8, "C-run"]], "Kernel Tuner Examples": [[9, "kernel-tuner-examples"]], "Vector Add": [[9, "vector-add"]], "Stencil": [[9, "stencil"]], "Matrix Multiplication": [[9, "matrix-multiplication"]], "convolution.py": [[9, "convolution-py"]], "sepconv.py": [[9, "sepconv-py"]], "convolution_correct.py": [[9, "convolution-correct-py"]], "convolution_streams.py": [[9, "convolution-streams-py"]], "Reduction": [[9, "reduction"]], "Sparse Matrix Vector Multiplication": [[9, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[9, "point-in-polygon"]], "ExpDist": [[9, "expdist"]], "Code Generator": [[9, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[10, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[10, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[10, "Let's-move-to-the-GPU"]], "Tune the kernel": [[10, "Tune-the-kernel"]], "Using the optimized parameters": [[10, "Using-the-optimized-parameters"]], "Tuning Host Code": [[11, "tuning-host-code"]], "Tuning the number of streams": [[11, "tuning-the-number-of-streams"]], "Quick install": [[12, "quick-install"]], "Example usage": [[12, "example-usage"]], "Citation": [[12, "citation"]], "Installation": [[13, "installation"]], "Python": [[13, "python"]], "Installing Python Packages": [[13, "installing-python-packages"]], "CUDA and PyCUDA": [[13, "cuda-and-pycuda"]], "OpenCL and PyOpenCL": [[13, "opencl-and-pyopencl"]], "HIP and PyHIP": [[13, "hip-and-pyhip"]], "Installing the git version": [[13, "installing-the-git-version"]], "Dependencies for the guides": [[13, "dependencies-for-the-guides"]], "Matrix multiplication": [[14, "Matrix-multiplication"]], "Naive CUDA kernel": [[14, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[14, "Tuning-a-naive-kernel"]], "Increase work per thread": [[14, "Increase-work-per-thread"]], "Metrics and Objectives": [[15, "metrics-and-objectives"]], "Metrics": [[15, "metrics"]], "Tuning Objectives": [[15, "tuning-objectives"]], "Observers": [[16, "observers"]], "PowerSensorObserver": [[16, "powersensorobserver"]], "NVMLObserver": [[16, "nvmlobserver"]], "Tuning execution parameters with NVML": [[16, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[16, "pmtobserver"]], "Optimization strategies": [[17, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[17, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[17, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[17, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[17, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[17, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[17, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[17, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[17, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[17, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[17, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[17, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[17, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[17, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[17, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[18, "getting-started"]], "Using structs": [[19, "using-structs"]], "Templated kernels": [[20, "templated-kernels"]], "Example": [[20, "example"]], "Selecting a backend": [[20, "selecting-a-backend"]], "API Documentation": [[21, "api-documentation"]], "Parameter Vocabulary": [[22, "parameter-vocabulary"]]}, "indexentries": {"compilationfailedconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.CompilationFailedConfig"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[5, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[5, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[5, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.InvalidConfig"]], "npencoder (class in kernel_tuner.util)": [[5, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[5, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[5, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[5, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[5, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[5, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[5, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_tune_params_list"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[5, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[5, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[5, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[5, "module-kernel_tuner.strategies.common"], [5, "module-kernel_tuner.util"], [17, "module-kernel_tuner.strategies.basinhopping"], [17, "module-kernel_tuner.strategies.bayes_opt"], [17, "module-kernel_tuner.strategies.brute_force"], [17, "module-kernel_tuner.strategies.diff_evo"], [17, "module-kernel_tuner.strategies.dual_annealing"], [17, "module-kernel_tuner.strategies.firefly_algorithm"], [17, "module-kernel_tuner.strategies.genetic_algorithm"], [17, "module-kernel_tuner.strategies.greedy_ils"], [17, "module-kernel_tuner.strategies.greedy_mls"], [17, "module-kernel_tuner.strategies.minimize"], [17, "module-kernel_tuner.strategies.mls"], [17, "module-kernel_tuner.strategies.ordered_greedy_mls"], [17, "module-kernel_tuner.strategies.pso"], [17, "module-kernel_tuner.strategies.random_sample"], [17, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.prepare_kernel_string"]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[5, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments"]], "print_config() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[5, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[5, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.scale_from_params"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[16, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[16, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[16, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[16, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[17, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[17, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[17, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[17, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[17, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[17, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[17, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[17, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[17, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[17, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[17, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[17, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[17, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[17, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[17, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[17, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[17, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[17, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[17, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[17, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[17, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[17, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[17, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[17, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[17, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[17, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["cache_files", "contents", "contributing", "convolution", "correctness", "design", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"A": [0, 3, 5, 12, 13, 14, 16, 17], "veri": [0, 4, 6, 7, 8, 11, 13, 14, 16, 19, 20], "us": [0, 1, 2, 3, 4, 5, 9, 11, 12, 13, 15, 16, 17, 18, 20, 22], "featur": [0, 3, 4, 9, 13, 15, 16, 18, 20], "kernel": [0, 2, 3, 4, 5, 11, 13, 15, 16, 17, 18, 19, 21, 22], "tuner": [0, 2, 3, 4, 5, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "i": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22], "abil": 0, "store": [0, 2, 3, 5, 8, 14, 16, 18], "benchmark": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 22], "result": [0, 2, 3, 4, 5, 8, 10, 14, 15, 16, 17, 18, 22], "dure": [0, 5, 6, 7, 8, 10, 16], "tune": [0, 1, 4, 5, 9, 12, 13, 17, 18, 20, 22], "you": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22], "can": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22], "enabl": [0, 16, 17, 19, 20], "pass": [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 20], "ani": [0, 2, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 19, 20, 22], "filenam": [0, 3, 5, 9, 14, 18], "option": [0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 20, 21, 22], "argument": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21], "tune_kernel": [0, 3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17, 18, 19, 20], "The": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20], "individu": [0, 16, 17], "configur": [0, 3, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17], "ar": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22], "append": [0, 5, 13], "run": [0, 3, 4, 5, 6, 7, 10, 11, 13, 14, 16, 17], "thi": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "also": [0, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 22], "allow": [0, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 20], "restart": [0, 6, 7, 8, 17], "session": [0, 2, 5, 17], "from": [0, 2, 3, 4, 5, 6, 9, 10, 11, 13, 14, 16, 17, 19, 20], "an": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], "exist": [0, 5], "should": [0, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 18], "someth": [0, 3, 6, 7, 8, 14], "have": [0, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 20, 22], "termin": [0, 13], "previou": [0, 2, 6, 7, 8, 17], "befor": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17], "had": [0, 3], "complet": [0, 3], "happen": [0, 2, 3, 14, 18], "quit": [0, 6, 7, 8, 10, 14, 20], "often": [0, 6, 7, 8, 16], "hpc": 0, "environ": [0, 3, 5, 13, 17], "when": [0, 2, 3, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 19, 20, 22], "job": 0, "reserv": [0, 7, 22], "out": [0, 2, 3, 4, 10, 13, 14], "number": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18, 19, 22], "other": [0, 2, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 22], "simul": [0, 5, 8, 12, 17, 19], "visual": [0, 2, 14], "optim": [0, 1, 3, 4, 5, 6, 7, 8, 11, 12, 14, 15, 16], "strategi": [0, 1, 3, 15], "start": [0, 1, 3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 17], "call": [0, 3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19, 20, 21], "contain": [0, 3, 5, 6, 7, 8, 10, 11, 14, 16, 17, 20], "full": [0, 2, 5, 16, 18], "search": [0, 3, 5, 9, 12, 14, 15, 17], "space": [0, 2, 3, 4, 5, 10, 11, 14, 15, 17], "true": [0, 3, 4, 5, 6, 7, 8, 11, 14, 16, 17], "creat": [0, 2, 3, 5, 6, 7, 8, 10, 14, 16, 18, 19], "even": [0, 2, 6, 7, 8, 11, 14, 17], "work": [0, 2, 3, 5, 6, 7, 8, 13, 15, 17, 20], "while": [0, 3, 5, 6, 7, 8, 9, 14, 16, 17], "still": [0, 2, 4, 14], "As": [0, 3, 6, 7, 8, 10, 13, 14, 16], "new": [0, 2, 5, 6, 7, 8, 17], "come": [0, 5, 6, 7, 8, 14, 16, 20], "thei": [0, 2, 5, 6, 7, 8, 9, 14, 15], "stream": [0, 5, 6, 7, 8], "pleas": [0, 2, 3, 12, 13, 16, 18, 19], "see": [0, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 18, 20], "dashboard": [0, 12], "introduct": 1, "instal": [1, 2, 3, 6, 7, 8, 10, 11, 14, 16, 18], "get": [1, 3, 5, 6, 7, 8, 10, 13, 14], "convolut": [1, 4, 11, 14], "diffus": 1, "matrix": 1, "multipl": [1, 5, 11, 16, 20], "exampl": [1, 2, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19], "cach": [1, 2, 5, 6, 7, 8, 13, 14, 17], "file": [1, 2, 3, 5, 6, 7, 9, 11, 14, 17, 18, 20, 21], "correct": [1, 2, 11, 19], "verif": [1, 9], "host": [1, 2, 5, 7, 8, 9, 16, 19, 20], "code": [1, 3, 5, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22], "struct": 1, "templat": [1, 10], "metric": [1, 3, 5, 9, 14], "object": [1, 3, 4, 5, 6, 7, 8, 17], "observ": [1, 5, 15, 22], "api": [1, 3, 5], "paramet": [1, 4, 5, 6, 7, 9, 11, 14, 15, 17, 18, 19, 20], "vocabulari": [1, 16, 18], "design": [1, 2, 6, 7, 8, 16], "contribut": 1, "thank": 2, "consid": [2, 10, 12, 14], "Not": [2, 5], "all": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 21], "help": [2, 20], "u": [2, 3, 6, 7, 8], "improv": [2, 5, 6, 7, 8, 14, 17], "about": [2, 3, 5, 6, 7, 8, 12, 14, 16, 17, 18, 21], "problem": [2, 3, 5, 6, 7, 8, 9, 10, 11, 14], "ensur": [2, 4, 6, 7, 8, 11, 13, 16, 19], "follow": [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 20], "describ": [2, 3, 5, 11, 16, 19], "what": [2, 3, 4, 5, 6, 7, 8, 11, 14, 16, 18, 19, 20, 22], "expect": [2, 3, 4, 5, 6, 7, 8, 14, 16], "If": [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 19], "possibl": [2, 3, 4, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19], "includ": [2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 20, 21], "minim": [2, 15, 20], "reproduc": 2, "actual": [2, 3, 4, 5, 6, 7, 8, 10, 14, 20], "output": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 18, 22], "error": [2, 3, 4, 5, 11, 14, 20], "print": [2, 3, 5, 6, 7, 8, 10, 14], "list": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19], "version": [2, 3, 14, 16], "python": [2, 3, 5, 9, 10, 11, 14, 16, 18, 19, 20], "cuda": [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 16, 18, 19, 20], "opencl": [2, 3, 6, 7, 8, 9, 11, 12, 14], "c": [2, 3, 9, 11, 12, 13, 14, 18, 20], "compil": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 22], "applic": [2, 3, 6, 7, 8, 9, 10, 11, 12, 15, 16, 19, 20], "For": [2, 3, 4, 5, 6, 7, 8, 10, 13, 16, 18, 19], "select": [2, 3, 5, 6, 7, 8, 10, 14, 16, 17], "propos": 2, "chang": [2, 10, 16], "addit": [2, 3, 6, 7, 8, 13, 15, 18], "signific": 2, "requir": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 20], "first": [2, 3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 19, 20], "discuss": [2, 5], "Then": [2, 6, 7, 8, 10, 12, 13, 20], "fork": 2, "repositori": [2, 3, 6, 7, 8, 10, 12, 13, 14], "branch": 2, "one": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17], "per": [2, 3, 6, 7, 8, 10, 15, 16], "pull": 2, "request": [2, 16], "googl": 2, "style": 2, "sphinxdoc": 2, "docstr": [2, 5], "modul": [2, 5, 11, 16], "public": [2, 12], "function": [2, 3, 4, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21], "up": [2, 3, 5, 6, 7, 8, 13, 14, 18], "date": 2, "written": [2, 20], "unit": [2, 5], "your": [2, 3, 6, 7, 8, 10, 11, 12, 13, 16, 19], "nox": 2, "do": [2, 3, 5, 6, 7, 8, 10, 11, 14], "hardwar": [2, 6, 7, 8, 10, 16, 17, 18], "skip": [2, 3, 6, 7, 8], "gpu": [2, 3, 4, 5, 9, 11, 12, 14, 16, 18, 19, 22], "hip": [2, 12], "produc": [2, 4], "same": [2, 3, 4, 6, 7, 8, 10, 11, 16, 18], "better": [2, 6, 7, 8], "entri": [2, 5, 6, 7], "changelog": 2, "md": 2, "match": [2, 3, 4, 5], "roadmap": 2, "updat": [2, 5], "remov": [2, 17], "doubt": 2, "where": [2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 19, 20], "put": [2, 5, 6, 7, 8], "look": [2, 3, 5, 6, 7, 8, 10, 13, 14, 20], "regard": [2, 5, 17], "step": [2, 6, 7, 8, 13, 14, 15, 17, 20], "set": [2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 16, 17, 18, 20, 22], "sudo": [2, 13], "access": [2, 3, 6, 7, 8, 10, 16, 19], "e": [2, 13, 15, 16, 17], "g": [2, 13, 15, 16], "devic": [2, 3, 4, 6, 7, 8, 9, 11, 16, 20], "clone": [2, 3, 6, 7, 8, 10, 13, 14], "git": [2, 16], "desir": 2, "locat": [2, 4, 10, 16], "http": [2, 12, 13, 16], "github": [2, 3, 6, 7, 8, 10, 13, 14], "com": [2, 12, 13], "kerneltun": [2, 12], "kernel_tun": [2, 3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22], "cd": [2, 13], "pyenv": 2, "curl": [2, 13], "bash": [2, 13], "rememb": [2, 3, 6, 7, 8, 14], "add": [2, 3, 5, 6, 7, 8, 11, 14, 16, 17], "bash_profil": 2, "bashrc": 2, "specifi": [2, 3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22], "virtual": [2, 13], "folder": 2, "virtualenv": 2, "whatev": [2, 11, 17], "name": [2, 3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 18, 22], "prefer": [2, 3, 5, 6, 8, 16], "3": [2, 4, 6, 7, 8, 10, 11, 13, 14, 17], "8": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16], "9": [2, 3, 4, 6, 7, 8, 11], "10": [2, 6, 7, 8, 12, 17], "11": [2, 6, 7, 8], "so": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 20], "found": [2, 3, 5, 12, 16, 17], "global": [2, 5, 6, 7, 8, 17], "replac": [2, 3, 4, 5, 6, 7, 8, 10, 14], "poetri": [2, 13], "ssl": [2, 13], "org": [2, 12, 13], "python3": [2, 13], "make": [2, 3, 6, 7, 8, 10, 12, 13, 14, 16, 19, 20], "sure": [2, 3, 6, 7, 8, 12, 13, 14], "non": [2, 4], "depend": [2, 3, 4, 8, 9, 10, 12, 15], "project": 2, "extra": [2, 13, 20], "doc": [2, 3, 6, 7, 8, 10, 13, 14], "leav": 2, "doe": [2, 4, 5, 6, 7, 8, 10, 11, 14, 16, 20], "appli": [2, 6, 7, 8], "system": [2, 12, 13, 16], "To": [2, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20], "go": [2, 3, 6, 7, 8, 10, 12, 13, 14, 18], "mai": [2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19], "necessari": [2, 4, 5, 6, 7, 8], "conveni": [2, 6, 7, 8, 11], "packag": 2, "cupi": [2, 16, 20], "cuda11x": 2, "cuda12x": 2, "These": [2, 6, 7, 8, 10, 13, 14, 16, 20], "current": [2, 3, 4, 5, 6, 7, 8, 13, 14, 16, 17], "defin": [2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 15, 16, 20], "part": [2, 6, 7, 8, 12, 13, 14, 15, 19], "forget": [2, 10], "path": [2, 3, 16], "correctli": [2, 14], "re": [2, 3, 6, 7, 8, 10, 14], "ld_libary_path": 2, "cpath": 2, "check": [2, 4, 5, 6, 7, 8, 11, 14], "pytest": 2, "except": [2, 5, 9], "more": [2, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 18, 20], "ha": [2, 3, 5, 6, 7, 8, 11, 14, 16, 17], "been": [2, 3, 5, 6, 7, 8, 11, 14, 17], "left": [2, 6, 7, 8, 10, 15], "gracefulli": 2, "without": [2, 6, 7, 8, 10, 11, 16, 17], "conda": 2, "mamba": 2, "perform": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19], "miniconda": [2, 13], "tradit": 2, "under": [2, 3, 12], "quota": 2, "otherwis": [2, 5, 14], "restrict": [2, 5, 9, 14, 20], "disk": 2, "instruct": [2, 6, 7, 8, 9, 13, 14], "differ": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17], "directori": [2, 3, 6, 7, 8, 10, 13, 14], "save": [2, 6, 7], "ad": [2, 6, 7, 8, 11], "condarc": 2, "envs_dir": 2, "both": [2, 6, 7, 8, 9, 14], "automat": [2, 3, 6, 7, 8, 10, 14, 20], "activ": 2, "via": [2, 17], "usual": [2, 16], "provid": [2, 4, 5, 6, 7, 8, 11, 20, 21], "end": [2, 3, 5, 6, 7, 8, 10, 14, 16, 17, 19], "exit": 2, "shell": 2, "enter": [2, 3, 6, 7, 8, 10, 14], "avail": [2, 3, 6, 7, 8, 9, 10, 13, 16], "continu": [2, 3, 5, 6, 7, 8, 13, 16, 17], "n": [2, 4, 6, 7, 8, 10, 11, 12, 14, 17, 18, 20], "base": [2, 5, 15, 16, 20], "forg": 2, "default": [2, 3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 20], "execut": [2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17], "config": [2, 5], "auto_activate_bas": 2, "fals": [2, 5, 16, 17], "load": 2, "On": [2, 6, 7, 8], "most": [2, 5, 6, 7, 8, 9, 11, 12, 14, 16, 17, 18, 19], "unload": 2, "rocm": [2, 13, 16], "inform": [2, 3, 5, 6, 7, 8, 12, 16, 17, 18, 22], "like": [2, 3, 5, 6, 7, 8, 9, 10, 14, 17, 18, 19, 20], "keyr": 2, "seemingli": 2, "weird": 2, "known": [2, 14], "some": [2, 3, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, 19, 20], "pip": [2, 3, 6, 7, 12, 13, 14], "m": [2, 6, 7, 8, 10], "disabl": 2, "node": [2, 17], "backend": [2, 11, 16], "2": [2, 3, 4, 6, 7, 8, 9, 10, 11, 14, 16, 17], "echo": 2, "noxenv": 2, "txt": 2, "anaconda": 2, "altern": [2, 13], "venv": 2, "alreadi": [2, 3, 5, 6, 7, 8, 13, 14], "Be": [2, 6, 7, 8], "adjust": [2, 3], "below": [2, 8, 9, 10, 11, 13, 14, 15, 16, 17, 19], "against": [2, 4, 5], "support": [2, 3, 5, 6, 7, 8, 11, 13, 16, 17, 20, 22], "isol": [2, 20], "top": [2, 5, 10, 16], "level": [2, 5, 16], "coverag": 2, "It": [2, 3, 5, 6, 7, 8, 11, 13, 14, 16, 20], "": [2, 3, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 18, 19, 20, 21], "invok": 2, "tab": 2, "studio": 2, "take": [2, 3, 5, 6, 7, 8, 10, 14, 16, 17, 18, 20], "1": [2, 3, 4, 6, 7, 8, 10, 11, 14, 16, 17], "gigabyt": 2, "size": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 18, 20], "user": [2, 3, 4, 5, 7, 9, 13, 14, 15, 16, 17, 20], "tight": 2, "diskspac": 2, "small": [2, 3, 6, 7, 8, 14], "each": [2, 3, 4, 5, 6, 7, 10, 14, 16, 17], "ran": 2, "note": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 19], "longer": [2, 3, 5, 15], "pycuda": [2, 6, 8, 10, 11, 16, 20], "capabl": [2, 5, 6, 7, 14], "present": [2, 14], "hold": [2, 6, 7, 14, 18, 19], "pyopencl": [2, 5, 7, 16], "nvidia": [2, 5, 13, 14, 16, 20], "break": [2, 20], "cannot": [2, 6, 7, 8, 16], "them": [2, 3, 8, 10, 11, 14], "seen": [2, 3, 5, 14], "integr": [2, 20], "type": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20], "html": 2, "gener": [2, 3, 5, 6, 7, 8, 12, 14, 16, 17, 19, 22], "page": [2, 3, 6, 7, 8, 9, 10, 12, 14, 15], "sourc": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 20], "inspect": [2, 5, 16], "commit": 2, "brows": 2, "through": [2, 5, 6, 7, 8, 10, 12, 15, 16, 17], "least": [2, 5], "those": [2, 3, 9, 16], "pandoc": 2, "ubuntu": 2, "apt": 2, "mac": 2, "brew": 2, "onlin": 2, "built": [2, 16, 17, 19], "action": 2, "correspond": [2, 3, 6, 7, 8, 10, 16, 17, 18], "master": 2, "latest": [2, 13], "last": [2, 5, 19], "releas": [2, 5], "stabl": 2, "publish": [2, 12], "point": [2, 3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18], "process": [2, 3, 5, 6, 7, 8, 14, 15, 16, 17, 20], "again": [2, 3, 6, 7, 8, 10, 14], "fulli": [2, 13], "autom": 2, "guid": [3, 6, 14, 15, 18], "meant": 3, "write": [3, 9, 10, 14, 20], "script": [3, 5, 14, 19, 20], "we": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20], "ll": [3, 6, 7, 8, 13, 14], "simpl": [3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19], "find": [3, 11, 14, 17], "shortli": 3, "much": [3, 6, 7, 8, 10, 16, 20], "reus": [3, 6, 7, 8, 14], "read": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16], "document": [3, 4, 6, 7, 8, 10, 13, 14, 19, 22], "jupyt": [3, 6, 7, 8, 10, 13, 14], "notebook": [3, 6, 7, 8, 10, 13, 14], "just": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14], "tutori": [3, 6, 10, 12, 13, 14], "readi": [3, 5, 6, 7, 8, 10, 14], "oper": [3, 6, 7, 8, 10, 11, 14, 15], "essenti": 3, "signal": [3, 22], "imag": [3, 6, 7, 8], "main": [3, 5, 10, 16, 18], "neural": 3, "network": 3, "deep": 3, "learn": 3, "comput": [3, 4, 5, 9, 10, 11, 12, 14, 17], "linear": [3, 14], "combin": [3, 5, 6, 7, 8, 9, 10, 14, 16, 17, 18], "weight": [3, 17], "filter": [3, 4, 9, 11], "rang": [3, 4, 6, 7, 8, 10, 11, 20], "pixel": 3, "input": [3, 4, 6, 7, 8, 9, 11, 14, 15, 18, 19], "w": [3, 6, 7, 15, 17], "time": [3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 17, 20, 22], "h": [3, 10], "f": [3, 4, 10, 11, 19], "f_w": 3, "f_h": 3, "o": [3, 5], "begin": [3, 6, 7, 8, 10], "equat": [3, 6, 7, 8, 10, 17], "nonumb": [3, 10], "x": [3, 4, 5, 6, 7, 8, 10, 12, 14, 18, 20], "y": [3, 5, 6, 7, 8, 10, 11, 14], "sum": [3, 4, 5, 14], "limits_": 3, "j": [3, 6, 7, 8, 12, 14], "0": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 19], "naiv": [3, 4, 6, 7, 8], "parallel": [3, 6, 7, 8], "thread": [3, 5, 6, 7, 8, 9, 10, 15, 16, 18, 22], "avoid": [3, 14, 22], "confus": 3, "around": [3, 9], "term": 3, "refer": [3, 4, 5, 6, 7, 8, 9, 11, 16], "shown": [3, 5, 16], "block": [3, 5, 6, 7, 8, 9, 10, 13, 14, 15, 18, 22], "press": [3, 6, 7, 8, 10, 14], "shift": [3, 6, 7, 8, 10, 14], "writefil": [3, 14], "convolution_na": [3, 4], "cu": [3, 4, 11, 14, 18, 20], "__global__": [3, 6, 8, 10, 12, 14, 18, 20], "void": [3, 6, 7, 8, 10, 12, 14, 18, 19, 20], "convolution_kernel": [3, 4], "float": [3, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20], "int": [3, 5, 6, 7, 8, 10, 12, 14, 18, 20], "blockidx": [3, 6, 7, 8, 10, 12, 14, 18, 20], "blockdim": [3, 18], "threadidx": [3, 6, 7, 8, 10, 12, 14, 18, 20], "image_height": 3, "image_width": 3, "filter_height": 3, "filter_width": 3, "input_width": 3, "run_kernel": [3, 4, 5, 9], "our": [3, 6, 7, 8, 10, 14, 18, 19], "But": [3, 6, 7, 8, 10, 18], "data": [3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18, 19], "which": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22], "import": [3, 4, 6, 7, 8, 10, 13, 14, 15, 18, 19, 20], "numpi": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 18, 19, 20], "np": [3, 5, 10, 14, 18, 19], "filter_s": 3, "17": [3, 4, 6, 7, 8, 11], "output_s": 3, "4096": [3, 4, 6, 7, 8, 11, 14], "prod": [3, 4, 11], "border_s": 3, "input_s": [3, 4, 11], "output_imag": 3, "zero": [3, 4, 10, 11, 14], "astyp": [3, 4, 6, 7, 8, 10, 11, 12, 14, 18, 20], "float32": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 18, 20], "input_imag": 3, "random": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 20], "randn": [3, 4, 11, 12, 14, 18, 20], "conv_filt": 3, "now": [3, 6, 7, 8, 10, 11, 14, 18], "structur": [3, 5, 6, 7, 14, 18], "how": [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 18, 19, 20, 21], "signatur": [3, 5], "kernel_nam": [3, 5, 11, 19, 20], "kernel_sourc": [3, 5, 19], "problem_s": [3, 4, 5, 6, 7, 8, 10, 11, 14, 18, 19, 22], "param": [3, 4, 5, 16, 17], "ellipsi": 3, "here": [3, 10, 11, 13, 14, 16], "indic": [3, 17, 22], "mani": [3, 5, 6, 7, 8, 14, 15, 16, 17], "won": 3, "t": [3, 5, 6, 7, 8, 10, 11, 13, 17, 20], "need": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 18, 19, 20, 21], "right": [3, 6, 7, 8, 10, 13], "interest": [3, 19], "five": [3, 5, 18], "string": [3, 5, 6, 7, 8, 9, 14, 15, 16, 18, 19], "domain": [3, 6, 7, 8, 9, 10], "three": [3, 4, 14], "dimens": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 22], "dictionari": [3, 5, 6, 7, 8, 10, 14, 16, 17, 18], "simpli": [3, 4, 5, 6, 7, 8, 10, 17, 18], "cell": [3, 6, 7, 8, 10, 14], "wrote": 3, "determin": [3, 6, 7, 8, 10, 16, 17], "grid": [3, 5, 6, 7, 8, 9, 11, 14, 22], "abov": [3, 5, 6, 7, 8, 10, 13, 14, 18, 19], "divid": [3, 6, 7, 8, 10, 11, 14], "divisor": [3, 5, 6, 7, 8, 14], "arrai": [3, 4, 5, 6, 7, 8, 10, 18, 19], "scalar": [3, 6, 7, 8, 10], "therefor": [3, 4, 6, 7, 8, 10, 11, 14], "exactli": [3, 5, 6, 7, 8, 14, 16], "order": [3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 17, 18], "32": [3, 5, 6, 7, 8, 10, 12, 14, 18], "bit": [3, 5, 6, 7, 8, 10, 11, 14], "final": [3, 4, 6, 7, 8, 10], "anyth": 3, "insert": [3, 4, 5, 8, 10, 11, 14, 18, 20, 22], "preprocessor": [3, 5], "statement": [3, 8, 10, 14, 20], "valu": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18], "were": [3, 6, 7, 8, 10, 14], "i_like_convolut": 3, "42": 3, "line": [3, 6, 7, 8], "definit": [3, 10], "effect": [3, 6, 7, 8], "unless": 3, "cours": [3, 6, 7, 8, 13, 14], "somewher": 3, "token": 3, "In": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 18, 19, 22], "freeli": 3, "few": [3, 6, 7, 8, 10, 11, 20], "special": [3, 6, 7, 8, 16, 18, 22], "notic": [3, 6, 7, 8], "haven": [3, 13], "yet": [3, 5, 10, 11, 18], "basic": [3, 5, 6, 7, 8, 18], "block_size_x": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 18, 20], "block_size_i": [3, 4, 6, 7, 8, 10, 11, 14], "block_size_z": [3, 6, 7, 8, 10], "interpret": 3, "z": [3, 5, 10], "block_size_nam": [3, 5], "let": [3, 5, 6, 7, 8, 18, 20], "creation": [3, 12, 17], "trusti": 3, "old": 3, "16": [3, 4, 6, 7, 8, 10, 11, 14], "dict": [3, 4, 5, 8, 11, 12, 16, 17, 18, 20], "undefin": [3, 5, 6, 7, 8, 14], "constant": [3, 5, 6, 7, 8, 9, 11, 14, 17], "filter_heigth": 3, "could": [3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 17, 20], "runtim": [3, 5, 6, 7, 8, 12, 13, 16, 20], "setup": [3, 6, 7, 8, 11, 13, 16, 19], "everyth": [3, 5, 6, 7, 8], "answer": [3, 4, 5, 6, 7, 8, 9], "done": [3, 13, 15, 16], "alloc": [3, 5, 6, 7, 8, 9, 11], "memori": [3, 5, 9, 11, 16, 19, 22], "move": [3, 5, 6, 11, 14, 17], "content": [3, 5], "deriv": [3, 5, 6, 7, 8, 15], "after": [3, 4, 5, 6, 7, 8, 11, 13, 14, 16], "retriev": [3, 5], "free": [3, 6, 7, 8, 11, 13, 14], "return": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19], "contrast": 3, "wa": [3, 5, 6, 7, 8, 16], "finish": [3, 5, 7, 10, 11, 16], "particularli": [3, 15], "compar": [3, 4, 6, 7, 8, 10, 14, 15, 16], "case": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 18, 19], "than": [3, 6, 7, 8, 10, 15, 16, 17, 22], "highli": [3, 12, 14], "parametr": 3, "long": [3, 6, 7, 8, 10, 11, 14, 19], "instead": [3, 5, 9, 14], "littl": [3, 6, 7, 8, 14], "ve": [3, 6, 7, 8, 13, 14], "interfac": [3, 4, 11, 13, 16, 17, 19], "familiar": [3, 14], "becaus": [3, 4, 6, 7, 8, 11, 13, 14, 15, 20, 22], "kernel_str": [3, 4, 5, 6, 7, 8, 11, 12, 17], "tune_param": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 19, 20], "onli": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 19], "similarli": 3, "singl": [3, 4, 5, 6, 7, 8, 11, 14, 16, 20], "wai": [3, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16], "64": [3, 6, 7, 8, 12, 14, 18, 20], "128": [3, 6, 7, 8, 12, 18, 20], "try": [3, 5, 6, 7, 8, 13, 14, 17], "env": [3, 5, 17, 18], "cartesian": [3, 10], "product": [3, 6, 7], "realli": [3, 6, 7, 8, 13], "howev": [3, 4, 6, 7, 8, 11, 13, 14, 16, 19, 20], "lot": [3, 6, 7, 8, 14, 16, 18, 19], "problemat": 3, "explain": [3, 5, 6, 7, 8, 11, 13, 14, 15, 18, 20], "illeg": 3, "2048": 3, "limit": [3, 5, 6, 7, 8, 9, 14, 16, 17, 20, 22], "1024": [3, 6, 7, 8, 18], "fail": [3, 5, 13], "reason": [3, 5, 19], "too": [3, 6, 7, 8, 10, 11, 14], "share": [3, 5], "regist": [3, 6, 7, 8, 14, 16], "silent": 3, "verbos": [3, 4, 5, 6, 7, 8, 11], "bound": [3, 5, 14, 17], "ignor": [3, 5, 6, 7, 8], "two": [3, 5, 6, 7, 8, 9, 14, 15, 17], "thing": [3, 11, 14], "record": [3, 5, 6, 16], "show": [3, 6, 7, 8, 9, 12, 15, 19], "specif": [3, 5, 6, 7, 8, 10, 15, 16, 17], "secondli": [3, 14], "experi": 3, "took": [3, 6, 8, 17, 18], "place": [3, 6, 7, 8, 16, 17, 18], "That": [3, 6, 7, 8, 11, 14, 15, 18], "mean": [3, 11, 14, 15, 17, 19, 20, 22], "softwar": [3, 6, 7, 8, 12, 13, 16, 17, 18], "along": [3, 5, 13, 18, 22], "second": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17], "alwai": [3, 5, 6, 7, 8], "circumst": 3, "obtain": [3, 6, 7, 8, 10, 16], "promis": 3, "would": [3, 6, 7, 8, 20], "tile": [3, 9, 14], "factor": [3, 6, 7, 8, 9, 10, 14, 22], "amount": [3, 6, 7, 8, 14, 15], "particular": [3, 5, 6, 7, 9, 11, 14, 16, 19], "increas": [3, 6, 7, 8, 16], "certain": [3, 5, 6, 7, 8, 16, 22], "tile_size_x": [3, 4, 6, 7, 8, 11, 14], "4": [3, 6, 7, 8, 10, 14, 16], "tile_size_i": [3, 4, 6, 7, 8, 11, 14], "understand": 3, "everi": [3, 4, 6, 7, 8, 9, 16, 18], "fewer": [3, 6, 7, 8], "total": [3, 5, 6, 7, 8, 14, 15, 18], "stai": 3, "tell": [3, 6, 7, 8, 9, 11, 14, 18, 19], "influenc": 3, "did": [3, 6, 7, 8, 14], "mimick": 3, "behavior": [3, 14, 16], "assum": [3, 5, 6, 7, 8, 14], "far": [3, 6, 7, 8, 14, 18], "grid_div_x": [3, 4, 6, 7, 8, 11, 14], "grid_div_i": [3, 4, 6, 7, 8, 11, 14], "decreas": [3, 14], "correspondingli": 3, "displai": 3, "commonli": [3, 6, 7, 8, 13, 14], "gflop": [3, 5, 9, 14, 15], "giga": [3, 14], "compos": [3, 5, 14, 15], "lambda": [3, 5, 6, 7, 14, 15], "collect": [3, 5, 6, 7, 8, 10, 14, 16, 19], "ordereddict": [3, 6, 7, 8, 10, 14, 15], "p": [3, 5, 14, 15, 19], "1e9": [3, 14], "1e3": [3, 6, 7, 8, 14, 15], "expand": [3, 14, 16], "sinc": [3, 8, 10, 14, 20], "And": [3, 6, 7, 8, 17, 20], "know": [3, 6, 7, 8, 14, 15], "enough": [3, 4, 14], "abl": [3, 5, 6, 7, 8], "own": [3, 8, 11, 13, 15, 16], "whenev": 4, "program": [4, 6, 7, 8, 11, 14, 19, 20], "good": [4, 6, 7, 8, 22], "fast": [4, 6, 7, 8], "verifi": [4, 5, 9], "instanc": [4, 5, 6, 7, 8, 11, 16], "none": [4, 5, 16, 17], "onc": [4, 5, 6, 7, 8, 10, 16], "comparison": 4, "implement": [4, 5, 9, 10, 15, 16, 17], "allclos": 4, "maximum": [4, 5, 10, 17], "absolut": 4, "1e": 4, "6": [4, 6, 7, 8, 10, 11], "want": [4, 8, 10, 11, 13, 14, 16, 18, 22], "toler": 4, "atol": [4, 5], "convolution_correct": 4, "py": [4, 11, 13], "demonstr": [4, 8, 9, 14], "open": [4, 6, 7, 11, 14], "r": [4, 11], "cmem_arg": [4, 5], "d_filter": 4, "arg": [4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 19, 20], "field": [4, 6, 7, 8], "its": [4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16], "almost": [4, 6, 7, 8, 16], "whose": 4, "trust": [4, 17], "construct": [4, 14], "There": [4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 22], "precomput": 4, "flexibl": [4, 6, 7, 14], "callabl": [4, 5], "accept": [4, 5, 17], "cpu_result": 4, "gpu_result": [4, 6, 8], "although": 4, "semant": 4, "posit": [4, 5, 10, 17, 20], "reflect": [4, 16], "reduct": [4, 15], "snippet": 4, "sum_x": 4, "custom": [4, 9, 15, 16, 19], "def": [4, 5, 6, 7, 8, 10, 16, 19], "verify_partial_reduc": 4, "isclos": 4, "first_kernel": 4, "_": [4, 6, 7, 8], "sum_float": 4, "map": [4, 9, 10], "third": [4, 14], "partial": [4, 6, 7, 8, 9], "cpu": [4, 7, 8, 11], "achiev": [4, 8], "element": [4, 6, 7, 8, 14, 15, 18, 19], "necessarili": [4, 11], "section": [5, 6, 7, 8], "detail": [5, 13, 21], "intern": [5, 12, 17, 20], "mostli": [5, 12], "relev": [5, 12, 16], "develop": [5, 12, 13], "extens": 5, "architectur": [5, 16], "At": [5, 10], "expos": 5, "respons": 5, "iter": [5, 6, 7, 8, 10, 14, 16, 17, 18], "over": [5, 6, 7, 8, 13, 14, 16, 17], "brute_forc": 5, "valid": [5, 9, 14], "random_sampl": 5, "sampl": [5, 17], "advanc": [5, 20], "being": [5, 6, 7, 8, 14, 16, 17], "strategy_opt": [5, 17], "sai": [5, 6, 7, 8, 18, 20], "foreseen": 5, "futur": [5, 12, 22], "high": [5, 6, 7, 8, 12, 14, 16], "wrap": [5, 18, 20], "low": [5, 6, 7, 8, 14], "abstract": [5, 16], "ready_argument_list": 5, "build": [5, 6, 7, 8], "bottom": 5, "pyhip": 5, "either": [5, 10, 17, 20], "typic": [5, 13, 14], "nvcc": 5, "gcc": 5, "fortran": [5, 9, 20], "turn": 5, "launch": [5, 6, 7, 8, 11, 16], "rest": [5, 6, 7, 8], "helper": [5, 16], "get_opt": 5, "suppli": [5, 11, 14, 17, 20], "get_strategy_docstr": 5, "method": [5, 6, 7, 8, 11, 14, 16, 17], "make_strategy_options_doc": 5, "scale_from_param": 5, "ep": [5, 17], "func": [5, 16], "invers": 5, "unscal": 5, "setup_method_argu": 5, "prepar": [5, 6, 7, 8], "setup_method_opt": 5, "tuning_opt": [5, 17], "snap_to_nearest_config": 5, "closest": 5, "unscale_and_snap_to_nearest": 5, "snap": 5, "scale": 5, "variabl": [5, 10, 13, 17], "nearest": 5, "class": [5, 16, 17], "kernel_opt": 5, "device_opt": 5, "__init__": 5, "instanti": [5, 20], "kernelsourc": 5, "parameter_spac": [5, 17], "entir": [5, 6, 7, 8, 14, 17], "iterfac": 5, "platform": [5, 12, 13, 16], "quiet": 5, "compiler_opt": 5, "7": [5, 6, 7, 8, 10], "offer": 5, "languag": [5, 8, 11, 14, 19], "lang": [5, 9, 11, 20], "bool": [5, 19], "gpu_arg": 5, "benchmark_continu": 5, "durat": [5, 16], "benchmark_default": 5, "check_kernel_output": 5, "compile_kernel": 5, "copy_constant_memory_arg": 5, "recent": [5, 13, 16], "copy_shared_memory_arg": 5, "smem_arg": 5, "copy_texture_memory_arg": 5, "texmem_arg": 5, "textur": 5, "create_kernel_inst": 5, "get_environ": 5, "memcpy_dtoh": [5, 6], "dest": 5, "src": 5, "copi": [5, 6, 7, 8, 18], "static": 5, "preprocess_gpu_argu": 5, "old_argu": 5, "flat": 5, "given": [5, 6, 7, 8, 10, 16, 17], "mem": 5, "group": [5, 6, 7, 8], "maintain": 5, "state": [5, 6, 7, 8, 16], "interact": [5, 16], "properti": [5, 14], "context": [5, 6, 8, 10], "kernel_inst": 5, "lookup": 5, "directli": [5, 6, 7, 8, 11, 14, 16, 20], "driver": [5, 6, 8, 10], "ndarrai": [5, 10], "format": [5, 6, 7, 19], "kei": [5, 6, 7, 8, 14, 17, 18], "symbol": 5, "similar": [5, 11, 14], "regular": [5, 8, 16], "int32": [5, 12, 18, 20], "kernel_finish": 5, "devicealloc": 5, "memcpy_htod": [5, 6], "memset": 5, "unsign": [5, 7], "byte": [5, 19], "tupl": [5, 8, 10, 17], "start_ev": 5, "event": [5, 6, 11, 16], "mark": 5, "measur": [5, 6, 7, 8, 10, 11, 14, 15, 16, 22], "stop_ev": 5, "synchron": [5, 6, 8, 10, 14, 15], "halt": [5, 11], "until": [5, 11], "task": 5, "rawkernel": 5, "cudeviceptr": 5, "cufunct": 5, "id": [5, 16], "must": [5, 15], "dynam": 5, "buffer": [5, 7, 19], "fill": [5, 14], "item": [5, 6, 7, 8, 10], "ndrang": 5, "kernelinst": 5, "repres": [5, 6, 7, 8], "tunabl": [5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 22], "ctype": 5, "_funcptr": 5, "ptr": 5, "pionter": 5, "compilationfailedconfig": 5, "errorconfig": 5, "invalidconfig": 5, "npencod": 5, "skipkei": 5, "ensure_ascii": 5, "check_circular": 5, "allow_nan": 5, "sort_kei": 5, "indent": 5, "separ": [5, 9, 11, 20], "dump": [5, 6, 7], "json": [5, 6, 7, 9], "obj": 5, "subclass": 5, "serializ": 5, "rais": 5, "typeerror": 5, "arbitrari": 5, "self": [5, 16, 17], "els": 5, "jsonencod": 5, "runtimefailedconfig": 5, "skippablefailur": 5, "stopcriterionreach": 5, "thrown": 5, "stop": [5, 17], "criterion": [5, 17], "reach": 5, "check_argument_list": 5, "check_argument_typ": 5, "dtype": [5, 19], "kernel_argu": 5, "check_restrict": 5, "whether": [5, 15, 17], "meet": 5, "check_stop_criterion": 5, "max_fev": [5, 17], "exceed": 5, "check_thread_block_dimens": 5, "max_thread": 5, "check_tune_params_list": 5, "simulation_mod": 5, "forbidden": 5, "compile_restrict": 5, "monolith": 5, "try_to_constraint": 5, "union": 5, "str": [5, 6, 7, 8, 10], "constraint": 5, "pars": [5, 6, 7], "config_valid": 5, "max": 5, "convert_constraint_restrict": 5, "convert": [5, 6, 7], "backward": 5, "compat": [5, 13], "cuda_error_check": 5, "statu": 5, "delete_temp_fil": 5, "delet": 5, "temporari": 5, "don": [5, 6, 8, 10, 11], "complain": 5, "detect_languag": 5, "attempt": [5, 20], "detect": [5, 17, 20], "dump_cach": 5, "omit": 5, "sever": [5, 6, 7, 8, 9, 10, 13, 14, 20], "store_cach": 5, "speed": 5, "great": [5, 6, 7, 8, 18], "power": [5, 14, 16, 22], "get_best_config": 5, "objective_higher_is_bett": [5, 15], "best": [5, 6, 7, 10, 14, 17, 20, 22], "accord": 5, "get_config_str": 5, "compact": 5, "represent": [5, 19], "get_grid_dimens": 5, "current_problem_s": 5, "grid_div": 5, "dim": 5, "get_instance_str": 5, "debug": 5, "advis": 5, "get_kernel_str": [5, 6, 7, 8], "One": [5, 6, 7, 8, 16, 19], "get_problem_s": 5, "get_smem_arg": 5, "get_temp_filenam": 5, "suffix": 5, "form": [5, 14, 16, 17], "temp_x": 5, "larg": [5, 6, 7, 8, 10], "integ": [5, 16, 19], "get_thread_block_dimens": 5, "convent": [5, 11], "get_total_tim": 5, "overhead_tim": 5, "looks_like_a_filenam": 5, "normalize_verify_funct": 5, "v": [5, 6, 7, 8, 10], "normal": [5, 17], "result_host": 5, "keyword": 5, "behaviour": 5, "parse_restrict": 5, "prepare_kernel_str": 5, "prepend": [5, 8], "seri": [5, 10], "By": [5, 11, 14, 17], "macro": 5, "made": 5, "print_config": 5, "print_config_output": 5, "process_cach": 5, "device_nam": 5, "tune_params_kei": 5, "x1": 5, "x2": 5, "xn": 5, "234342": 5, "y1": 5, "y2": 5, "yn": 5, "134233": 5, "close": [5, 6, 7, 8], "bracket": 5, "miss": 5, "earlier": [5, 6, 7, 8, 10], "abruptli": 5, "process_metr": 5, "calcul": [5, 10], "express": [5, 6, 7, 8, 9, 11, 14], "10000": 5, "read_cach": 5, "open_cach": 5, "cachefil": 5, "read_fil": 5, "replace_param_occurr": 5, "occurr": 5, "setup_block_and_grid": 5, "write_fil": 5, "whole": [6, 7, 8, 14, 17], "model": [6, 7, 8, 12], "physic": 6, "numer": [6, 7, 8], "introduc": [6, 7, 8, 14, 16], "redistribut": [6, 7, 8], "region": [6, 7, 8], "concentr": [6, 7, 8], "bulk": [6, 7, 8], "motion": [6, 7, 8], "concept": [6, 7, 8], "wide": [6, 7, 8, 13, 14], "chemistri": [6, 7, 8], "biologi": [6, 7, 8], "suppos": [6, 7, 8], "metal": [6, 7, 8], "sheet": [6, 7, 8], "temperatur": [6, 7, 8, 16, 17, 22], "equal": [6, 7, 8, 14], "degre": [6, 7, 8], "everywher": [6, 7, 8], "heat": [6, 7, 8], "thousand": [6, 7, 8], "instant": [6, 7, 8, 10], "hotspot": [6, 7, 8], "cooler": [6, 7, 8], "area": [6, 7, 8, 14], "melt": [6, 7, 8], "loss": [6, 7, 8], "radiat": [6, 7, 8], "caus": [6, 7, 8], "frac": [6, 7, 8], "d": [6, 7, 8, 10, 17, 18], "spatial": [6, 7, 8], "descret": [6, 7, 8], "2d": [6, 7, 8, 9], "quantiti": [6, 7, 8, 15, 16], "nx": [6, 7, 8, 10], "equi": [6, 7, 8], "distant": [6, 7, 8], "direct": [6, 7, 8, 11, 14, 15], "ny": [6, 7, 8, 10], "distanc": [6, 7, 8, 17], "delta": [6, 7, 8], "between": [6, 7, 8, 11, 13, 14, 15, 17], "central": [6, 7, 8], "approxim": [6, 7, 8], "x_i": [6, 7, 8, 10], "x_": [6, 7, 8], "approx": [6, 7, 8], "u_": [6, 7, 8], "2u_": [6, 7, 8], "y_": [6, 7, 8], "estim": [6, 7, 8], "next": [6, 7, 8, 14, 19], "simplifi": [6, 7, 8], "formula": [6, 7, 8], "further": [6, 7, 8, 13, 14], "4u_": [6, 7, 8], "simplic": [6, 7, 8, 10], "assumpt": [6, 7, 8], "boundari": [6, 7, 8], "condit": [6, 7, 8, 14], "dt": [6, 7, 8], "225": [6, 7, 8], "give": [6, 7, 8, 17], "test": [6, 7, 8, 9, 13, 14, 16], "initi": [6, 7, 8, 19], "hot": [6, 7, 8], "plot": [6, 7, 8], "anoth": [6, 7, 8, 11, 14, 15, 17], "color": [6, 7, 8], "matplotlib": [6, 7, 8, 13], "pyplot": [6, 7, 8], "inlin": [6, 7, 8], "get_initial_condit": [6, 7, 8], "ones": [6, 7, 8, 22], "randint": [6, 7, 8], "1000": [6, 7, 8, 10], "2000": [6, 7, 8], "fig": [6, 7, 8], "ax1": [6, 7, 8], "ax2": [6, 7, 8], "subplot": [6, 7, 8], "imshow": [6, 7, 8], "lt": [6, 7, 8], "axesimag": [6, 7, 8], "0x2aaab952f240": 6, "gt": [6, 7, 8], "quick": [6, 7, 8], "later": [6, 7, 8, 10], "field_copi": [6, 7], "4164": 6, "018869400024": 6, "0x2aab1c98b3c8": 6, "worri": [6, 8], "terminologi": [6, 8], "text": [6, 8, 14], "5": [6, 7, 8, 10, 17], "225f": [6, 7, 8], "diffuse_kernel": [6, 7, 8], "u_new": [6, 7, 8], "0f": [6, 7, 8], "togeth": [6, 7, 8, 13], "choos": [6, 7, 8, 14, 17], "impact": [6, 7, 8, 11], "fix": [6, 7, 8, 17], "unrol": [6, 7, 8, 9, 14, 22], "loop": [6, 7, 8, 9, 14, 22], "drv": 6, "sourcemodul": [6, 8, 10], "init": 6, "make_context": 6, "devprop": 6, "k": [6, 7, 8, 10, 12, 14, 18], "get_devic": 6, "get_attribut": 6, "cc": 6, "compute_capability_major": 6, "compute_capability_minor": 6, "u_old": [6, 8], "mem_alloc": 6, "nbyte": 6, "block_size_str": [6, 8], "arch": 6, "sm_": 6, "get_funct": [6, 8, 10], "boilerpl": [6, 7, 8], "moment": [6, 7, 8], "serv": [6, 7, 8, 15, 17], "guess": [6, 7, 8], "pair": [6, 7, 8], "500": [6, 7, 8], "time_sinc": 6, "zeros_lik": [6, 10, 12, 14, 18, 20], "set_titl": [6, 7, 8], "53": [6, 7, 8], "423038482666016": 6, "0x2aaabbdcb2e8": 6, "faster": [6, 7, 8, 14], "cleanup": 6, "pop": 6, "think": [6, 7, 8], "messi": [6, 7, 8], "got": [6, 7, 8], "cleaner": [6, 7, 8], "previous": [6, 7, 8, 14], "plai": [6, 7, 8], "difficult": [6, 7, 8, 19, 20], "rather": [6, 7, 8], "underutil": [6, 7, 8], "purpos": [6, 7, 8, 11, 14, 22], "feel": [6, 7, 8], "48": [6, 7, 8], "care": [6, 7, 8], "appropi": [6, 7, 8], "fly": [6, 7, 8], "12": [6, 7, 8], "13": [6, 7, 8], "geforc": [6, 7, 8, 10], "gtx": [6, 7, 8, 10], "titan": [6, 7, 8], "22305920124": 6, "779033613205": 6, "824838399887": 6, "900499212742": 6, "999763202667": 6, "727967989445": 6, "752479994297": 6, "797900807858": 6, "876627194881": 6, "93347837925": 6, "766662418842": 6, "803033602238": 6, "853574407101": 6, "971545600891": 6, "763775992393": 6, "791257584095": 6, "848044800758": 6, "922745585442": 6, "792595207691": 6, "822137594223": 6, "893279993534": 6, "well": [6, 7, 8, 10, 14, 16], "millisecond": [6, 7, 8], "averag": [6, 7, 8, 11, 16], "matter": [6, 7, 8, 11], "analyz": [6, 7, 8], "seem": [6, 7, 8], "vari": [6, 7, 8, 10, 14, 15], "addtion": [6, 7, 8], "among": [6, 7, 8, 12, 17], "128x32": [6, 7, 8], "likewis": [6, 7, 8], "becom": [6, 7, 8, 16, 17], "affect": [6, 7, 8, 14], "within": [6, 7, 8, 10, 14, 17], "exchang": [6, 7, 8], "fact": [6, 7, 8, 11], "commun": [6, 7, 8], "idea": [6, 7, 8, 11, 14, 22], "control": [6, 7, 8, 16, 17], "l2": [6, 7, 8], "closer": [6, 7, 8], "multiprocessor": [6, 7, 8], "l1": [6, 7, 8], "fine": [6, 7, 8], "grain": [6, 7, 8], "manag": [6, 7, 8, 14, 16], "cost": [6, 7, 8, 17], "overhead": [6, 7, 8, 14], "degrad": [6, 7, 8], "intermedi": [6, 7, 8], "mind": [6, 7, 8], "14": [6, 7, 8], "tx": [6, 7, 8, 14], "ty": [6, 7, 8, 14], "bx": [6, 7, 8, 10], "__shared__": [6, 8, 14], "sh_u": [6, 7, 8], "pragma": [6, 7, 8, 14], "__syncthread": [6, 7, 8, 14], "75041918755": 6, "18713598251": 6, "09015038013": 6, "06844799519": 6, "09730558395": 6, "14420480728": 6, "05957758427": 6, "07508480549": 6, "0731967926": 6, "14729599953": 6, "08389122486": 6, "10700161457": 6, "10125439167": 6, "31661438942": 6, "0629119873": 6, "04807043076": 6, "054880023": 6, "12033278942": 6, "06672639847": 6, "05816960335": 6, "12000002861": 6, "sometim": [6, 7, 8, 19], "merg": [6, 7, 8, 14], "half": [6, 7, 8], "doubl": [6, 7, 8, 19, 20], "cover": [6, 7, 8, 17], "beyond": [6, 7, 8], "reduc": [6, 7, 8, 14], "condens": [6, 7, 8], "keep": [6, 7, 8, 14, 19], "importantli": [6, 7, 8], "worst": [6, 7, 8], "15": [6, 7, 8, 20], "tj": [6, 7, 8], "ti": [6, 7, 8, 10], "somehow": [6, 7, 8], "larger": [6, 7, 8, 11, 17, 20], "insid": [6, 7, 8, 11, 14, 20], "round": [6, 7, 8], "arithmet": [6, 7, 8], "evalu": [6, 7, 8, 14, 17], "759308815": 6, "29789438248": 6, "06983039379": 6, "2634239912": 6, "997139203548": 6, "843692803383": 6, "05549435616": 6, "862348806858": 6, "750636804104": 6, "19084160328": 6, "876377594471": 6, "714169609547": 6, "875001597404": 6, "691116797924": 6, "575859189034": 6, "759679996967": 6, "622867202759": 6, "650336003304": 6, "09794559479": 6, "826515209675": 6, "692665600777": 6, "78363519907": 6, "646092808247": 6, "554745602608": 6, "716115188599": 6, "581280004978": 6, "662566399574": 6, "07386879921": 6, "833420813084": 6, "705055999756": 6, "840755212307": 6, "652575993538": 6, "569388794899": 6, "689356791973": 6, "597267186642": 6, "675232005119": 6, "10033922195": 6, "860332798958": 6, "731891202927": 6, "867276787758": 6, "68781440258": 6, "595276796818": 6, "735436797142": 6, "60216319561": 6, "852166390419": 6, "15089921951": 6, "852575981617": 6, "705932807922": 6, "888671982288": 6, "673248004913": 6, "563417613506": 6, "761139214039": 6, "621254396439": 6, "676595199108": 6, "06709122658": 6, "804953610897": 6, "685670387745": 6, "801798415184": 6, "632006394863": 6, "542387211323": 6, "722668802738": 6, "578745603561": 6, "618598401546": 6, "08220798969": 6, "821881604195": 6, "687955200672": 6, "77759360075": 6, "618003201485": 6, "539891195297": 6, "705900788307": 6, "568556785583": 6, "624492788315": 6, "0799423933": 6, "832300806046": 6, "70140799284": 6, "835481595993": 6, "638348805904": 6, "550105595589": 6, "667251205444": 6, "576044797897": 6, "732409596443": 6, "15916161537": 6, "869497597218": 6, "733248019218": 6, "890803205967": 6, "677363204956": 6, "577215993404": 6, "730982398987": 6, "58035838604": 6, "10066559315": 6, "837804794312": 6, "691385602951": 6, "851040017605": 6, "666656005383": 6, "560505592823": 6, "771103990078": 6, "626163220406": 6, "694451200962": 6, "11514236927": 6, "837299215794": 6, "703302407265": 6, "806828796864": 6, "648620784283": 6, "562521612644": 6, "760915207863": 6, "605760002136": 6, "690009605885": 6, "10740480423": 6, "841631996632": 6, "700883197784": 6, "838195204735": 6, "649779188633": 6, "56585599184": 6, "7168192029": 6, "59088640213": 6, "69627519846": 6, "3269824028": 6, "02665598392": 6, "840908801556": 6, "03752319813": 6, "788345599174": 6, "662041604519": 6, "85437438488": 6, "680422389507": 6, "0759360075": 6, "801996803284": 6, "666003203392": 6, "808000004292": 6, "643359994888": 6, "544691193104": 6, "741964805126": 6, "60942081213": 6, "681350398064": 6, "05262081623": 6, "792108798027": 6, "66344319582": 6, "768064010143": 6, "625260794163": 6, "540352010727": 6, "721862399578": 6, "579411196709": 6, "626976013184": 6, "06332798004": 6, "808211183548": 6, "679372787476": 6, "803718411922": 6, "627136015892": 6, "538227200508": 6, "682188808918": 6, "573836791515": 6, "725548803806": 6, "13023357391": 6, "843411195278": 6, "713843202591": 6, "85886080265": 6, "657920002937": 6, "565254402161": 6, "697094392776": 6, "579904007912": 6, "07484800816": 6, "801119995117": 6, "667347204685": 6, "799059200287": 6, "643820810318": 6, "542937588692": 6, "740518403053": 6, "615148806572": 6, "731334400177": 6, "07002239227": 6, "805299210548": 6, "675923216343": 6, "782060790062": 6, "631142401695": 6, "540383994579": 6, "723999989033": 6, "578681600094": 6, "726335990429": 6, "13297917843": 6, "844428789616": 6, "710278391838": 6, "835494399071": 6, "637958395481": 6, "567417597771": 6, "699366402626": 6, "588492810726": 6, "tri": [6, 7, 8, 17], "grow": [6, 7, 8], "quickli": [6, 7, 8], "went": [6, 7, 8, 10], "72": [6, 7, 8], "26": [6, 7, 8], "32x2": [6, 7, 8], "64x4": [6, 7, 8], "four": [6, 7, 8], "best_tim": [6, 7], "min": [6, 7], "05": [6, 7], "join": [6, 7], "nice": [6, 7], "stdout": [6, 7], "why": [6, 7, 11, 15], "easili": [6, 7, 16], "easi": [6, 7, 15, 16], "csv": [6, 7, 9], "analysi": [6, 7], "panda": [6, 7, 9, 13], "18": [6, 7, 8], "fp": [6, 7], "datafram": [6, 7], "df": [6, 7], "to_csv": [6, 7], "0x2aab1de088d0": 7, "01": 7, "sy": 7, "140": 7, "wall": 7, "98": 7, "__kernel": 7, "get_group_id": 7, "get_local_id": 7, "cl": 7, "ctx": 7, "create_some_context": 7, "mf": 7, "mem_flag": 7, "a_h": 7, "a_d": 7, "read_writ": 7, "copy_host_ptr": 7, "hostbuf": 7, "b_d": 7, "kernel_src": 7, "prg": 7, "queue": 7, "commandqueu": 7, "run_gpu": 7, "444": 7, "154": 7, "598": 7, "985": 7, "enqueue_copi": 7, "1748096": 7, "7284544": 7, "7707904": 7, "8573184": 7, "8380288": 7, "686528": 7, "69648": 7, "7461632": 7, "818304": 7, "771072": 7, "7190464": 7, "7522432": 7, "7982208": 7, "9624512": 7, "7214464": 7, "7453312": 7, "8028416": 7, "8922624": 7, "747328": 7, "7860736": 7, "8637184": 7, "__local": 7, "barrier": 7, "clk_local_mem_f": 7, "8449472": 7, "1912576": 7, "1035136": 7, "0927808": 7, "1140736": 7, "1790336": 7, "0808192": 7, "0809792": 7, "0836928": 7, "1545856": 7, "1249984": 7, "1264": 7, "1230336": 7, "4015104": 7, "0873216": 7, "0626496": 7, "0692224": 7, "140192": 7, "0801344": 7, "0688128": 7, "1428928": 7, "8844544": 7, "3245952": 7, "0911808": 7, "3039616": 7, "0079296": 7, "84848": 7, "0708288": 7, "857728": 7, "7561792": 7, "231072": 7, "8774336": 7, "7087296": 7, "8772672": 7, "6911872": 7, "5715968": 7, "7584896": 7, "6292032": 7, "6498688": 7, "1145664": 7, "8252928": 7, "6757568": 7, "7881152": 7, "6237696": 7, "544224": 7, "6951168": 7, "5648128": 7, "6452736": 7, "1065792": 7, "8313792": 7, "6905984": 7, "8302656": 7, "6367488": 7, "5478592": 7, "6660672": 7, "5719744": 7, "6551744": 7, "1384064": 7, "8531072": 7, "7078976": 7, "8516672": 7, "6677696": 7, "5685632": 7, "7074048": 7, "5753152": 7, "8228864": 7, "2124736": 7, "8633344": 7, "6921216": 7, "8896384": 7, "6659904": 7, "5582144": 7, "7522624": 7, "6081536": 7, "6664448": 7, "1095936": 7, "8063424": 7, "6717888": 7, "7982848": 7, "6263552": 7, "5289728": 7, "7008832": 7, "567456": 7, "5968704": 7, "1018432": 7, "8117248": 7, "6724736": 7, "7728576": 7, "6038336": 7, "5172352": 7, "6796352": 7, "5470016": 7, "5968448": 7, "1107712": 7, "8237248": 7, "6810944": 7, "821952": 7, "620352": 7, "5230208": 7, "6415552": 7, "5476864": 7, "7168192": 7, "1942016": 7, "8626304": 7, "7099712": 7, "9123328": 7, "6608448": 7, "5631168": 7, "7113024": 7, "556576": 7, "1583104": 7, "8384832": 7, "67856": 7, "845856": 7, "6581248": 7, "54944": 7, "7520064": 7, "6076224": 7, "6842112": 7, "1547072": 7, "8422016": 7, "6895552": 7, "8037312": 7, "6387072": 7, "5383296": 7, "7326656": 7, "5863488": 7, "6813376": 7, "1493952": 7, "8444928": 7, "6929216": 7, "832768": 7, "6389312": 7, "5412672": 7, "698336": 7, "5717568": 7, "676096": 7, "4303104": 7, "0341696": 7, "8365184": 7, "0398656": 7, "7786496": 7, "648928": 7, "8479232": 7, "6508544": 7, "1219392": 7, "7994048": 7, "6492288": 7, "8068416": 7, "6343168": 7, "5235328": 7, "7268928": 7, "5898432": 7, "6633536": 7, "0849664": 7, "7869632": 7, "6458624": 7, "7611968": 7, "613088": 7, "50912": 7, "6972928": 7, "5620608": 7, "601856": 7, "095232": 7, "7967488": 7, "6601472": 7, "7952896": 7, "6047296": 7, "5108224": 7, "6607744": 7, "5492416": 7, "7091136": 7, "171552": 7, "8473408": 7, "6962112": 7, "8663936": 7, "6466816": 7, "5475584": 7, "6754048": 7, "5591744": 7, "108896": 7, "7907264": 7, "6459328": 7, "7965888": 7, "6250816": 7, "5188416": 7, "721408": 7, "5920832": 7, "7068608": 7, "0909248": 7, "7930752": 7, "6524544": 7, "7745216": 7, "6146176": 7, "5116928": 7, "6975872": 7, "5548416": 7, "7075136": 7, "174624": 7, "8384512": 7, "69104": 7, "8335488": 7, "6264192": 7, "5445248": 7, "6719104": 7, "5592064": 7, "19": [7, 8], "solv": 8, "0x7f888f8cd7b8": 8, "4152": 8, "086019515991": 8, "0x7f8865b51f28": 8, "gpuarrai": [8, 10], "tool": [8, 10, 12], "autoinit": [8, 10], "to_gpu": [8, 10], "mod": [8, 10], "t0": [8, 10], "ona": 8, "33": 8, "46109390258789": 8, "0x7f8858b873c8": 8, "1080": [8, 10], "916985595226": 8, "489004802704": 8, "500524806976": 8, "513356792927": 8, "545715200901": 8, "486515200138": 8, "449055999517": 8, "44974719882": 8, "457427197695": 8, "492915201187": 8, "464863997698": 8, "466118401289": 8, "475264000893": 8, "513632011414": 8, "458412796259": 8, "457715201378": 8, "461017608643": 8, "475987195969": 8, "460032004118": 8, "457779198885": 8, "462649595737": 8, "kernel_string_shar": 8, "22673916817": 8, "826361596584": 8, "793516802788": 8, "782112002373": 8, "776639997959": 8, "795135998726": 8, "722777605057": 8, "762777590752": 8, "75422719717": 8, "804876792431": 8, "778656005859": 8, "769734406471": 8, "782495999336": 8, "932281601429": 8, "734028804302": 8, "721625590324": 8, "736511993408": 8, "800019192696": 8, "724966406822": 8, "722969603539": 8, "759430396557": 8, "kernel_string_til": 8, "22200961113": 8, "91601279974": 8, "752838408947": 8, "873651194572": 8, "69833599329": 8, "586931192875": 8, "516473591328": 8, "411392003298": 8, "384262400866": 8, "82159358263": 8, "632607996464": 8, "506457602978": 8, "618758392334": 8, "500288009644": 8, "429862397909": 8, "44995200038": 8, "366150397062": 8, "342201602459": 8, "793542397022": 8, "58026239872": 8, "494163197279": 8, "546316814423": 8, "467059195042": 8, "404249596596": 8, "440895992517": 8, "341376006603": 8, "339692795277": 8, "783923208714": 8, "597920000553": 8, "50277120471": 8, "615475213528": 8, "470937597752": 8, "418393599987": 8, "443519997597": 8, "343961596489": 8, "342540800571": 8, "780352008343": 8, "611705589294": 8, "515667212009": 8, "622534394264": 8, "502195191383": 8, "437388807535": 8, "45568639636": 8, "359289598465": 8, "426995199919": 8, "788947200775": 8, "616556799412": 8, "496121603251": 8, "629164803028": 8, "474841600657": 8, "407667201757": 8, "47406719923": 8, "371507203579": 8, "352531200647": 8, "72023679018": 8, "574816000462": 8, "481817597151": 8, "580928003788": 8, "455724793673": 8, "394975996017": 8, "464659202099": 8, "357107198238": 8, "324083191156": 8, "759910392761": 8, "569177603722": 8, "481279999018": 8, "528115200996": 8, "441734397411": 8, "393126398325": 8, "455404800177": 8, "350457596779": 8, "322547197342": 8, "754201591015": 8, "579827189445": 8, "491852802038": 8, "582751989365": 8, "451283198595": 8, "391807991266": 8, "456275194883": 8, "356716805696": 8, "362937599421": 8, "809894394875": 8, "60433280468": 8, "507142400742": 8, "655827200413": 8, "474092799425": 8, "408166396618": 8, "480531209707": 8, "346707201004": 8, "780134403706": 8, "601049602032": 8, "493900799751": 8, "620384001732": 8, "494553589821": 8, "425414395332": 8, "467033600807": 8, "375468802452": 8, "346079999208": 8, "771052801609": 8, "593977594376": 8, "49723520875": 8, "583270406723": 8, "478079998493": 8, "416320002079": 8, "443942397833": 8, "359744000435": 8, "343545603752": 8, "780960011482": 8, "598758399487": 8, "498617601395": 8, "57678719759": 8, "46561280489": 8, "41324160099": 8, "431225597858": 8, "351263999939": 8, "34440960288": 8, "933260798454": 8, "715257608891": 8, "586604809761": 8, "711615991592": 8, "558771193027": 8, "466284793615": 8, "44043520093": 8, "361823999882": 8, "731839990616": 8, "57044479847": 8, "470220798254": 8, "608800005913": 8, "472665601969": 8, "416352003813": 8, "481376004219": 8, "380812799931": 8, "351923197508": 8, "719257593155": 8, "55171200037": 8, "466758400202": 8, "568435204029": 8, "459654402733": 8, "394380801916": 8, "463052803278": 8, "36409599781": 8, "328998398781": 8, "73579518795": 8, "564575994015": 8, "472236800194": 8, "549024009705": 8, "438406395912": 8, "389945602417": 8, "455193603039": 8, "364051198959": 8, "375519996881": 8, "798195195198": 8, "588998401165": 8, "49552000761": 8, "595462405682": 8, "460972803831": 8, "400672000647": 8, "465132802725": 8, "364627194405": 8, "729363203049": 8, "558815991879": 8, "466655993462": 8, "600819194317": 8, "460281592607": 8, "404908800125": 8, "478739196062": 8, "386668801308": 8, "385510402918": 8, "720915210247": 8, "550668799877": 8, "466937589645": 8, "564921605587": 8, "447974395752": 8, "394271999598": 8, "46233600378": 8, "365190398693": 8, "387827193737": 8, "762003195286": 8, "579007995129": 8, "486649608612": 8, "557331204414": 8, "443033593893": 8, "396070402861": 8, "457075202465": 8, "369555193186": 8, "wish": 8, "modifi": [8, 16], "tile_size_j": 8, "fixed_param": [8, 10], "ceil": [8, 10], "zip": [8, 10], "transfer": [8, 9, 11], "20": [8, 17], "21": 8, "618": 8, "2231903076172": 8, "0x7f887c3d2358": 8, "incorpor": 8, "ifndef": 8, "kerenel": 8, "psedo": 8, "endif": 8, "bypass": 8, "usecas": 9, "test_vector_add": 9, "test_vector_add_parameter": 9, "illustr": 9, "dimension": [9, 10], "clean": [9, 14], "center": [9, 10], "lock": [9, 16], "overlap": [9, 11], "shuffl": 9, "pipelin": 9, "consist": [9, 14], "scipi": 9, "algorithm": [9, 12, 17], "cub": 9, "librari": [9, 16, 19], "gaussian": 10, "delv": 10, "hand": [10, 14], "sum_": 10, "exp": 10, "beta": [10, 17], "sqrt": 10, "y_i": 10, "z_i": 10, "vector": [10, 11, 18], "coordin": 10, "linalg": 10, "la": 10, "compute_grid": 10, "xgrid": 10, "ygrid": 10, "zgrid": 10, "x0": 10, "y0": 10, "z0": 10, "themselv": 10, "meshgrid": 10, "send": 10, "interv": 10, "256": [10, 12, 18], "suffici": [10, 15], "100": [10, 17], "randomli": [10, 17], "distribut": [10, 14], "linspac": 10, "cpu_grid": 10, "npt": 10, "rand": 10, "xyz": 10, "52320": 10, "160627": 10, "might": [10, 15], "nz": 10, "bz": 10, "kernel_cod": 10, "math": 10, "__host__": 10, "__device__": [10, 20], "b": [10, 12, 14, 17, 18, 20], "addgrid": 10, "xvect": 10, "yvect": 10, "zvect": 10, "dx": 10, "dy": 10, "dz": 10, "assign": 10, "explor": 10, "middl": 10, "henc": [10, 19], "mention": 10, "56833920479": 10, "80796158314": 10, "940044796467": 10, "855628800392": 10, "855359995365": 10, "16174077988": 10, "11877760887": 10, "01592960358": 10, "849273598194": 10, "849235200882": 10, "19029750824": 10, "16199679375": 10, "40401918888": 10, "39618558884": 10, "39508478642": 10, "31647996902": 10, "31470079422": 10, "50787198544": 10, "53760001659": 10, "56709756851": 10, "34500494003": 10, "25130877495": 10, "50662400723": 10, "55267841816": 10, "17987194061": 10, "12309756279": 10, "01125121117": 10, "849631989002": 10, "853708791733": 10, "17051515579": 10, "15584001541": 10, "40074241161": 10, "39547519684": 10, "39331197739": 10, "30295038223": 10, "28725762367": 10, "39589118958": 10, "38867840767": 10, "37724158764": 10, "34344320297": 10, "26213116646": 10, "38793599606": 10, "3775359869": 10, "74003200531": 10, "13276162148": 10, "37233917713": 10, "18835201263": 10, "15777277946": 10, "40247042179": 10, "39366400242": 10, "39439997673": 10, "23719043732": 10, "28542718887": 10, "39207677841": 10, "38956804276": 10, "3778496027": 10, "29814395905": 10, "26398081779": 10, "38625922203": 10, "3754431963": 10, "72981758118": 10, "12483196259": 10, "37322881222": 10, "61618566513": 10, "2194111824": 10, "17600002289": 10, "27082881927": 10, "38787200451": 10, "3835711956": 10, "37543039322": 10, "30227203369": 10, "23127679825": 10, "38627202511": 10, "37677440643": 10, "64358406067": 10, "12255358696": 10, "37474560738": 10, "61655673981": 10, "19179515839": 10, "99912958145": 10, "213971138": 10, "16430072784": 10, "38772480488": 10, "3735104084": 10, "54432649612": 10, "05524477959": 10, "36935677528": 10, "42449922562": 10, "10455036163": 10, "67516155243": 10, "programmat": 10, "With": [10, 11], "30": 10, "minimum": 10, "84": 10, "suit": 10, "grid_dim": 10, "associ": 10, "substitut": 10, "ourselv": 10, "extract": 10, "manual": [10, 13], "exlicitli": 10, "accur": [10, 16], "xgpu": 10, "ygpu": 10, "zgpu": 10, "grid_gpu": 10, "80": 10, "133200": 10, "lower": [10, 16, 17], "roughli": [10, 14], "40000": 10, "across": [11, 14], "handl": 11, "qualiti": 11, "itself": [11, 12], "precis": 11, "plain": 11, "omp_get_wtim": 11, "openmp": 11, "convolution_stream": 11, "complex": [11, 14], "behind": 11, "spread": 11, "back": 11, "split": 11, "chunk": 11, "slightli": [11, 14, 20], "account": [11, 14], "border": 11, "latter": 11, "cudastreamwaitev": 11, "num_stream": 11, "clarifi": 11, "fit": [11, 17], "choic": [11, 13], "grid_size_x": 11, "grid_size_i": 11, "cudamemcpytosymbol": 11, "upload": 11, "memcpi": 11, "yourself": 11, "spent": 11, "relat": [12, 15, 22], "famili": 12, "launcher": 12, "kt": [12, 19], "easiest": 12, "toolkit": [12, 13], "intend": 12, "Or": [12, 13], "vector_add": [12, 17, 18, 20], "10000000": 12, "512": [12, 18], "research": 12, "cite": 12, "articl": [12, 18], "author": 12, "ben": 12, "van": 12, "werkhoven": 12, "titl": 12, "auto": [12, 14, 16, 17, 20, 22], "journal": 12, "year": 12, "2019": 12, "volum": 12, "90": 12, "347": 12, "358": 12, "url": 12, "www": 12, "sciencedirect": 12, "scienc": 12, "pii": 12, "s0167739x18313359": 12, "doi": 12, "1016": 12, "2018": 12, "08": 12, "004": 12, "willemsen2021bayesian": 12, "willemsen": [12, 17], "flori": 12, "jan": 12, "nieuwpoort": 12, "rob": 12, "bayesian": [12, 17], "workshop": 12, "pmb": 12, "supercomput": 12, "sc21": 12, "2021": 12, "arxiv": 12, "ab": 12, "2111": 12, "14991": 12, "schoonhoven2022benchmark": 12, "schoonhoven": 12, "richard": 12, "batenburg": 12, "joost": 12, "ieee": 12, "transact": 12, "evolutionari": 12, "2022": 12, "schoonhoven2022go": 12, "veenboer": 12, "bram": 12, "green": 12, "energi": [12, 16, 17, 22], "effici": [12, 14, 16], "steer": 12, "sc22": 12, "2211": 12, "07260": 12, "comprehens": 13, "recommend": [13, 19], "linux": 13, "download": 13, "wget": 13, "repo": 13, "continuum": 13, "io": 13, "miniconda3": 13, "x86_64": 13, "sh": 13, "newer": [13, 16], "nativ": 13, "command": 13, "prefix": 13, "home": 13, "pythonpath": 13, "bind": [13, 16], "older": 13, "troubl": 13, "retri": 13, "dir": 13, "wiki": 13, "tiker": 13, "net": 13, "amd": [13, 16], "app": 13, "sdk": 13, "intel": 13, "appl": 13, "beignet": 13, "stack": 13, "jatinx": 13, "navig": 13, "benvanwerkhoven": 13, "differenti": [13, 17], "chanc": [13, 17, 20], "algebra": 14, "frequent": 14, "programm": [14, 16], "row": 14, "column": 14, "squar": 14, "matric": 14, "matmul_na": 14, "width": 14, "matmul_kernel": 14, "height": 14, "Of": 14, "solut": [14, 16], "realiti": 14, "contant": 14, "denot": [14, 18], "sensibl": 14, "pick": 14, "word": 14, "warpsiz": 14, "namelijk": 14, "stand": 14, "briefli": 14, "figur": 14, "fifth": 14, "fourth": 14, "dramat": 14, "profil": 14, "util": 14, "pretti": 14, "opportun": 14, "realiz": 14, "collabor": 14, "bandwidth": 14, "techniqu": 14, "submatric": 14, "proce": 14, "matmul_shar": 14, "sa": 14, "sb": 14, "kb": 14, "outer": 14, "inner": 14, "race": 14, "drastic": 14, "consumpt": [14, 16], "due": [14, 20], "significantli": [14, 16], "fortun": 14, "benefit": 14, "redund": 14, "distinct": 14, "1xn": 14, "usag": [14, 16], "occup": 14, "goe": 14, "down": 14, "matmul": 14, "newli": 14, "coupl": 14, "respect": [14, 16], "independ": 14, "yield": 14, "discontinu": 14, "room": 14, "impos": 14, "report": [15, 16, 22], "possibli": 15, "_flop": 15, "total_flop": 15, "ps_energi": [15, 16, 22], "occur": 15, "exhaust": 15, "brute": [15, 17, 18], "forc": [15, 17, 18, 20], "maxim": 15, "boolean": [15, 16], "facilit": 16, "layer": 16, "act": 16, "hook": 16, "pattern": 16, "subscrib": 16, "benchmarkobserv": 16, "overwritten": 16, "extend": 16, "mandatori": 16, "get_result": 16, "aggreg": 16, "after_finish": 16, "after_start": 16, "before_start": 16, "register_configur": 16, "register_devic": 16, "dev": 16, "variou": [16, 18], "registerobserv": 16, "track": 16, "counter": 16, "num_reg": 16, "current_modul": 16, "powersensor2": 16, "pcie": 16, "intercept": 16, "sensor": 16, "transmit": 16, "usb": 16, "connect": 16, "advantag": 16, "instantan": 16, "frequenc": 16, "khz": 16, "pybind11": 16, "powersensor": [16, 22], "extern": [16, 20], "ps_power": [16, 22], "joul": [16, 22], "watt": [16, 22], "ttyacm0": 16, "core": 16, "voltag": 16, "thin": 16, "wrapper": [16, 20], "intricaci": 16, "friendli": 16, "mode": 16, "repeatedli": 16, "downsid": 16, "approach": 16, "save_al": 16, "nvidia_smi_fallback": 16, "use_locked_clock": 16, "continous_dur": 16, "monitor": 16, "clock": [16, 22], "power_read": [16, 22], "nvml_power": [16, 22], "nvml_energi": [16, 22], "core_freq": [16, 22], "mem_freq": [16, 22], "gr_voltag": 16, "ordin": 16, "identifi": 16, "smi": 16, "root": 16, "privileg": 16, "opt": 16, "amper": 16, "continuous_dur": 16, "common": [16, 20], "cap": 16, "popular": 16, "nvml_gr_clock": [16, 22], "nvml_mem_clock": [16, 22], "nvml_pwr_limit": [16, 22], "graphic": [16, 22], "jetson": 16, "rapl": 16, "xilinx": 16, "pmt": 16, "astron": 16, "nl": 16, "rd": 16, "meter": 16, "arduino": 16, "_energi": 16, "_power": 16, "acceler": 17, "prohibit": 17, "slow": 17, "wast": 17, "basin": 17, "hop": 17, "dual": 17, "anneal": 17, "evolut": 17, "firefli": 17, "genet": 17, "greedi": 17, "local": 17, "multi": 17, "particl": 17, "swarm": 17, "mechan": 17, "overrid": 17, "time_limit": 17, "uniqu": 17, "count": 17, "searchspac": 17, "runner": 17, "nelder": 17, "mead": 17, "powel": 17, "cg": 17, "bfg": 17, "l": 17, "tnc": 17, "cobyla": 17, "slsqp": 17, "reject": 17, "thesi": 17, "generate_normalized_param_dict": 17, "denorm": 17, "normalize_parameter_spac": 17, "param_spac": 17, "prune_parameter_spac": 17, "normalize_dict": 17, "prune": 17, "hyperparamet": 17, "popul": 17, "best1bin": 17, "best1exp": 17, "rand1exp": 17, "randtobest1exp": 17, "best2exp": 17, "rand2exp": 17, "randtobest1bin": 17, "best2bin": 17, "rand2bin": 17, "rand1bin": 17, "popsiz": 17, "maxit": 17, "constr": 17, "compute_intens": 17, "fun": 17, "intens": 17, "distance_to": 17, "euclidian": 17, "move_toward": 17, "alpha": 17, "toward": 17, "b0": 17, "attract": 17, "gamma": 17, "light": 17, "absorpt": 17, "coeffici": 17, "disruptive_uniform_crossov": 17, "dna1": 17, "dna2": 17, "disrupt": 17, "uniform": 17, "crossov": 17, "uniformli": 17, "gene": 17, "children": 17, "guarante": 17, "parent": 17, "mutat": 17, "dna": 17, "mutation_ch": 17, "single_point_crossov": 17, "index": 17, "single_point": 17, "two_point": 17, "disruptive_uniform": 17, "two_point_crossov": 17, "uniform_crossov": 17, "weighted_choic": 17, "probabl": 17, "il": 17, "neighbor": 17, "ham": 17, "adjac": 17, "greedy": 17, "soon": 17, "no_improv": 17, "exce": 17, "50": 17, "random_walk": 17, "hillclimb": 17, "travers": 17, "inertia": 17, "c1": 17, "cognit": 17, "c2": 17, "social": 17, "fraction": 17, "acceptance_prob": 17, "old_cost": 17, "new_cost": 17, "modif": [17, 19], "po": 17, "t_min": 17, "001": 17, "995": 17, "vector_add_kernel": 18, "wise": 18, "1000000": [18, 20], "recogn": 18, "alright": 18, "issu": 19, "portabl": 19, "stick": 19, "pointer": 19, "primit": 19, "lead": 19, "ineffici": 19, "situat": 19, "scientif": 19, "sens": 19, "experiment": 19, "pack": 19, "consult": 19, "create_receive_spec_struct": 19, "0l": 19, "pad": 19, "8byte": 19, "packstr": 19, "iiiiiiiiiiippi": 19, "fffi": 19, "nsampl": 19, "nsamplesiq": 19, "nslowtimesampl": 19, "nchannel": 19, "ntx": 19, "nrepeat": 19, "nfasttimesampl": 19, "rfsize": 19, "mnrow": 19, "mnrowsiq": 19, "nactivechannel": 19, "isiq": 19, "fsiq": 19, "fc": 19, "nbuffer": 19, "frombuff": 19, "len": 19, "receive_spec": 19, "bf": 19, "rf": 19, "recon": 19, "sync": 19, "length": 19, "slight": 19, "matlab": 20, "typenam": 20, "my_typ": 20, "linkag": 20, "regardless": 20, "demot": 20, "rewrit": 20, "real": 20, "risk": 20, "nvrtc": 20, "seper": 20, "block_size_": 22, "grid_size_": 22, "compiler_opt_": 22, "loop_unroll_factor_": 22, "nvml_": 22, "nvml": 22, "nvmlobserv": 22}, "objects": {"kernel_tuner.backends.cupy": [[5, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[5, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[5, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[5, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[5, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[5, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "benchmark"], [5, 1, 1, "", "benchmark_continuous"], [5, 1, 1, "", "benchmark_default"], [5, 1, 1, "", "check_kernel_output"], [5, 1, 1, "", "compile_kernel"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "create_kernel_instance"], [5, 1, 1, "", "get_environment"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "preprocess_gpu_arguments"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"]], "kernel_tuner.observers": [[16, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[16, 1, 1, "", "after_finish"], [16, 1, 1, "", "after_start"], [16, 1, 1, "", "before_start"], [16, 1, 1, "", "during"], [16, 1, 1, "", "get_results"], [16, 1, 1, "", "register_configuration"], [16, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[16, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[16, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[16, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[5, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[5, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "run"]], "kernel_tuner.strategies": [[17, 2, 0, "-", "basinhopping"], [17, 2, 0, "-", "bayes_opt"], [17, 2, 0, "-", "brute_force"], [5, 2, 0, "-", "common"], [17, 2, 0, "-", "diff_evo"], [17, 2, 0, "-", "dual_annealing"], [17, 2, 0, "-", "firefly_algorithm"], [17, 2, 0, "-", "genetic_algorithm"], [17, 2, 0, "-", "greedy_ils"], [17, 2, 0, "-", "greedy_mls"], [17, 2, 0, "-", "minimize"], [17, 2, 0, "-", "mls"], [17, 2, 0, "-", "ordered_greedy_mls"], [17, 2, 0, "-", "pso"], [17, 2, 0, "-", "random_sample"], [17, 2, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[17, 3, 1, "", "generate_normalized_param_dicts"], [17, 3, 1, "", "normalize_parameter_space"], [17, 3, 1, "", "prune_parameter_space"], [17, 3, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.common": [[5, 3, 1, "", "get_options"], [5, 3, 1, "", "get_strategy_docstring"], [5, 3, 1, "", "make_strategy_options_doc"], [5, 3, 1, "", "scale_from_params"], [5, 3, 1, "", "setup_method_arguments"], [5, 3, 1, "", "setup_method_options"], [5, 3, 1, "", "snap_to_nearest_config"], [5, 3, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[17, 0, 1, "", "Firefly"], [17, 3, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[17, 1, 1, "", "compute_intensity"], [17, 1, 1, "", "distance_to"], [17, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[17, 3, 1, "", "disruptive_uniform_crossover"], [17, 3, 1, "", "mutate"], [17, 3, 1, "", "single_point_crossover"], [17, 3, 1, "", "tune"], [17, 3, 1, "", "two_point_crossover"], [17, 3, 1, "", "uniform_crossover"], [17, 3, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[17, 3, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[17, 3, 1, "", "acceptance_prob"], [17, 3, 1, "", "neighbor"], [17, 3, 1, "", "tune"]], "kernel_tuner": [[5, 2, 0, "-", "util"]], "kernel_tuner.util": [[5, 0, 1, "", "CompilationFailedConfig"], [5, 0, 1, "", "ErrorConfig"], [5, 0, 1, "", "InvalidConfig"], [5, 0, 1, "", "NpEncoder"], [5, 0, 1, "", "RuntimeFailedConfig"], [5, 4, 1, "", "SkippableFailure"], [5, 4, 1, "", "StopCriterionReached"], [5, 3, 1, "", "check_argument_list"], [5, 3, 1, "", "check_argument_type"], [5, 3, 1, "", "check_restrictions"], [5, 3, 1, "", "check_stop_criterion"], [5, 3, 1, "", "check_thread_block_dimensions"], [5, 3, 1, "", "check_tune_params_list"], [5, 3, 1, "", "compile_restrictions"], [5, 3, 1, "", "config_valid"], [5, 3, 1, "", "convert_constraint_restriction"], [5, 3, 1, "", "cuda_error_check"], [5, 3, 1, "", "delete_temp_file"], [5, 3, 1, "", "detect_language"], [5, 3, 1, "", "dump_cache"], [5, 3, 1, "", "get_best_config"], [5, 3, 1, "", "get_config_string"], [5, 3, 1, "", "get_grid_dimensions"], [5, 3, 1, "", "get_instance_string"], [5, 3, 1, "", "get_kernel_string"], [5, 3, 1, "", "get_problem_size"], [5, 3, 1, "", "get_smem_args"], [5, 3, 1, "", "get_temp_filename"], [5, 3, 1, "", "get_thread_block_dimensions"], [5, 3, 1, "", "get_total_timings"], [5, 3, 1, "", "looks_like_a_filename"], [5, 3, 1, "", "normalize_verify_function"], [5, 3, 1, "", "parse_restrictions"], [5, 3, 1, "", "prepare_kernel_string"], [5, 3, 1, "", "print_config"], [5, 3, 1, "", "print_config_output"], [5, 3, 1, "", "process_cache"], [5, 3, 1, "", "process_metrics"], [5, 3, 1, "", "read_cache"], [5, 3, 1, "", "read_file"], [5, 3, 1, "", "replace_param_occurrences"], [5, 3, 1, "", "setup_block_and_grid"], [5, 3, 1, "", "store_cache"], [5, 3, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[5, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:module", "3": "py:function", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "module", "Python module"], "3": ["py", "function", "Python function"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"cach": 0, "file": 0, "The": [1, 12], "kernel": [1, 6, 7, 8, 9, 10, 12, 14, 20], "tuner": [1, 6, 7, 8, 9, 10, 12], "document": [1, 2, 5, 12, 21], "guid": [1, 2, 13], "featur": 1, "refer": 1, "contribut": 2, "report": 2, "issu": 2, "code": [2, 6, 7, 8, 9, 11], "develop": 2, "environ": 2, "local": [2, 7], "setup": 2, "cluster": 2, "run": [2, 8], "test": [2, 3], "build": 2, "convolut": [3, 9], "2d": 3, "exampl": [3, 9, 12, 20], "implement": [3, 6, 7, 8], "tune": [3, 6, 7, 8, 10, 11, 14, 15, 16], "more": 3, "tunabl": 3, "paramet": [3, 8, 10, 16, 22], "correct": 4, "verif": 4, "design": 5, "strategi": [5, 17], "kernel_tun": [5, 17], "common": 5, "runner": 5, "sequenti": 5, "sequentialrunn": 5, "simulationrunn": 5, "devic": 5, "interfac": 5, "core": 5, "deviceinterfac": 5, "backend": [5, 20], "pycuda": [5, 13], "pycudafunct": 5, "cupi": 5, "cupyfunct": 5, "nvcuda": 5, "cudafunct": 5, "opencl": [5, 13], "openclfunct": 5, "c": [5, 8], "cfunction": 5, "hip": [5, 13], "hipfunct": 5, "util": 5, "function": 5, "diffus": [6, 7, 8], "python": [6, 7, 8, 13], "comput": [6, 7, 8], "gpu": [6, 7, 8, 10], "auto": [6, 7, 8], "us": [6, 7, 8, 10, 14, 19], "share": [6, 7, 8, 14], "memori": [6, 7, 8, 14], "tile": [6, 7, 8], "store": [6, 7], "result": [6, 7], "tutori": [7, 8], "from": [7, 8], "physic": [7, 8], "best": 8, "product": 8, "vector": 9, "add": 9, "stencil": 9, "matrix": [9, 14], "multipl": [9, 14], "py": 9, "sepconv": 9, "convolution_correct": 9, "convolution_stream": 9, "reduct": 9, "spars": 9, "point": 9, "polygon": 9, "expdist": 9, "gener": 9, "3d": 10, "grid": 10, "let": 10, "": 10, "start": [10, 18], "cpu": 10, "move": 10, "optim": [10, 17], "host": 11, "number": 11, "stream": 11, "quick": 12, "instal": [12, 13], "usag": 12, "citat": 12, "packag": 13, "cuda": [13, 14], "pyopencl": 13, "pyhip": 13, "git": 13, "version": 13, "depend": 13, "naiv": 14, "increas": 14, "work": 14, "per": 14, "thread": 14, "metric": 15, "object": 15, "observ": 16, "powersensorobserv": 16, "nvmlobserv": 16, "execut": 16, "nvml": 16, "pmtobserv": 16, "basinhop": 17, "bayes_opt": 17, "brute_forc": 17, "diff_evo": 17, "dual_ann": 17, "firefly_algorithm": 17, "genetic_algorithm": 17, "greedy_il": 17, "greedy_ml": 17, "minim": 17, "ml": 17, "ordered_greedy_ml": 17, "pso": 17, "random_sampl": 17, "simulated_ann": 17, "get": 18, "struct": 19, "templat": 20, "select": 20, "api": 21, "vocabulari": 22}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 58}, "alltitles": {"Cache files": [[0, "cache-files"]], "The Kernel Tuner documentation": [[1, "the-kernel-tuner-documentation"], [12, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[1, null]], "Guides": [[1, null]], "Features": [[1, null]], "Reference": [[1, null]], "Contribution guide": [[2, "contribution-guide"]], "Reporting Issues": [[2, "reporting-issues"]], "Contributing Code": [[2, "contributing-code"]], "Development environment": [[2, "development-environment"]], "Local setup": [[2, "local-setup"]], "Cluster setup": [[2, "cluster-setup"]], "Running tests": [[2, "running-tests"]], "Building documentation": [[2, "building-documentation"]], "Convolution": [[3, "Convolution"], [9, "convolution"]], "2D Convolution example": [[3, "2D-Convolution-example"]], "Implement a test": [[3, "Implement-a-test"]], "Tuning 2D Convolution": [[3, "Tuning-2D-Convolution"]], "More tunable parameters": [[3, "More-tunable-parameters"]], "Correctness Verification": [[4, "correctness-verification"]], "Design documentation": [[5, "design-documentation"]], "Strategies": [[5, "strategies"]], "kernel_tuner.strategies.common": [[5, "module-kernel_tuner.strategies.common"]], "Runners": [[5, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[5, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[5, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[5, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[5, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[5, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[5, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[5, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[5, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.c.CFunctions": [[5, "kernel-tuner-backends-c-cfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[5, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[5, "util-functions"]], "kernel_tuner.util": [[5, "module-kernel_tuner.util"]], "Diffusion": [[6, "Diffusion"], [6, "id1"], [7, "Diffusion"], [8, "Diffusion"]], "Python implementation": [[6, "Python-implementation"], [7, "Python-implementation"], [8, "Python-implementation"]], "Computing on the GPU": [[6, "Computing-on-the-GPU"], [7, "Computing-on-the-GPU"], [8, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[6, "Auto-Tuning-with-the-Kernel-Tuner"], [7, "Auto-Tuning-with-the-Kernel-Tuner"], [8, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[6, "Using-Shared-Memory"]], "Tiling GPU Code": [[6, "Tiling-GPU-Code"], [7, "Tiling-GPU-Code"], [8, "Tiling-GPU-Code"]], "Storing the results": [[6, "Storing-the-results"], [7, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[7, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [8, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[7, "Using-Shared-(local)-Memory"]], "Using shared memory": [[8, "Using-shared-memory"], [14, "Using-shared-memory"]], "Using the best parameters in a production run": [[8, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[8, "Python-run"]], "C run": [[8, "C-run"]], "Kernel Tuner Examples": [[9, "kernel-tuner-examples"]], "Vector Add": [[9, "vector-add"]], "Stencil": [[9, "stencil"]], "Matrix Multiplication": [[9, "matrix-multiplication"]], "convolution.py": [[9, "convolution-py"]], "sepconv.py": [[9, "sepconv-py"]], "convolution_correct.py": [[9, "convolution-correct-py"]], "convolution_streams.py": [[9, "convolution-streams-py"]], "Reduction": [[9, "reduction"]], "Sparse Matrix Vector Multiplication": [[9, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[9, "point-in-polygon"]], "ExpDist": [[9, "expdist"]], "Code Generator": [[9, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[10, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[10, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[10, "Let's-move-to-the-GPU"]], "Tune the kernel": [[10, "Tune-the-kernel"]], "Using the optimized parameters": [[10, "Using-the-optimized-parameters"]], "Tuning Host Code": [[11, "tuning-host-code"]], "Tuning the number of streams": [[11, "tuning-the-number-of-streams"]], "Quick install": [[12, "quick-install"]], "Example usage": [[12, "example-usage"]], "Citation": [[12, "citation"]], "Installation": [[13, "installation"]], "Python": [[13, "python"]], "Installing Python Packages": [[13, "installing-python-packages"]], "CUDA and PyCUDA": [[13, "cuda-and-pycuda"]], "OpenCL and PyOpenCL": [[13, "opencl-and-pyopencl"]], "HIP and PyHIP": [[13, "hip-and-pyhip"]], "Installing the git version": [[13, "installing-the-git-version"]], "Dependencies for the guides": [[13, "dependencies-for-the-guides"]], "Matrix multiplication": [[14, "Matrix-multiplication"]], "Naive CUDA kernel": [[14, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[14, "Tuning-a-naive-kernel"]], "Increase work per thread": [[14, "Increase-work-per-thread"]], "Metrics and Objectives": [[15, "metrics-and-objectives"]], "Metrics": [[15, "metrics"]], "Tuning Objectives": [[15, "tuning-objectives"]], "Observers": [[16, "observers"]], "PowerSensorObserver": [[16, "powersensorobserver"]], "NVMLObserver": [[16, "nvmlobserver"]], "Tuning execution parameters with NVML": [[16, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[16, "pmtobserver"]], "Optimization strategies": [[17, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[17, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[17, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[17, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[17, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[17, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[17, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[17, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[17, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[17, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[17, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[17, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[17, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[17, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[17, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[18, "getting-started"]], "Using structs": [[19, "using-structs"]], "Templated kernels": [[20, "templated-kernels"]], "Example": [[20, "example"]], "Selecting a backend": [[20, "selecting-a-backend"]], "API Documentation": [[21, "api-documentation"]], "Parameter Vocabulary": [[22, "parameter-vocabulary"]]}, "indexentries": {"compilationfailedconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.CompilationFailedConfig"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[5, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[5, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[5, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.InvalidConfig"]], "npencoder (class in kernel_tuner.util)": [[5, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[5, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[5, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[5, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[5, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[5, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[5, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_tune_params_list"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[5, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[5, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[5, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[5, "module-kernel_tuner.strategies.common"], [5, "module-kernel_tuner.util"], [17, "module-kernel_tuner.strategies.basinhopping"], [17, "module-kernel_tuner.strategies.bayes_opt"], [17, "module-kernel_tuner.strategies.brute_force"], [17, "module-kernel_tuner.strategies.diff_evo"], [17, "module-kernel_tuner.strategies.dual_annealing"], [17, "module-kernel_tuner.strategies.firefly_algorithm"], [17, "module-kernel_tuner.strategies.genetic_algorithm"], [17, "module-kernel_tuner.strategies.greedy_ils"], [17, "module-kernel_tuner.strategies.greedy_mls"], [17, "module-kernel_tuner.strategies.minimize"], [17, "module-kernel_tuner.strategies.mls"], [17, "module-kernel_tuner.strategies.ordered_greedy_mls"], [17, "module-kernel_tuner.strategies.pso"], [17, "module-kernel_tuner.strategies.random_sample"], [17, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.prepare_kernel_string"]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[5, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments"]], "print_config() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[5, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[5, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.scale_from_params"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[16, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[16, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[16, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[16, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[17, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[17, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[17, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[17, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[17, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[17, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[17, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[17, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[17, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[17, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[17, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[17, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[17, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[17, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[17, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[17, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[17, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[17, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[17, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[17, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[17, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[17, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[17, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[17, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[17, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[17, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]]}}) \ No newline at end of file