From 821dcf62a5e4988fe263db065614663510daa3c7 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Tue, 3 Oct 2023 09:13:02 +0000 Subject: [PATCH] Add changes for 2d5bff28fc57f154c3ba147f2068df9b19c41785 --- latest/design.html | 10 ++-------- latest/genindex.html | 12 ++++++------ latest/objects.inv | Bin 7526 -> 7509 bytes latest/observers.html | 8 ++++++++ latest/searchindex.js | 2 +- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/latest/design.html b/latest/design.html index a88c2aaa7..3b30c86e8 100644 --- a/latest/design.html +++ b/latest/design.html @@ -469,12 +469,6 @@

kernel_tuner.core.DeviceInterface -
-compile_and_benchmark(kernel_source, gpu_args, params, kernel_options, to)
-

Compile and benchmark a kernel instance based on kernel strings and parameters

-
-
compile_kernel(instance, verbose)
@@ -1269,11 +1263,11 @@

kernel_tuner.backends.opencl.OpenCLFunctions

-class kernel_tuner.backends.c.CFunctions(iterations=7, compiler_options=None, compiler=None)
+class kernel_tuner.backends.c.CFunctions(iterations=7, compiler_options=None, compiler=None, observers=None)

Class that groups the code for running and compiling C functions

-__init__(iterations=7, compiler_options=None, compiler=None)
+__init__(iterations=7, compiler_options=None, compiler=None, observers=None)

instantiate CFunctions object used for interacting with C code

Parameters:
diff --git a/latest/genindex.html b/latest/genindex.html index c066ae84d..d374515d7 100644 --- a/latest/genindex.html +++ b/latest/genindex.html @@ -219,8 +219,6 @@

C

  • (kernel_tuner.backends.pycuda.PyCudaFunctions method)
  • -
  • compile_and_benchmark() (kernel_tuner.core.DeviceInterface method) -
  • compile_kernel() (kernel_tuner.core.DeviceInterface method)
  • compile_restrictions() (in module kernel_tuner.util) @@ -231,8 +229,6 @@

    C

  • convert_constraint_restriction() (in module kernel_tuner.util)
  • - - +
      +
    • replace_param_occurrences() (in module kernel_tuner.util) +
    • run() (kernel_tuner.runners.sequential.SequentialRunner method)
        diff --git a/latest/objects.inv b/latest/objects.inv index eb2c5550197411151d496ec5cb3943374f5e9d13..fc6e245cfe87a3b73bcb4acf5336056cef75600f 100644 GIT binary patch delta 7313 zcmV;C9B$+0I@LOmp?|uI+r`K5(NfQ^AG}BcB%YbjLFy_X^TU@Qm`tz$n_OIt^>Mn_(E>o9+ z)kr2R%0Pj=B!`#IO4Ai}RL_^$$CNMhSOZsUl=m4Xmwj20>8xz8hmtr8*cLJZtw`m?odh`Tt z>IjN-R#rf$%YQNg0v*>8E0Oy$0Hd7P5fACgOn?}NX23(awM*Tf{!#Y`r?yAHxU>Tn z+@Tqu(z!D;coJu3hEC(k4%m1{c7VjWF&iAviJ9PFF6@Am#(^DSG4AVthI3v9IE3pu zqM;mD`%=zTL49FWAe@&81>?YscyK3XflKGe>XK?0qd z4HD+yjA&_`oEaM9=!}RsXJ>&#I6N~V%IS4IY0K;BPx|NfcZAzB<6s=01rhH0tbpm9 zpR0)^?$6ag8V6`;0q+7Wz&I!91_io7J1EQ%TJX}iLK`^78Co!L?$8Q~aELZcluPVi zxtG0rMSn(uW*m%5v>?J=q7^WmOLR4n#3i~KNaGSME#O_E1sLZN-Jn31Xa|M4LUQqtF&Sh<9*g-pA@Rx!DWB!IJ$Lie&wpq{@wkY-7c5}tM}-|1^L((Qq8QcvKm;-sDxS>>?DU5;}nS7zBJd*;>|_=#5r2S))|fLJL6mC67B>oaD)@>}C{zkY<8Y{$Zw>_Ck&nFim~5h{Kr ze1MD-bszMOCnF8V$GW|AeZM%suA1o#{gX-rA%C4Stzu8)wi}w_N~amsxi5yIIahUg z=tv;p(+V!I34ajwa#oT6zo$dxdtu|R+c4AqrO#>kAi-U^4GhnW&L|rB)LclZ_gDZO zfxsW}tZDhaO(EpDXH7t|lm@+BNk7Xw5suju&twCYG#NM!b}S1z8G9VMMhrZp48L%gb1S;CXsh!M9|&C4kB?{a(&2Bn1qpwz zRdaErCQ`$$NuDh_9-iN0J&?yMogiMHx}lyf4W8lw^7LMmXi*=IM43I!3<>e$%}$?61X8jekiSK#va=C`@&ka(o;hS@#Yzg`%A>w>5pr zHRp2Z#<7!Ag>>vs%{Zi*@gV*xX z)yUZIX&|I}J=fPU!5Ck$5?nWd)g3AYP&%cqRjl1Zh+~7EOOmFsL**JUjWug7$vGXwCq7D$0`dBHHMO?jbel zQ_EcDLpLM`hIKLfAM}22p6YA#9F{4aZuu{2oYecGzLM&8oz4Wk8si|p4nsFU6hE{t zw7)=Yek{J-4INWum!cNGGKH)E(SQHimR%L`5>Qt=MgK&~^*_xm9R&p5)%VYJ-xm*8 z4lxgWkdbffE(w?)3WTZaZ)wCp2KJhf6s24M5{Dvw+dcf07CKWkVN9<*0b;O;@=%x2 zvwH=CuUsgXegT3^7xok5`Qh5pFBQGebQ%VyG_7y+MqmfapUZLG!$;|v-+!nOWg6&C z;%9b#o2q2u;pVD16XtEsyo;g%Dsm-shCGp`KR?QaXORFCDd~7G+NZgA0_e}Q<&xcW z(W>ml-~;T9)2c5oX8b$F72#>9E6mVl7|SvP3=dg~KwuHYmC&e2#_iDnV?(ulkc{2G)KmZ^N;=si1{4LVsxi_FTK}nfa(^axUODZ9kDvpHV-)o>%QyB~LqX4^MRb^%UR3t`Ml5qsOoJE^PIJ6_Zxc^n{CFVlFZODl`M!7%{;Mpk6TY%|B%~L9;P_mU&h}`iFn>$_2ok-+A zW@;r<$!<9tUT{#o;oo!q-3_Bj>X03K1wEOlEH#|RXR7i~G2i*jtj?;g(4*MuF-_%uUU4D|!OB6MbFv}mmry}OoB@Ta-1$Pwe_VXm(g{o=DigSKh? zT^9S8->3ySo5=VYJbxe1csr{`P&ZcOg4A`Z>pnUUFl`Dlg`6k{lk`biPAv@yOnz{yl%r}b$o^shU0oJNoEznP{-Q{_8?dX{Tb9qw z19|0nO8o)*ynlD4o92!9=yKfZxMCFTLHTM!>2;n%>66>56PA_AC+jn+B)56sSp z5x%cGfTh^|DF1+YT2Nv(Tn2Cz+mhrTFjoso%uXmfSAVuQAs2pNbpwF}jjG;_4|XbY zRcpR2{(<9M&DanZc@NM7E%g}?^hM-LnBz@6uDf*iPu&9*igL2Cnv zkc}m(9s-Kts^Y|nM>Ccis8MVsM!0A!%l&?14BI_cyh<#4bpslPxr!4f4rI)COebK> zapJ@QjDOj7Y$RC*5iA)?(X@LI!`FldQ;W7_+eU;kV1tR0j-{*aRLAgDapJ_|8L!*m zafDZxDCuar$_{%JUj-*lJcKcur$tQ(15PbQObPw`;Bk-RXyj-O7 zC>ucGwEHM>h!WTaNPulU_&ib`Mmj~v9d_J0Mj2=uJP3Cl^bXL08Gs4P-ABtql%O_% z19j(N_5dBA0f^AueSAJdDZmEylSa%hC*Cvq!M~jNn1RE0dlG<1c?fA>;BHMe5GM{} zoPQ`}kH~U>1pwQL57vXzE#TC_7A6SO!_z(>^&v(g19WfN378th5=3b3%ybd2f?y8` z%AJWd;#Gj`0pYkUg-4V$gl>XBJSb@hToq&$I0Sd4Dv8#BP)7vc&crD3DlqoYu-lb} zC0YYS9UgEylfJ~O1hAK@bD|cm1m%`NEq?|IVF?bt-AQZ!I#2_Efw(hiPP_`7eWIW| zGHXu?5R}dM06i*=5FP?$ZGvDuF0D}l45-1#z&$9v5?mE(6(WfDW_E(9AuWLf^xlM0 zFg1iFfZ*Ph!YWz=N|Kj4ODe;7j`k(t?`-oGy4$%|VnfHYA!fMCZFp z1B0|2KXr&q!j}RBX>)=JL~LTfBq(sRk<*C4B!w7dxNV5ZL_m&2s3O`P$dn==JtI^R zWe;Ft;Z6z((19C}V0saek`fDmuzwRg$?zw|1nR+!K}KHJZM9)s!BY=kdR34X+zjBJgCk8mfq~2RileC)@_=A&h}f zJpxjFLly1z@Plk3Pswqm6Nj9VGWT8aW4CyI2t9URq zuq8{@fh1-uU;v#+15BCV5q~C_;6PfjX6WhXF&5}Zu?o?=ag$?vLVEv5a!?_4cZE7pwIS(%xKWh&+owsSQcJ%1Q`T4O|t`2?pm zhVzaMr=#-PgEes`v2^h1tYK&nr2reYl=iWQXxfqXXJTft3Mk&-;biCI8%_b5IFkUw z$-&7!oZL!w8{Ape!c{3bH3x8||_9ixh0d%4aXp8VH8}0krg#wUMNBR6JRa)6A7P~GzIDc*U15Q^3(h*s|*Va~xFLBJw>zqQyD10AcnH~ZjYkl(gG9*sh<~!ZHeD6`!wFaVqyJSZIqE_i zKE*9M(i=G$7a5slJcJBM-6I$ov_ph^oQzrGV`v!x0_D8ql;rbKoD@BGbp(UBbzfHX zASo{`k=8zXvY{(_>H|qMgblV4LSs?&N_eR7hj4UfC|a({khWvz!JCq!&f9Q*ElTfDv(Jbw(-6rhz%q{)<<-1X&6GFZ_GXkb+UM3Z&ik+Rbjhd7 z(yJnSuM7C*7pDDrr~OZ%{h4bk5*d@os6-|tGA?nX#C{5(##6|y<$zE9>|}AfSNQGj zOSTqfbblzaMrq-(t@XRJfwli!Jy};jIzd>yL%n~UE|h-V^jE@u!7L^pt2?8HD9!ZJ zV{^MXX0{J z&PO~sms{yRB>a({WaQ}0pFA=k;M0=PB7vY)i^=!RJ~>24XZWhzHfElnEOv_ z1a=$hjfdZjTGR12rQU4Vy{R=Bev9s!T5gM+t@+C3ZIO#{PH#rWLEaWYG*-CxMmWZA zfqzSJM{17dziI4UT>r=Ia5TD_-&xd6{zujs{LnlSwnU51mE7J)lf?L>W0P>tbRd=5 zKN-lD`Efn3gU~G(zs%H zd@irsyvW%vFEgJ=#f_&gQLutGyO)`Cwn=Aon-$q6Ek-5zRR&;bXDxLU~q8TI56%Ew}S$u4beCw;)IW|_2U%&P5`07IqKz&Z< zON|T8tPnB)n_?R3Z+Sz*&$_?XW#Jn%W`&QuH2bKP?wabmx}De!*mu|0CwD6c*}J9S zP8960sl%B|AN5V>)ADHtGm@TNVSmrqP^HAj;}{tE4|2Ketrt4hyCI^NL)7oN%+mFL zaT)g&7$+L(?qAmJ8kQpW;_nVX5-F<6g`r4ER=-r$n;+Qw@1HAENGxl(*Zlci_8WVj zP7P#FG9FIqp`?m)^D}+o((f!nV^3ZlpW4G*rJyyZ;w6;5J+#I8P?*7F5Nh>=jidtz1GNS z7@x}ggBHq=P}RJV(<)YG2!A~_sH1EcHZ!rEn+NL7yT|1L6wr=YPqxWAMpS0qVuAG(mz>08F)~1Qa2DJ&?)<|b+TKIeQ5;`M3;)2058eaqxG6>L9y&b1z}qI=Z*{mO}$u8pYRSAXY&_;S>_Ve7l| zqNVo;KYPlKage{E>Qk?(F8e+6O6G6E3ZHN?k_l z_)ryfdefJ>mo(`oM`OC&KR(<^W9TOxuvR+Hfv1VtJ6wKdccPpI_eJ?k2j=L ze;id2c~ciud53%h#RK zvD~}UCh9CpYRZ9I`k-tl3T}#P{HH411JF!|%lMz6{l2LSD$;f`+J;`4`$$8z8R>>_1Rb3vQ zrV(a@Hh@L|nZ515TSw4&><=fvLkzY+89= zK6L|qEHz6Bl=>0yy#$QbNMXiFtN|ovcA{}R)>C@J3{51!NMbx&WWtZ&n7Z_4tKcZ@ zB&M<-%_-K`VyMRSwrc1&b(FOjr(!{i6OUlef}yD{6##n8hq-63UC%T+rq6-qTn^nh zc7MlMKiI0sR?@(rdZC#oTA9vXi5IJYvVr{yaitMb$9J%&zlIikEC9-8%)MtpixbDO z|Jr@O%Vh#bX`@X2j=lxsP||vlucUsuOmu!(Ub?zWI579XV9glw212UWbA26gci9dm zP%fTmlZI#$NVJJ3+SDyy0f(Zgg9J&%P-8@PqPO<|-PR2iVQG*OU1u9yq%DZDL)NCF#YKg`L|6H6a8Vp^G%KWiI@o z8&X=gVnURV_Fx8^6)#r7V1wsm4wFS1L4UzElKc@bSQ0>f4Y#qZHz2zv9Ch)Ov=AFfPR9z8iYIIc z(2%rEh!L^|6V!s-c9PkVr%lbeUWD&_;;S%^%TAO!zUbsPGVvoX?pYB#oljT3d4Cf1 zVw2(ikiZ?-LHGbpSq z+Rk0jl$RZAE|675?q}OWjO5DJtMe@h4@LP*+R6!MlD}WdvLcgZ`(~DJmH7;dT_ZQ6tM^hxrs6QU(MRiBg9Fmfg<2Z3 zod~)=u*|3gKH<=OnvS^*+<&=c+Ic1tJLKS;kaJMpT6F0gmhZ~moh%?rmu=Ma;~R0> zfXXfEaEfJ9z6#(uTVNw5<-<4LtDD3YZq#pNvOTyo2-na7|PvFL{TQ> r??wF`J3m65Ge~&Hq+<<-ReA5^V0kd8O1@Bd*X>d(C}#N&h4L9dnp!Iq delta 7317 zcmV;G9BSj$I_5f%p?|8I+r`K5(NfQ^AG}BcB%YbjLFy_X^UE(mFqvTC@l1x6T+h?o zlHv4Fl+UEC##4E^eLuJ5RCn!|^{;;`uf;fKKV}sf>&N!=G&R?ow^{z?*{gjzpX;`s z&SwVGkm;$bU^wNK6z$xfuk`~Os%!fCO0Y0en_OIt^>YfroJxyH- zRwJ3PC<7^~*K;vE&JAfNdJRWB zrjDShB zf(JAMRJw3x22T>q%+P5<*#R3L$PSRWFlK`T1~C&HEQB49(gd(0EGB#%&~U-a0EY-& zM>JI6YG2B^DrhXM3PkWSp z__<6Z34bmFX#$|d0zL#*;>|_=#5rXG8&5fCMQ7mC67B>t?e!aK+~ z4fLV$Sd)?F;A7oBb$!1?sIHpnu>6xv1R;N;C#_QeZ0KcbQ<$Gb{uiG%w_N5O}`5?hvxeW}@jHdie*J`BHdn|yCK;X|{ zW?H^vQwVwPSrd>frCDiL($Dfvgkv_vGuc2TO$LsG9m|4_mM%lrh>nN!d?eG{pGAMH zThr*2m`SeB*cSbH>dq~pL$GmTL%d6kygqX)nvxA>clbcys=i##y zp0p>jQq^J*Py6V1!Ed@3hW)h|sxg150d)UhiNaKuDf`C(k~QxzQz+U9b6eA=Tyrjm zZX7!~R7l7E)XWpDObklXy+<0!`i}O(@5Vg+r94evyYEc;dtc74q<(ywX!2S{8U?XP|KPKl*=P+p?=7UIOZBr|92Bx&EiQrM-Z_oBIB_?)&25$|2@~ zcQW#g-6a9zLxC`L{VmNH$iQARlA@HeKH^Zs54wlnx52UYcz(Ec^h-rAG#!S)DNXAey%E^J^7m|9^YC7J<~M&TM42YKllUE+ z-=->=c(}PL&V)sqGjF13fQnoR9U)Jo>CYEA+bj}bA|>tbMf*4xj{yCdS}xg57p=-( z3_if#IIa5fV#dEyToE3Jy25mAhOx{8!0?cz2n3ccP&c!$5I*`bJ%+yMz zlHGDPyx^dE!@uYJyBkK6)FC_e3VJe8S!y_s&s61~V!ZR2d7V{V$&*1oAklGdCTdGG zQO=tv*MuEre3~IFhWdeC5jwIndbHMx)?G^|_~Tqx}qz)^0|2+uRKp_JYb*qf1z~Kyb&K=fm?k%8>ItmuvwRR zy-WojT?Y1|Vj6L_&LbW*XzfY3thmU_yu7Td#H_Xgbp1O{g|M0XSNzDw3 zPj%1!f!Q7O!}f*QQ_ZoH+3i#%!DxH6#q!su(dO z^z(z~J&yg5qcvdEp?QUU@8wtpC~w=U`xKtou<;NGZw-3^B=XgOj0HgWY1*A2k)tMP ze>4KjRcxy4-<2Dx2=fP%Ji9johbW_g}wsp7jNO>6P6e)Mu9_tuo zpl|Rb+_}3uKnLakCM|dGCl67A`T$PUojceAbbt;ZQg`>B`4FW5AJ|VCF~6L6H|7Wb za^hnKPT%c2|3}J0NCOjh>j?vK;xNXEe^T~{qYbbCU_bH6dhi(wICZdx3DWfNvmTK8 z5GRody7#OJm>R?rL~8Cl-XdNF!5)&7J5SSyR{^pIq~o@8JEEi^bQ2`vK_`X4RY6{X zQ*hV$B+(iW>WJjqc^XQ*3XDB8?RK5L60HHE4o|q9C%we01hAL0bD}1$1RW@Yf0_&w z!V;W(yHBnG=s+C+CgRSMa^h9s>=PyBkw^8U073bTPtc>z4#GpAyiJg-$DL)A00ZhU zGI0+&QwgpL^$L;1dyjL1sUbaqB=p`>q+n_YPXNii>)fhn4Jh>liGJ{jSvYmDhv1~& zcU~7r3+e`90N_0(45kM61UeY-f1OPR(gM2y9~gK~M1!dXcp@joN6y0!Jbn#14?m16 zc%a}r>kXs@cLO+F@SHCPQNs9;XwVRS$Xyy3r04jdL*&VODL{}uCm2A)o(7l%1@1O- z7!i0fAx0VQ8)7gKa4;fN5$z9TND*)*BUBOP4`5*7J_!<_19u?7@FL!cXuZ z!+#PcP!H}5VyF>t1}9Vz;}3Me;XnNosE2R{KI{lMcND4^;E!D68o8`C@R(ENvf41N z;Gu`_OjVE;+zsH+!*h-+Lhx^23fDYV&1VfLA^PaH)2tUC? z5C4hNKs~rKh@nToS=UfSj6cw!hyRpqpdP{*_|PNZd~c|t-5=pFdhGnaKQuz z(u#G%c;-CD5hEZ_59mx(uHa;2N)iGTqIu&+`W8k(!}~a0h-2V^yq%hF^d6%LaS1+7 zw=?q&&ZAUgy`uU0wC)VQ9A<#>$&3h9jP!@?|4-VNn&wYUcxGzkvxkb-h@>9>*1$-@ z`l`Su9Z45H1~n4#e-1oDwPM|f9b%e!SjLqeVLMmDK8mqhPloR#nPTh7aNeW-)Gua?yJ1{7)ehkZ9tLoVWjtGpvJS^BZLb2_-!XAU-)spQ4!R6_;Rph5JdvN zPfYq@VqRv-p|Vko6g_vfpMmRS+Bi-wq*w(X@BIeL zg!w@6e+V*WfIvA9xu*C=@6u{Z-(rY;EWX`Q<=ag;K*mGJesFpOF*`_vtdA&LIMh|a zKb&x-Kl)##lA|uv@F{Kvm2Tu@oH1pV@enecP#?j_pdBLQ<77PYK8BVNAW+Urj*XMz zqmriQu8v?3H;K)v9wgmF|<*E#^ z9Xk%*lpJ+l!~L}=OB-A}`%M4N;TH@Cf zk-D*W_iXqS!^@~2Cgx-IT=zXY{F^n|JVf=Z`$hM?)b)$_*i2o&zICRZvkYH`Cd^U& zf7?a>lhh9r^E0DQl<50jRdd#Q%ztHUqi=mh%T=k_Zn|o|lHC16l$}JFy1UxThw+c2 zz%>pfnjtv@!;ZTMh}jr=B-(ti>nk$%>qiZRamoYMs_*2!Fz3|u=Jn+=v%%r1?qA#e zQL}5MIE{Y!dQiJ5=BZ<64%6f*|5{H^fBE#Zw5m9Y*adv^3)B9*)BdN>{>-%%iHu2P zR3Z}+8J9RxVm}2?=P6{@a=^!acCxtLEBto%C0h$K+7+2mT6nazes>yJ+t1aKb@ihY zgw;FL``77-(yyETO4u)$#pGjkXVeg-nO=G{$D$HzG1go(y{ChX(P`}7ATP^!f9uM! zqaoUDnI64{IUCzJjeHie^h)KoOjdl)#O18W5uARe@?wg4E+%6;4SeD9_d@=okmqHY zD9c2dV(2XQsc_w7iwwWlKF_RGitFF9_e0lY{nP8nGSaEb*zHttXT<$A+RT7&N|i|B zds8PEaf@o4Aa|;6*>}Tg=G^zKe-0dT|7k{Gx1nx4{BG1t$KRB?*|2+4GZ}u1?o2JW zMb74YmYQ?#cwk)1J?;0e-Zu+b|68} zz``2}kET5V(O@JhEFKL4gQJ~D)D^wkbfiiDwGE7kM`IC@(PS1M9}Segf0Q)15}uS9 zD1R|1iwG&ovL+xhN($`WV!!j}yoyY8JeSvPUgYeTmzfWw;`-BuE7<)ovO%Iwe`&4Z!CVc1*{`xnSHEFBH-98lqM%pRfwq_AJ@>ce>jAZW z>8|rJ0_-6-vD@vl9gN;+!$3;fps#_rs)ka0mc^(3)nP5%ijM9^r2~IbG-Cw1qM`aA zi;vdGx4x>KePb2*^;`dr&py-u)aP`*)VScx3Lzb^DW;+RmNzv0f2{j!T^7DhV^;Xc zOS6w!>CRN&)$PP?z`nb_KDw^Js4ojS;#WIUbJQ%M!)=4blErQcbG#-6-9Keea1 zilLjI-{rqaP4;-@np##H$>d6={~GhAdm;R}ZAvy(^S>X&nGs%^u~Pm(U(!p1A-+S= zl9-l$OPc)cYKj*xsqF2+7HdOg=wLVUc_+xebIfHOuLiq4fA-m8mZttU%^{S>uGPvH zx8C+3U#pd;`I{e}x^ZI9(c_bQt&zhpK9u zf!q41EyGAYe~*U=y$XabC7){<|E7ZdYp%=ZoX(>nf#`L?MKhK)&f>^ zkD9+-IT6#f5f%LEd=OubIyY>7cV4vg9^o5l*)a~1w_o12m-0a3`IipLD=lX*>s> zCT4GN`I+5`av0nfIEC%4(OlYCX}MVHksaGASu``IV@rOjhZ3 z#j7*f(&CH5ph;S6D=n{%_@bK-P2xb60-}B1{2)@Mvz{xNk`A*VzoTSI)$DR92Q+O= zwmS<=f7!lv!*kvf*1g$$j!>I7A8Y_iMaC*JT9JtgH=lsK-a+Q0E6I;(mbR#Ag2SVW zDT8oO_=9V*;InA*5)qk}q{3xdJ`sytaELVj*{Cfaq(#O{gJ^hPvN%WtzhjY;rH&7Z zr9yOTIq8bsY=1udKTXZ46Qibm{Bcx8;I$u@e~DSDn*7Y~?QVs3AXyYbD9Q^PWacO< zFQE$FJ&a^v@$J}tTNQ7?KbNmNrDL<{ioD*ctj&bjs@A>UmLjJG3&`IWki~iyDhOqH zDlRnJmYYN;p{yfW0LMlm}C1 ze@irsKOIB)tAG$CuqOj~tVh-{#g<`dyxD_76$8C$Uz#ib4-;fL$RB@w{M{F14}yU# z`(|SQFdP}g-N(;9hC8uPe%)94hqPlisGm`M705gG`JR9F@^`E4{RzJEnxN|nhF&IB z&e7npt!a`EXa|`mk#D4wzo=|KnejKHf6RZU;V=Z5eXUaJk8S(~D{ERxPQ1Posqir= zQ)@+Drj%r(GM#e|WZezsp`dy7Q`h&D-ig(EOhy0&OGeRsB_sRM^CW)~BnXis1``P|k~q(1UhsV>rY^nP zDmY3ziL30FWQz5*7^*S7uNpc|e;ws5Ca74@;>07^vt($hO9g;l^I`7UyT~)mj_Grt zIhR8>j@>cW5B4h3N*WkcD>U;&E7SQa@nRKFKCoBwSDGPpd|7w;YiPm80-$`x+O6td_iH-s&cFa96STn}Ge}Rzd^;};^ z++DVV36zT`+N2@c1QKoHi8gi1r-h+t>L5W7aBbs+>Ap^J2~ zWg+~b8&Z0=VnURV{$M(r2^B9^!Qg{uy9$#t8bN=_Hj?}iFIW;leoeQrtT!OP2$*M+ zif7~J*+6(M2{)4E3gZ(1Fhn<@tL;FIc1*323=M$cqVfD`-i;*85FB;!l(ZBZNlwQK zl8Ps6NYIe9O^6Y)1{c(V+_t^fk*7_~I$wk@2jWu`kJC<+IzH*-H!|^kAnsWa+pDTrQARMlSN&LyY9gCUNt5`VU3ti#nHM|vnx%-g4(-(h*sj4^huWKoq24)cU!Rtr0isOgH^vW~s zm$c80E_(smei0WV+tIWR8n$z2)gx!^2E`VU#@V|Lo4S_0%4M0KR}|^1s$;8o^TSJ5 z77sK853k40sMPW9c(=^`$^Kev_6Wzd5IYnuU_>%(=*f%tzL@g%dYj#HDPHtOspfxr z>_A3?mafyW*#wiWAj^W*yE5USeh|9=?Ld}tX?884HydBicvRk8-K#TSjR4x_9-HxX z|BJ=;x}%5OZlfjR&g->!(b?qwVw=4$i@d~sm#VI^17`M4)atENk*PQgZuH*z))0X7 z&5{;lwrw)^yLcIuz}x2_$s%k9UGRF>MDR|!Tm-|%7?E;S66(m%C_awmgV8f^3u1uf^oH;YAbqKsc{!| zZ4uksbN8^PuH*Zw<@o)_LJ7OY1s1UD{Ilne? diff --git a/latest/observers.html b/latest/observers.html index e459883f4..dd0b50671 100644 --- a/latest/observers.html +++ b/latest/observers.html @@ -73,6 +73,7 @@
      • BenchmarkObserver.before_start()
      • BenchmarkObserver.during()
      • BenchmarkObserver.get_results()
      • +
      • BenchmarkObserver.register_configuration()
      • BenchmarkObserver.register_device()
    • @@ -175,6 +176,13 @@ generally returns averaged values over multiple iterations.

    +
    +
    +register_configuration(params)
    +

    Called once before benchmarking of a single kernel configuration. The params argument is a dict +that stores the configuration parameters.

    +
    +
    register_device(dev)
    diff --git a/latest/searchindex.js b/latest/searchindex.js index b990bce60..12d5c2924 100644 --- a/latest/searchindex.js +++ b/latest/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["cache_files", "contents", "contributing", "convolution", "correctness", "design", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"A": [0, 3, 5, 12, 13, 14, 16, 17, 21], "veri": [0, 4, 6, 7, 8, 11, 13, 14, 16, 19, 20], "us": [0, 1, 2, 3, 4, 5, 9, 11, 12, 13, 15, 16, 17, 18, 20, 21, 22], "featur": [0, 3, 4, 9, 13, 15, 16, 18, 20, 21], "kernel": [0, 2, 3, 4, 5, 11, 13, 15, 16, 17, 18, 19, 21, 22], "tuner": [0, 2, 3, 4, 5, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "i": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "abil": 0, "store": [0, 2, 3, 5, 8, 14, 18, 21], "benchmark": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 21, 22], "result": [0, 2, 3, 4, 5, 8, 10, 14, 15, 16, 17, 18, 21, 22], "dure": [0, 5, 6, 7, 8, 10, 16, 21], "tune": [0, 1, 4, 5, 9, 12, 13, 17, 18, 20, 21, 22], "you": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22], "can": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "enabl": [0, 16, 17, 19, 20], "pass": [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 20, 21], "ani": [0, 2, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 19, 20, 21, 22], "filenam": [0, 3, 5, 9, 14, 18, 21], "option": [0, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 20, 21, 22], "argument": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21], "tune_kernel": [0, 3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17, 18, 19, 20, 21], "The": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21], "individu": [0, 16, 17], "configur": [0, 3, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 21], "ar": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "append": [0, 5, 21], "run": [0, 3, 4, 5, 6, 7, 10, 11, 13, 14, 16, 17, 21], "thi": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "also": [0, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "allow": [0, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 20, 21], "restart": [0, 6, 7, 8, 17], "session": [0, 5, 17], "from": [0, 3, 4, 5, 6, 9, 10, 11, 13, 14, 16, 17, 19, 20, 21], "an": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], "exist": [0, 5, 21], "should": [0, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 18, 21], "someth": [0, 3, 6, 7, 8, 14], "have": [0, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22], "termin": [0, 13], "previou": [0, 6, 7, 8, 17, 21], "befor": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 21], "had": [0, 3], "complet": [0, 3], "happen": [0, 2, 3, 14, 18], "quit": [0, 6, 7, 8, 10, 14, 20], "often": [0, 6, 7, 8, 16], "hpc": 0, "environ": [0, 3, 5, 13, 17, 21], "when": [0, 2, 3, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 19, 20, 21, 22], "job": 0, "reserv": [0, 7, 22], "out": [0, 3, 4, 10, 14], "number": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18, 19, 21, 22], "other": [0, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 21, 22], "simul": [0, 5, 8, 12, 17, 19, 21], "visual": [0, 14], "optim": [0, 1, 3, 4, 5, 6, 7, 8, 11, 12, 14, 15, 16, 21], "strategi": [0, 1, 3, 15, 21], "start": [0, 1, 3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 17, 21], "call": [0, 3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19, 20, 21], "contain": [0, 3, 5, 6, 7, 8, 10, 11, 14, 16, 17, 20, 21], "full": [0, 5, 13, 16, 18], "search": [0, 3, 5, 9, 12, 14, 15, 17, 21], "space": [0, 3, 4, 5, 10, 11, 14, 15, 17, 21], "true": [0, 3, 4, 5, 6, 7, 8, 11, 14, 16, 17, 21], "creat": [0, 2, 3, 5, 6, 7, 8, 10, 14, 16, 18, 19, 21], "even": [0, 2, 6, 7, 8, 11, 14, 17], "work": [0, 2, 3, 5, 6, 7, 8, 13, 15, 17, 20, 21], "while": [0, 3, 5, 6, 7, 8, 9, 14, 16, 17], "still": [0, 2, 4, 14], "As": [0, 3, 6, 7, 8, 10, 13, 14, 16], "new": [0, 2, 5, 6, 7, 8, 17, 21], "come": [0, 5, 6, 7, 8, 14, 16, 20], "thei": [0, 5, 6, 7, 8, 9, 14, 15], "stream": [0, 5, 6, 7, 8], "pleas": [0, 2, 3, 12, 13, 16, 18, 19, 21], "see": [0, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 18, 20, 21], "dashboard": [0, 12], "introduct": 1, "instal": [1, 2, 3, 6, 7, 8, 10, 11, 14, 16, 18], "get": [1, 3, 5, 6, 7, 8, 10, 13, 14], "convolut": [1, 4, 11, 14], "diffus": 1, "matrix": 1, "multipl": [1, 5, 11, 16, 20, 21], "exampl": [1, 2, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19, 21], "cach": [1, 5, 6, 7, 8, 13, 14, 17, 21], "file": [1, 2, 3, 5, 6, 7, 9, 11, 14, 17, 18, 20, 21], "correct": [1, 11, 19, 21], "verif": [1, 9, 21], "host": [1, 2, 5, 7, 8, 9, 16, 19, 20, 21], "code": [1, 3, 5, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "struct": 1, "templat": [1, 10], "metric": [1, 3, 5, 9, 14, 21], "object": [1, 3, 4, 5, 6, 7, 8, 17, 21], "observ": [1, 5, 15, 21, 22], "api": [1, 3, 5], "paramet": [1, 4, 5, 6, 7, 9, 11, 14, 15, 17, 18, 19, 20, 21], "vocabulari": [1, 16, 18], "design": [1, 2, 6, 7, 8, 16], "contribut": 1, "thank": 2, "consid": [2, 10, 12, 14, 21], "Not": 2, "all": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 21], "help": [2, 20], "u": [2, 3, 6, 7, 8], "improv": [2, 5, 6, 7, 8, 14, 17, 21], "about": [2, 3, 5, 6, 7, 8, 12, 14, 16, 17, 18, 21], "problem": [2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 21], "ensur": [2, 4, 6, 7, 8, 11, 13, 16, 19], "follow": [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 20, 21], "describ": [2, 3, 5, 11, 16, 19], "what": [2, 3, 4, 5, 6, 7, 8, 11, 14, 16, 18, 19, 20, 21, 22], "expect": [2, 3, 4, 5, 6, 7, 8, 14, 16, 21], "If": [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 19, 21], "possibl": [2, 3, 4, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19, 21], "includ": [2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 20, 21], "minim": [2, 15, 20, 21], "reproduc": 2, "actual": [2, 3, 4, 5, 6, 7, 8, 10, 14, 20], "output": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 18, 21, 22], "error": [2, 3, 4, 5, 11, 14, 20], "print": [2, 3, 5, 6, 7, 8, 10, 14, 21], "list": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21], "version": [2, 3, 14, 16, 21], "python": [2, 3, 5, 9, 10, 11, 14, 16, 18, 19, 20, 21], "cuda": [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 16, 18, 19, 20, 21], "opencl": [2, 3, 6, 7, 8, 9, 11, 12, 14, 21], "c": [2, 3, 9, 11, 12, 13, 14, 18, 20, 21], "compil": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22], "applic": [2, 3, 6, 7, 8, 9, 10, 11, 12, 15, 16, 19, 20, 21], "For": [2, 3, 4, 5, 6, 7, 8, 10, 13, 16, 18, 19, 21], "select": [2, 3, 5, 6, 7, 8, 10, 14, 16, 17, 21], "propos": 2, "chang": [2, 10, 16, 21], "addit": [2, 3, 6, 7, 8, 13, 15, 18], "signific": 2, "requir": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 20], "first": [2, 3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 19, 20, 21], "discuss": [2, 5], "Then": [2, 6, 7, 8, 10, 12, 13, 20], "fork": 2, "repositori": [2, 3, 6, 7, 8, 10, 12, 13, 14], "branch": 2, "one": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17, 21], "per": [2, 3, 6, 7, 8, 10, 15, 16, 21], "pull": 2, "request": [2, 16, 21], "googl": 2, "style": 2, "sphinxdoc": 2, "docstr": [2, 5], "modul": [2, 5, 11, 16], "public": [2, 12], "function": [2, 3, 4, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21], "pylint": 2, "check": [2, 4, 5, 6, 7, 8, 11, 14], "your": [2, 3, 6, 7, 8, 10, 11, 12, 13, 16, 19, 21], "written": [2, 20], "unit": [2, 5], "produc": [2, 4], "same": [2, 3, 4, 5, 6, 7, 8, 10, 11, 16, 18, 21], "better": [2, 6, 7, 8], "compat": [2, 5, 13], "3": [2, 4, 6, 7, 8, 10, 11, 13, 14, 17, 21], "5": [2, 6, 7, 8, 10, 17], "newer": [2, 13, 16], "entri": [2, 5, 6, 7], "changelog": 2, "md": 2, "match": [2, 3, 4, 5], "roadmap": 2, "updat": [2, 5], "remov": [2, 17], "doubt": 2, "where": [2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 19, 20, 21], "put": [2, 5, 6, 7, 8], "look": [2, 3, 5, 6, 7, 8, 10, 13, 14, 20], "regard": [2, 5, 17], "packag": 2, "pip": [2, 3, 6, 7, 12, 13, 14], "e": [2, 13, 15, 16, 17, 21], "dev": [2, 13, 16], "after": [2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 21], "command": [2, 13], "abl": [2, 3, 5, 6, 7, 8], "below": [2, 8, 9, 10, 11, 13, 14, 15, 16, 17, 19], "how": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 18, 19, 20, 21], "do": [2, 3, 5, 6, 7, 8, 10, 11, 14, 21], "flag": 2, "mode": [2, 16], "mean": [2, 3, 11, 14, 15, 17, 19, 20, 22], "copi": [2, 5, 6, 7, 8, 18, 21], "link": 2, "track": [2, 16], "sourc": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 20, 21], "To": [2, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21], "pytest": 2, "v": [2, 5, 6, 7, 8, 10], "top": [2, 5, 10, 16, 21], "level": [2, 5, 16], "directori": [2, 3, 6, 7, 8, 10, 13, 14], "note": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 19, 21], "pycuda": [2, 6, 8, 10, 11, 16, 20], "capabl": [2, 5, 6, 7, 14, 21], "gpu": [2, 3, 4, 5, 9, 11, 12, 14, 16, 18, 19, 21, 22], "skip": [2, 3, 6, 7, 8, 21], "present": [2, 14], "hold": [2, 6, 7, 14, 18, 19, 21], "pyopencl": [2, 5, 7, 16], "cupi": [2, 16, 20, 21], "nvidia": [2, 5, 13, 14, 16, 20], "make": [2, 3, 6, 7, 8, 10, 12, 13, 14, 16, 19, 20], "break": [2, 20], "cannot": [2, 6, 7, 8, 16], "them": [2, 3, 8, 10, 11, 14], "local": [2, 17, 21], "seen": [2, 3, 5, 14], "integr": [2, 20], "locat": [2, 4, 10, 16], "doc": [2, 3, 6, 7, 8, 10, 13, 14], "type": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21], "html": 2, "gener": [2, 3, 5, 6, 7, 8, 12, 14, 16, 17, 19, 21, 22], "page": [2, 3, 6, 7, 8, 9, 10, 12, 14, 15], "inspect": [2, 5, 16], "commit": 2, "brows": 2, "through": [2, 5, 6, 7, 8, 10, 12, 15, 16, 17, 21], "sure": [2, 3, 6, 7, 8, 12, 13, 14], "depend": [2, 3, 4, 8, 9, 10, 12, 15, 21], "extra": [2, 20], "pandoc": 2, "ubuntu": 2, "sudo": [2, 13], "apt": 2, "differ": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 21], "": [2, 3, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 18, 19, 20, 21], "onlin": 2, "built": [2, 16, 17, 19, 21], "automat": [2, 3, 6, 7, 8, 10, 14, 20, 21], "github": [2, 3, 6, 7, 8, 10, 13, 14], "action": 2, "correspond": [2, 3, 6, 7, 8, 10, 16, 17, 18], "master": 2, "latest": [2, 13], "last": [2, 5, 19], "releas": [2, 5], "stabl": 2, "publish": [2, 12], "point": [2, 3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18, 21], "process": [2, 3, 5, 6, 7, 8, 14, 15, 16, 17, 20], "again": [2, 3, 6, 7, 8, 10, 14], "fulli": [2, 13], "autom": 2, "guid": [3, 6, 14, 15, 18], "meant": 3, "write": [3, 9, 10, 14, 20, 21], "script": [3, 5, 14, 19, 20], "we": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20], "ll": [3, 6, 7, 8, 13, 14], "simpl": [3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19], "find": [3, 11, 14, 17, 21], "shortli": 3, "much": [3, 6, 7, 8, 10, 16, 20, 21], "reus": [3, 6, 7, 8, 14], "read": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 21], "document": [3, 4, 6, 7, 8, 10, 13, 14, 19, 22], "jupyt": [3, 6, 7, 8, 10, 13, 14], "notebook": [3, 6, 7, 8, 10, 13, 14], "just": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14], "clone": [3, 6, 7, 8, 10, 13, 14], "tutori": [3, 6, 10, 12, 13, 14], "re": [3, 6, 7, 8, 10, 14], "readi": [3, 5, 6, 7, 8, 10, 14], "go": [3, 6, 7, 8, 10, 12, 14, 18], "kernel_tun": [3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21, 22], "oper": [3, 6, 7, 8, 10, 11, 14, 15], "essenti": 3, "signal": [3, 22], "imag": [3, 6, 7, 8], "main": [3, 5, 10, 16, 18], "neural": 3, "network": 3, "deep": 3, "learn": 3, "comput": [3, 4, 5, 9, 10, 11, 12, 14, 17, 21], "linear": [3, 14, 21], "combin": [3, 5, 6, 7, 8, 9, 10, 14, 16, 17, 18, 21], "weight": [3, 17], "filter": [3, 4, 9, 11], "rang": [3, 4, 6, 7, 8, 10, 11, 20], "pixel": 3, "input": [3, 4, 6, 7, 8, 9, 11, 14, 15, 18, 19, 21], "each": [3, 4, 5, 6, 7, 10, 14, 16, 17, 21], "size": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 18, 20, 21], "w": [3, 6, 7, 15, 17], "time": [3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 17, 20, 21, 22], "h": [3, 10, 21], "f": [3, 4, 10, 11, 19], "f_w": 3, "f_h": 3, "o": [3, 5], "begin": [3, 6, 7, 8, 10], "equat": [3, 6, 7, 8, 10, 17], "nonumb": [3, 10], "x": [3, 4, 5, 6, 7, 8, 10, 12, 14, 18, 20, 21], "y": [3, 5, 6, 7, 8, 10, 11, 14, 21], "sum": [3, 4, 5, 14], "limits_": 3, "j": [3, 6, 7, 8, 12, 14], "0": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 19, 21], "end": [3, 5, 6, 7, 8, 10, 14, 16, 17, 19], "naiv": [3, 4, 6, 7, 8], "parallel": [3, 6, 7, 8], "thread": [3, 5, 6, 7, 8, 9, 10, 15, 16, 18, 21, 22], "avoid": [3, 14, 22], "confus": 3, "around": [3, 9], "term": 3, "refer": [3, 4, 5, 6, 7, 8, 9, 11, 16, 21], "shown": [3, 5, 16], "block": [3, 5, 6, 7, 8, 9, 10, 14, 15, 18, 21, 22], "execut": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 21], "press": [3, 6, 7, 8, 10, 14], "shift": [3, 6, 7, 8, 10, 14], "enter": [3, 6, 7, 8, 10, 14], "writefil": [3, 14], "convolution_na": [3, 4], "cu": [3, 4, 11, 14, 18, 20], "__global__": [3, 6, 8, 10, 12, 14, 18, 20], "void": [3, 6, 7, 8, 10, 12, 14, 18, 19, 20], "convolution_kernel": [3, 4], "float": [3, 5, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21], "int": [3, 5, 6, 7, 8, 10, 12, 14, 18, 20, 21], "blockidx": [3, 6, 7, 8, 10, 12, 14, 18, 20], "blockdim": [3, 18, 21], "threadidx": [3, 6, 7, 8, 10, 12, 14, 18, 20], "image_height": 3, "image_width": 3, "filter_height": 3, "filter_width": 3, "input_width": 3, "run_kernel": [3, 4, 5, 9, 21], "our": [3, 6, 7, 8, 10, 14, 18, 19], "But": [3, 6, 7, 8, 10, 18], "some": [3, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21], "data": [3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18, 19, 21], "which": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22], "import": [3, 4, 6, 7, 8, 10, 13, 14, 15, 18, 19, 20], "numpi": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 18, 19, 20, 21], "np": [3, 5, 10, 14, 18, 19], "filter_s": 3, "17": [3, 4, 6, 7, 8, 11], "output_s": 3, "4096": [3, 4, 6, 7, 8, 11, 14], "prod": [3, 4, 11], "border_s": 3, "2": [3, 4, 6, 7, 8, 9, 10, 11, 14, 16, 17, 21], "1": [3, 4, 6, 7, 8, 10, 11, 14, 16, 17, 21], "input_s": [3, 4, 11], "output_imag": 3, "zero": [3, 4, 10, 11, 14], "astyp": [3, 4, 6, 7, 8, 10, 11, 12, 14, 18, 20], "float32": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 18, 20, 21], "input_imag": 3, "random": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 20, 21], "randn": [3, 4, 11, 12, 14, 18, 20], "conv_filt": 3, "now": [3, 5, 6, 7, 8, 10, 11, 14, 18], "structur": [3, 5, 6, 7, 14, 18], "ha": [3, 5, 6, 7, 8, 11, 14, 16, 17, 21], "signatur": [3, 5], "kernel_nam": [3, 5, 11, 19, 20, 21], "kernel_sourc": [3, 5, 19, 21], "problem_s": [3, 4, 5, 6, 7, 8, 10, 11, 14, 18, 19, 21, 22], "param": [3, 4, 5, 17, 21], "ellipsi": 3, "here": [3, 10, 11, 13, 14, 16, 21], "indic": [3, 17, 22], "mani": [3, 5, 6, 7, 8, 14, 15, 16, 17, 21], "won": 3, "t": [3, 5, 6, 7, 8, 10, 11, 13, 17, 20, 21], "need": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 18, 19, 20, 21], "right": [3, 6, 7, 8, 10, 13], "interest": [3, 19], "found": [3, 5, 12, 16, 17], "five": [3, 5, 18], "name": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 18, 21, 22], "string": [3, 5, 6, 7, 8, 9, 14, 15, 16, 18, 19, 21], "domain": [3, 6, 7, 8, 9, 10, 21], "up": [3, 5, 6, 7, 8, 14, 18, 21], "three": [3, 4, 14], "dimens": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 21, 22], "dictionari": [3, 5, 6, 7, 8, 10, 14, 16, 17, 18, 21], "simpli": [3, 4, 5, 6, 7, 8, 10, 17, 18, 21], "cell": [3, 6, 7, 8, 10, 14], "wrote": 3, "determin": [3, 6, 7, 8, 10, 16, 17], "grid": [3, 5, 6, 7, 8, 9, 11, 14, 21, 22], "defin": [3, 4, 5, 6, 7, 8, 9, 10, 14, 15, 16, 20, 21], "abov": [3, 5, 6, 7, 8, 10, 13, 14, 18, 19], "divid": [3, 6, 7, 8, 10, 11, 14, 21], "divisor": [3, 5, 6, 7, 8, 14, 21], "default": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 20, 21], "so": [3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 20, 21], "specifi": [3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22], "arrai": [3, 4, 5, 6, 7, 8, 10, 18, 19, 21], "scalar": [3, 6, 7, 8, 10, 21], "therefor": [3, 4, 6, 7, 8, 10, 11, 14], "exactli": [3, 5, 6, 7, 8, 14, 16], "order": [3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 17, 18, 21], "32": [3, 5, 6, 7, 8, 10, 12, 14, 18, 21], "bit": [3, 5, 6, 7, 8, 10, 11, 14], "final": [3, 4, 6, 7, 8, 10], "user": [3, 4, 5, 7, 9, 13, 14, 15, 16, 17, 20, 21], "rememb": [3, 6, 7, 8, 14], "anyth": 3, "insert": [3, 4, 5, 8, 10, 11, 14, 18, 20, 21, 22], "preprocessor": [3, 5, 21], "statement": [3, 8, 10, 14, 20], "valu": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 21], "were": [3, 6, 7, 8, 10, 14, 21], "like": [3, 5, 6, 7, 8, 9, 10, 14, 17, 18, 19, 20, 21], "i_like_convolut": 3, "42": 3, "line": [3, 6, 7, 8], "definit": [3, 10, 21], "effect": [3, 6, 7, 8, 21], "perform": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21], "unless": 3, "cours": [3, 6, 7, 8, 13, 14], "somewher": 3, "token": 3, "In": [3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18, 19, 21, 22], "freeli": 3, "few": [3, 6, 7, 8, 10, 11, 20], "special": [3, 6, 7, 8, 16, 18, 22], "mai": [3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19, 21], "notic": [3, 6, 7, 8], "haven": [3, 13], "yet": [3, 10, 11, 18], "basic": [3, 5, 6, 7, 8, 18], "block_size_x": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 18, 20, 21], "block_size_i": [3, 4, 6, 7, 8, 10, 11, 14, 21], "block_size_z": [3, 6, 7, 8, 10, 21], "interpret": 3, "z": [3, 5, 10, 21], "prefer": [3, 5, 6, 8, 16, 21], "block_size_nam": [3, 5, 21], "let": [3, 5, 6, 7, 8, 18, 20], "continu": [3, 5, 6, 7, 8, 13, 16, 17, 21], "creation": [3, 12, 17], "trusti": 3, "old": 3, "16": [3, 4, 6, 7, 8, 10, 11, 14], "dict": [3, 4, 5, 8, 11, 12, 16, 17, 18, 20, 21], "current": [3, 4, 5, 6, 7, 8, 14, 16, 17, 21], "undefin": [3, 5, 6, 7, 8, 14], "constant": [3, 5, 6, 7, 8, 9, 11, 14, 17, 21], "filter_heigth": 3, "those": [3, 9, 16], "could": [3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 17, 20, 21], "runtim": [3, 5, 6, 7, 8, 12, 13, 16, 20], "setup": [3, 6, 7, 8, 11, 13, 16, 19], "everyth": [3, 5, 6, 7, 8, 13], "answer": [3, 4, 5, 6, 7, 8, 9, 21], "done": [3, 13, 15, 16], "alloc": [3, 5, 6, 7, 8, 9, 11, 21], "memori": [3, 5, 9, 11, 16, 19, 21, 22], "move": [3, 5, 6, 11, 14, 17, 21], "content": [3, 5, 21], "deriv": [3, 5, 6, 7, 8, 15], "retriev": [3, 5, 21], "free": [3, 6, 7, 8, 11, 13, 14], "return": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19, 21], "contrast": 3, "wa": [3, 5, 6, 7, 8, 16, 21], "finish": [3, 5, 7, 10, 11, 16], "particularli": [3, 15], "compar": [3, 4, 6, 7, 8, 10, 14, 15, 16], "case": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 18, 19, 21], "than": [3, 6, 7, 8, 10, 15, 16, 17, 21, 22], "highli": [3, 12, 14], "parametr": 3, "It": [3, 5, 6, 7, 8, 11, 13, 14, 16, 20, 21], "long": [3, 6, 7, 8, 10, 11, 14, 19], "instead": [3, 5, 9, 14, 21], "adjust": 3, "path": [3, 16], "littl": [3, 6, 7, 8, 14], "ve": [3, 6, 7, 8, 13, 14], "interfac": [3, 4, 11, 13, 16, 17, 19, 21], "familiar": [3, 14], "becaus": [3, 4, 6, 7, 8, 11, 13, 14, 15, 20, 22], "kernel_str": [3, 4, 5, 6, 7, 8, 11, 12, 17, 21], "tune_param": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 19, 20, 21], "onli": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 19, 21], "replac": [3, 4, 5, 6, 7, 8, 10, 14, 21], "similarli": 3, "singl": [3, 4, 5, 6, 7, 8, 11, 14, 16, 20, 21], "wai": [3, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 21], "64": [3, 6, 7, 8, 12, 14, 18, 20], "128": [3, 6, 7, 8, 12, 18, 20], "8": [3, 5, 6, 7, 8, 10, 14, 16], "try": [3, 5, 6, 7, 8, 13, 14, 17, 21], "env": [3, 5, 17, 18, 21], "take": [3, 5, 6, 7, 8, 10, 14, 16, 17, 18, 20, 21], "cartesian": [3, 10], "product": [3, 5, 6, 7, 21], "small": [3, 6, 7, 8, 14], "set": [3, 4, 5, 6, 7, 8, 9, 10, 14, 16, 17, 18, 20, 21, 22], "realli": [3, 6, 7, 8, 13], "howev": [3, 4, 6, 7, 8, 11, 13, 14, 16, 19, 20, 21], "lot": [3, 6, 7, 8, 14, 16, 18, 19, 21], "problemat": 3, "support": [3, 5, 6, 7, 8, 11, 13, 16, 17, 20, 21, 22], "explain": [3, 5, 6, 7, 8, 11, 13, 14, 15, 18, 20, 21], "illeg": 3, "2048": 3, "limit": [3, 5, 6, 7, 8, 9, 14, 16, 17, 20, 21, 22], "1024": [3, 6, 7, 8, 18], "devic": [3, 4, 6, 7, 8, 9, 11, 16, 20, 21], "fail": [3, 5, 13, 21], "reason": [3, 5, 19, 21], "too": [3, 6, 7, 8, 10, 11, 14, 21], "share": [3, 5, 21], "regist": [3, 6, 7, 8, 14, 16], "avail": [3, 6, 7, 8, 9, 10, 13, 16], "silent": 3, "verbos": [3, 4, 5, 6, 7, 8, 11, 21], "bound": [3, 5, 14, 17], "access": [3, 6, 7, 8, 10, 16, 19], "ignor": [3, 5, 6, 7, 8, 21], "two": [3, 5, 6, 7, 8, 9, 14, 15, 17, 21], "thing": [3, 11, 14], "record": [3, 5, 6, 16, 21], "show": [3, 6, 7, 8, 9, 12, 15, 19], "specif": [3, 5, 6, 7, 8, 10, 15, 16, 17, 21], "secondli": [3, 14], "experi": 3, "took": [3, 6, 8, 17, 18, 21], "place": [3, 6, 7, 8, 16, 17, 18, 21], "That": [3, 6, 7, 8, 11, 14, 15, 18], "softwar": [3, 6, 7, 8, 12, 13, 16, 17, 18], "along": [3, 5, 13, 18, 22], "inform": [3, 5, 6, 7, 8, 12, 16, 17, 18, 21, 22], "second": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 21], "alwai": [3, 5, 6, 7, 8], "under": [3, 12, 21], "circumst": 3, "obtain": [3, 6, 7, 8, 10, 16], "promis": 3, "would": [3, 6, 7, 8, 20], "tile": [3, 9, 14], "factor": [3, 6, 7, 8, 9, 10, 14, 22], "amount": [3, 5, 6, 7, 8, 14, 15, 21], "particular": [3, 5, 6, 7, 9, 11, 14, 16, 19], "increas": [3, 6, 7, 8, 16], "certain": [3, 5, 6, 7, 8, 16, 22], "tile_size_x": [3, 4, 6, 7, 8, 11, 14], "4": [3, 6, 7, 8, 10, 14, 16], "tile_size_i": [3, 4, 6, 7, 8, 11, 14, 21], "understand": 3, "everi": [3, 4, 6, 7, 8, 9, 16, 18], "fewer": [3, 6, 7, 8], "total": [3, 5, 6, 7, 8, 14, 15, 18], "stai": 3, "tell": [3, 6, 7, 8, 9, 11, 14, 18, 19], "influenc": 3, "alreadi": [3, 5, 6, 7, 8, 13, 14, 21], "did": [3, 6, 7, 8, 14], "mimick": 3, "behavior": [3, 14, 16, 21], "been": [3, 5, 6, 7, 8, 11, 14, 17], "assum": [3, 5, 6, 7, 8, 14, 21], "far": [3, 6, 7, 8, 14, 18], "grid_div_x": [3, 4, 6, 7, 8, 11, 14, 21], "grid_div_i": [3, 4, 6, 7, 8, 11, 14, 21], "add": [3, 5, 6, 7, 8, 11, 14, 16, 17], "decreas": [3, 14], "correspondingli": 3, "displai": 3, "commonli": [3, 6, 7, 8, 13, 14], "gflop": [3, 5, 9, 14, 15], "giga": [3, 14], "compos": [3, 5, 14, 15], "lambda": [3, 5, 6, 7, 14, 15, 21], "collect": [3, 5, 6, 7, 8, 10, 14, 16, 19], "ordereddict": [3, 5, 6, 7, 8, 10, 14, 15, 21], "p": [3, 5, 14, 15, 19, 21], "1e9": [3, 14], "1e3": [3, 6, 7, 8, 14, 15], "expand": [3, 14, 16], "longer": [3, 5, 15], "sinc": [3, 8, 10, 14, 20], "9": [3, 4, 6, 7, 8, 11], "And": [3, 6, 7, 8, 17, 20, 21], "know": [3, 6, 7, 8, 14, 15], "enough": [3, 4, 14], "own": [3, 8, 11, 13, 15, 16], "whenev": 4, "program": [4, 6, 7, 8, 11, 14, 19, 20], "good": [4, 6, 7, 8, 22], "fast": [4, 6, 7, 8], "verifi": [4, 5, 9, 21], "instanc": [4, 5, 6, 7, 8, 11, 16, 21], "none": [4, 5, 16, 17, 21], "onc": [4, 5, 6, 7, 8, 10, 16, 21], "against": [4, 5], "comparison": 4, "implement": [4, 5, 9, 10, 15, 16, 17, 21], "allclos": [4, 21], "maximum": [4, 5, 10, 17, 21], "absolut": [4, 21], "1e": [4, 21], "6": [4, 6, 7, 8, 10, 11, 13, 21], "want": [4, 8, 10, 11, 13, 14, 16, 18, 21, 22], "toler": 4, "atol": [4, 5, 21], "convolution_correct": 4, "py": [4, 11, 13], "demonstr": [4, 8, 9, 14], "open": [4, 6, 7, 11, 14], "r": [4, 11], "cmem_arg": [4, 5, 21], "d_filter": 4, "arg": [4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 19, 20], "non": 4, "field": [4, 6, 7, 8], "its": [4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16, 21], "almost": [4, 6, 7, 8, 16], "whose": [4, 21], "trust": [4, 17], "construct": [4, 14], "There": [4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 21, 22], "precomput": 4, "more": [4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 18, 20, 21], "flexibl": [4, 6, 7, 14], "necessari": [4, 5, 6, 7, 8, 21], "callabl": [4, 5, 21], "accept": [4, 5, 17, 21], "cpu_result": 4, "gpu_result": [4, 6, 8], "although": 4, "semant": 4, "posit": [4, 5, 10, 17, 20, 21], "reflect": [4, 16], "reduct": [4, 15, 21], "snippet": 4, "sum_x": 4, "n": [4, 6, 7, 8, 10, 11, 12, 14, 17, 18, 20], "custom": [4, 9, 15, 16, 19], "def": [4, 5, 6, 7, 8, 10, 16, 19], "verify_partial_reduc": 4, "isclos": 4, "first_kernel": 4, "_": [4, 6, 7, 8], "sum_float": 4, "map": [4, 9, 10], "provid": [4, 5, 6, 7, 8, 11, 20, 21], "third": [4, 14], "partial": [4, 6, 7, 8, 9], "cpu": [4, 7, 8, 11], "achiev": [4, 8], "element": [4, 6, 7, 8, 14, 15, 18, 19, 21], "doe": [4, 5, 6, 7, 8, 10, 11, 14, 16, 20, 21], "necessarili": [4, 11], "section": [5, 6, 7, 8], "detail": [5, 13, 21], "intern": [5, 12, 17, 20], "mostli": [5, 12, 21], "relev": [5, 12, 16], "develop": [5, 12, 13], "extens": 5, "architectur": [5, 16], "At": [5, 10, 21], "expos": 5, "respons": 5, "iter": [5, 6, 7, 8, 10, 14, 16, 17, 18, 21], "over": [5, 6, 7, 8, 13, 14, 16, 17], "brute_forc": [5, 21], "valid": [5, 9, 14, 21], "random_sampl": [5, 21], "sampl": [5, 17, 21], "advanc": [5, 20, 21], "being": [5, 6, 7, 8, 14, 16, 17, 21], "strategy_opt": [5, 17, 21], "sai": [5, 6, 7, 8, 18, 20], "foreseen": 5, "futur": [5, 12, 21, 22], "high": [5, 6, 7, 8, 12, 14, 16], "wrap": [5, 18, 20, 21], "base": [5, 15, 16, 20, 21], "low": [5, 6, 7, 8, 14], "abstract": [5, 16], "ready_argument_list": 5, "build": [5, 6, 7, 8, 13], "bottom": 5, "pyhip": 5, "either": [5, 10, 17, 20, 21], "typic": [5, 13, 14, 21], "nvcc": 5, "gcc": 5, "fortran": [5, 9, 20], "turn": 5, "launch": [5, 6, 7, 8, 11, 16, 21], "rest": [5, 6, 7, 8], "helper": [5, 16], "get_opt": 5, "suppli": [5, 11, 14, 17, 20, 21], "get_strategy_docstr": 5, "method": [5, 6, 7, 8, 11, 14, 16, 17], "make_strategy_options_doc": 5, "scale_from_param": 5, "ep": [5, 17], "func": [5, 16, 21], "invers": 5, "unscal": 5, "setup_method_argu": 5, "prepar": [5, 6, 7, 8], "setup_method_opt": 5, "tuning_opt": [5, 17], "snap_to_nearest_config": 5, "closest": 5, "unscale_and_snap_to_nearest": 5, "snap": 5, "scale": 5, "variabl": [5, 10, 13, 17, 21], "nearest": [5, 21], "config": 5, "class": [5, 16, 17], "kernel_opt": 5, "device_opt": 5, "__init__": 5, "instanti": [5, 20], "kernelsourc": 5, "parameter_spac": [5, 17], "entir": [5, 6, 7, 8, 14, 17, 21], "iterfac": 5, "platform": [5, 12, 13, 16, 21], "quiet": [5, 21], "fals": [5, 16, 17, 21], "compiler_opt": [5, 21], "7": [5, 6, 7, 8, 10, 21], "offer": 5, "languag": [5, 8, 11, 14, 19, 21], "lang": [5, 9, 11, 20, 21], "bool": [5, 19, 21], "gpu_arg": 5, "benchmark_continu": 5, "durat": [5, 16], "least": 5, "benchmark_default": 5, "check_kernel_output": 5, "compile_and_benchmark": 5, "compile_kernel": 5, "copy_constant_memory_arg": 5, "most": [5, 6, 7, 8, 9, 11, 12, 14, 16, 17, 18, 19, 21], "recent": [5, 13, 16], "copy_shared_memory_arg": 5, "smem_arg": [5, 21], "copy_texture_memory_arg": 5, "texmem_arg": [5, 21], "textur": [5, 21], "create_kernel_inst": 5, "get_environ": 5, "memcpy_dtoh": [5, 6], "dest": 5, "src": 5, "mem": 5, "group": [5, 6, 7, 8, 21], "maintain": 5, "state": [5, 6, 7, 8, 16, 21], "interact": [5, 16], "properti": [5, 14, 21], "context": [5, 6, 8, 10], "kernel_inst": 5, "lookup": 5, "directli": [5, 6, 7, 8, 11, 14, 16, 20, 21], "driver": [5, 6, 8, 10], "ndarrai": [5, 10], "format": [5, 6, 7, 19], "kei": [5, 6, 7, 8, 14, 17, 18, 21], "symbol": [5, 21], "similar": [5, 11, 14, 21], "regular": [5, 8, 16], "int32": [5, 12, 18, 20, 21], "kernel_finish": 5, "otherwis": [5, 14, 21], "devicealloc": 5, "memcpy_htod": [5, 6], "memset": 5, "unsign": [5, 7], "byte": [5, 19, 21], "global": [5, 6, 7, 8, 17], "tupl": [5, 8, 10, 17, 21], "start_ev": 5, "event": [5, 6, 11, 16], "mark": 5, "measur": [5, 6, 7, 8, 10, 11, 14, 15, 16, 21, 22], "stop_ev": 5, "synchron": [5, 6, 8, 10, 14, 15], "halt": [5, 11], "until": [5, 11], "task": 5, "rawkernel": 5, "static": 5, "cudeviceptr": 5, "cufunct": 5, "id": [5, 16], "must": [5, 15, 21], "dynam": [5, 21], "buffer": [5, 7, 19], "fill": [5, 14], "item": [5, 6, 7, 8, 10], "ndrang": 5, "cleanup_lib": 5, "unload": 5, "previous": [5, 6, 7, 8, 14], "load": 5, "librari": [5, 9, 16, 19], "kernelinst": 5, "repres": [5, 6, 7, 8], "tunabl": [5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 21, 22], "ctype": 5, "_funcptr": 5, "asynchron": 5, "memcpi": [5, 11], "c_arg": 5, "whatev": [5, 11, 17], "left": [5, 6, 7, 8, 10, 15], "robust": 5, "averag": [5, 6, 7, 8, 11, 16], "ptr": 5, "pionter": 5, "compilationfailedconfig": 5, "errorconfig": 5, "invalidconfig": 5, "maxprodconstraint": 5, "maxprod": 5, "constraint": 5, "enforc": 5, "given": [5, 6, 7, 8, 10, 16, 17, 21], "npencod": 5, "skipkei": 5, "ensure_ascii": 5, "check_circular": 5, "allow_nan": 5, "sort_kei": 5, "indent": 5, "separ": [5, 9, 11, 20], "dump": [5, 6, 7], "json": [5, 6, 7, 9, 21], "obj": 5, "subclass": 5, "serializ": 5, "rais": 5, "typeerror": 5, "arbitrari": 5, "self": [5, 16, 17], "except": [5, 9], "els": 5, "jsonencod": 5, "runtimefailedconfig": 5, "skippablefailur": 5, "stopcriterionreach": 5, "thrown": 5, "stop": [5, 17], "criterion": [5, 17], "reach": 5, "check_argument_list": 5, "check_argument_typ": 5, "dtype": [5, 19], "kernel_argu": 5, "check_restrict": 5, "restrict": [5, 9, 14, 20, 21], "whether": [5, 15, 17, 21], "meet": 5, "check_stop_criterion": 5, "max_fev": [5, 17, 21], "exceed": 5, "check_thread_block_dimens": 5, "max_thread": 5, "check_tune_params_list": 5, "forbidden": 5, "compile_restrict": 5, "pars": [5, 6, 7], "config_valid": 5, "max": 5, "convert_constraint_restrict": 5, "convert": [5, 6, 7], "backward": 5, "cuda_error_check": 5, "statu": 5, "delete_temp_fil": 5, "delet": 5, "temporari": 5, "don": [5, 6, 8, 10, 11, 21], "complain": 5, "detect_languag": 5, "attempt": [5, 20], "detect": [5, 17, 20, 21], "dump_cach": 5, "str": [5, 6, 7, 8, 10], "omit": 5, "sever": [5, 6, 7, 8, 9, 10, 13, 14, 20, 21], "store_cach": 5, "speed": 5, "great": [5, 6, 7, 8, 18], "power": [5, 14, 16, 22], "get_best_config": 5, "objective_higher_is_bett": [5, 15, 21], "best": [5, 6, 7, 10, 14, 17, 20, 21, 22], "accord": [5, 21], "get_config_str": 5, "compact": 5, "represent": [5, 19], "get_grid_dimens": 5, "current_problem_s": 5, "grid_div": 5, "dim": 5, "get_instance_str": 5, "debug": 5, "advis": 5, "get_kernel_str": [5, 6, 7, 8], "One": [5, 6, 7, 8, 16, 19], "get_problem_s": 5, "get_smem_arg": 5, "get_temp_filenam": 5, "suffix": [5, 21], "form": [5, 14, 16, 17], "temp_x": 5, "larg": [5, 6, 7, 8, 10, 21], "integ": [5, 16, 19, 21], "get_thread_block_dimens": 5, "convent": [5, 11, 21], "get_total_tim": 5, "overhead_tim": 5, "looks_like_a_filenam": 5, "normalize_verify_funct": 5, "normal": [5, 17, 21], "result_host": 5, "keyword": 5, "behaviour": 5, "parse_restrict": 5, "prepare_kernel_str": 5, "prepend": [5, 8], "seri": [5, 10], "By": [5, 11, 14, 17, 21], "macro": 5, "made": 5, "print_config": 5, "print_config_output": 5, "process_cach": 5, "device_nam": [5, 21], "tune_params_kei": 5, "x1": 5, "x2": 5, "xn": 5, "234342": 5, "y1": 5, "y2": 5, "yn": 5, "134233": 5, "close": [5, 6, 7, 8], "bracket": 5, "miss": [5, 21], "earlier": [5, 6, 7, 8, 10], "abruptli": 5, "process_metr": 5, "calcul": [5, 10], "express": [5, 6, 7, 8, 9, 11, 14, 21], "10000": 5, "read_cach": 5, "open_cach": 5, "cachefil": [5, 21], "read_fil": 5, "replace_param_occurr": 5, "occurr": 5, "setup_block_and_grid": 5, "write_fil": 5, "whole": [6, 7, 8, 14, 17], "model": [6, 7, 8, 12], "physic": 6, "numer": [6, 7, 8], "introduc": [6, 7, 8, 14, 16], "redistribut": [6, 7, 8], "region": [6, 7, 8], "concentr": [6, 7, 8], "without": [6, 7, 8, 10, 11, 16, 17], "bulk": [6, 7, 8], "motion": [6, 7, 8], "concept": [6, 7, 8], "wide": [6, 7, 8, 13, 14], "chemistri": [6, 7, 8], "biologi": [6, 7, 8], "suppos": [6, 7, 8], "metal": [6, 7, 8], "sheet": [6, 7, 8], "temperatur": [6, 7, 8, 16, 17, 22], "equal": [6, 7, 8, 14, 21], "degre": [6, 7, 8], "everywher": [6, 7, 8], "heat": [6, 7, 8], "thousand": [6, 7, 8], "instant": [6, 7, 8, 10], "hotspot": [6, 7, 8], "cooler": [6, 7, 8], "area": [6, 7, 8, 14], "melt": [6, 7, 8], "loss": [6, 7, 8], "radiat": [6, 7, 8], "caus": [6, 7, 8], "frac": [6, 7, 8], "d": [6, 7, 8, 10, 17, 18], "spatial": [6, 7, 8], "descret": [6, 7, 8], "2d": [6, 7, 8, 9], "quantiti": [6, 7, 8, 15, 16, 21], "nx": [6, 7, 8, 10], "equi": [6, 7, 8], "distant": [6, 7, 8], "direct": [6, 7, 8, 11, 14, 15, 21], "ny": [6, 7, 8, 10], "Be": [6, 7, 8], "distanc": [6, 7, 8, 17], "delta": [6, 7, 8], "between": [6, 7, 8, 11, 14, 15, 17, 21], "central": [6, 7, 8], "approxim": [6, 7, 8], "x_i": [6, 7, 8, 10], "x_": [6, 7, 8], "approx": [6, 7, 8], "u_": [6, 7, 8], "2u_": [6, 7, 8], "y_": [6, 7, 8], "estim": [6, 7, 8], "next": [6, 7, 8, 14, 19], "step": [6, 7, 8, 14, 15, 17, 20], "simplifi": [6, 7, 8], "formula": [6, 7, 8], "further": [6, 7, 8, 13, 14], "4u_": [6, 7, 8], "simplic": [6, 7, 8, 10], "assumpt": [6, 7, 8], "boundari": [6, 7, 8], "condit": [6, 7, 8, 14], "dt": [6, 7, 8], "225": [6, 7, 8], "give": [6, 7, 8, 17], "test": [6, 7, 8, 9, 14, 16, 21], "initi": [6, 7, 8, 19], "hot": [6, 7, 8], "plot": [6, 7, 8], "anoth": [6, 7, 8, 11, 14, 15, 17, 21], "color": [6, 7, 8], "matplotlib": [6, 7, 8, 13], "pyplot": [6, 7, 8], "inlin": [6, 7, 8], "get_initial_condit": [6, 7, 8], "ones": [6, 7, 8, 22], "randint": [6, 7, 8], "10": [6, 7, 8, 12, 17], "1000": [6, 7, 8, 10], "2000": [6, 7, 8], "fig": [6, 7, 8], "ax1": [6, 7, 8], "ax2": [6, 7, 8], "subplot": [6, 7, 8], "imshow": [6, 7, 8], "lt": [6, 7, 8], "axesimag": [6, 7, 8], "0x2aaab952f240": 6, "gt": [6, 7, 8], "quick": [6, 7, 8], "save": [6, 7], "later": [6, 7, 8, 10, 21], "field_copi": [6, 7], "m": [6, 7, 8, 10], "4164": 6, "018869400024": 6, "0x2aab1c98b3c8": 6, "worri": [6, 8], "appli": [6, 7, 8], "terminologi": [6, 8], "text": [6, 8, 14], "225f": [6, 7, 8], "diffuse_kernel": [6, 7, 8], "u_new": [6, 7, 8], "0f": [6, 7, 8], "togeth": [6, 7, 8, 13, 21], "choos": [6, 7, 8, 14, 17, 21], "impact": [6, 7, 8, 11], "fix": [6, 7, 8, 17, 21], "unrol": [6, 7, 8, 9, 14, 22], "loop": [6, 7, 8, 9, 14, 22], "drv": 6, "sourcemodul": [6, 8, 10], "init": 6, "make_context": 6, "devprop": 6, "k": [6, 7, 8, 10, 12, 14, 18], "get_devic": 6, "get_attribut": 6, "cc": 6, "compute_capability_major": 6, "compute_capability_minor": 6, "u_old": [6, 8], "mem_alloc": 6, "nbyte": 6, "block_size_str": [6, 8], "arch": 6, "sm_": 6, "get_funct": [6, 8, 10], "boilerpl": [6, 7, 8], "moment": [6, 7, 8, 21], "These": [6, 7, 8, 10, 13, 14, 16, 20, 21], "serv": [6, 7, 8, 15, 17], "guess": [6, 7, 8], "pair": [6, 7, 8], "500": [6, 7, 8], "time_sinc": 6, "zeros_lik": [6, 10, 12, 14, 18, 20], "set_titl": [6, 7, 8], "53": [6, 7, 8], "423038482666016": 6, "0x2aaabbdcb2e8": 6, "faster": [6, 7, 8, 14], "cleanup": 6, "pop": 6, "think": [6, 7, 8], "messi": [6, 7, 8], "got": [6, 7, 8], "cleaner": [6, 7, 8], "plai": [6, 7, 8], "difficult": [6, 7, 8, 19, 20], "rather": [6, 7, 8, 21], "underutil": [6, 7, 8], "purpos": [6, 7, 8, 11, 14, 21, 22], "feel": [6, 7, 8], "48": [6, 7, 8], "care": [6, 7, 8], "11": [6, 7, 8], "appropi": [6, 7, 8], "fly": [6, 7, 8], "12": [6, 7, 8], "13": [6, 7, 8], "geforc": [6, 7, 8, 10], "gtx": [6, 7, 8, 10], "titan": [6, 7, 8], "22305920124": 6, "779033613205": 6, "824838399887": 6, "900499212742": 6, "999763202667": 6, "727967989445": 6, "752479994297": 6, "797900807858": 6, "876627194881": 6, "93347837925": 6, "766662418842": 6, "803033602238": 6, "853574407101": 6, "971545600891": 6, "763775992393": 6, "791257584095": 6, "848044800758": 6, "922745585442": 6, "792595207691": 6, "822137594223": 6, "893279993534": 6, "well": [6, 7, 8, 10, 14, 16, 21], "millisecond": [6, 7, 8], "matter": [6, 7, 8, 11], "conveni": [6, 7, 8, 11, 21], "analyz": [6, 7, 8], "seem": [6, 7, 8], "hardwar": [6, 7, 8, 10, 16, 17, 18], "vari": [6, 7, 8, 10, 14, 15], "addtion": [6, 7, 8], "among": [6, 7, 8, 12, 17], "128x32": [6, 7, 8], "likewis": [6, 7, 8], "becom": [6, 7, 8, 16, 17], "affect": [6, 7, 8, 14], "within": [6, 7, 8, 10, 14, 17, 21], "exchang": [6, 7, 8], "fact": [6, 7, 8, 11], "commun": [6, 7, 8], "idea": [6, 7, 8, 11, 14, 22], "control": [6, 7, 8, 16, 17, 21], "l2": [6, 7, 8], "closer": [6, 7, 8], "multiprocessor": [6, 7, 8], "l1": [6, 7, 8], "fine": [6, 7, 8], "grain": [6, 7, 8], "manag": [6, 7, 8, 14, 16], "cost": [6, 7, 8, 17], "instruct": [6, 7, 8, 9, 14], "overhead": [6, 7, 8, 14], "degrad": [6, 7, 8], "intermedi": [6, 7, 8], "mind": [6, 7, 8], "14": [6, 7, 8], "tx": [6, 7, 8, 14], "ty": [6, 7, 8, 14], "bx": [6, 7, 8, 10], "__shared__": [6, 8, 14], "sh_u": [6, 7, 8], "pragma": [6, 7, 8, 14], "__syncthread": [6, 7, 8, 14], "75041918755": 6, "18713598251": 6, "09015038013": 6, "06844799519": 6, "09730558395": 6, "14420480728": 6, "05957758427": 6, "07508480549": 6, "0731967926": 6, "14729599953": 6, "08389122486": 6, "10700161457": 6, "10125439167": 6, "31661438942": 6, "0629119873": 6, "04807043076": 6, "054880023": 6, "12033278942": 6, "06672639847": 6, "05816960335": 6, "12000002861": 6, "sometim": [6, 7, 8, 19], "merg": [6, 7, 8, 14], "half": [6, 7, 8], "doubl": [6, 7, 8, 19, 20], "cover": [6, 7, 8, 17], "part": [6, 7, 8, 12, 13, 14, 15, 19, 21], "beyond": [6, 7, 8, 21], "reduc": [6, 7, 8, 14], "condens": [6, 7, 8], "keep": [6, 7, 8, 14, 19], "importantli": [6, 7, 8], "worst": [6, 7, 8], "both": [6, 7, 8, 9, 14], "15": [6, 7, 8, 20], "tj": [6, 7, 8], "ti": [6, 7, 8, 10], "ad": [6, 7, 8, 11, 21], "somehow": [6, 7, 8], "larger": [6, 7, 8, 11, 17, 20], "insid": [6, 7, 8, 11, 14, 20, 21], "round": [6, 7, 8, 21], "arithmet": [6, 7, 8, 21], "evalu": [6, 7, 8, 14, 17, 21], "759308815": 6, "29789438248": 6, "06983039379": 6, "2634239912": 6, "997139203548": 6, "843692803383": 6, "05549435616": 6, "862348806858": 6, "750636804104": 6, "19084160328": 6, "876377594471": 6, "714169609547": 6, "875001597404": 6, "691116797924": 6, "575859189034": 6, "759679996967": 6, "622867202759": 6, "650336003304": 6, "09794559479": 6, "826515209675": 6, "692665600777": 6, "78363519907": 6, "646092808247": 6, "554745602608": 6, "716115188599": 6, "581280004978": 6, "662566399574": 6, "07386879921": 6, "833420813084": 6, "705055999756": 6, "840755212307": 6, "652575993538": 6, "569388794899": 6, "689356791973": 6, "597267186642": 6, "675232005119": 6, "10033922195": 6, "860332798958": 6, "731891202927": 6, "867276787758": 6, "68781440258": 6, "595276796818": 6, "735436797142": 6, "60216319561": 6, "852166390419": 6, "15089921951": 6, "852575981617": 6, "705932807922": 6, "888671982288": 6, "673248004913": 6, "563417613506": 6, "761139214039": 6, "621254396439": 6, "676595199108": 6, "06709122658": 6, "804953610897": 6, "685670387745": 6, "801798415184": 6, "632006394863": 6, "542387211323": 6, "722668802738": 6, "578745603561": 6, "618598401546": 6, "08220798969": 6, "821881604195": 6, "687955200672": 6, "77759360075": 6, "618003201485": 6, "539891195297": 6, "705900788307": 6, "568556785583": 6, "624492788315": 6, "0799423933": 6, "832300806046": 6, "70140799284": 6, "835481595993": 6, "638348805904": 6, "550105595589": 6, "667251205444": 6, "576044797897": 6, "732409596443": 6, "15916161537": 6, "869497597218": 6, "733248019218": 6, "890803205967": 6, "677363204956": 6, "577215993404": 6, "730982398987": 6, "58035838604": 6, "10066559315": 6, "837804794312": 6, "691385602951": 6, "851040017605": 6, "666656005383": 6, "560505592823": 6, "771103990078": 6, "626163220406": 6, "694451200962": 6, "11514236927": 6, "837299215794": 6, "703302407265": 6, "806828796864": 6, "648620784283": 6, "562521612644": 6, "760915207863": 6, "605760002136": 6, "690009605885": 6, "10740480423": 6, "841631996632": 6, "700883197784": 6, "838195204735": 6, "649779188633": 6, "56585599184": 6, "7168192029": 6, "59088640213": 6, "69627519846": 6, "3269824028": 6, "02665598392": 6, "840908801556": 6, "03752319813": 6, "788345599174": 6, "662041604519": 6, "85437438488": 6, "680422389507": 6, "0759360075": 6, "801996803284": 6, "666003203392": 6, "808000004292": 6, "643359994888": 6, "544691193104": 6, "741964805126": 6, "60942081213": 6, "681350398064": 6, "05262081623": 6, "792108798027": 6, "66344319582": 6, "768064010143": 6, "625260794163": 6, "540352010727": 6, "721862399578": 6, "579411196709": 6, "626976013184": 6, "06332798004": 6, "808211183548": 6, "679372787476": 6, "803718411922": 6, "627136015892": 6, "538227200508": 6, "682188808918": 6, "573836791515": 6, "725548803806": 6, "13023357391": 6, "843411195278": 6, "713843202591": 6, "85886080265": 6, "657920002937": 6, "565254402161": 6, "697094392776": 6, "579904007912": 6, "07484800816": 6, "801119995117": 6, "667347204685": 6, "799059200287": 6, "643820810318": 6, "542937588692": 6, "740518403053": 6, "615148806572": 6, "731334400177": 6, "07002239227": 6, "805299210548": 6, "675923216343": 6, "782060790062": 6, "631142401695": 6, "540383994579": 6, "723999989033": 6, "578681600094": 6, "726335990429": 6, "13297917843": 6, "844428789616": 6, "710278391838": 6, "835494399071": 6, "637958395481": 6, "567417597771": 6, "699366402626": 6, "588492810726": 6, "tri": [6, 7, 8, 17], "grow": [6, 7, 8], "quickli": [6, 7, 8], "On": [6, 7, 8, 21], "went": [6, 7, 8, 10], "72": [6, 7, 8], "26": [6, 7, 8], "32x2": [6, 7, 8], "64x4": [6, 7, 8], "four": [6, 7, 8], "best_tim": [6, 7], "min": [6, 7], "05": [6, 7], "join": [6, 7], "nice": [6, 7], "stdout": [6, 7], "why": [6, 7, 11, 15], "easili": [6, 7, 16], "easi": [6, 7, 15, 16, 21], "csv": [6, 7, 9], "analysi": [6, 7], "panda": [6, 7, 9, 13], "18": [6, 7, 8], "fp": [6, 7], "datafram": [6, 7], "df": [6, 7], "to_csv": [6, 7], "0x2aab1de088d0": 7, "01": 7, "sy": 7, "140": 7, "wall": 7, "98": 7, "__kernel": 7, "get_group_id": 7, "get_local_id": 7, "cl": 7, "ctx": 7, "create_some_context": 7, "mf": 7, "mem_flag": 7, "a_h": 7, "a_d": 7, "read_writ": 7, "copy_host_ptr": 7, "hostbuf": 7, "b_d": 7, "kernel_src": 7, "prg": 7, "queue": 7, "commandqueu": 7, "run_gpu": 7, "444": 7, "154": 7, "598": 7, "985": 7, "enqueue_copi": 7, "1748096": 7, "7284544": 7, "7707904": 7, "8573184": 7, "8380288": 7, "686528": 7, "69648": 7, "7461632": 7, "818304": 7, "771072": 7, "7190464": 7, "7522432": 7, "7982208": 7, "9624512": 7, "7214464": 7, "7453312": 7, "8028416": 7, "8922624": 7, "747328": 7, "7860736": 7, "8637184": 7, "__local": 7, "barrier": 7, "clk_local_mem_f": 7, "8449472": 7, "1912576": 7, "1035136": 7, "0927808": 7, "1140736": 7, "1790336": 7, "0808192": 7, "0809792": 7, "0836928": 7, "1545856": 7, "1249984": 7, "1264": 7, "1230336": 7, "4015104": 7, "0873216": 7, "0626496": 7, "0692224": 7, "140192": 7, "0801344": 7, "0688128": 7, "1428928": 7, "8844544": 7, "3245952": 7, "0911808": 7, "3039616": 7, "0079296": 7, "84848": 7, "0708288": 7, "857728": 7, "7561792": 7, "231072": 7, "8774336": 7, "7087296": 7, "8772672": 7, "6911872": 7, "5715968": 7, "7584896": 7, "6292032": 7, "6498688": 7, "1145664": 7, "8252928": 7, "6757568": 7, "7881152": 7, "6237696": 7, "544224": 7, "6951168": 7, "5648128": 7, "6452736": 7, "1065792": 7, "8313792": 7, "6905984": 7, "8302656": 7, "6367488": 7, "5478592": 7, "6660672": 7, "5719744": 7, "6551744": 7, "1384064": 7, "8531072": 7, "7078976": 7, "8516672": 7, "6677696": 7, "5685632": 7, "7074048": 7, "5753152": 7, "8228864": 7, "2124736": 7, "8633344": 7, "6921216": 7, "8896384": 7, "6659904": 7, "5582144": 7, "7522624": 7, "6081536": 7, "6664448": 7, "1095936": 7, "8063424": 7, "6717888": 7, "7982848": 7, "6263552": 7, "5289728": 7, "7008832": 7, "567456": 7, "5968704": 7, "1018432": 7, "8117248": 7, "6724736": 7, "7728576": 7, "6038336": 7, "5172352": 7, "6796352": 7, "5470016": 7, "5968448": 7, "1107712": 7, "8237248": 7, "6810944": 7, "821952": 7, "620352": 7, "5230208": 7, "6415552": 7, "5476864": 7, "7168192": 7, "1942016": 7, "8626304": 7, "7099712": 7, "9123328": 7, "6608448": 7, "5631168": 7, "7113024": 7, "556576": 7, "1583104": 7, "8384832": 7, "67856": 7, "845856": 7, "6581248": 7, "54944": 7, "7520064": 7, "6076224": 7, "6842112": 7, "1547072": 7, "8422016": 7, "6895552": 7, "8037312": 7, "6387072": 7, "5383296": 7, "7326656": 7, "5863488": 7, "6813376": 7, "1493952": 7, "8444928": 7, "6929216": 7, "832768": 7, "6389312": 7, "5412672": 7, "698336": 7, "5717568": 7, "676096": 7, "4303104": 7, "0341696": 7, "8365184": 7, "0398656": 7, "7786496": 7, "648928": 7, "8479232": 7, "6508544": 7, "1219392": 7, "7994048": 7, "6492288": 7, "8068416": 7, "6343168": 7, "5235328": 7, "7268928": 7, "5898432": 7, "6633536": 7, "0849664": 7, "7869632": 7, "6458624": 7, "7611968": 7, "613088": 7, "50912": 7, "6972928": 7, "5620608": 7, "601856": 7, "095232": 7, "7967488": 7, "6601472": 7, "7952896": 7, "6047296": 7, "5108224": 7, "6607744": 7, "5492416": 7, "7091136": 7, "171552": 7, "8473408": 7, "6962112": 7, "8663936": 7, "6466816": 7, "5475584": 7, "6754048": 7, "5591744": 7, "108896": 7, "7907264": 7, "6459328": 7, "7965888": 7, "6250816": 7, "5188416": 7, "721408": 7, "5920832": 7, "7068608": 7, "0909248": 7, "7930752": 7, "6524544": 7, "7745216": 7, "6146176": 7, "5116928": 7, "6975872": 7, "5548416": 7, "7075136": 7, "174624": 7, "8384512": 7, "69104": 7, "8335488": 7, "6264192": 7, "5445248": 7, "6719104": 7, "5592064": 7, "19": [7, 8], "solv": 8, "0x7f888f8cd7b8": 8, "4152": 8, "086019515991": 8, "0x7f8865b51f28": 8, "gpuarrai": [8, 10], "tool": [8, 10, 12], "autoinit": [8, 10], "to_gpu": [8, 10], "mod": [8, 10], "t0": [8, 10], "ona": 8, "33": 8, "46109390258789": 8, "0x7f8858b873c8": 8, "1080": [8, 10], "916985595226": 8, "489004802704": 8, "500524806976": 8, "513356792927": 8, "545715200901": 8, "486515200138": 8, "449055999517": 8, "44974719882": 8, "457427197695": 8, "492915201187": 8, "464863997698": 8, "466118401289": 8, "475264000893": 8, "513632011414": 8, "458412796259": 8, "457715201378": 8, "461017608643": 8, "475987195969": 8, "460032004118": 8, "457779198885": 8, "462649595737": 8, "kernel_string_shar": 8, "22673916817": 8, "826361596584": 8, "793516802788": 8, "782112002373": 8, "776639997959": 8, "795135998726": 8, "722777605057": 8, "762777590752": 8, "75422719717": 8, "804876792431": 8, "778656005859": 8, "769734406471": 8, "782495999336": 8, "932281601429": 8, "734028804302": 8, "721625590324": 8, "736511993408": 8, "800019192696": 8, "724966406822": 8, "722969603539": 8, "759430396557": 8, "kernel_string_til": 8, "22200961113": 8, "91601279974": 8, "752838408947": 8, "873651194572": 8, "69833599329": 8, "586931192875": 8, "516473591328": 8, "411392003298": 8, "384262400866": 8, "82159358263": 8, "632607996464": 8, "506457602978": 8, "618758392334": 8, "500288009644": 8, "429862397909": 8, "44995200038": 8, "366150397062": 8, "342201602459": 8, "793542397022": 8, "58026239872": 8, "494163197279": 8, "546316814423": 8, "467059195042": 8, "404249596596": 8, "440895992517": 8, "341376006603": 8, "339692795277": 8, "783923208714": 8, "597920000553": 8, "50277120471": 8, "615475213528": 8, "470937597752": 8, "418393599987": 8, "443519997597": 8, "343961596489": 8, "342540800571": 8, "780352008343": 8, "611705589294": 8, "515667212009": 8, "622534394264": 8, "502195191383": 8, "437388807535": 8, "45568639636": 8, "359289598465": 8, "426995199919": 8, "788947200775": 8, "616556799412": 8, "496121603251": 8, "629164803028": 8, "474841600657": 8, "407667201757": 8, "47406719923": 8, "371507203579": 8, "352531200647": 8, "72023679018": 8, "574816000462": 8, "481817597151": 8, "580928003788": 8, "455724793673": 8, "394975996017": 8, "464659202099": 8, "357107198238": 8, "324083191156": 8, "759910392761": 8, "569177603722": 8, "481279999018": 8, "528115200996": 8, "441734397411": 8, "393126398325": 8, "455404800177": 8, "350457596779": 8, "322547197342": 8, "754201591015": 8, "579827189445": 8, "491852802038": 8, "582751989365": 8, "451283198595": 8, "391807991266": 8, "456275194883": 8, "356716805696": 8, "362937599421": 8, "809894394875": 8, "60433280468": 8, "507142400742": 8, "655827200413": 8, "474092799425": 8, "408166396618": 8, "480531209707": 8, "346707201004": 8, "780134403706": 8, "601049602032": 8, "493900799751": 8, "620384001732": 8, "494553589821": 8, "425414395332": 8, "467033600807": 8, "375468802452": 8, "346079999208": 8, "771052801609": 8, "593977594376": 8, "49723520875": 8, "583270406723": 8, "478079998493": 8, "416320002079": 8, "443942397833": 8, "359744000435": 8, "343545603752": 8, "780960011482": 8, "598758399487": 8, "498617601395": 8, "57678719759": 8, "46561280489": 8, "41324160099": 8, "431225597858": 8, "351263999939": 8, "34440960288": 8, "933260798454": 8, "715257608891": 8, "586604809761": 8, "711615991592": 8, "558771193027": 8, "466284793615": 8, "44043520093": 8, "361823999882": 8, "731839990616": 8, "57044479847": 8, "470220798254": 8, "608800005913": 8, "472665601969": 8, "416352003813": 8, "481376004219": 8, "380812799931": 8, "351923197508": 8, "719257593155": 8, "55171200037": 8, "466758400202": 8, "568435204029": 8, "459654402733": 8, "394380801916": 8, "463052803278": 8, "36409599781": 8, "328998398781": 8, "73579518795": 8, "564575994015": 8, "472236800194": 8, "549024009705": 8, "438406395912": 8, "389945602417": 8, "455193603039": 8, "364051198959": 8, "375519996881": 8, "798195195198": 8, "588998401165": 8, "49552000761": 8, "595462405682": 8, "460972803831": 8, "400672000647": 8, "465132802725": 8, "364627194405": 8, "729363203049": 8, "558815991879": 8, "466655993462": 8, "600819194317": 8, "460281592607": 8, "404908800125": 8, "478739196062": 8, "386668801308": 8, "385510402918": 8, "720915210247": 8, "550668799877": 8, "466937589645": 8, "564921605587": 8, "447974395752": 8, "394271999598": 8, "46233600378": 8, "365190398693": 8, "387827193737": 8, "762003195286": 8, "579007995129": 8, "486649608612": 8, "557331204414": 8, "443033593893": 8, "396070402861": 8, "457075202465": 8, "369555193186": 8, "wish": 8, "modifi": [8, 16], "tile_size_j": 8, "fixed_param": [8, 10], "ceil": [8, 10], "zip": [8, 10], "transfer": [8, 9, 11], "20": [8, 17], "21": 8, "618": 8, "2231903076172": 8, "0x7f887c3d2358": 8, "incorpor": 8, "ifndef": 8, "kerenel": 8, "psedo": 8, "endif": 8, "bypass": 8, "usecas": 9, "test_vector_add": 9, "test_vector_add_parameter": 9, "illustr": 9, "dimension": [9, 10, 21], "clean": [9, 14], "center": [9, 10], "lock": [9, 16], "overlap": [9, 11], "shuffl": 9, "pipelin": 9, "consist": [9, 14, 21], "scipi": 9, "algorithm": [9, 12, 17, 21], "cub": 9, "gaussian": 10, "delv": 10, "hand": [10, 14], "sum_": 10, "exp": 10, "beta": [10, 17], "sqrt": 10, "y_i": 10, "z_i": 10, "vector": [10, 11, 18], "coordin": 10, "forget": 10, "linalg": 10, "la": 10, "compute_grid": 10, "xgrid": 10, "ygrid": 10, "zgrid": 10, "x0": 10, "y0": 10, "z0": 10, "themselv": 10, "meshgrid": 10, "send": 10, "interv": 10, "256": [10, 12, 18], "suffici": [10, 15], "100": [10, 17, 21], "randomli": [10, 17], "distribut": [10, 14], "linspac": 10, "cpu_grid": 10, "npt": 10, "rand": 10, "xyz": [10, 21], "52320": 10, "160627": 10, "might": [10, 15], "nz": 10, "bz": 10, "kernel_cod": 10, "math": 10, "__host__": 10, "__device__": [10, 20], "b": [10, 12, 14, 17, 18, 20], "addgrid": 10, "xvect": 10, "yvect": 10, "zvect": 10, "dx": 10, "dy": 10, "dz": 10, "assign": 10, "explor": 10, "middl": 10, "henc": [10, 19], "mention": 10, "56833920479": 10, "80796158314": 10, "940044796467": 10, "855628800392": 10, "855359995365": 10, "16174077988": 10, "11877760887": 10, "01592960358": 10, "849273598194": 10, "849235200882": 10, "19029750824": 10, "16199679375": 10, "40401918888": 10, "39618558884": 10, "39508478642": 10, "31647996902": 10, "31470079422": 10, "50787198544": 10, "53760001659": 10, "56709756851": 10, "34500494003": 10, "25130877495": 10, "50662400723": 10, "55267841816": 10, "17987194061": 10, "12309756279": 10, "01125121117": 10, "849631989002": 10, "853708791733": 10, "17051515579": 10, "15584001541": 10, "40074241161": 10, "39547519684": 10, "39331197739": 10, "30295038223": 10, "28725762367": 10, "39589118958": 10, "38867840767": 10, "37724158764": 10, "34344320297": 10, "26213116646": 10, "38793599606": 10, "3775359869": 10, "74003200531": 10, "13276162148": 10, "37233917713": 10, "18835201263": 10, "15777277946": 10, "40247042179": 10, "39366400242": 10, "39439997673": 10, "23719043732": 10, "28542718887": 10, "39207677841": 10, "38956804276": 10, "3778496027": 10, "29814395905": 10, "26398081779": 10, "38625922203": 10, "3754431963": 10, "72981758118": 10, "12483196259": 10, "37322881222": 10, "61618566513": 10, "2194111824": 10, "17600002289": 10, "27082881927": 10, "38787200451": 10, "3835711956": 10, "37543039322": 10, "30227203369": 10, "23127679825": 10, "38627202511": 10, "37677440643": 10, "64358406067": 10, "12255358696": 10, "37474560738": 10, "61655673981": 10, "19179515839": 10, "99912958145": 10, "213971138": 10, "16430072784": 10, "38772480488": 10, "3735104084": 10, "54432649612": 10, "05524477959": 10, "36935677528": 10, "42449922562": 10, "10455036163": 10, "67516155243": 10, "programmat": 10, "With": [10, 11], "30": 10, "minimum": 10, "84": 10, "suit": [10, 21], "grid_dim": 10, "associ": 10, "substitut": 10, "ourselv": 10, "extract": 10, "manual": [10, 13], "exlicitli": 10, "accur": [10, 16], "xgpu": 10, "ygpu": 10, "zgpu": 10, "grid_gpu": 10, "80": 10, "133200": 10, "lower": [10, 16, 17], "roughli": [10, 14], "40000": 10, "across": [11, 14], "handl": [11, 21], "backend": [11, 16], "qualiti": 11, "itself": [11, 12, 21], "precis": 11, "plain": 11, "omp_get_wtim": 11, "openmp": 11, "convolution_stream": 11, "complex": [11, 14], "behind": 11, "spread": 11, "back": [11, 21], "split": 11, "chunk": 11, "slightli": [11, 14, 20], "account": [11, 14], "border": [11, 21], "latter": 11, "cudastreamwaitev": 11, "num_stream": 11, "clarifi": 11, "fit": [11, 17], "choic": [11, 13], "grid_size_x": 11, "grid_size_i": 11, "cudamemcpytosymbol": 11, "upload": 11, "yourself": [11, 21], "spent": [11, 21], "relat": [12, 15, 22], "famili": 12, "launcher": 12, "kt": [12, 19], "easiest": 12, "toolkit": [12, 13], "intend": 12, "hip": [12, 21], "Or": [12, 13], "vector_add": [12, 17, 18, 20], "10000000": 12, "512": [12, 18], "research": 12, "cite": 12, "articl": [12, 18], "kerneltun": 12, "author": 12, "ben": 12, "van": 12, "werkhoven": 12, "titl": 12, "auto": [12, 14, 16, 17, 20, 21, 22], "journal": 12, "system": [12, 13, 16], "year": 12, "2019": 12, "volum": 12, "90": 12, "347": 12, "358": 12, "url": 12, "http": [12, 13, 16], "www": 12, "sciencedirect": 12, "com": [12, 13], "scienc": 12, "pii": 12, "s0167739x18313359": 12, "doi": 12, "org": 12, "1016": 12, "2018": 12, "08": 12, "004": 12, "willemsen2021bayesian": 12, "willemsen": [12, 17], "flori": 12, "jan": 12, "nieuwpoort": 12, "rob": 12, "bayesian": [12, 17, 21], "workshop": 12, "pmb": 12, "supercomput": 12, "sc21": 12, "2021": 12, "arxiv": 12, "ab": 12, "2111": 12, "14991": 12, "schoonhoven2022benchmark": 12, "schoonhoven": 12, "richard": 12, "batenburg": 12, "joost": 12, "ieee": 12, "transact": 12, "evolutionari": 12, "2022": 12, "schoonhoven2022go": 12, "veenboer": 12, "bram": 12, "green": 12, "energi": [12, 16, 17, 22], "effici": [12, 14, 16], "steer": 12, "sc22": 12, "2211": 12, "07260": 12, "recommend": [13, 19], "miniconda": 13, "linux": 13, "download": 13, "wget": 13, "repo": 13, "continuum": 13, "io": 13, "miniconda3": 13, "x86_64": 13, "sh": 13, "bash": 13, "nativ": 13, "virtual": 13, "g": [13, 15, 16], "prefix": 13, "home": 13, "pythonpath": 13, "bind": [13, 16], "older": 13, "troubl": 13, "retri": 13, "dir": 13, "wiki": 13, "tiker": 13, "net": 13, "amd": [13, 16], "app": 13, "sdk": 13, "intel": 13, "appl": 13, "beignet": 13, "rocm": [13, 16], "stack": 13, "altern": [13, 21], "jatinx": 13, "navig": 13, "benvanwerkhoven": 13, "cd": 13, "chanc": [13, 17, 20], "known": 14, "algebra": 14, "frequent": 14, "programm": [14, 16], "row": 14, "column": 14, "squar": 14, "matric": 14, "matmul_na": 14, "width": 14, "matmul_kernel": 14, "height": 14, "Of": 14, "solut": [14, 16], "realiti": 14, "contant": 14, "denot": [14, 18, 21], "sensibl": 14, "pick": 14, "word": 14, "warpsiz": 14, "correctli": 14, "namelijk": 14, "stand": 14, "briefli": 14, "figur": 14, "fifth": 14, "fourth": 14, "dramat": 14, "profil": 14, "util": 14, "pretti": 14, "opportun": 14, "realiz": 14, "collabor": 14, "bandwidth": 14, "techniqu": 14, "submatric": 14, "proce": 14, "matmul_shar": 14, "sa": 14, "sb": 14, "kb": 14, "outer": 14, "inner": 14, "race": 14, "drastic": 14, "consumpt": [14, 16], "due": [14, 20, 21], "significantli": [14, 16], "fortun": 14, "benefit": 14, "redund": 14, "distinct": 14, "1xn": 14, "usag": [14, 16], "occup": 14, "goe": 14, "down": 14, "matmul": 14, "newli": 14, "coupl": 14, "respect": [14, 16], "independ": 14, "yield": 14, "discontinu": 14, "room": 14, "impos": 14, "report": [15, 16, 21, 22], "possibli": [15, 21], "_flop": 15, "total_flop": 15, "ps_energi": [15, 16, 22], "occur": [15, 21], "exhaust": 15, "brute": [15, 17, 18], "forc": [15, 17, 18, 20], "maxim": [15, 21], "boolean": [15, 16, 21], "facilit": 16, "layer": 16, "act": 16, "hook": 16, "pattern": 16, "subscrib": 16, "benchmarkobserv": 16, "overwritten": [16, 21], "extend": 16, "mandatori": 16, "get_result": 16, "usual": 16, "aggreg": 16, "after_finish": 16, "after_start": 16, "before_start": 16, "register_devic": 16, "variou": [16, 18], "registerobserv": 16, "counter": 16, "num_reg": 16, "current_modul": 16, "powersensor2": 16, "pcie": 16, "intercept": 16, "sensor": 16, "transmit": 16, "usb": 16, "connect": 16, "advantag": 16, "instantan": 16, "frequenc": 16, "khz": 16, "pybind11": 16, "powersensor": [16, 22], "extern": [16, 20], "ps_power": [16, 22], "joul": [16, 22], "watt": [16, 22], "ttyacm0": 16, "core": 16, "voltag": 16, "thin": 16, "wrapper": [16, 20], "intricaci": 16, "friendli": 16, "repeatedli": 16, "downsid": 16, "approach": 16, "save_al": 16, "nvidia_smi_fallback": 16, "use_locked_clock": 16, "continous_dur": 16, "monitor": 16, "clock": [16, 22], "power_read": [16, 22], "nvml_power": [16, 22], "nvml_energi": [16, 22], "core_freq": [16, 22], "mem_freq": [16, 22], "gr_voltag": 16, "ordin": 16, "identifi": 16, "smi": 16, "root": 16, "privileg": 16, "opt": 16, "amper": 16, "continuous_dur": 16, "common": [16, 20], "cap": 16, "popular": 16, "nvml_gr_clock": [16, 22], "nvml_mem_clock": [16, 22], "nvml_pwr_limit": [16, 22], "graphic": [16, 22], "jetson": 16, "rapl": 16, "xilinx": 16, "pmt": 16, "git": 16, "astron": 16, "nl": 16, "rd": 16, "meter": 16, "arduino": 16, "_energi": 16, "_power": 16, "acceler": 17, "prohibit": 17, "slow": 17, "wast": 17, "basin": [17, 21], "hop": [17, 21], "dual": [17, 21], "anneal": [17, 21], "differenti": [17, 21], "evolut": [17, 21], "firefli": [17, 21], "genet": [17, 21], "greedi": [17, 21], "multi": [17, 21], "particl": [17, 21], "swarm": [17, 21], "mechan": 17, "overrid": 17, "time_limit": [17, 21], "uniqu": [17, 21], "count": 17, "searchspac": 17, "runner": 17, "nelder": 17, "mead": 17, "powel": 17, "cg": 17, "bfg": 17, "l": 17, "tnc": 17, "cobyla": 17, "slsqp": 17, "reject": 17, "thesi": 17, "generate_normalized_param_dict": 17, "denorm": 17, "normalize_parameter_spac": 17, "param_spac": 17, "prune_parameter_spac": 17, "normalize_dict": 17, "prune": 17, "hyperparamet": 17, "via": 17, "popul": 17, "best1bin": 17, "best1exp": 17, "rand1exp": 17, "randtobest1exp": 17, "best2exp": 17, "rand2exp": 17, "randtobest1bin": 17, "best2bin": 17, "rand2bin": 17, "rand1bin": 17, "popsiz": 17, "maxit": 17, "constr": 17, "compute_intens": 17, "fun": 17, "intens": 17, "distance_to": 17, "euclidian": 17, "move_toward": 17, "alpha": 17, "toward": 17, "b0": 17, "attract": 17, "gamma": 17, "light": 17, "absorpt": 17, "coeffici": 17, "disruptive_uniform_crossov": 17, "dna1": 17, "dna2": 17, "disrupt": 17, "uniform": 17, "crossov": 17, "uniformli": 17, "gene": 17, "children": 17, "guarante": 17, "parent": 17, "mutat": 17, "dna": 17, "mutation_ch": 17, "single_point_crossov": 17, "index": 17, "single_point": 17, "two_point": 17, "disruptive_uniform": 17, "two_point_crossov": 17, "uniform_crossov": 17, "weighted_choic": 17, "probabl": [17, 21], "il": 17, "neighbor": 17, "node": 17, "ham": 17, "adjac": 17, "greedy": 17, "soon": 17, "no_improv": 17, "exce": 17, "50": 17, "random_walk": 17, "hillclimb": 17, "travers": 17, "inertia": 17, "c1": 17, "cognit": 17, "c2": 17, "social": 17, "fraction": 17, "acceptance_prob": 17, "old_cost": 17, "new_cost": 17, "modif": [17, 19], "po": 17, "t_min": 17, "001": 17, "995": 17, "vector_add_kernel": 18, "wise": 18, "1000000": [18, 20], "recogn": 18, "alright": 18, "issu": 19, "portabl": 19, "stick": 19, "pointer": 19, "primit": 19, "lead": 19, "ineffici": 19, "situat": 19, "scientif": 19, "sens": 19, "experiment": 19, "pack": 19, "consult": 19, "create_receive_spec_struct": 19, "0l": 19, "pad": 19, "8byte": 19, "packstr": 19, "iiiiiiiiiiippi": 19, "fffi": 19, "nsampl": 19, "nsamplesiq": 19, "nslowtimesampl": 19, "nchannel": 19, "ntx": 19, "nrepeat": 19, "nfasttimesampl": 19, "rfsize": 19, "mnrow": 19, "mnrowsiq": 19, "nactivechannel": 19, "isiq": 19, "fsiq": 19, "fc": 19, "nbuffer": 19, "frombuff": 19, "len": 19, "receive_spec": 19, "bf": 19, "rf": 19, "recon": 19, "sync": 19, "length": 19, "slight": 19, "matlab": 20, "typenam": 20, "my_typ": 20, "linkag": 20, "regardless": 20, "demot": 20, "rewrit": 20, "real": 20, "risk": 20, "isol": 20, "nvrtc": 20, "seper": 20, "grid_div_z": 21, "06": 21, "log": 21, "simulation_mod": 21, "auxilliari": 21, "safer": 21, "notat": 21, "divison": 21, "treat": 21, "warp": 21, "empti": 21, "kepler": 21, "plu": 21, "filter_mod": 21, "address_mod": 21, "clamp": 21, "mirror": 21, "axi": 21, "normalized_coordin": 21, "emtpi": 21, "get_local_s": 21, "satisfi": 21, "000001": 21, "ref": 21, "basinhop": 21, "bayes_opt": 21, "diff_evo": 21, "firefly_algorithm": 21, "genetic_algorithm": 21, "greedy_il": 21, "greedy_ml": 21, "ml": 21, "ordered_greedy_ml": 21, "pso": 21, "simulated_ann": 21, "sort": 21, "resourc": 21, "persist": 21, "consol": 21, "info": 21, "summar": 21, "store_result": 21, "results_filenam": 21, "typicali": 21, "percentag": 21, "create_device_target": 21, "header_filenam": 21, "header": 21, "target": 21, "dtarget_gpu": 21, "name_of_gpu": 21, "chosen": 21, "block_size_": 22, "grid_size_": 22, "compiler_opt_": 22, "loop_unroll_factor_": 22, "nvml_": 22, "nvml": 22, "nvmlobserv": 22}, "objects": {"kernel_tuner.backends.c": [[5, 0, 1, "", "CFunctions"]], "kernel_tuner.backends.c.CFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "cleanup_lib"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[5, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[5, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[5, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[5, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[5, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[5, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "benchmark"], [5, 1, 1, "", "benchmark_continuous"], [5, 1, 1, "", "benchmark_default"], [5, 1, 1, "", "check_kernel_output"], [5, 1, 1, "", "compile_and_benchmark"], [5, 1, 1, "", "compile_kernel"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "create_kernel_instance"], [5, 1, 1, "", "get_environment"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"]], "kernel_tuner": [[21, 2, 1, "", "create_device_targets"], [21, 2, 1, "", "run_kernel"], [21, 2, 1, "", "store_results"], [21, 2, 1, "", "tune_kernel"], [5, 3, 0, "-", "util"]], "kernel_tuner.observers": [[16, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[16, 1, 1, "", "after_finish"], [16, 1, 1, "", "after_start"], [16, 1, 1, "", "before_start"], [16, 1, 1, "", "during"], [16, 1, 1, "", "get_results"], [16, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[16, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[16, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[16, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[5, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[5, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "run"]], "kernel_tuner.strategies": [[17, 3, 0, "-", "basinhopping"], [17, 3, 0, "-", "bayes_opt"], [17, 3, 0, "-", "brute_force"], [5, 3, 0, "-", "common"], [17, 3, 0, "-", "diff_evo"], [17, 3, 0, "-", "dual_annealing"], [17, 3, 0, "-", "firefly_algorithm"], [17, 3, 0, "-", "genetic_algorithm"], [17, 3, 0, "-", "greedy_ils"], [17, 3, 0, "-", "greedy_mls"], [17, 3, 0, "-", "minimize"], [17, 3, 0, "-", "mls"], [17, 3, 0, "-", "ordered_greedy_mls"], [17, 3, 0, "-", "pso"], [17, 3, 0, "-", "random_sample"], [17, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[17, 2, 1, "", "generate_normalized_param_dicts"], [17, 2, 1, "", "normalize_parameter_space"], [17, 2, 1, "", "prune_parameter_space"], [17, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[5, 2, 1, "", "get_options"], [5, 2, 1, "", "get_strategy_docstring"], [5, 2, 1, "", "make_strategy_options_doc"], [5, 2, 1, "", "scale_from_params"], [5, 2, 1, "", "setup_method_arguments"], [5, 2, 1, "", "setup_method_options"], [5, 2, 1, "", "snap_to_nearest_config"], [5, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[17, 0, 1, "", "Firefly"], [17, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[17, 1, 1, "", "compute_intensity"], [17, 1, 1, "", "distance_to"], [17, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[17, 2, 1, "", "disruptive_uniform_crossover"], [17, 2, 1, "", "mutate"], [17, 2, 1, "", "single_point_crossover"], [17, 2, 1, "", "tune"], [17, 2, 1, "", "two_point_crossover"], [17, 2, 1, "", "uniform_crossover"], [17, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[17, 2, 1, "", "acceptance_prob"], [17, 2, 1, "", "neighbor"], [17, 2, 1, "", "tune"]], "kernel_tuner.util": [[5, 0, 1, "", "CompilationFailedConfig"], [5, 0, 1, "", "ErrorConfig"], [5, 0, 1, "", "InvalidConfig"], [5, 0, 1, "", "MaxProdConstraint"], [5, 0, 1, "", "NpEncoder"], [5, 0, 1, "", "RuntimeFailedConfig"], [5, 4, 1, "", "SkippableFailure"], [5, 4, 1, "", "StopCriterionReached"], [5, 2, 1, "", "check_argument_list"], [5, 2, 1, "", "check_argument_type"], [5, 2, 1, "", "check_restrictions"], [5, 2, 1, "", "check_stop_criterion"], [5, 2, 1, "", "check_thread_block_dimensions"], [5, 2, 1, "", "check_tune_params_list"], [5, 2, 1, "", "compile_restrictions"], [5, 2, 1, "", "config_valid"], [5, 2, 1, "", "convert_constraint_restriction"], [5, 2, 1, "", "cuda_error_check"], [5, 2, 1, "", "delete_temp_file"], [5, 2, 1, "", "detect_language"], [5, 2, 1, "", "dump_cache"], [5, 2, 1, "", "get_best_config"], [5, 2, 1, "", "get_config_string"], [5, 2, 1, "", "get_grid_dimensions"], [5, 2, 1, "", "get_instance_string"], [5, 2, 1, "", "get_kernel_string"], [5, 2, 1, "", "get_problem_size"], [5, 2, 1, "", "get_smem_args"], [5, 2, 1, "", "get_temp_filename"], [5, 2, 1, "", "get_thread_block_dimensions"], [5, 2, 1, "", "get_total_timings"], [5, 2, 1, "", "looks_like_a_filename"], [5, 2, 1, "", "normalize_verify_function"], [5, 2, 1, "", "parse_restrictions"], [5, 2, 1, "", "prepare_kernel_string"], [5, 2, 1, "", "print_config"], [5, 2, 1, "", "print_config_output"], [5, 2, 1, "", "process_cache"], [5, 2, 1, "", "process_metrics"], [5, 2, 1, "", "read_cache"], [5, 2, 1, "", "read_file"], [5, 2, 1, "", "replace_param_occurrences"], [5, 2, 1, "", "setup_block_and_grid"], [5, 2, 1, "", "store_cache"], [5, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[5, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"cach": 0, "file": 0, "The": [1, 12], "kernel": [1, 6, 7, 8, 9, 10, 12, 14, 20], "tuner": [1, 6, 7, 8, 9, 10, 12], "document": [1, 2, 5, 12, 21], "guid": [1, 2, 13], "featur": 1, "refer": 1, "contribut": 2, "report": 2, "issu": 2, "code": [2, 6, 7, 8, 9, 11], "develop": 2, "setup": 2, "run": [2, 8], "test": [2, 3], "build": 2, "convolut": [3, 9], "2d": 3, "exampl": [3, 9, 12, 20], "implement": [3, 6, 7, 8], "tune": [3, 6, 7, 8, 10, 11, 14, 15, 16], "more": 3, "tunabl": 3, "paramet": [3, 8, 10, 16, 22], "correct": 4, "verif": 4, "design": 5, "strategi": [5, 17], "kernel_tun": [5, 17], "common": 5, "runner": 5, "sequenti": 5, "sequentialrunn": 5, "simulationrunn": 5, "devic": 5, "interfac": 5, "core": 5, "deviceinterfac": 5, "backend": [5, 20], "pycuda": [5, 13], "pycudafunct": 5, "cupi": 5, "cupyfunct": 5, "nvcuda": 5, "cudafunct": 5, "opencl": [5, 13], "openclfunct": 5, "c": [5, 8], "cfunction": 5, "hip": [5, 13], "hipfunct": 5, "util": 5, "function": 5, "diffus": [6, 7, 8], "python": [6, 7, 8, 13], "comput": [6, 7, 8], "gpu": [6, 7, 8, 10], "auto": [6, 7, 8], "us": [6, 7, 8, 10, 14, 19], "share": [6, 7, 8, 14], "memori": [6, 7, 8, 14], "tile": [6, 7, 8], "store": [6, 7], "result": [6, 7], "tutori": [7, 8], "from": [7, 8], "physic": [7, 8], "local": 7, "best": 8, "product": 8, "vector": 9, "add": 9, "stencil": 9, "matrix": [9, 14], "multipl": [9, 14], "py": 9, "sepconv": 9, "convolution_correct": 9, "convolution_stream": 9, "reduct": 9, "spars": 9, "point": 9, "polygon": 9, "expdist": 9, "gener": 9, "3d": 10, "grid": 10, "let": 10, "": 10, "start": [10, 18], "cpu": 10, "move": 10, "optim": [10, 17], "host": 11, "number": 11, "stream": 11, "quick": 12, "instal": [12, 13], "usag": 12, "citat": 12, "packag": 13, "cuda": [13, 14], "pyopencl": 13, "pyhip": 13, "git": 13, "version": 13, "depend": 13, "naiv": 14, "increas": 14, "work": 14, "per": 14, "thread": 14, "metric": 15, "object": 15, "observ": 16, "powersensorobserv": 16, "nvmlobserv": 16, "execut": 16, "nvml": 16, "pmtobserv": 16, "basinhop": 17, "bayes_opt": 17, "brute_forc": 17, "diff_evo": 17, "dual_ann": 17, "firefly_algorithm": 17, "genetic_algorithm": 17, "greedy_il": 17, "greedy_ml": 17, "minim": 17, "ml": 17, "ordered_greedy_ml": 17, "pso": 17, "random_sampl": 17, "simulated_ann": 17, "get": 18, "struct": 19, "templat": 20, "select": 20, "api": 21, "vocabulari": 22}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 60}, "alltitles": {"Cache files": [[0, "cache-files"]], "The Kernel Tuner documentation": [[1, "the-kernel-tuner-documentation"], [12, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[1, null]], "Guides": [[1, null]], "Features": [[1, null]], "Reference": [[1, null]], "Contribution guide": [[2, "contribution-guide"]], "Reporting Issues": [[2, "reporting-issues"]], "Contributing Code": [[2, "contributing-code"]], "Development setup": [[2, "development-setup"]], "Running tests": [[2, "running-tests"]], "Building documentation": [[2, "building-documentation"]], "Convolution": [[3, "Convolution"], [9, "convolution"]], "2D Convolution example": [[3, "2D-Convolution-example"]], "Implement a test": [[3, "Implement-a-test"]], "Tuning 2D Convolution": [[3, "Tuning-2D-Convolution"]], "More tunable parameters": [[3, "More-tunable-parameters"]], "Correctness Verification": [[4, "correctness-verification"]], "Design documentation": [[5, "design-documentation"]], "Strategies": [[5, "strategies"]], "kernel_tuner.strategies.common": [[5, "module-kernel_tuner.strategies.common"]], "Runners": [[5, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[5, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[5, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[5, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[5, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[5, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[5, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[5, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[5, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.c.CFunctions": [[5, "kernel-tuner-backends-c-cfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[5, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[5, "util-functions"]], "kernel_tuner.util": [[5, "module-kernel_tuner.util"]], "Diffusion": [[6, "Diffusion"], [6, "id1"], [7, "Diffusion"], [8, "Diffusion"]], "Python implementation": [[6, "Python-implementation"], [7, "Python-implementation"], [8, "Python-implementation"]], "Computing on the GPU": [[6, "Computing-on-the-GPU"], [7, "Computing-on-the-GPU"], [8, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[6, "Auto-Tuning-with-the-Kernel-Tuner"], [7, "Auto-Tuning-with-the-Kernel-Tuner"], [8, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[6, "Using-Shared-Memory"]], "Tiling GPU Code": [[6, "Tiling-GPU-Code"], [7, "Tiling-GPU-Code"], [8, "Tiling-GPU-Code"]], "Storing the results": [[6, "Storing-the-results"], [7, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[7, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [8, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[7, "Using-Shared-(local)-Memory"]], "Using shared memory": [[8, "Using-shared-memory"], [14, "Using-shared-memory"]], "Using the best parameters in a production run": [[8, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[8, "Python-run"]], "C run": [[8, "C-run"]], "Kernel Tuner Examples": [[9, "kernel-tuner-examples"]], "Vector Add": [[9, "vector-add"]], "Stencil": [[9, "stencil"]], "Matrix Multiplication": [[9, "matrix-multiplication"]], "convolution.py": [[9, "convolution-py"]], "sepconv.py": [[9, "sepconv-py"]], "convolution_correct.py": [[9, "convolution-correct-py"]], "convolution_streams.py": [[9, "convolution-streams-py"]], "Reduction": [[9, "reduction"]], "Sparse Matrix Vector Multiplication": [[9, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[9, "point-in-polygon"]], "ExpDist": [[9, "expdist"]], "Code Generator": [[9, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[10, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[10, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[10, "Let's-move-to-the-GPU"]], "Tune the kernel": [[10, "Tune-the-kernel"]], "Using the optimized parameters": [[10, "Using-the-optimized-parameters"]], "Tuning Host Code": [[11, "tuning-host-code"]], "Tuning the number of streams": [[11, "tuning-the-number-of-streams"]], "Quick install": [[12, "quick-install"]], "Example usage": [[12, "example-usage"]], "Citation": [[12, "citation"]], "Installation": [[13, "installation"]], "Python": [[13, "python"]], "Installing Python Packages": [[13, "installing-python-packages"]], "CUDA and PyCUDA": [[13, "cuda-and-pycuda"]], "OpenCL and PyOpenCL": [[13, "opencl-and-pyopencl"]], "HIP and PyHIP": [[13, "hip-and-pyhip"]], "Installing the git version": [[13, "installing-the-git-version"]], "Dependencies for the guides": [[13, "dependencies-for-the-guides"]], "Matrix multiplication": [[14, "Matrix-multiplication"]], "Naive CUDA kernel": [[14, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[14, "Tuning-a-naive-kernel"]], "Increase work per thread": [[14, "Increase-work-per-thread"]], "Metrics and Objectives": [[15, "metrics-and-objectives"]], "Metrics": [[15, "metrics"]], "Tuning Objectives": [[15, "tuning-objectives"]], "Observers": [[16, "observers"]], "PowerSensorObserver": [[16, "powersensorobserver"]], "NVMLObserver": [[16, "nvmlobserver"]], "Tuning execution parameters with NVML": [[16, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[16, "pmtobserver"]], "Optimization strategies": [[17, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[17, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[17, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[17, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[17, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[17, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[17, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[17, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[17, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[17, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[17, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[17, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[17, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[17, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[17, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[18, "getting-started"]], "Using structs": [[19, "using-structs"]], "Templated kernels": [[20, "templated-kernels"]], "Example": [[20, "example"]], "Selecting a backend": [[20, "selecting-a-backend"]], "API Documentation": [[21, "api-documentation"]], "Parameter Vocabulary": [[22, "parameter-vocabulary"]]}, "indexentries": {"cfunctions (class in kernel_tuner.backends.c)": [[5, "kernel_tuner.backends.c.CFunctions"]], "compilationfailedconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.CompilationFailedConfig"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[5, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[5, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[5, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.InvalidConfig"]], "maxprodconstraint (class in kernel_tuner.util)": [[5, "kernel_tuner.util.MaxProdConstraint"]], "npencoder (class in kernel_tuner.util)": [[5, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[5, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[5, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[5, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[5, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.__init__"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[5, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[5, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_tune_params_list"]], "cleanup_lib() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.cleanup_lib"]], "compile() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.compile"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_and_benchmark() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.compile_and_benchmark"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[5, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[5, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[5, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.memset"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[5, "module-kernel_tuner.strategies.common"], [5, "module-kernel_tuner.util"], [17, "module-kernel_tuner.strategies.basinhopping"], [17, "module-kernel_tuner.strategies.bayes_opt"], [17, "module-kernel_tuner.strategies.brute_force"], [17, "module-kernel_tuner.strategies.diff_evo"], [17, "module-kernel_tuner.strategies.dual_annealing"], [17, "module-kernel_tuner.strategies.firefly_algorithm"], [17, "module-kernel_tuner.strategies.genetic_algorithm"], [17, "module-kernel_tuner.strategies.greedy_ils"], [17, "module-kernel_tuner.strategies.greedy_mls"], [17, "module-kernel_tuner.strategies.minimize"], [17, "module-kernel_tuner.strategies.mls"], [17, "module-kernel_tuner.strategies.ordered_greedy_mls"], [17, "module-kernel_tuner.strategies.pso"], [17, "module-kernel_tuner.strategies.random_sample"], [17, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.prepare_kernel_string"]], "print_config() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[5, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[5, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.scale_from_params"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.start_event"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[16, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[16, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[16, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[16, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[17, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[17, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[17, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[17, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[17, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[17, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[17, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[17, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[17, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[17, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[17, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[17, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[17, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[17, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[17, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[17, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[17, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[17, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[17, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[17, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[17, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[17, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[17, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[17, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[17, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[17, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]], "create_device_targets() (in module kernel_tuner)": [[21, "kernel_tuner.create_device_targets"]], "run_kernel() (in module kernel_tuner)": [[21, "kernel_tuner.run_kernel"]], "store_results() (in module kernel_tuner)": [[21, "kernel_tuner.store_results"]], "tune_kernel() (in module kernel_tuner)": [[21, "kernel_tuner.tune_kernel"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["cache_files", "contents", "contributing", "convolution", "correctness", "design", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"A": [0, 3, 5, 12, 13, 14, 16, 17, 21], "veri": [0, 4, 6, 7, 8, 11, 13, 14, 16, 19, 20], "us": [0, 1, 2, 3, 4, 5, 9, 11, 12, 13, 15, 16, 17, 18, 20, 21, 22], "featur": [0, 3, 4, 9, 13, 15, 16, 18, 20, 21], "kernel": [0, 2, 3, 4, 5, 11, 13, 15, 16, 17, 18, 19, 21, 22], "tuner": [0, 2, 3, 4, 5, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "i": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "abil": 0, "store": [0, 2, 3, 5, 8, 14, 16, 18, 21], "benchmark": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 21, 22], "result": [0, 2, 3, 4, 5, 8, 10, 14, 15, 16, 17, 18, 21, 22], "dure": [0, 5, 6, 7, 8, 10, 16, 21], "tune": [0, 1, 4, 5, 9, 12, 13, 17, 18, 20, 21, 22], "you": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22], "can": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "enabl": [0, 16, 17, 19, 20], "pass": [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 20, 21], "ani": [0, 2, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 19, 20, 21, 22], "filenam": [0, 3, 5, 9, 14, 18, 21], "option": [0, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 20, 21, 22], "argument": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21], "tune_kernel": [0, 3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17, 18, 19, 20, 21], "The": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21], "individu": [0, 16, 17], "configur": [0, 3, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 21], "ar": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "append": [0, 5, 21], "run": [0, 3, 4, 5, 6, 7, 10, 11, 13, 14, 16, 17, 21], "thi": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "also": [0, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "allow": [0, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 20, 21], "restart": [0, 6, 7, 8, 17], "session": [0, 5, 17], "from": [0, 3, 4, 5, 6, 9, 10, 11, 13, 14, 16, 17, 19, 20, 21], "an": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], "exist": [0, 5, 21], "should": [0, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 18, 21], "someth": [0, 3, 6, 7, 8, 14], "have": [0, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22], "termin": [0, 13], "previou": [0, 6, 7, 8, 17, 21], "befor": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 21], "had": [0, 3], "complet": [0, 3], "happen": [0, 2, 3, 14, 18], "quit": [0, 6, 7, 8, 10, 14, 20], "often": [0, 6, 7, 8, 16], "hpc": 0, "environ": [0, 3, 5, 13, 17, 21], "when": [0, 2, 3, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 19, 20, 21, 22], "job": 0, "reserv": [0, 7, 22], "out": [0, 3, 4, 10, 14], "number": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18, 19, 21, 22], "other": [0, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 21, 22], "simul": [0, 5, 8, 12, 17, 19, 21], "visual": [0, 14], "optim": [0, 1, 3, 4, 5, 6, 7, 8, 11, 12, 14, 15, 16, 21], "strategi": [0, 1, 3, 15, 21], "start": [0, 1, 3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 17, 21], "call": [0, 3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19, 20, 21], "contain": [0, 3, 5, 6, 7, 8, 10, 11, 14, 16, 17, 20, 21], "full": [0, 5, 13, 16, 18], "search": [0, 3, 5, 9, 12, 14, 15, 17, 21], "space": [0, 3, 4, 5, 10, 11, 14, 15, 17, 21], "true": [0, 3, 4, 5, 6, 7, 8, 11, 14, 16, 17, 21], "creat": [0, 2, 3, 5, 6, 7, 8, 10, 14, 16, 18, 19, 21], "even": [0, 2, 6, 7, 8, 11, 14, 17], "work": [0, 2, 3, 5, 6, 7, 8, 13, 15, 17, 20, 21], "while": [0, 3, 5, 6, 7, 8, 9, 14, 16, 17], "still": [0, 2, 4, 14], "As": [0, 3, 6, 7, 8, 10, 13, 14, 16], "new": [0, 2, 5, 6, 7, 8, 17, 21], "come": [0, 5, 6, 7, 8, 14, 16, 20], "thei": [0, 5, 6, 7, 8, 9, 14, 15], "stream": [0, 5, 6, 7, 8], "pleas": [0, 2, 3, 12, 13, 16, 18, 19, 21], "see": [0, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 18, 20, 21], "dashboard": [0, 12], "introduct": 1, "instal": [1, 2, 3, 6, 7, 8, 10, 11, 14, 16, 18], "get": [1, 3, 5, 6, 7, 8, 10, 13, 14], "convolut": [1, 4, 11, 14], "diffus": 1, "matrix": 1, "multipl": [1, 5, 11, 16, 20, 21], "exampl": [1, 2, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19, 21], "cach": [1, 5, 6, 7, 8, 13, 14, 17, 21], "file": [1, 2, 3, 5, 6, 7, 9, 11, 14, 17, 18, 20, 21], "correct": [1, 11, 19, 21], "verif": [1, 9, 21], "host": [1, 2, 5, 7, 8, 9, 16, 19, 20, 21], "code": [1, 3, 5, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "struct": 1, "templat": [1, 10], "metric": [1, 3, 5, 9, 14, 21], "object": [1, 3, 4, 5, 6, 7, 8, 17, 21], "observ": [1, 5, 15, 21, 22], "api": [1, 3, 5], "paramet": [1, 4, 5, 6, 7, 9, 11, 14, 15, 17, 18, 19, 20, 21], "vocabulari": [1, 16, 18], "design": [1, 2, 6, 7, 8, 16], "contribut": 1, "thank": 2, "consid": [2, 10, 12, 14, 21], "Not": 2, "all": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 21], "help": [2, 20], "u": [2, 3, 6, 7, 8], "improv": [2, 5, 6, 7, 8, 14, 17, 21], "about": [2, 3, 5, 6, 7, 8, 12, 14, 16, 17, 18, 21], "problem": [2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 21], "ensur": [2, 4, 6, 7, 8, 11, 13, 16, 19], "follow": [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 20, 21], "describ": [2, 3, 5, 11, 16, 19], "what": [2, 3, 4, 5, 6, 7, 8, 11, 14, 16, 18, 19, 20, 21, 22], "expect": [2, 3, 4, 5, 6, 7, 8, 14, 16, 21], "If": [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 19, 21], "possibl": [2, 3, 4, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19, 21], "includ": [2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 20, 21], "minim": [2, 15, 20, 21], "reproduc": 2, "actual": [2, 3, 4, 5, 6, 7, 8, 10, 14, 20], "output": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 18, 21, 22], "error": [2, 3, 4, 5, 11, 14, 20], "print": [2, 3, 5, 6, 7, 8, 10, 14, 21], "list": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21], "version": [2, 3, 14, 16, 21], "python": [2, 3, 5, 9, 10, 11, 14, 16, 18, 19, 20, 21], "cuda": [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 16, 18, 19, 20, 21], "opencl": [2, 3, 6, 7, 8, 9, 11, 12, 14, 21], "c": [2, 3, 9, 11, 12, 13, 14, 18, 20, 21], "compil": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22], "applic": [2, 3, 6, 7, 8, 9, 10, 11, 12, 15, 16, 19, 20, 21], "For": [2, 3, 4, 5, 6, 7, 8, 10, 13, 16, 18, 19, 21], "select": [2, 3, 5, 6, 7, 8, 10, 14, 16, 17, 21], "propos": 2, "chang": [2, 10, 16, 21], "addit": [2, 3, 6, 7, 8, 13, 15, 18], "signific": 2, "requir": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 20], "first": [2, 3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 19, 20, 21], "discuss": [2, 5], "Then": [2, 6, 7, 8, 10, 12, 13, 20], "fork": 2, "repositori": [2, 3, 6, 7, 8, 10, 12, 13, 14], "branch": 2, "one": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17, 21], "per": [2, 3, 6, 7, 8, 10, 15, 16, 21], "pull": 2, "request": [2, 16, 21], "googl": 2, "style": 2, "sphinxdoc": 2, "docstr": [2, 5], "modul": [2, 5, 11, 16], "public": [2, 12], "function": [2, 3, 4, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21], "pylint": 2, "check": [2, 4, 5, 6, 7, 8, 11, 14], "your": [2, 3, 6, 7, 8, 10, 11, 12, 13, 16, 19, 21], "written": [2, 20], "unit": [2, 5], "produc": [2, 4], "same": [2, 3, 4, 5, 6, 7, 8, 10, 11, 16, 18, 21], "better": [2, 6, 7, 8], "compat": [2, 5, 13], "3": [2, 4, 6, 7, 8, 10, 11, 13, 14, 17, 21], "5": [2, 6, 7, 8, 10, 17], "newer": [2, 13, 16], "entri": [2, 5, 6, 7], "changelog": 2, "md": 2, "match": [2, 3, 4, 5], "roadmap": 2, "updat": [2, 5], "remov": [2, 17], "doubt": 2, "where": [2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 19, 20, 21], "put": [2, 5, 6, 7, 8], "look": [2, 3, 5, 6, 7, 8, 10, 13, 14, 20], "regard": [2, 5, 17], "packag": 2, "pip": [2, 3, 6, 7, 12, 13, 14], "e": [2, 13, 15, 16, 17, 21], "dev": [2, 13, 16], "after": [2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 21], "command": [2, 13], "abl": [2, 3, 5, 6, 7, 8], "below": [2, 8, 9, 10, 11, 13, 14, 15, 16, 17, 19], "how": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 18, 19, 20, 21], "do": [2, 3, 5, 6, 7, 8, 10, 11, 14, 21], "flag": 2, "mode": [2, 16], "mean": [2, 3, 11, 14, 15, 17, 19, 20, 22], "copi": [2, 5, 6, 7, 8, 18, 21], "link": 2, "track": [2, 16], "sourc": [2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 20, 21], "To": [2, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21], "pytest": 2, "v": [2, 5, 6, 7, 8, 10], "top": [2, 5, 10, 16, 21], "level": [2, 5, 16], "directori": [2, 3, 6, 7, 8, 10, 13, 14], "note": [2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 19, 21], "pycuda": [2, 6, 8, 10, 11, 16, 20], "capabl": [2, 5, 6, 7, 14, 21], "gpu": [2, 3, 4, 5, 9, 11, 12, 14, 16, 18, 19, 21, 22], "skip": [2, 3, 6, 7, 8, 21], "present": [2, 14], "hold": [2, 6, 7, 14, 18, 19, 21], "pyopencl": [2, 5, 7, 16], "cupi": [2, 16, 20, 21], "nvidia": [2, 5, 13, 14, 16, 20], "make": [2, 3, 6, 7, 8, 10, 12, 13, 14, 16, 19, 20], "break": [2, 20], "cannot": [2, 6, 7, 8, 16], "them": [2, 3, 8, 10, 11, 14], "local": [2, 17, 21], "seen": [2, 3, 5, 14], "integr": [2, 20], "locat": [2, 4, 10, 16], "doc": [2, 3, 6, 7, 8, 10, 13, 14], "type": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21], "html": 2, "gener": [2, 3, 5, 6, 7, 8, 12, 14, 16, 17, 19, 21, 22], "page": [2, 3, 6, 7, 8, 9, 10, 12, 14, 15], "inspect": [2, 5, 16], "commit": 2, "brows": 2, "through": [2, 5, 6, 7, 8, 10, 12, 15, 16, 17, 21], "sure": [2, 3, 6, 7, 8, 12, 13, 14], "depend": [2, 3, 4, 8, 9, 10, 12, 15, 21], "extra": [2, 20], "pandoc": 2, "ubuntu": 2, "sudo": [2, 13], "apt": 2, "differ": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 21], "": [2, 3, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 18, 19, 20, 21], "onlin": 2, "built": [2, 16, 17, 19, 21], "automat": [2, 3, 6, 7, 8, 10, 14, 20, 21], "github": [2, 3, 6, 7, 8, 10, 13, 14], "action": 2, "correspond": [2, 3, 6, 7, 8, 10, 16, 17, 18], "master": 2, "latest": [2, 13], "last": [2, 5, 19], "releas": [2, 5], "stabl": 2, "publish": [2, 12], "point": [2, 3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18, 21], "process": [2, 3, 5, 6, 7, 8, 14, 15, 16, 17, 20], "again": [2, 3, 6, 7, 8, 10, 14], "fulli": [2, 13], "autom": 2, "guid": [3, 6, 14, 15, 18], "meant": 3, "write": [3, 9, 10, 14, 20, 21], "script": [3, 5, 14, 19, 20], "we": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20], "ll": [3, 6, 7, 8, 13, 14], "simpl": [3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19], "find": [3, 11, 14, 17, 21], "shortli": 3, "much": [3, 6, 7, 8, 10, 16, 20, 21], "reus": [3, 6, 7, 8, 14], "read": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 21], "document": [3, 4, 6, 7, 8, 10, 13, 14, 19, 22], "jupyt": [3, 6, 7, 8, 10, 13, 14], "notebook": [3, 6, 7, 8, 10, 13, 14], "just": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14], "clone": [3, 6, 7, 8, 10, 13, 14], "tutori": [3, 6, 10, 12, 13, 14], "re": [3, 6, 7, 8, 10, 14], "readi": [3, 5, 6, 7, 8, 10, 14], "go": [3, 6, 7, 8, 10, 12, 14, 18], "kernel_tun": [3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21, 22], "oper": [3, 6, 7, 8, 10, 11, 14, 15], "essenti": 3, "signal": [3, 22], "imag": [3, 6, 7, 8], "main": [3, 5, 10, 16, 18], "neural": 3, "network": 3, "deep": 3, "learn": 3, "comput": [3, 4, 5, 9, 10, 11, 12, 14, 17, 21], "linear": [3, 14, 21], "combin": [3, 5, 6, 7, 8, 9, 10, 14, 16, 17, 18, 21], "weight": [3, 17], "filter": [3, 4, 9, 11], "rang": [3, 4, 6, 7, 8, 10, 11, 20], "pixel": 3, "input": [3, 4, 6, 7, 8, 9, 11, 14, 15, 18, 19, 21], "each": [3, 4, 5, 6, 7, 10, 14, 16, 17, 21], "size": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 18, 20, 21], "w": [3, 6, 7, 15, 17], "time": [3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 17, 20, 21, 22], "h": [3, 10, 21], "f": [3, 4, 10, 11, 19], "f_w": 3, "f_h": 3, "o": [3, 5], "begin": [3, 6, 7, 8, 10], "equat": [3, 6, 7, 8, 10, 17], "nonumb": [3, 10], "x": [3, 4, 5, 6, 7, 8, 10, 12, 14, 18, 20, 21], "y": [3, 5, 6, 7, 8, 10, 11, 14, 21], "sum": [3, 4, 5, 14], "limits_": 3, "j": [3, 6, 7, 8, 12, 14], "0": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 19, 21], "end": [3, 5, 6, 7, 8, 10, 14, 16, 17, 19], "naiv": [3, 4, 6, 7, 8], "parallel": [3, 6, 7, 8], "thread": [3, 5, 6, 7, 8, 9, 10, 15, 16, 18, 21, 22], "avoid": [3, 14, 22], "confus": 3, "around": [3, 9], "term": 3, "refer": [3, 4, 5, 6, 7, 8, 9, 11, 16, 21], "shown": [3, 5, 16], "block": [3, 5, 6, 7, 8, 9, 10, 14, 15, 18, 21, 22], "execut": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 21], "press": [3, 6, 7, 8, 10, 14], "shift": [3, 6, 7, 8, 10, 14], "enter": [3, 6, 7, 8, 10, 14], "writefil": [3, 14], "convolution_na": [3, 4], "cu": [3, 4, 11, 14, 18, 20], "__global__": [3, 6, 8, 10, 12, 14, 18, 20], "void": [3, 6, 7, 8, 10, 12, 14, 18, 19, 20], "convolution_kernel": [3, 4], "float": [3, 5, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21], "int": [3, 5, 6, 7, 8, 10, 12, 14, 18, 20, 21], "blockidx": [3, 6, 7, 8, 10, 12, 14, 18, 20], "blockdim": [3, 18, 21], "threadidx": [3, 6, 7, 8, 10, 12, 14, 18, 20], "image_height": 3, "image_width": 3, "filter_height": 3, "filter_width": 3, "input_width": 3, "run_kernel": [3, 4, 5, 9, 21], "our": [3, 6, 7, 8, 10, 14, 18, 19], "But": [3, 6, 7, 8, 10, 18], "some": [3, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21], "data": [3, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18, 19, 21], "which": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22], "import": [3, 4, 6, 7, 8, 10, 13, 14, 15, 18, 19, 20], "numpi": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 18, 19, 20, 21], "np": [3, 5, 10, 14, 18, 19], "filter_s": 3, "17": [3, 4, 6, 7, 8, 11], "output_s": 3, "4096": [3, 4, 6, 7, 8, 11, 14], "prod": [3, 4, 11], "border_s": 3, "2": [3, 4, 6, 7, 8, 9, 10, 11, 14, 16, 17, 21], "1": [3, 4, 6, 7, 8, 10, 11, 14, 16, 17, 21], "input_s": [3, 4, 11], "output_imag": 3, "zero": [3, 4, 10, 11, 14], "astyp": [3, 4, 6, 7, 8, 10, 11, 12, 14, 18, 20], "float32": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 18, 20, 21], "input_imag": 3, "random": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 20, 21], "randn": [3, 4, 11, 12, 14, 18, 20], "conv_filt": 3, "now": [3, 5, 6, 7, 8, 10, 11, 14, 18], "structur": [3, 5, 6, 7, 14, 18], "ha": [3, 5, 6, 7, 8, 11, 14, 16, 17, 21], "signatur": [3, 5], "kernel_nam": [3, 5, 11, 19, 20, 21], "kernel_sourc": [3, 5, 19, 21], "problem_s": [3, 4, 5, 6, 7, 8, 10, 11, 14, 18, 19, 21, 22], "param": [3, 4, 5, 16, 17, 21], "ellipsi": 3, "here": [3, 10, 11, 13, 14, 16, 21], "indic": [3, 17, 22], "mani": [3, 5, 6, 7, 8, 14, 15, 16, 17, 21], "won": 3, "t": [3, 5, 6, 7, 8, 10, 11, 13, 17, 20, 21], "need": [3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 18, 19, 20, 21], "right": [3, 6, 7, 8, 10, 13], "interest": [3, 19], "found": [3, 5, 12, 16, 17], "five": [3, 5, 18], "name": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 18, 21, 22], "string": [3, 5, 6, 7, 8, 9, 14, 15, 16, 18, 19, 21], "domain": [3, 6, 7, 8, 9, 10, 21], "up": [3, 5, 6, 7, 8, 14, 18, 21], "three": [3, 4, 14], "dimens": [3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 21, 22], "dictionari": [3, 5, 6, 7, 8, 10, 14, 16, 17, 18, 21], "simpli": [3, 4, 5, 6, 7, 8, 10, 17, 18, 21], "cell": [3, 6, 7, 8, 10, 14], "wrote": 3, "determin": [3, 6, 7, 8, 10, 16, 17], "grid": [3, 5, 6, 7, 8, 9, 11, 14, 21, 22], "defin": [3, 4, 5, 6, 7, 8, 9, 10, 14, 15, 16, 20, 21], "abov": [3, 5, 6, 7, 8, 10, 13, 14, 18, 19], "divid": [3, 6, 7, 8, 10, 11, 14, 21], "divisor": [3, 5, 6, 7, 8, 14, 21], "default": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 20, 21], "so": [3, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 20, 21], "specifi": [3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22], "arrai": [3, 4, 5, 6, 7, 8, 10, 18, 19, 21], "scalar": [3, 6, 7, 8, 10, 21], "therefor": [3, 4, 6, 7, 8, 10, 11, 14], "exactli": [3, 5, 6, 7, 8, 14, 16], "order": [3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 17, 18, 21], "32": [3, 5, 6, 7, 8, 10, 12, 14, 18, 21], "bit": [3, 5, 6, 7, 8, 10, 11, 14], "final": [3, 4, 6, 7, 8, 10], "user": [3, 4, 5, 7, 9, 13, 14, 15, 16, 17, 20, 21], "rememb": [3, 6, 7, 8, 14], "anyth": 3, "insert": [3, 4, 5, 8, 10, 11, 14, 18, 20, 21, 22], "preprocessor": [3, 5, 21], "statement": [3, 8, 10, 14, 20], "valu": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 21], "were": [3, 6, 7, 8, 10, 14, 21], "like": [3, 5, 6, 7, 8, 9, 10, 14, 17, 18, 19, 20, 21], "i_like_convolut": 3, "42": 3, "line": [3, 6, 7, 8], "definit": [3, 10, 21], "effect": [3, 6, 7, 8, 21], "perform": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21], "unless": 3, "cours": [3, 6, 7, 8, 13, 14], "somewher": 3, "token": 3, "In": [3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 18, 19, 21, 22], "freeli": 3, "few": [3, 6, 7, 8, 10, 11, 20], "special": [3, 6, 7, 8, 16, 18, 22], "mai": [3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19, 21], "notic": [3, 6, 7, 8], "haven": [3, 13], "yet": [3, 10, 11, 18], "basic": [3, 5, 6, 7, 8, 18], "block_size_x": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 18, 20, 21], "block_size_i": [3, 4, 6, 7, 8, 10, 11, 14, 21], "block_size_z": [3, 6, 7, 8, 10, 21], "interpret": 3, "z": [3, 5, 10, 21], "prefer": [3, 5, 6, 8, 16, 21], "block_size_nam": [3, 5, 21], "let": [3, 5, 6, 7, 8, 18, 20], "continu": [3, 5, 6, 7, 8, 13, 16, 17, 21], "creation": [3, 12, 17], "trusti": 3, "old": 3, "16": [3, 4, 6, 7, 8, 10, 11, 14], "dict": [3, 4, 5, 8, 11, 12, 16, 17, 18, 20, 21], "current": [3, 4, 5, 6, 7, 8, 14, 16, 17, 21], "undefin": [3, 5, 6, 7, 8, 14], "constant": [3, 5, 6, 7, 8, 9, 11, 14, 17, 21], "filter_heigth": 3, "those": [3, 9, 16], "could": [3, 4, 5, 6, 7, 8, 11, 13, 14, 16, 17, 20, 21], "runtim": [3, 5, 6, 7, 8, 12, 13, 16, 20], "setup": [3, 6, 7, 8, 11, 13, 16, 19], "everyth": [3, 5, 6, 7, 8, 13], "answer": [3, 4, 5, 6, 7, 8, 9, 21], "done": [3, 13, 15, 16], "alloc": [3, 5, 6, 7, 8, 9, 11, 21], "memori": [3, 5, 9, 11, 16, 19, 21, 22], "move": [3, 5, 6, 11, 14, 17, 21], "content": [3, 5, 21], "deriv": [3, 5, 6, 7, 8, 15], "retriev": [3, 5, 21], "free": [3, 6, 7, 8, 11, 13, 14], "return": [3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19, 21], "contrast": 3, "wa": [3, 5, 6, 7, 8, 16, 21], "finish": [3, 5, 7, 10, 11, 16], "particularli": [3, 15], "compar": [3, 4, 6, 7, 8, 10, 14, 15, 16], "case": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 18, 19, 21], "than": [3, 6, 7, 8, 10, 15, 16, 17, 21, 22], "highli": [3, 12, 14], "parametr": 3, "It": [3, 5, 6, 7, 8, 11, 13, 14, 16, 20, 21], "long": [3, 6, 7, 8, 10, 11, 14, 19], "instead": [3, 5, 9, 14, 21], "adjust": 3, "path": [3, 16], "littl": [3, 6, 7, 8, 14], "ve": [3, 6, 7, 8, 13, 14], "interfac": [3, 4, 11, 13, 16, 17, 19, 21], "familiar": [3, 14], "becaus": [3, 4, 6, 7, 8, 11, 13, 14, 15, 20, 22], "kernel_str": [3, 4, 5, 6, 7, 8, 11, 12, 17, 21], "tune_param": [3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 19, 20, 21], "onli": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 19, 21], "replac": [3, 4, 5, 6, 7, 8, 10, 14, 21], "similarli": 3, "singl": [3, 4, 5, 6, 7, 8, 11, 14, 16, 20, 21], "wai": [3, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 21], "64": [3, 6, 7, 8, 12, 14, 18, 20], "128": [3, 6, 7, 8, 12, 18, 20], "8": [3, 5, 6, 7, 8, 10, 14, 16], "try": [3, 5, 6, 7, 8, 13, 14, 17, 21], "env": [3, 5, 17, 18, 21], "take": [3, 5, 6, 7, 8, 10, 14, 16, 17, 18, 20, 21], "cartesian": [3, 10], "product": [3, 5, 6, 7, 21], "small": [3, 6, 7, 8, 14], "set": [3, 4, 5, 6, 7, 8, 9, 10, 14, 16, 17, 18, 20, 21, 22], "realli": [3, 6, 7, 8, 13], "howev": [3, 4, 6, 7, 8, 11, 13, 14, 16, 19, 20, 21], "lot": [3, 6, 7, 8, 14, 16, 18, 19, 21], "problemat": 3, "support": [3, 5, 6, 7, 8, 11, 13, 16, 17, 20, 21, 22], "explain": [3, 5, 6, 7, 8, 11, 13, 14, 15, 18, 20, 21], "illeg": 3, "2048": 3, "limit": [3, 5, 6, 7, 8, 9, 14, 16, 17, 20, 21, 22], "1024": [3, 6, 7, 8, 18], "devic": [3, 4, 6, 7, 8, 9, 11, 16, 20, 21], "fail": [3, 5, 13, 21], "reason": [3, 5, 19, 21], "too": [3, 6, 7, 8, 10, 11, 14, 21], "share": [3, 5, 21], "regist": [3, 6, 7, 8, 14, 16], "avail": [3, 6, 7, 8, 9, 10, 13, 16], "silent": 3, "verbos": [3, 4, 5, 6, 7, 8, 11, 21], "bound": [3, 5, 14, 17], "access": [3, 6, 7, 8, 10, 16, 19], "ignor": [3, 5, 6, 7, 8, 21], "two": [3, 5, 6, 7, 8, 9, 14, 15, 17, 21], "thing": [3, 11, 14], "record": [3, 5, 6, 16, 21], "show": [3, 6, 7, 8, 9, 12, 15, 19], "specif": [3, 5, 6, 7, 8, 10, 15, 16, 17, 21], "secondli": [3, 14], "experi": 3, "took": [3, 6, 8, 17, 18, 21], "place": [3, 6, 7, 8, 16, 17, 18, 21], "That": [3, 6, 7, 8, 11, 14, 15, 18], "softwar": [3, 6, 7, 8, 12, 13, 16, 17, 18], "along": [3, 5, 13, 18, 22], "inform": [3, 5, 6, 7, 8, 12, 16, 17, 18, 21, 22], "second": [3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 17, 21], "alwai": [3, 5, 6, 7, 8], "under": [3, 12, 21], "circumst": 3, "obtain": [3, 6, 7, 8, 10, 16], "promis": 3, "would": [3, 6, 7, 8, 20], "tile": [3, 9, 14], "factor": [3, 6, 7, 8, 9, 10, 14, 22], "amount": [3, 5, 6, 7, 8, 14, 15, 21], "particular": [3, 5, 6, 7, 9, 11, 14, 16, 19], "increas": [3, 6, 7, 8, 16], "certain": [3, 5, 6, 7, 8, 16, 22], "tile_size_x": [3, 4, 6, 7, 8, 11, 14], "4": [3, 6, 7, 8, 10, 14, 16], "tile_size_i": [3, 4, 6, 7, 8, 11, 14, 21], "understand": 3, "everi": [3, 4, 6, 7, 8, 9, 16, 18], "fewer": [3, 6, 7, 8], "total": [3, 5, 6, 7, 8, 14, 15, 18], "stai": 3, "tell": [3, 6, 7, 8, 9, 11, 14, 18, 19], "influenc": 3, "alreadi": [3, 5, 6, 7, 8, 13, 14, 21], "did": [3, 6, 7, 8, 14], "mimick": 3, "behavior": [3, 14, 16, 21], "been": [3, 5, 6, 7, 8, 11, 14, 17], "assum": [3, 5, 6, 7, 8, 14, 21], "far": [3, 6, 7, 8, 14, 18], "grid_div_x": [3, 4, 6, 7, 8, 11, 14, 21], "grid_div_i": [3, 4, 6, 7, 8, 11, 14, 21], "add": [3, 5, 6, 7, 8, 11, 14, 16, 17], "decreas": [3, 14], "correspondingli": 3, "displai": 3, "commonli": [3, 6, 7, 8, 13, 14], "gflop": [3, 5, 9, 14, 15], "giga": [3, 14], "compos": [3, 5, 14, 15], "lambda": [3, 5, 6, 7, 14, 15, 21], "collect": [3, 5, 6, 7, 8, 10, 14, 16, 19], "ordereddict": [3, 5, 6, 7, 8, 10, 14, 15, 21], "p": [3, 5, 14, 15, 19, 21], "1e9": [3, 14], "1e3": [3, 6, 7, 8, 14, 15], "expand": [3, 14, 16], "longer": [3, 5, 15], "sinc": [3, 8, 10, 14, 20], "9": [3, 4, 6, 7, 8, 11], "And": [3, 6, 7, 8, 17, 20, 21], "know": [3, 6, 7, 8, 14, 15], "enough": [3, 4, 14], "own": [3, 8, 11, 13, 15, 16], "whenev": 4, "program": [4, 6, 7, 8, 11, 14, 19, 20], "good": [4, 6, 7, 8, 22], "fast": [4, 6, 7, 8], "verifi": [4, 5, 9, 21], "instanc": [4, 5, 6, 7, 8, 11, 16, 21], "none": [4, 5, 16, 17, 21], "onc": [4, 5, 6, 7, 8, 10, 16, 21], "against": [4, 5], "comparison": 4, "implement": [4, 5, 9, 10, 15, 16, 17, 21], "allclos": [4, 21], "maximum": [4, 5, 10, 17, 21], "absolut": [4, 21], "1e": [4, 21], "6": [4, 6, 7, 8, 10, 11, 13, 21], "want": [4, 8, 10, 11, 13, 14, 16, 18, 21, 22], "toler": 4, "atol": [4, 5, 21], "convolution_correct": 4, "py": [4, 11, 13], "demonstr": [4, 8, 9, 14], "open": [4, 6, 7, 11, 14], "r": [4, 11], "cmem_arg": [4, 5, 21], "d_filter": 4, "arg": [4, 5, 6, 7, 8, 10, 11, 12, 14, 17, 18, 19, 20], "non": 4, "field": [4, 6, 7, 8], "its": [4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16, 21], "almost": [4, 6, 7, 8, 16], "whose": [4, 21], "trust": [4, 17], "construct": [4, 14], "There": [4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 21, 22], "precomput": 4, "more": [4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 18, 20, 21], "flexibl": [4, 6, 7, 14], "necessari": [4, 5, 6, 7, 8, 21], "callabl": [4, 5, 21], "accept": [4, 5, 17, 21], "cpu_result": 4, "gpu_result": [4, 6, 8], "although": 4, "semant": 4, "posit": [4, 5, 10, 17, 20, 21], "reflect": [4, 16], "reduct": [4, 15, 21], "snippet": 4, "sum_x": 4, "n": [4, 6, 7, 8, 10, 11, 12, 14, 17, 18, 20], "custom": [4, 9, 15, 16, 19], "def": [4, 5, 6, 7, 8, 10, 16, 19], "verify_partial_reduc": 4, "isclos": 4, "first_kernel": 4, "_": [4, 6, 7, 8], "sum_float": 4, "map": [4, 9, 10], "provid": [4, 5, 6, 7, 8, 11, 20, 21], "third": [4, 14], "partial": [4, 6, 7, 8, 9], "cpu": [4, 7, 8, 11], "achiev": [4, 8], "element": [4, 6, 7, 8, 14, 15, 18, 19, 21], "doe": [4, 5, 6, 7, 8, 10, 11, 14, 16, 20, 21], "necessarili": [4, 11], "section": [5, 6, 7, 8], "detail": [5, 13, 21], "intern": [5, 12, 17, 20], "mostli": [5, 12, 21], "relev": [5, 12, 16], "develop": [5, 12, 13], "extens": 5, "architectur": [5, 16], "At": [5, 10, 21], "expos": 5, "respons": 5, "iter": [5, 6, 7, 8, 10, 14, 16, 17, 18, 21], "over": [5, 6, 7, 8, 13, 14, 16, 17], "brute_forc": [5, 21], "valid": [5, 9, 14, 21], "random_sampl": [5, 21], "sampl": [5, 17, 21], "advanc": [5, 20, 21], "being": [5, 6, 7, 8, 14, 16, 17, 21], "strategy_opt": [5, 17, 21], "sai": [5, 6, 7, 8, 18, 20], "foreseen": 5, "futur": [5, 12, 21, 22], "high": [5, 6, 7, 8, 12, 14, 16], "wrap": [5, 18, 20, 21], "base": [5, 15, 16, 20, 21], "low": [5, 6, 7, 8, 14], "abstract": [5, 16], "ready_argument_list": 5, "build": [5, 6, 7, 8, 13], "bottom": 5, "pyhip": 5, "either": [5, 10, 17, 20, 21], "typic": [5, 13, 14, 21], "nvcc": 5, "gcc": 5, "fortran": [5, 9, 20], "turn": 5, "launch": [5, 6, 7, 8, 11, 16, 21], "rest": [5, 6, 7, 8], "helper": [5, 16], "get_opt": 5, "suppli": [5, 11, 14, 17, 20, 21], "get_strategy_docstr": 5, "method": [5, 6, 7, 8, 11, 14, 16, 17], "make_strategy_options_doc": 5, "scale_from_param": 5, "ep": [5, 17], "func": [5, 16, 21], "invers": 5, "unscal": 5, "setup_method_argu": 5, "prepar": [5, 6, 7, 8], "setup_method_opt": 5, "tuning_opt": [5, 17], "snap_to_nearest_config": 5, "closest": 5, "unscale_and_snap_to_nearest": 5, "snap": 5, "scale": 5, "variabl": [5, 10, 13, 17, 21], "nearest": [5, 21], "config": 5, "class": [5, 16, 17], "kernel_opt": 5, "device_opt": 5, "__init__": 5, "instanti": [5, 20], "kernelsourc": 5, "parameter_spac": [5, 17], "entir": [5, 6, 7, 8, 14, 17, 21], "iterfac": 5, "platform": [5, 12, 13, 16, 21], "quiet": [5, 21], "fals": [5, 16, 17, 21], "compiler_opt": [5, 21], "7": [5, 6, 7, 8, 10, 21], "offer": 5, "languag": [5, 8, 11, 14, 19, 21], "lang": [5, 9, 11, 20, 21], "bool": [5, 19, 21], "gpu_arg": 5, "benchmark_continu": 5, "durat": [5, 16], "least": 5, "benchmark_default": 5, "check_kernel_output": 5, "compile_kernel": 5, "copy_constant_memory_arg": 5, "most": [5, 6, 7, 8, 9, 11, 12, 14, 16, 17, 18, 19, 21], "recent": [5, 13, 16], "copy_shared_memory_arg": 5, "smem_arg": [5, 21], "copy_texture_memory_arg": 5, "texmem_arg": [5, 21], "textur": [5, 21], "create_kernel_inst": 5, "get_environ": 5, "memcpy_dtoh": [5, 6], "dest": 5, "src": 5, "mem": 5, "group": [5, 6, 7, 8, 21], "maintain": 5, "state": [5, 6, 7, 8, 16, 21], "interact": [5, 16], "properti": [5, 14, 21], "context": [5, 6, 8, 10], "kernel_inst": 5, "lookup": 5, "directli": [5, 6, 7, 8, 11, 14, 16, 20, 21], "driver": [5, 6, 8, 10], "ndarrai": [5, 10], "format": [5, 6, 7, 19], "kei": [5, 6, 7, 8, 14, 17, 18, 21], "symbol": [5, 21], "similar": [5, 11, 14, 21], "regular": [5, 8, 16], "int32": [5, 12, 18, 20, 21], "kernel_finish": 5, "otherwis": [5, 14, 21], "devicealloc": 5, "memcpy_htod": [5, 6], "memset": 5, "unsign": [5, 7], "byte": [5, 19, 21], "global": [5, 6, 7, 8, 17], "tupl": [5, 8, 10, 17, 21], "start_ev": 5, "event": [5, 6, 11, 16], "mark": 5, "measur": [5, 6, 7, 8, 10, 11, 14, 15, 16, 21, 22], "stop_ev": 5, "synchron": [5, 6, 8, 10, 14, 15], "halt": [5, 11], "until": [5, 11], "task": 5, "rawkernel": 5, "static": 5, "cudeviceptr": 5, "cufunct": 5, "id": [5, 16], "must": [5, 15, 21], "dynam": [5, 21], "buffer": [5, 7, 19], "fill": [5, 14], "item": [5, 6, 7, 8, 10], "ndrang": 5, "cleanup_lib": 5, "unload": 5, "previous": [5, 6, 7, 8, 14], "load": 5, "librari": [5, 9, 16, 19], "kernelinst": 5, "repres": [5, 6, 7, 8], "tunabl": [5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 21, 22], "ctype": 5, "_funcptr": 5, "asynchron": 5, "memcpi": [5, 11], "c_arg": 5, "whatev": [5, 11, 17], "left": [5, 6, 7, 8, 10, 15], "robust": 5, "averag": [5, 6, 7, 8, 11, 16], "ptr": 5, "pionter": 5, "compilationfailedconfig": 5, "errorconfig": 5, "invalidconfig": 5, "maxprodconstraint": 5, "maxprod": 5, "constraint": 5, "enforc": 5, "given": [5, 6, 7, 8, 10, 16, 17, 21], "npencod": 5, "skipkei": 5, "ensure_ascii": 5, "check_circular": 5, "allow_nan": 5, "sort_kei": 5, "indent": 5, "separ": [5, 9, 11, 20], "dump": [5, 6, 7], "json": [5, 6, 7, 9, 21], "obj": 5, "subclass": 5, "serializ": 5, "rais": 5, "typeerror": 5, "arbitrari": 5, "self": [5, 16, 17], "except": [5, 9], "els": 5, "jsonencod": 5, "runtimefailedconfig": 5, "skippablefailur": 5, "stopcriterionreach": 5, "thrown": 5, "stop": [5, 17], "criterion": [5, 17], "reach": 5, "check_argument_list": 5, "check_argument_typ": 5, "dtype": [5, 19], "kernel_argu": 5, "check_restrict": 5, "restrict": [5, 9, 14, 20, 21], "whether": [5, 15, 17, 21], "meet": 5, "check_stop_criterion": 5, "max_fev": [5, 17, 21], "exceed": 5, "check_thread_block_dimens": 5, "max_thread": 5, "check_tune_params_list": 5, "forbidden": 5, "compile_restrict": 5, "pars": [5, 6, 7], "config_valid": 5, "max": 5, "convert_constraint_restrict": 5, "convert": [5, 6, 7], "backward": 5, "cuda_error_check": 5, "statu": 5, "delete_temp_fil": 5, "delet": 5, "temporari": 5, "don": [5, 6, 8, 10, 11, 21], "complain": 5, "detect_languag": 5, "attempt": [5, 20], "detect": [5, 17, 20, 21], "dump_cach": 5, "str": [5, 6, 7, 8, 10], "omit": 5, "sever": [5, 6, 7, 8, 9, 10, 13, 14, 20, 21], "store_cach": 5, "speed": 5, "great": [5, 6, 7, 8, 18], "power": [5, 14, 16, 22], "get_best_config": 5, "objective_higher_is_bett": [5, 15, 21], "best": [5, 6, 7, 10, 14, 17, 20, 21, 22], "accord": [5, 21], "get_config_str": 5, "compact": 5, "represent": [5, 19], "get_grid_dimens": 5, "current_problem_s": 5, "grid_div": 5, "dim": 5, "get_instance_str": 5, "debug": 5, "advis": 5, "get_kernel_str": [5, 6, 7, 8], "One": [5, 6, 7, 8, 16, 19], "get_problem_s": 5, "get_smem_arg": 5, "get_temp_filenam": 5, "suffix": [5, 21], "form": [5, 14, 16, 17], "temp_x": 5, "larg": [5, 6, 7, 8, 10, 21], "integ": [5, 16, 19, 21], "get_thread_block_dimens": 5, "convent": [5, 11, 21], "get_total_tim": 5, "overhead_tim": 5, "looks_like_a_filenam": 5, "normalize_verify_funct": 5, "normal": [5, 17, 21], "result_host": 5, "keyword": 5, "behaviour": 5, "parse_restrict": 5, "prepare_kernel_str": 5, "prepend": [5, 8], "seri": [5, 10], "By": [5, 11, 14, 17, 21], "macro": 5, "made": 5, "print_config": 5, "print_config_output": 5, "process_cach": 5, "device_nam": [5, 21], "tune_params_kei": 5, "x1": 5, "x2": 5, "xn": 5, "234342": 5, "y1": 5, "y2": 5, "yn": 5, "134233": 5, "close": [5, 6, 7, 8], "bracket": 5, "miss": [5, 21], "earlier": [5, 6, 7, 8, 10], "abruptli": 5, "process_metr": 5, "calcul": [5, 10], "express": [5, 6, 7, 8, 9, 11, 14, 21], "10000": 5, "read_cach": 5, "open_cach": 5, "cachefil": [5, 21], "read_fil": 5, "replace_param_occurr": 5, "occurr": 5, "setup_block_and_grid": 5, "write_fil": 5, "whole": [6, 7, 8, 14, 17], "model": [6, 7, 8, 12], "physic": 6, "numer": [6, 7, 8], "introduc": [6, 7, 8, 14, 16], "redistribut": [6, 7, 8], "region": [6, 7, 8], "concentr": [6, 7, 8], "without": [6, 7, 8, 10, 11, 16, 17], "bulk": [6, 7, 8], "motion": [6, 7, 8], "concept": [6, 7, 8], "wide": [6, 7, 8, 13, 14], "chemistri": [6, 7, 8], "biologi": [6, 7, 8], "suppos": [6, 7, 8], "metal": [6, 7, 8], "sheet": [6, 7, 8], "temperatur": [6, 7, 8, 16, 17, 22], "equal": [6, 7, 8, 14, 21], "degre": [6, 7, 8], "everywher": [6, 7, 8], "heat": [6, 7, 8], "thousand": [6, 7, 8], "instant": [6, 7, 8, 10], "hotspot": [6, 7, 8], "cooler": [6, 7, 8], "area": [6, 7, 8, 14], "melt": [6, 7, 8], "loss": [6, 7, 8], "radiat": [6, 7, 8], "caus": [6, 7, 8], "frac": [6, 7, 8], "d": [6, 7, 8, 10, 17, 18], "spatial": [6, 7, 8], "descret": [6, 7, 8], "2d": [6, 7, 8, 9], "quantiti": [6, 7, 8, 15, 16, 21], "nx": [6, 7, 8, 10], "equi": [6, 7, 8], "distant": [6, 7, 8], "direct": [6, 7, 8, 11, 14, 15, 21], "ny": [6, 7, 8, 10], "Be": [6, 7, 8], "distanc": [6, 7, 8, 17], "delta": [6, 7, 8], "between": [6, 7, 8, 11, 14, 15, 17, 21], "central": [6, 7, 8], "approxim": [6, 7, 8], "x_i": [6, 7, 8, 10], "x_": [6, 7, 8], "approx": [6, 7, 8], "u_": [6, 7, 8], "2u_": [6, 7, 8], "y_": [6, 7, 8], "estim": [6, 7, 8], "next": [6, 7, 8, 14, 19], "step": [6, 7, 8, 14, 15, 17, 20], "simplifi": [6, 7, 8], "formula": [6, 7, 8], "further": [6, 7, 8, 13, 14], "4u_": [6, 7, 8], "simplic": [6, 7, 8, 10], "assumpt": [6, 7, 8], "boundari": [6, 7, 8], "condit": [6, 7, 8, 14], "dt": [6, 7, 8], "225": [6, 7, 8], "give": [6, 7, 8, 17], "test": [6, 7, 8, 9, 14, 16, 21], "initi": [6, 7, 8, 19], "hot": [6, 7, 8], "plot": [6, 7, 8], "anoth": [6, 7, 8, 11, 14, 15, 17, 21], "color": [6, 7, 8], "matplotlib": [6, 7, 8, 13], "pyplot": [6, 7, 8], "inlin": [6, 7, 8], "get_initial_condit": [6, 7, 8], "ones": [6, 7, 8, 22], "randint": [6, 7, 8], "10": [6, 7, 8, 12, 17], "1000": [6, 7, 8, 10], "2000": [6, 7, 8], "fig": [6, 7, 8], "ax1": [6, 7, 8], "ax2": [6, 7, 8], "subplot": [6, 7, 8], "imshow": [6, 7, 8], "lt": [6, 7, 8], "axesimag": [6, 7, 8], "0x2aaab952f240": 6, "gt": [6, 7, 8], "quick": [6, 7, 8], "save": [6, 7], "later": [6, 7, 8, 10, 21], "field_copi": [6, 7], "m": [6, 7, 8, 10], "4164": 6, "018869400024": 6, "0x2aab1c98b3c8": 6, "worri": [6, 8], "appli": [6, 7, 8], "terminologi": [6, 8], "text": [6, 8, 14], "225f": [6, 7, 8], "diffuse_kernel": [6, 7, 8], "u_new": [6, 7, 8], "0f": [6, 7, 8], "togeth": [6, 7, 8, 13, 21], "choos": [6, 7, 8, 14, 17, 21], "impact": [6, 7, 8, 11], "fix": [6, 7, 8, 17, 21], "unrol": [6, 7, 8, 9, 14, 22], "loop": [6, 7, 8, 9, 14, 22], "drv": 6, "sourcemodul": [6, 8, 10], "init": 6, "make_context": 6, "devprop": 6, "k": [6, 7, 8, 10, 12, 14, 18], "get_devic": 6, "get_attribut": 6, "cc": 6, "compute_capability_major": 6, "compute_capability_minor": 6, "u_old": [6, 8], "mem_alloc": 6, "nbyte": 6, "block_size_str": [6, 8], "arch": 6, "sm_": 6, "get_funct": [6, 8, 10], "boilerpl": [6, 7, 8], "moment": [6, 7, 8, 21], "These": [6, 7, 8, 10, 13, 14, 16, 20, 21], "serv": [6, 7, 8, 15, 17], "guess": [6, 7, 8], "pair": [6, 7, 8], "500": [6, 7, 8], "time_sinc": 6, "zeros_lik": [6, 10, 12, 14, 18, 20], "set_titl": [6, 7, 8], "53": [6, 7, 8], "423038482666016": 6, "0x2aaabbdcb2e8": 6, "faster": [6, 7, 8, 14], "cleanup": 6, "pop": 6, "think": [6, 7, 8], "messi": [6, 7, 8], "got": [6, 7, 8], "cleaner": [6, 7, 8], "plai": [6, 7, 8], "difficult": [6, 7, 8, 19, 20], "rather": [6, 7, 8, 21], "underutil": [6, 7, 8], "purpos": [6, 7, 8, 11, 14, 21, 22], "feel": [6, 7, 8], "48": [6, 7, 8], "care": [6, 7, 8], "11": [6, 7, 8], "appropi": [6, 7, 8], "fly": [6, 7, 8], "12": [6, 7, 8], "13": [6, 7, 8], "geforc": [6, 7, 8, 10], "gtx": [6, 7, 8, 10], "titan": [6, 7, 8], "22305920124": 6, "779033613205": 6, "824838399887": 6, "900499212742": 6, "999763202667": 6, "727967989445": 6, "752479994297": 6, "797900807858": 6, "876627194881": 6, "93347837925": 6, "766662418842": 6, "803033602238": 6, "853574407101": 6, "971545600891": 6, "763775992393": 6, "791257584095": 6, "848044800758": 6, "922745585442": 6, "792595207691": 6, "822137594223": 6, "893279993534": 6, "well": [6, 7, 8, 10, 14, 16, 21], "millisecond": [6, 7, 8], "matter": [6, 7, 8, 11], "conveni": [6, 7, 8, 11, 21], "analyz": [6, 7, 8], "seem": [6, 7, 8], "hardwar": [6, 7, 8, 10, 16, 17, 18], "vari": [6, 7, 8, 10, 14, 15], "addtion": [6, 7, 8], "among": [6, 7, 8, 12, 17], "128x32": [6, 7, 8], "likewis": [6, 7, 8], "becom": [6, 7, 8, 16, 17], "affect": [6, 7, 8, 14], "within": [6, 7, 8, 10, 14, 17, 21], "exchang": [6, 7, 8], "fact": [6, 7, 8, 11], "commun": [6, 7, 8], "idea": [6, 7, 8, 11, 14, 22], "control": [6, 7, 8, 16, 17, 21], "l2": [6, 7, 8], "closer": [6, 7, 8], "multiprocessor": [6, 7, 8], "l1": [6, 7, 8], "fine": [6, 7, 8], "grain": [6, 7, 8], "manag": [6, 7, 8, 14, 16], "cost": [6, 7, 8, 17], "instruct": [6, 7, 8, 9, 14], "overhead": [6, 7, 8, 14], "degrad": [6, 7, 8], "intermedi": [6, 7, 8], "mind": [6, 7, 8], "14": [6, 7, 8], "tx": [6, 7, 8, 14], "ty": [6, 7, 8, 14], "bx": [6, 7, 8, 10], "__shared__": [6, 8, 14], "sh_u": [6, 7, 8], "pragma": [6, 7, 8, 14], "__syncthread": [6, 7, 8, 14], "75041918755": 6, "18713598251": 6, "09015038013": 6, "06844799519": 6, "09730558395": 6, "14420480728": 6, "05957758427": 6, "07508480549": 6, "0731967926": 6, "14729599953": 6, "08389122486": 6, "10700161457": 6, "10125439167": 6, "31661438942": 6, "0629119873": 6, "04807043076": 6, "054880023": 6, "12033278942": 6, "06672639847": 6, "05816960335": 6, "12000002861": 6, "sometim": [6, 7, 8, 19], "merg": [6, 7, 8, 14], "half": [6, 7, 8], "doubl": [6, 7, 8, 19, 20], "cover": [6, 7, 8, 17], "part": [6, 7, 8, 12, 13, 14, 15, 19, 21], "beyond": [6, 7, 8, 21], "reduc": [6, 7, 8, 14], "condens": [6, 7, 8], "keep": [6, 7, 8, 14, 19], "importantli": [6, 7, 8], "worst": [6, 7, 8], "both": [6, 7, 8, 9, 14], "15": [6, 7, 8, 20], "tj": [6, 7, 8], "ti": [6, 7, 8, 10], "ad": [6, 7, 8, 11, 21], "somehow": [6, 7, 8], "larger": [6, 7, 8, 11, 17, 20], "insid": [6, 7, 8, 11, 14, 20, 21], "round": [6, 7, 8, 21], "arithmet": [6, 7, 8, 21], "evalu": [6, 7, 8, 14, 17, 21], "759308815": 6, "29789438248": 6, "06983039379": 6, "2634239912": 6, "997139203548": 6, "843692803383": 6, "05549435616": 6, "862348806858": 6, "750636804104": 6, "19084160328": 6, "876377594471": 6, "714169609547": 6, "875001597404": 6, "691116797924": 6, "575859189034": 6, "759679996967": 6, "622867202759": 6, "650336003304": 6, "09794559479": 6, "826515209675": 6, "692665600777": 6, "78363519907": 6, "646092808247": 6, "554745602608": 6, "716115188599": 6, "581280004978": 6, "662566399574": 6, "07386879921": 6, "833420813084": 6, "705055999756": 6, "840755212307": 6, "652575993538": 6, "569388794899": 6, "689356791973": 6, "597267186642": 6, "675232005119": 6, "10033922195": 6, "860332798958": 6, "731891202927": 6, "867276787758": 6, "68781440258": 6, "595276796818": 6, "735436797142": 6, "60216319561": 6, "852166390419": 6, "15089921951": 6, "852575981617": 6, "705932807922": 6, "888671982288": 6, "673248004913": 6, "563417613506": 6, "761139214039": 6, "621254396439": 6, "676595199108": 6, "06709122658": 6, "804953610897": 6, "685670387745": 6, "801798415184": 6, "632006394863": 6, "542387211323": 6, "722668802738": 6, "578745603561": 6, "618598401546": 6, "08220798969": 6, "821881604195": 6, "687955200672": 6, "77759360075": 6, "618003201485": 6, "539891195297": 6, "705900788307": 6, "568556785583": 6, "624492788315": 6, "0799423933": 6, "832300806046": 6, "70140799284": 6, "835481595993": 6, "638348805904": 6, "550105595589": 6, "667251205444": 6, "576044797897": 6, "732409596443": 6, "15916161537": 6, "869497597218": 6, "733248019218": 6, "890803205967": 6, "677363204956": 6, "577215993404": 6, "730982398987": 6, "58035838604": 6, "10066559315": 6, "837804794312": 6, "691385602951": 6, "851040017605": 6, "666656005383": 6, "560505592823": 6, "771103990078": 6, "626163220406": 6, "694451200962": 6, "11514236927": 6, "837299215794": 6, "703302407265": 6, "806828796864": 6, "648620784283": 6, "562521612644": 6, "760915207863": 6, "605760002136": 6, "690009605885": 6, "10740480423": 6, "841631996632": 6, "700883197784": 6, "838195204735": 6, "649779188633": 6, "56585599184": 6, "7168192029": 6, "59088640213": 6, "69627519846": 6, "3269824028": 6, "02665598392": 6, "840908801556": 6, "03752319813": 6, "788345599174": 6, "662041604519": 6, "85437438488": 6, "680422389507": 6, "0759360075": 6, "801996803284": 6, "666003203392": 6, "808000004292": 6, "643359994888": 6, "544691193104": 6, "741964805126": 6, "60942081213": 6, "681350398064": 6, "05262081623": 6, "792108798027": 6, "66344319582": 6, "768064010143": 6, "625260794163": 6, "540352010727": 6, "721862399578": 6, "579411196709": 6, "626976013184": 6, "06332798004": 6, "808211183548": 6, "679372787476": 6, "803718411922": 6, "627136015892": 6, "538227200508": 6, "682188808918": 6, "573836791515": 6, "725548803806": 6, "13023357391": 6, "843411195278": 6, "713843202591": 6, "85886080265": 6, "657920002937": 6, "565254402161": 6, "697094392776": 6, "579904007912": 6, "07484800816": 6, "801119995117": 6, "667347204685": 6, "799059200287": 6, "643820810318": 6, "542937588692": 6, "740518403053": 6, "615148806572": 6, "731334400177": 6, "07002239227": 6, "805299210548": 6, "675923216343": 6, "782060790062": 6, "631142401695": 6, "540383994579": 6, "723999989033": 6, "578681600094": 6, "726335990429": 6, "13297917843": 6, "844428789616": 6, "710278391838": 6, "835494399071": 6, "637958395481": 6, "567417597771": 6, "699366402626": 6, "588492810726": 6, "tri": [6, 7, 8, 17], "grow": [6, 7, 8], "quickli": [6, 7, 8], "On": [6, 7, 8, 21], "went": [6, 7, 8, 10], "72": [6, 7, 8], "26": [6, 7, 8], "32x2": [6, 7, 8], "64x4": [6, 7, 8], "four": [6, 7, 8], "best_tim": [6, 7], "min": [6, 7], "05": [6, 7], "join": [6, 7], "nice": [6, 7], "stdout": [6, 7], "why": [6, 7, 11, 15], "easili": [6, 7, 16], "easi": [6, 7, 15, 16, 21], "csv": [6, 7, 9], "analysi": [6, 7], "panda": [6, 7, 9, 13], "18": [6, 7, 8], "fp": [6, 7], "datafram": [6, 7], "df": [6, 7], "to_csv": [6, 7], "0x2aab1de088d0": 7, "01": 7, "sy": 7, "140": 7, "wall": 7, "98": 7, "__kernel": 7, "get_group_id": 7, "get_local_id": 7, "cl": 7, "ctx": 7, "create_some_context": 7, "mf": 7, "mem_flag": 7, "a_h": 7, "a_d": 7, "read_writ": 7, "copy_host_ptr": 7, "hostbuf": 7, "b_d": 7, "kernel_src": 7, "prg": 7, "queue": 7, "commandqueu": 7, "run_gpu": 7, "444": 7, "154": 7, "598": 7, "985": 7, "enqueue_copi": 7, "1748096": 7, "7284544": 7, "7707904": 7, "8573184": 7, "8380288": 7, "686528": 7, "69648": 7, "7461632": 7, "818304": 7, "771072": 7, "7190464": 7, "7522432": 7, "7982208": 7, "9624512": 7, "7214464": 7, "7453312": 7, "8028416": 7, "8922624": 7, "747328": 7, "7860736": 7, "8637184": 7, "__local": 7, "barrier": 7, "clk_local_mem_f": 7, "8449472": 7, "1912576": 7, "1035136": 7, "0927808": 7, "1140736": 7, "1790336": 7, "0808192": 7, "0809792": 7, "0836928": 7, "1545856": 7, "1249984": 7, "1264": 7, "1230336": 7, "4015104": 7, "0873216": 7, "0626496": 7, "0692224": 7, "140192": 7, "0801344": 7, "0688128": 7, "1428928": 7, "8844544": 7, "3245952": 7, "0911808": 7, "3039616": 7, "0079296": 7, "84848": 7, "0708288": 7, "857728": 7, "7561792": 7, "231072": 7, "8774336": 7, "7087296": 7, "8772672": 7, "6911872": 7, "5715968": 7, "7584896": 7, "6292032": 7, "6498688": 7, "1145664": 7, "8252928": 7, "6757568": 7, "7881152": 7, "6237696": 7, "544224": 7, "6951168": 7, "5648128": 7, "6452736": 7, "1065792": 7, "8313792": 7, "6905984": 7, "8302656": 7, "6367488": 7, "5478592": 7, "6660672": 7, "5719744": 7, "6551744": 7, "1384064": 7, "8531072": 7, "7078976": 7, "8516672": 7, "6677696": 7, "5685632": 7, "7074048": 7, "5753152": 7, "8228864": 7, "2124736": 7, "8633344": 7, "6921216": 7, "8896384": 7, "6659904": 7, "5582144": 7, "7522624": 7, "6081536": 7, "6664448": 7, "1095936": 7, "8063424": 7, "6717888": 7, "7982848": 7, "6263552": 7, "5289728": 7, "7008832": 7, "567456": 7, "5968704": 7, "1018432": 7, "8117248": 7, "6724736": 7, "7728576": 7, "6038336": 7, "5172352": 7, "6796352": 7, "5470016": 7, "5968448": 7, "1107712": 7, "8237248": 7, "6810944": 7, "821952": 7, "620352": 7, "5230208": 7, "6415552": 7, "5476864": 7, "7168192": 7, "1942016": 7, "8626304": 7, "7099712": 7, "9123328": 7, "6608448": 7, "5631168": 7, "7113024": 7, "556576": 7, "1583104": 7, "8384832": 7, "67856": 7, "845856": 7, "6581248": 7, "54944": 7, "7520064": 7, "6076224": 7, "6842112": 7, "1547072": 7, "8422016": 7, "6895552": 7, "8037312": 7, "6387072": 7, "5383296": 7, "7326656": 7, "5863488": 7, "6813376": 7, "1493952": 7, "8444928": 7, "6929216": 7, "832768": 7, "6389312": 7, "5412672": 7, "698336": 7, "5717568": 7, "676096": 7, "4303104": 7, "0341696": 7, "8365184": 7, "0398656": 7, "7786496": 7, "648928": 7, "8479232": 7, "6508544": 7, "1219392": 7, "7994048": 7, "6492288": 7, "8068416": 7, "6343168": 7, "5235328": 7, "7268928": 7, "5898432": 7, "6633536": 7, "0849664": 7, "7869632": 7, "6458624": 7, "7611968": 7, "613088": 7, "50912": 7, "6972928": 7, "5620608": 7, "601856": 7, "095232": 7, "7967488": 7, "6601472": 7, "7952896": 7, "6047296": 7, "5108224": 7, "6607744": 7, "5492416": 7, "7091136": 7, "171552": 7, "8473408": 7, "6962112": 7, "8663936": 7, "6466816": 7, "5475584": 7, "6754048": 7, "5591744": 7, "108896": 7, "7907264": 7, "6459328": 7, "7965888": 7, "6250816": 7, "5188416": 7, "721408": 7, "5920832": 7, "7068608": 7, "0909248": 7, "7930752": 7, "6524544": 7, "7745216": 7, "6146176": 7, "5116928": 7, "6975872": 7, "5548416": 7, "7075136": 7, "174624": 7, "8384512": 7, "69104": 7, "8335488": 7, "6264192": 7, "5445248": 7, "6719104": 7, "5592064": 7, "19": [7, 8], "solv": 8, "0x7f888f8cd7b8": 8, "4152": 8, "086019515991": 8, "0x7f8865b51f28": 8, "gpuarrai": [8, 10], "tool": [8, 10, 12], "autoinit": [8, 10], "to_gpu": [8, 10], "mod": [8, 10], "t0": [8, 10], "ona": 8, "33": 8, "46109390258789": 8, "0x7f8858b873c8": 8, "1080": [8, 10], "916985595226": 8, "489004802704": 8, "500524806976": 8, "513356792927": 8, "545715200901": 8, "486515200138": 8, "449055999517": 8, "44974719882": 8, "457427197695": 8, "492915201187": 8, "464863997698": 8, "466118401289": 8, "475264000893": 8, "513632011414": 8, "458412796259": 8, "457715201378": 8, "461017608643": 8, "475987195969": 8, "460032004118": 8, "457779198885": 8, "462649595737": 8, "kernel_string_shar": 8, "22673916817": 8, "826361596584": 8, "793516802788": 8, "782112002373": 8, "776639997959": 8, "795135998726": 8, "722777605057": 8, "762777590752": 8, "75422719717": 8, "804876792431": 8, "778656005859": 8, "769734406471": 8, "782495999336": 8, "932281601429": 8, "734028804302": 8, "721625590324": 8, "736511993408": 8, "800019192696": 8, "724966406822": 8, "722969603539": 8, "759430396557": 8, "kernel_string_til": 8, "22200961113": 8, "91601279974": 8, "752838408947": 8, "873651194572": 8, "69833599329": 8, "586931192875": 8, "516473591328": 8, "411392003298": 8, "384262400866": 8, "82159358263": 8, "632607996464": 8, "506457602978": 8, "618758392334": 8, "500288009644": 8, "429862397909": 8, "44995200038": 8, "366150397062": 8, "342201602459": 8, "793542397022": 8, "58026239872": 8, "494163197279": 8, "546316814423": 8, "467059195042": 8, "404249596596": 8, "440895992517": 8, "341376006603": 8, "339692795277": 8, "783923208714": 8, "597920000553": 8, "50277120471": 8, "615475213528": 8, "470937597752": 8, "418393599987": 8, "443519997597": 8, "343961596489": 8, "342540800571": 8, "780352008343": 8, "611705589294": 8, "515667212009": 8, "622534394264": 8, "502195191383": 8, "437388807535": 8, "45568639636": 8, "359289598465": 8, "426995199919": 8, "788947200775": 8, "616556799412": 8, "496121603251": 8, "629164803028": 8, "474841600657": 8, "407667201757": 8, "47406719923": 8, "371507203579": 8, "352531200647": 8, "72023679018": 8, "574816000462": 8, "481817597151": 8, "580928003788": 8, "455724793673": 8, "394975996017": 8, "464659202099": 8, "357107198238": 8, "324083191156": 8, "759910392761": 8, "569177603722": 8, "481279999018": 8, "528115200996": 8, "441734397411": 8, "393126398325": 8, "455404800177": 8, "350457596779": 8, "322547197342": 8, "754201591015": 8, "579827189445": 8, "491852802038": 8, "582751989365": 8, "451283198595": 8, "391807991266": 8, "456275194883": 8, "356716805696": 8, "362937599421": 8, "809894394875": 8, "60433280468": 8, "507142400742": 8, "655827200413": 8, "474092799425": 8, "408166396618": 8, "480531209707": 8, "346707201004": 8, "780134403706": 8, "601049602032": 8, "493900799751": 8, "620384001732": 8, "494553589821": 8, "425414395332": 8, "467033600807": 8, "375468802452": 8, "346079999208": 8, "771052801609": 8, "593977594376": 8, "49723520875": 8, "583270406723": 8, "478079998493": 8, "416320002079": 8, "443942397833": 8, "359744000435": 8, "343545603752": 8, "780960011482": 8, "598758399487": 8, "498617601395": 8, "57678719759": 8, "46561280489": 8, "41324160099": 8, "431225597858": 8, "351263999939": 8, "34440960288": 8, "933260798454": 8, "715257608891": 8, "586604809761": 8, "711615991592": 8, "558771193027": 8, "466284793615": 8, "44043520093": 8, "361823999882": 8, "731839990616": 8, "57044479847": 8, "470220798254": 8, "608800005913": 8, "472665601969": 8, "416352003813": 8, "481376004219": 8, "380812799931": 8, "351923197508": 8, "719257593155": 8, "55171200037": 8, "466758400202": 8, "568435204029": 8, "459654402733": 8, "394380801916": 8, "463052803278": 8, "36409599781": 8, "328998398781": 8, "73579518795": 8, "564575994015": 8, "472236800194": 8, "549024009705": 8, "438406395912": 8, "389945602417": 8, "455193603039": 8, "364051198959": 8, "375519996881": 8, "798195195198": 8, "588998401165": 8, "49552000761": 8, "595462405682": 8, "460972803831": 8, "400672000647": 8, "465132802725": 8, "364627194405": 8, "729363203049": 8, "558815991879": 8, "466655993462": 8, "600819194317": 8, "460281592607": 8, "404908800125": 8, "478739196062": 8, "386668801308": 8, "385510402918": 8, "720915210247": 8, "550668799877": 8, "466937589645": 8, "564921605587": 8, "447974395752": 8, "394271999598": 8, "46233600378": 8, "365190398693": 8, "387827193737": 8, "762003195286": 8, "579007995129": 8, "486649608612": 8, "557331204414": 8, "443033593893": 8, "396070402861": 8, "457075202465": 8, "369555193186": 8, "wish": 8, "modifi": [8, 16], "tile_size_j": 8, "fixed_param": [8, 10], "ceil": [8, 10], "zip": [8, 10], "transfer": [8, 9, 11], "20": [8, 17], "21": 8, "618": 8, "2231903076172": 8, "0x7f887c3d2358": 8, "incorpor": 8, "ifndef": 8, "kerenel": 8, "psedo": 8, "endif": 8, "bypass": 8, "usecas": 9, "test_vector_add": 9, "test_vector_add_parameter": 9, "illustr": 9, "dimension": [9, 10, 21], "clean": [9, 14], "center": [9, 10], "lock": [9, 16], "overlap": [9, 11], "shuffl": 9, "pipelin": 9, "consist": [9, 14, 21], "scipi": 9, "algorithm": [9, 12, 17, 21], "cub": 9, "gaussian": 10, "delv": 10, "hand": [10, 14], "sum_": 10, "exp": 10, "beta": [10, 17], "sqrt": 10, "y_i": 10, "z_i": 10, "vector": [10, 11, 18], "coordin": 10, "forget": 10, "linalg": 10, "la": 10, "compute_grid": 10, "xgrid": 10, "ygrid": 10, "zgrid": 10, "x0": 10, "y0": 10, "z0": 10, "themselv": 10, "meshgrid": 10, "send": 10, "interv": 10, "256": [10, 12, 18], "suffici": [10, 15], "100": [10, 17, 21], "randomli": [10, 17], "distribut": [10, 14], "linspac": 10, "cpu_grid": 10, "npt": 10, "rand": 10, "xyz": [10, 21], "52320": 10, "160627": 10, "might": [10, 15], "nz": 10, "bz": 10, "kernel_cod": 10, "math": 10, "__host__": 10, "__device__": [10, 20], "b": [10, 12, 14, 17, 18, 20], "addgrid": 10, "xvect": 10, "yvect": 10, "zvect": 10, "dx": 10, "dy": 10, "dz": 10, "assign": 10, "explor": 10, "middl": 10, "henc": [10, 19], "mention": 10, "56833920479": 10, "80796158314": 10, "940044796467": 10, "855628800392": 10, "855359995365": 10, "16174077988": 10, "11877760887": 10, "01592960358": 10, "849273598194": 10, "849235200882": 10, "19029750824": 10, "16199679375": 10, "40401918888": 10, "39618558884": 10, "39508478642": 10, "31647996902": 10, "31470079422": 10, "50787198544": 10, "53760001659": 10, "56709756851": 10, "34500494003": 10, "25130877495": 10, "50662400723": 10, "55267841816": 10, "17987194061": 10, "12309756279": 10, "01125121117": 10, "849631989002": 10, "853708791733": 10, "17051515579": 10, "15584001541": 10, "40074241161": 10, "39547519684": 10, "39331197739": 10, "30295038223": 10, "28725762367": 10, "39589118958": 10, "38867840767": 10, "37724158764": 10, "34344320297": 10, "26213116646": 10, "38793599606": 10, "3775359869": 10, "74003200531": 10, "13276162148": 10, "37233917713": 10, "18835201263": 10, "15777277946": 10, "40247042179": 10, "39366400242": 10, "39439997673": 10, "23719043732": 10, "28542718887": 10, "39207677841": 10, "38956804276": 10, "3778496027": 10, "29814395905": 10, "26398081779": 10, "38625922203": 10, "3754431963": 10, "72981758118": 10, "12483196259": 10, "37322881222": 10, "61618566513": 10, "2194111824": 10, "17600002289": 10, "27082881927": 10, "38787200451": 10, "3835711956": 10, "37543039322": 10, "30227203369": 10, "23127679825": 10, "38627202511": 10, "37677440643": 10, "64358406067": 10, "12255358696": 10, "37474560738": 10, "61655673981": 10, "19179515839": 10, "99912958145": 10, "213971138": 10, "16430072784": 10, "38772480488": 10, "3735104084": 10, "54432649612": 10, "05524477959": 10, "36935677528": 10, "42449922562": 10, "10455036163": 10, "67516155243": 10, "programmat": 10, "With": [10, 11], "30": 10, "minimum": 10, "84": 10, "suit": [10, 21], "grid_dim": 10, "associ": 10, "substitut": 10, "ourselv": 10, "extract": 10, "manual": [10, 13], "exlicitli": 10, "accur": [10, 16], "xgpu": 10, "ygpu": 10, "zgpu": 10, "grid_gpu": 10, "80": 10, "133200": 10, "lower": [10, 16, 17], "roughli": [10, 14], "40000": 10, "across": [11, 14], "handl": [11, 21], "backend": [11, 16], "qualiti": 11, "itself": [11, 12, 21], "precis": 11, "plain": 11, "omp_get_wtim": 11, "openmp": 11, "convolution_stream": 11, "complex": [11, 14], "behind": 11, "spread": 11, "back": [11, 21], "split": 11, "chunk": 11, "slightli": [11, 14, 20], "account": [11, 14], "border": [11, 21], "latter": 11, "cudastreamwaitev": 11, "num_stream": 11, "clarifi": 11, "fit": [11, 17], "choic": [11, 13], "grid_size_x": 11, "grid_size_i": 11, "cudamemcpytosymbol": 11, "upload": 11, "yourself": [11, 21], "spent": [11, 21], "relat": [12, 15, 22], "famili": 12, "launcher": 12, "kt": [12, 19], "easiest": 12, "toolkit": [12, 13], "intend": 12, "hip": [12, 21], "Or": [12, 13], "vector_add": [12, 17, 18, 20], "10000000": 12, "512": [12, 18], "research": 12, "cite": 12, "articl": [12, 18], "kerneltun": 12, "author": 12, "ben": 12, "van": 12, "werkhoven": 12, "titl": 12, "auto": [12, 14, 16, 17, 20, 21, 22], "journal": 12, "system": [12, 13, 16], "year": 12, "2019": 12, "volum": 12, "90": 12, "347": 12, "358": 12, "url": 12, "http": [12, 13, 16], "www": 12, "sciencedirect": 12, "com": [12, 13], "scienc": 12, "pii": 12, "s0167739x18313359": 12, "doi": 12, "org": 12, "1016": 12, "2018": 12, "08": 12, "004": 12, "willemsen2021bayesian": 12, "willemsen": [12, 17], "flori": 12, "jan": 12, "nieuwpoort": 12, "rob": 12, "bayesian": [12, 17, 21], "workshop": 12, "pmb": 12, "supercomput": 12, "sc21": 12, "2021": 12, "arxiv": 12, "ab": 12, "2111": 12, "14991": 12, "schoonhoven2022benchmark": 12, "schoonhoven": 12, "richard": 12, "batenburg": 12, "joost": 12, "ieee": 12, "transact": 12, "evolutionari": 12, "2022": 12, "schoonhoven2022go": 12, "veenboer": 12, "bram": 12, "green": 12, "energi": [12, 16, 17, 22], "effici": [12, 14, 16], "steer": 12, "sc22": 12, "2211": 12, "07260": 12, "recommend": [13, 19], "miniconda": 13, "linux": 13, "download": 13, "wget": 13, "repo": 13, "continuum": 13, "io": 13, "miniconda3": 13, "x86_64": 13, "sh": 13, "bash": 13, "nativ": 13, "virtual": 13, "g": [13, 15, 16], "prefix": 13, "home": 13, "pythonpath": 13, "bind": [13, 16], "older": 13, "troubl": 13, "retri": 13, "dir": 13, "wiki": 13, "tiker": 13, "net": 13, "amd": [13, 16], "app": 13, "sdk": 13, "intel": 13, "appl": 13, "beignet": 13, "rocm": [13, 16], "stack": 13, "altern": [13, 21], "jatinx": 13, "navig": 13, "benvanwerkhoven": 13, "cd": 13, "chanc": [13, 17, 20], "known": 14, "algebra": 14, "frequent": 14, "programm": [14, 16], "row": 14, "column": 14, "squar": 14, "matric": 14, "matmul_na": 14, "width": 14, "matmul_kernel": 14, "height": 14, "Of": 14, "solut": [14, 16], "realiti": 14, "contant": 14, "denot": [14, 18, 21], "sensibl": 14, "pick": 14, "word": 14, "warpsiz": 14, "correctli": 14, "namelijk": 14, "stand": 14, "briefli": 14, "figur": 14, "fifth": 14, "fourth": 14, "dramat": 14, "profil": 14, "util": 14, "pretti": 14, "opportun": 14, "realiz": 14, "collabor": 14, "bandwidth": 14, "techniqu": 14, "submatric": 14, "proce": 14, "matmul_shar": 14, "sa": 14, "sb": 14, "kb": 14, "outer": 14, "inner": 14, "race": 14, "drastic": 14, "consumpt": [14, 16], "due": [14, 20, 21], "significantli": [14, 16], "fortun": 14, "benefit": 14, "redund": 14, "distinct": 14, "1xn": 14, "usag": [14, 16], "occup": 14, "goe": 14, "down": 14, "matmul": 14, "newli": 14, "coupl": 14, "respect": [14, 16], "independ": 14, "yield": 14, "discontinu": 14, "room": 14, "impos": 14, "report": [15, 16, 21, 22], "possibli": [15, 21], "_flop": 15, "total_flop": 15, "ps_energi": [15, 16, 22], "occur": [15, 21], "exhaust": 15, "brute": [15, 17, 18], "forc": [15, 17, 18, 20], "maxim": [15, 21], "boolean": [15, 16, 21], "facilit": 16, "layer": 16, "act": 16, "hook": 16, "pattern": 16, "subscrib": 16, "benchmarkobserv": 16, "overwritten": [16, 21], "extend": 16, "mandatori": 16, "get_result": 16, "usual": 16, "aggreg": 16, "after_finish": 16, "after_start": 16, "before_start": 16, "register_configur": 16, "register_devic": 16, "variou": [16, 18], "registerobserv": 16, "counter": 16, "num_reg": 16, "current_modul": 16, "powersensor2": 16, "pcie": 16, "intercept": 16, "sensor": 16, "transmit": 16, "usb": 16, "connect": 16, "advantag": 16, "instantan": 16, "frequenc": 16, "khz": 16, "pybind11": 16, "powersensor": [16, 22], "extern": [16, 20], "ps_power": [16, 22], "joul": [16, 22], "watt": [16, 22], "ttyacm0": 16, "core": 16, "voltag": 16, "thin": 16, "wrapper": [16, 20], "intricaci": 16, "friendli": 16, "repeatedli": 16, "downsid": 16, "approach": 16, "save_al": 16, "nvidia_smi_fallback": 16, "use_locked_clock": 16, "continous_dur": 16, "monitor": 16, "clock": [16, 22], "power_read": [16, 22], "nvml_power": [16, 22], "nvml_energi": [16, 22], "core_freq": [16, 22], "mem_freq": [16, 22], "gr_voltag": 16, "ordin": 16, "identifi": 16, "smi": 16, "root": 16, "privileg": 16, "opt": 16, "amper": 16, "continuous_dur": 16, "common": [16, 20], "cap": 16, "popular": 16, "nvml_gr_clock": [16, 22], "nvml_mem_clock": [16, 22], "nvml_pwr_limit": [16, 22], "graphic": [16, 22], "jetson": 16, "rapl": 16, "xilinx": 16, "pmt": 16, "git": 16, "astron": 16, "nl": 16, "rd": 16, "meter": 16, "arduino": 16, "_energi": 16, "_power": 16, "acceler": 17, "prohibit": 17, "slow": 17, "wast": 17, "basin": [17, 21], "hop": [17, 21], "dual": [17, 21], "anneal": [17, 21], "differenti": [17, 21], "evolut": [17, 21], "firefli": [17, 21], "genet": [17, 21], "greedi": [17, 21], "multi": [17, 21], "particl": [17, 21], "swarm": [17, 21], "mechan": 17, "overrid": 17, "time_limit": [17, 21], "uniqu": [17, 21], "count": 17, "searchspac": 17, "runner": 17, "nelder": 17, "mead": 17, "powel": 17, "cg": 17, "bfg": 17, "l": 17, "tnc": 17, "cobyla": 17, "slsqp": 17, "reject": 17, "thesi": 17, "generate_normalized_param_dict": 17, "denorm": 17, "normalize_parameter_spac": 17, "param_spac": 17, "prune_parameter_spac": 17, "normalize_dict": 17, "prune": 17, "hyperparamet": 17, "via": 17, "popul": 17, "best1bin": 17, "best1exp": 17, "rand1exp": 17, "randtobest1exp": 17, "best2exp": 17, "rand2exp": 17, "randtobest1bin": 17, "best2bin": 17, "rand2bin": 17, "rand1bin": 17, "popsiz": 17, "maxit": 17, "constr": 17, "compute_intens": 17, "fun": 17, "intens": 17, "distance_to": 17, "euclidian": 17, "move_toward": 17, "alpha": 17, "toward": 17, "b0": 17, "attract": 17, "gamma": 17, "light": 17, "absorpt": 17, "coeffici": 17, "disruptive_uniform_crossov": 17, "dna1": 17, "dna2": 17, "disrupt": 17, "uniform": 17, "crossov": 17, "uniformli": 17, "gene": 17, "children": 17, "guarante": 17, "parent": 17, "mutat": 17, "dna": 17, "mutation_ch": 17, "single_point_crossov": 17, "index": 17, "single_point": 17, "two_point": 17, "disruptive_uniform": 17, "two_point_crossov": 17, "uniform_crossov": 17, "weighted_choic": 17, "probabl": [17, 21], "il": 17, "neighbor": 17, "node": 17, "ham": 17, "adjac": 17, "greedy": 17, "soon": 17, "no_improv": 17, "exce": 17, "50": 17, "random_walk": 17, "hillclimb": 17, "travers": 17, "inertia": 17, "c1": 17, "cognit": 17, "c2": 17, "social": 17, "fraction": 17, "acceptance_prob": 17, "old_cost": 17, "new_cost": 17, "modif": [17, 19], "po": 17, "t_min": 17, "001": 17, "995": 17, "vector_add_kernel": 18, "wise": 18, "1000000": [18, 20], "recogn": 18, "alright": 18, "issu": 19, "portabl": 19, "stick": 19, "pointer": 19, "primit": 19, "lead": 19, "ineffici": 19, "situat": 19, "scientif": 19, "sens": 19, "experiment": 19, "pack": 19, "consult": 19, "create_receive_spec_struct": 19, "0l": 19, "pad": 19, "8byte": 19, "packstr": 19, "iiiiiiiiiiippi": 19, "fffi": 19, "nsampl": 19, "nsamplesiq": 19, "nslowtimesampl": 19, "nchannel": 19, "ntx": 19, "nrepeat": 19, "nfasttimesampl": 19, "rfsize": 19, "mnrow": 19, "mnrowsiq": 19, "nactivechannel": 19, "isiq": 19, "fsiq": 19, "fc": 19, "nbuffer": 19, "frombuff": 19, "len": 19, "receive_spec": 19, "bf": 19, "rf": 19, "recon": 19, "sync": 19, "length": 19, "slight": 19, "matlab": 20, "typenam": 20, "my_typ": 20, "linkag": 20, "regardless": 20, "demot": 20, "rewrit": 20, "real": 20, "risk": 20, "isol": 20, "nvrtc": 20, "seper": 20, "grid_div_z": 21, "06": 21, "log": 21, "simulation_mod": 21, "auxilliari": 21, "safer": 21, "notat": 21, "divison": 21, "treat": 21, "warp": 21, "empti": 21, "kepler": 21, "plu": 21, "filter_mod": 21, "address_mod": 21, "clamp": 21, "mirror": 21, "axi": 21, "normalized_coordin": 21, "emtpi": 21, "get_local_s": 21, "satisfi": 21, "000001": 21, "ref": 21, "basinhop": 21, "bayes_opt": 21, "diff_evo": 21, "firefly_algorithm": 21, "genetic_algorithm": 21, "greedy_il": 21, "greedy_ml": 21, "ml": 21, "ordered_greedy_ml": 21, "pso": 21, "simulated_ann": 21, "sort": 21, "resourc": 21, "persist": 21, "consol": 21, "info": 21, "summar": 21, "store_result": 21, "results_filenam": 21, "typicali": 21, "percentag": 21, "create_device_target": 21, "header_filenam": 21, "header": 21, "target": 21, "dtarget_gpu": 21, "name_of_gpu": 21, "chosen": 21, "block_size_": 22, "grid_size_": 22, "compiler_opt_": 22, "loop_unroll_factor_": 22, "nvml_": 22, "nvml": 22, "nvmlobserv": 22}, "objects": {"kernel_tuner.backends.c": [[5, 0, 1, "", "CFunctions"]], "kernel_tuner.backends.c.CFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "cleanup_lib"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[5, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[5, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[5, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[5, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[5, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "compile"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "kernel_finished"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "memcpy_htod"], [5, 1, 1, "", "memset"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"], [5, 1, 1, "", "start_event"], [5, 1, 1, "", "stop_event"], [5, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[5, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "benchmark"], [5, 1, 1, "", "benchmark_continuous"], [5, 1, 1, "", "benchmark_default"], [5, 1, 1, "", "check_kernel_output"], [5, 1, 1, "", "compile_kernel"], [5, 1, 1, "", "copy_constant_memory_args"], [5, 1, 1, "", "copy_shared_memory_args"], [5, 1, 1, "", "copy_texture_memory_args"], [5, 1, 1, "", "create_kernel_instance"], [5, 1, 1, "", "get_environment"], [5, 1, 1, "", "memcpy_dtoh"], [5, 1, 1, "", "ready_argument_list"], [5, 1, 1, "", "run_kernel"]], "kernel_tuner": [[21, 2, 1, "", "create_device_targets"], [21, 2, 1, "", "run_kernel"], [21, 2, 1, "", "store_results"], [21, 2, 1, "", "tune_kernel"], [5, 3, 0, "-", "util"]], "kernel_tuner.observers": [[16, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[16, 1, 1, "", "after_finish"], [16, 1, 1, "", "after_start"], [16, 1, 1, "", "before_start"], [16, 1, 1, "", "during"], [16, 1, 1, "", "get_results"], [16, 1, 1, "", "register_configuration"], [16, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[16, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[16, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[16, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[5, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[5, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[5, 1, 1, "", "__init__"], [5, 1, 1, "", "run"]], "kernel_tuner.strategies": [[17, 3, 0, "-", "basinhopping"], [17, 3, 0, "-", "bayes_opt"], [17, 3, 0, "-", "brute_force"], [5, 3, 0, "-", "common"], [17, 3, 0, "-", "diff_evo"], [17, 3, 0, "-", "dual_annealing"], [17, 3, 0, "-", "firefly_algorithm"], [17, 3, 0, "-", "genetic_algorithm"], [17, 3, 0, "-", "greedy_ils"], [17, 3, 0, "-", "greedy_mls"], [17, 3, 0, "-", "minimize"], [17, 3, 0, "-", "mls"], [17, 3, 0, "-", "ordered_greedy_mls"], [17, 3, 0, "-", "pso"], [17, 3, 0, "-", "random_sample"], [17, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[17, 2, 1, "", "generate_normalized_param_dicts"], [17, 2, 1, "", "normalize_parameter_space"], [17, 2, 1, "", "prune_parameter_space"], [17, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[5, 2, 1, "", "get_options"], [5, 2, 1, "", "get_strategy_docstring"], [5, 2, 1, "", "make_strategy_options_doc"], [5, 2, 1, "", "scale_from_params"], [5, 2, 1, "", "setup_method_arguments"], [5, 2, 1, "", "setup_method_options"], [5, 2, 1, "", "snap_to_nearest_config"], [5, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[17, 0, 1, "", "Firefly"], [17, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[17, 1, 1, "", "compute_intensity"], [17, 1, 1, "", "distance_to"], [17, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[17, 2, 1, "", "disruptive_uniform_crossover"], [17, 2, 1, "", "mutate"], [17, 2, 1, "", "single_point_crossover"], [17, 2, 1, "", "tune"], [17, 2, 1, "", "two_point_crossover"], [17, 2, 1, "", "uniform_crossover"], [17, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[17, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[17, 2, 1, "", "acceptance_prob"], [17, 2, 1, "", "neighbor"], [17, 2, 1, "", "tune"]], "kernel_tuner.util": [[5, 0, 1, "", "CompilationFailedConfig"], [5, 0, 1, "", "ErrorConfig"], [5, 0, 1, "", "InvalidConfig"], [5, 0, 1, "", "MaxProdConstraint"], [5, 0, 1, "", "NpEncoder"], [5, 0, 1, "", "RuntimeFailedConfig"], [5, 4, 1, "", "SkippableFailure"], [5, 4, 1, "", "StopCriterionReached"], [5, 2, 1, "", "check_argument_list"], [5, 2, 1, "", "check_argument_type"], [5, 2, 1, "", "check_restrictions"], [5, 2, 1, "", "check_stop_criterion"], [5, 2, 1, "", "check_thread_block_dimensions"], [5, 2, 1, "", "check_tune_params_list"], [5, 2, 1, "", "compile_restrictions"], [5, 2, 1, "", "config_valid"], [5, 2, 1, "", "convert_constraint_restriction"], [5, 2, 1, "", "cuda_error_check"], [5, 2, 1, "", "delete_temp_file"], [5, 2, 1, "", "detect_language"], [5, 2, 1, "", "dump_cache"], [5, 2, 1, "", "get_best_config"], [5, 2, 1, "", "get_config_string"], [5, 2, 1, "", "get_grid_dimensions"], [5, 2, 1, "", "get_instance_string"], [5, 2, 1, "", "get_kernel_string"], [5, 2, 1, "", "get_problem_size"], [5, 2, 1, "", "get_smem_args"], [5, 2, 1, "", "get_temp_filename"], [5, 2, 1, "", "get_thread_block_dimensions"], [5, 2, 1, "", "get_total_timings"], [5, 2, 1, "", "looks_like_a_filename"], [5, 2, 1, "", "normalize_verify_function"], [5, 2, 1, "", "parse_restrictions"], [5, 2, 1, "", "prepare_kernel_string"], [5, 2, 1, "", "print_config"], [5, 2, 1, "", "print_config_output"], [5, 2, 1, "", "process_cache"], [5, 2, 1, "", "process_metrics"], [5, 2, 1, "", "read_cache"], [5, 2, 1, "", "read_file"], [5, 2, 1, "", "replace_param_occurrences"], [5, 2, 1, "", "setup_block_and_grid"], [5, 2, 1, "", "store_cache"], [5, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[5, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"cach": 0, "file": 0, "The": [1, 12], "kernel": [1, 6, 7, 8, 9, 10, 12, 14, 20], "tuner": [1, 6, 7, 8, 9, 10, 12], "document": [1, 2, 5, 12, 21], "guid": [1, 2, 13], "featur": 1, "refer": 1, "contribut": 2, "report": 2, "issu": 2, "code": [2, 6, 7, 8, 9, 11], "develop": 2, "setup": 2, "run": [2, 8], "test": [2, 3], "build": 2, "convolut": [3, 9], "2d": 3, "exampl": [3, 9, 12, 20], "implement": [3, 6, 7, 8], "tune": [3, 6, 7, 8, 10, 11, 14, 15, 16], "more": 3, "tunabl": 3, "paramet": [3, 8, 10, 16, 22], "correct": 4, "verif": 4, "design": 5, "strategi": [5, 17], "kernel_tun": [5, 17], "common": 5, "runner": 5, "sequenti": 5, "sequentialrunn": 5, "simulationrunn": 5, "devic": 5, "interfac": 5, "core": 5, "deviceinterfac": 5, "backend": [5, 20], "pycuda": [5, 13], "pycudafunct": 5, "cupi": 5, "cupyfunct": 5, "nvcuda": 5, "cudafunct": 5, "opencl": [5, 13], "openclfunct": 5, "c": [5, 8], "cfunction": 5, "hip": [5, 13], "hipfunct": 5, "util": 5, "function": 5, "diffus": [6, 7, 8], "python": [6, 7, 8, 13], "comput": [6, 7, 8], "gpu": [6, 7, 8, 10], "auto": [6, 7, 8], "us": [6, 7, 8, 10, 14, 19], "share": [6, 7, 8, 14], "memori": [6, 7, 8, 14], "tile": [6, 7, 8], "store": [6, 7], "result": [6, 7], "tutori": [7, 8], "from": [7, 8], "physic": [7, 8], "local": 7, "best": 8, "product": 8, "vector": 9, "add": 9, "stencil": 9, "matrix": [9, 14], "multipl": [9, 14], "py": 9, "sepconv": 9, "convolution_correct": 9, "convolution_stream": 9, "reduct": 9, "spars": 9, "point": 9, "polygon": 9, "expdist": 9, "gener": 9, "3d": 10, "grid": 10, "let": 10, "": 10, "start": [10, 18], "cpu": 10, "move": 10, "optim": [10, 17], "host": 11, "number": 11, "stream": 11, "quick": 12, "instal": [12, 13], "usag": 12, "citat": 12, "packag": 13, "cuda": [13, 14], "pyopencl": 13, "pyhip": 13, "git": 13, "version": 13, "depend": 13, "naiv": 14, "increas": 14, "work": 14, "per": 14, "thread": 14, "metric": 15, "object": 15, "observ": 16, "powersensorobserv": 16, "nvmlobserv": 16, "execut": 16, "nvml": 16, "pmtobserv": 16, "basinhop": 17, "bayes_opt": 17, "brute_forc": 17, "diff_evo": 17, "dual_ann": 17, "firefly_algorithm": 17, "genetic_algorithm": 17, "greedy_il": 17, "greedy_ml": 17, "minim": 17, "ml": 17, "ordered_greedy_ml": 17, "pso": 17, "random_sampl": 17, "simulated_ann": 17, "get": 18, "struct": 19, "templat": 20, "select": 20, "api": 21, "vocabulari": 22}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 60}, "alltitles": {"Cache files": [[0, "cache-files"]], "The Kernel Tuner documentation": [[1, "the-kernel-tuner-documentation"], [12, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[1, null]], "Guides": [[1, null]], "Features": [[1, null]], "Reference": [[1, null]], "Contribution guide": [[2, "contribution-guide"]], "Reporting Issues": [[2, "reporting-issues"]], "Contributing Code": [[2, "contributing-code"]], "Development setup": [[2, "development-setup"]], "Running tests": [[2, "running-tests"]], "Building documentation": [[2, "building-documentation"]], "Convolution": [[3, "Convolution"], [9, "convolution"]], "2D Convolution example": [[3, "2D-Convolution-example"]], "Implement a test": [[3, "Implement-a-test"]], "Tuning 2D Convolution": [[3, "Tuning-2D-Convolution"]], "More tunable parameters": [[3, "More-tunable-parameters"]], "Correctness Verification": [[4, "correctness-verification"]], "Design documentation": [[5, "design-documentation"]], "Strategies": [[5, "strategies"]], "kernel_tuner.strategies.common": [[5, "module-kernel_tuner.strategies.common"]], "Runners": [[5, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[5, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[5, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[5, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[5, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[5, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[5, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[5, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[5, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.c.CFunctions": [[5, "kernel-tuner-backends-c-cfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[5, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[5, "util-functions"]], "kernel_tuner.util": [[5, "module-kernel_tuner.util"]], "Diffusion": [[6, "Diffusion"], [6, "id1"], [7, "Diffusion"], [8, "Diffusion"]], "Python implementation": [[6, "Python-implementation"], [7, "Python-implementation"], [8, "Python-implementation"]], "Computing on the GPU": [[6, "Computing-on-the-GPU"], [7, "Computing-on-the-GPU"], [8, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[6, "Auto-Tuning-with-the-Kernel-Tuner"], [7, "Auto-Tuning-with-the-Kernel-Tuner"], [8, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[6, "Using-Shared-Memory"]], "Tiling GPU Code": [[6, "Tiling-GPU-Code"], [7, "Tiling-GPU-Code"], [8, "Tiling-GPU-Code"]], "Storing the results": [[6, "Storing-the-results"], [7, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[7, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [8, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[7, "Using-Shared-(local)-Memory"]], "Using shared memory": [[8, "Using-shared-memory"], [14, "Using-shared-memory"]], "Using the best parameters in a production run": [[8, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[8, "Python-run"]], "C run": [[8, "C-run"]], "Kernel Tuner Examples": [[9, "kernel-tuner-examples"]], "Vector Add": [[9, "vector-add"]], "Stencil": [[9, "stencil"]], "Matrix Multiplication": [[9, "matrix-multiplication"]], "convolution.py": [[9, "convolution-py"]], "sepconv.py": [[9, "sepconv-py"]], "convolution_correct.py": [[9, "convolution-correct-py"]], "convolution_streams.py": [[9, "convolution-streams-py"]], "Reduction": [[9, "reduction"]], "Sparse Matrix Vector Multiplication": [[9, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[9, "point-in-polygon"]], "ExpDist": [[9, "expdist"]], "Code Generator": [[9, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[10, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[10, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[10, "Let's-move-to-the-GPU"]], "Tune the kernel": [[10, "Tune-the-kernel"]], "Using the optimized parameters": [[10, "Using-the-optimized-parameters"]], "Tuning Host Code": [[11, "tuning-host-code"]], "Tuning the number of streams": [[11, "tuning-the-number-of-streams"]], "Quick install": [[12, "quick-install"]], "Example usage": [[12, "example-usage"]], "Citation": [[12, "citation"]], "Installation": [[13, "installation"]], "Python": [[13, "python"]], "Installing Python Packages": [[13, "installing-python-packages"]], "CUDA and PyCUDA": [[13, "cuda-and-pycuda"]], "OpenCL and PyOpenCL": [[13, "opencl-and-pyopencl"]], "HIP and PyHIP": [[13, "hip-and-pyhip"]], "Installing the git version": [[13, "installing-the-git-version"]], "Dependencies for the guides": [[13, "dependencies-for-the-guides"]], "Matrix multiplication": [[14, "Matrix-multiplication"]], "Naive CUDA kernel": [[14, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[14, "Tuning-a-naive-kernel"]], "Increase work per thread": [[14, "Increase-work-per-thread"]], "Metrics and Objectives": [[15, "metrics-and-objectives"]], "Metrics": [[15, "metrics"]], "Tuning Objectives": [[15, "tuning-objectives"]], "Observers": [[16, "observers"]], "PowerSensorObserver": [[16, "powersensorobserver"]], "NVMLObserver": [[16, "nvmlobserver"]], "Tuning execution parameters with NVML": [[16, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[16, "pmtobserver"]], "Optimization strategies": [[17, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[17, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[17, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[17, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[17, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[17, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[17, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[17, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[17, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[17, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[17, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[17, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[17, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[17, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[17, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[18, "getting-started"]], "Using structs": [[19, "using-structs"]], "Templated kernels": [[20, "templated-kernels"]], "Example": [[20, "example"]], "Selecting a backend": [[20, "selecting-a-backend"]], "API Documentation": [[21, "api-documentation"]], "Parameter Vocabulary": [[22, "parameter-vocabulary"]]}, "indexentries": {"cfunctions (class in kernel_tuner.backends.c)": [[5, "kernel_tuner.backends.c.CFunctions"]], "compilationfailedconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.CompilationFailedConfig"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[5, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[5, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[5, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.InvalidConfig"]], "maxprodconstraint (class in kernel_tuner.util)": [[5, "kernel_tuner.util.MaxProdConstraint"]], "npencoder (class in kernel_tuner.util)": [[5, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[5, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[5, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[5, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[5, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[5, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.__init__"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[5, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[5, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.check_tune_params_list"]], "cleanup_lib() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.cleanup_lib"]], "compile() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.compile"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[5, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[5, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[5, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.memset"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[5, "module-kernel_tuner.strategies.common"], [5, "module-kernel_tuner.util"], [17, "module-kernel_tuner.strategies.basinhopping"], [17, "module-kernel_tuner.strategies.bayes_opt"], [17, "module-kernel_tuner.strategies.brute_force"], [17, "module-kernel_tuner.strategies.diff_evo"], [17, "module-kernel_tuner.strategies.dual_annealing"], [17, "module-kernel_tuner.strategies.firefly_algorithm"], [17, "module-kernel_tuner.strategies.genetic_algorithm"], [17, "module-kernel_tuner.strategies.greedy_ils"], [17, "module-kernel_tuner.strategies.greedy_mls"], [17, "module-kernel_tuner.strategies.minimize"], [17, "module-kernel_tuner.strategies.mls"], [17, "module-kernel_tuner.strategies.ordered_greedy_mls"], [17, "module-kernel_tuner.strategies.pso"], [17, "module-kernel_tuner.strategies.random_sample"], [17, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.prepare_kernel_string"]], "print_config() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[5, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[5, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[5, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.scale_from_params"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.start_event"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.c.cfunctions method)": [[5, "kernel_tuner.backends.c.CFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[5, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[5, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[5, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[5, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[5, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[5, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[5, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[16, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[16, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[16, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[16, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[16, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[17, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[17, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[17, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[17, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[17, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[17, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[17, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[17, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[17, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[17, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[17, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[17, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[17, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[17, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[17, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[17, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[17, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[17, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[17, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[17, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[17, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[17, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[17, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[17, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[17, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[17, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[17, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[17, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[17, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[17, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[17, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]], "create_device_targets() (in module kernel_tuner)": [[21, "kernel_tuner.create_device_targets"]], "run_kernel() (in module kernel_tuner)": [[21, "kernel_tuner.run_kernel"]], "store_results() (in module kernel_tuner)": [[21, "kernel_tuner.store_results"]], "tune_kernel() (in module kernel_tuner)": [[21, "kernel_tuner.tune_kernel"]]}}) \ No newline at end of file