Merge pull request #25 from firedrakeproject/dham/check_upstream

Dham/check upstream
firedrakeproject · Jul 11, 2024 · 87c1cd8 · 87c1cd8
2 parents 8600e53 + 7ac42bf
commit 87c1cd8
Show file tree

Hide file tree

Showing 125 changed files with 2,454 additions and 1,766 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -8,20 +8,17 @@ on:
         - cron:  '17 3 * * 0'
 
 jobs:
-    flake8:
-        name: Flake8
+    ruff:
+        name: Ruff
         runs-on: ubuntu-latest
         steps:
         -   uses: actions/checkout@v4
-        -
-            uses: actions/setup-python@v5
             with:
-                # matches compat target in setup.py
-                python-version: '3.8'
+                submodules: true
         -   name: "Main Script"
             run: |
-                curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
-                . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples proto-tests contrib
+                pipx install ruff
+                ruff check
 
     pylint:
         name: Pylint

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -147,12 +147,12 @@ Documentation:
   tags:
   - python3
 
-Flake8:
+Ruff:
   script:
-  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
-  - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples
+  - pipx install ruff
+  - ruff check
   tags:
-  - python3
+  - docker-runner
   except:
   - tags
 

diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "loopy/target/c/compyte"]
 	path = loopy/target/c/compyte
-	url = https://github.com/inducer/compyte
+	url = https://github.com/inducer/compyte.git
diff --git a/contrib/c-integer-semantics.py b/contrib/c-integer-semantics.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 
-from os import system
 import ctypes
+from os import system
+
 
 C_SRC = """
 #include <stdlib.h>
@@ -139,7 +140,7 @@ def main():
             if cresult != presult:
                 print(a, b, cresult, presult)
 
-    #print(int_mod(552, -918), 552 % -918)
+    # print(int_mod(552, -918), 552 % -918)
     print(cmod(23, -11), 23 % -11)
 
 

diff --git a/contrib/mem-pattern-explorer/pattern_vis.py b/contrib/mem-pattern-explorer/pattern_vis.py
@@ -1,5 +1,6 @@
 import numpy as np
 
+
 # Inspired by a visualization used in the Halide tutorial
 # https://www.youtube.com/watch?v=3uiEyEKji0M
 
@@ -9,8 +10,8 @@ def div_ceil(nr, dr):
 
 
 def product(iterable):
-    from operator import mul
     from functools import reduce
+    from operator import mul
     return reduce(mul, iterable, 1)
 
 
@@ -42,8 +43,8 @@ def nsubgroups(self):
         return div_ceil(product(self.lsize), self.subgroup_size)
 
     def animate(self, f, interval=200):
-        import matplotlib.pyplot as plt
         import matplotlib.animation as animation
+        import matplotlib.pyplot as plt
 
         fig = plt.figure()
 

diff --git a/doc/conf.py b/doc/conf.py
@@ -1,6 +1,7 @@
 import os
 from urllib.request import urlopen
 
+
 _conf_url = "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"  # noqa
 with urlopen(_conf_url) as _inf:
     exec(compile(_inf.read(), _conf_url, "exec"), globals())
@@ -31,7 +32,6 @@
         "pyopencl": ("https://documen.tician.de/pyopencl", None),
         "cgen": ("https://documen.tician.de/cgen", None),
         "pymbolic": ("https://documen.tician.de/pymbolic", None),
-        "pytools": ("https://documen.tician.de/pytools", None),
         "pyrsistent": ("https://pyrsistent.readthedocs.io/en/latest/", None),
         }
 
@@ -45,6 +45,8 @@
 # this needs a setting of the same name across all packages involved, that's
 # why this name is as global-sounding as it is.
 import sys
+
+
 sys._BUILDING_SPHINX_DOCS = True
 
 nitpicky = True

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
@@ -242,6 +242,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
         if n is None:
             if a is not None:
                 n = a.shape[0]
+    <BLANKLINE>
             elif out is not None:
                 n = out.shape[0]
     <BLANKLINE>
@@ -505,7 +506,7 @@ ambiguous.
     ...
       for (int j = 0; j <= -1 + n; ++j)
         for (int i = 0; i <= -1 + n; ++i)
-          a[n * i + j] = 0.0f;
+          a[n * i + j] = (float) (0.0f);
     ...
 
 No more warnings! Loop nesting is also reflected in the dependency graph:
@@ -563,7 +564,7 @@ Consider this example:
     ...
       for (int i_outer = 0; i_outer <= -1 + (15 + n) / 16; ++i_outer)
         for (int i_inner = 0; i_inner <= ((-17 + n + -16 * i_outer >= 0) ? 15 : -1 + n + -16 * i_outer); ++i_inner)
-          a[16 * i_outer + i_inner] = 0.0f;
+          a[16 * i_outer + i_inner] = (float) (0.0f);
     ...
 
 By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
@@ -594,7 +595,7 @@ relation to loop nesting. For example, it's perfectly possible to request
     ...
       for (int i_inner = 0; i_inner <= ((-17 + n >= 0) ? 15 : -1 + n); ++i_inner)
         for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + (15 + n + 15 * i_inner) / 16; ++i_outer)
-          a[16 * i_outer + i_inner] = 0.0f;
+          a[16 * i_outer + i_inner] = (float) (0.0f);
     ...
 
 Notice how loopy has automatically generated guard conditionals to make
@@ -662,10 +663,10 @@ loop's tag to ``"unr"``:
     ...
       for (int i_outer = 0; i_outer <= loopy_floor_div_pos_b_int32(-4 + n, 4); ++i_outer)
       {
-        a[4 * i_outer] = 0.0f;
-        a[1 + 4 * i_outer] = 0.0f;
-        a[2 + 4 * i_outer] = 0.0f;
-        a[3 + 4 * i_outer] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
+        a[1 + 4 * i_outer] = (float) (0.0f);
+        a[2 + 4 * i_outer] = (float) (0.0f);
+        a[3 + 4 * i_outer] = (float) (0.0f);
       }
     ...
 
@@ -737,7 +738,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
     {
       if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
-        a[128 * gid(0) + lid(0)] = 0.0f;
+        a[128 * gid(0) + lid(0)] = (float) (0.0f);
     }
 
 Loopy requires that workgroup sizes are fixed and constant at compile time.
@@ -782,13 +783,13 @@ assumption:
     ...
       for (int i_outer = 0; i_outer <= -1 + (3 + n) / 4; ++i_outer)
       {
-        a[4 * i_outer] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
         if (-2 + -4 * i_outer + n >= 0)
-          a[1 + 4 * i_outer] = 0.0f;
+          a[1 + 4 * i_outer] = (float) (0.0f);
         if (-3 + -4 * i_outer + n >= 0)
-          a[2 + 4 * i_outer] = 0.0f;
+          a[2 + 4 * i_outer] = (float) (0.0f);
         if (-4 + -4 * i_outer + n >= 0)
-          a[3 + 4 * i_outer] = 0.0f;
+          a[3 + 4 * i_outer] = (float) (0.0f);
       }
     ...
 
@@ -812,24 +813,24 @@ enabling some cost savings:
       /* bulk slab for 'i_outer' */
       for (int i_outer = 0; i_outer <= -2 + (3 + n) / 4; ++i_outer)
       {
-        a[4 * i_outer] = 0.0f;
-        a[1 + 4 * i_outer] = 0.0f;
-        a[2 + 4 * i_outer] = 0.0f;
-        a[3 + 4 * i_outer] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
+        a[1 + 4 * i_outer] = (float) (0.0f);
+        a[2 + 4 * i_outer] = (float) (0.0f);
+        a[3 + 4 * i_outer] = (float) (0.0f);
       }
       /* final slab for 'i_outer' */
       {
         int const i_outer = -1 + n + -1 * ((3 * n) / 4);
     <BLANKLINE>
         if (-1 + n >= 0)
         {
-          a[4 * i_outer] = 0.0f;
+          a[4 * i_outer] = (float) (0.0f);
           if (-2 + -4 * i_outer + n >= 0)
-            a[1 + 4 * i_outer] = 0.0f;
+            a[1 + 4 * i_outer] = (float) (0.0f);
           if (-3 + -4 * i_outer + n >= 0)
-            a[2 + 4 * i_outer] = 0.0f;
+            a[2 + 4 * i_outer] = (float) (0.0f);
           if (4 + 4 * i_outer + -1 * n == 0)
-            a[3 + 4 * i_outer] = 0.0f;
+            a[3 + 4 * i_outer] = (float) (0.0f);
         }
       }
     ...
@@ -1630,7 +1631,6 @@ together into keys containing only the specified fields:
     >>> op_map_dtype = op_map.group_by('dtype')
     >>> print(op_map_dtype)
     Op(np:dtype('float32'), None, None): ...
-    <BLANKLINE>
     >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
     ...                           ].eval_with_dict(param_dict)
     >>> print(f32op_count)
@@ -1656,7 +1656,6 @@ we'll continue using the kernel from the previous example:
     >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
     >>> print(mem_map)
     MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
-    <BLANKLINE>
 
 Each line of output will look roughly like::
 
@@ -1727,13 +1726,11 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
     >>> bytes_map = mem_map.to_bytes()
     >>> print(bytes_map)
     MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
-    <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(global_ld_st_bytes)
     MemAccess(None, None, None, None, load, None, None, None, None): ...
     MemAccess(None, None, None, None, store, None, None, None, None): ...
-    <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
     >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
@@ -1775,7 +1772,6 @@ this time.
     MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, 'stats_knl'): ...
     MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, 'stats_knl'): ...
     MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, 'stats_knl'): ...
-    <BLANKLINE>
 
 With this parallelization, consecutive work-items will access consecutive array
 elements in memory. The polynomials are a bit more complicated now due to the
@@ -1820,7 +1816,6 @@ we'll switch the inner and outer tags in our parallelization of the kernel:
     MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, 'stats_knl'): ...
     MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, 'stats_knl'): ...
     MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, 'stats_knl'): ...
-    <BLANKLINE>
 
 With this parallelization, consecutive work-items will access *nonconsecutive*
 array elements in memory. The total number of array accesses still has not
@@ -1873,7 +1868,6 @@ kernel from the previous example:
     >>> sync_map = lp.get_synchronization_map(knl)
     >>> print(sync_map)
     Sync(kernel_launch, stats_knl): [l, m, n] -> { 1 }
-    <BLANKLINE>
 
 We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`:
 
@@ -1934,7 +1928,6 @@ count the barriers using :func:`loopy.get_synchronization_map`:
     >>> print(sync_map)
     Sync(barrier_local, loopy_kernel): { 1000 }
     Sync(kernel_launch, loopy_kernel): { 1 }
-    <BLANKLINE>
 
 Based on the kernel code printed above, we would expect each work-item to
 encounter 50x10x2 barriers, which matches the result from

diff --git a/examples/fortran/matmul-driver.py b/examples/fortran/matmul-driver.py
@@ -1,8 +1,10 @@
 import numpy as np
 import numpy.linalg as la
+
 import pyopencl as cl
 import pyopencl.array
 import pyopencl.clrandom
+
 import loopy as lp
 
 

diff --git a/examples/python/call-external.py b/examples/python/call-external.py
@@ -1,5 +1,6 @@
-import loopy as lp
 import numpy as np
+
+import loopy as lp
 from loopy.diagnostic import LoopyError
 from loopy.target.c import CTarget
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401

diff --git a/examples/python/find-centers.py b/examples/python/find-centers.py
@@ -1,9 +1,11 @@
 import numpy as np
+
 import pyopencl as cl
 
 import loopy as lp
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
 
+
 cl_ctx = cl.create_some_context()
 
 knl = lp.make_kernel(

diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py
@@ -1,7 +1,9 @@
 import numpy as np
+
 import loopy as lp
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
 
+
 knl = lp.make_kernel(
         "{ [i,k]: 0<=i<n and 0<=k<3 }",
         """
@@ -20,9 +22,13 @@
 
 # schedule
 from loopy.preprocess import preprocess_kernel
+
+
 knl = preprocess_kernel(knl)
 
 from loopy.schedule import get_one_linearized_kernel
+
+
 knl = knl.with_kernel(get_one_linearized_kernel(knl["loopy_kernel"],
                                                 knl.callables_table))
 

diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py
@@ -1,9 +1,12 @@
 import numpy as np
-import loopy as lp
+
 import pyopencl as cl
 import pyopencl.array
+
+import loopy as lp
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
 
+
 # setup
 # -----
 ctx = cl.create_some_context()