Skip to content

Commit

Permalink
Merge pull request #25 from firedrakeproject/dham/check_upstream
Browse files Browse the repository at this point in the history
Dham/check upstream
  • Loading branch information
dham authored Jul 11, 2024
2 parents 8600e53 + 7ac42bf commit 87c1cd8
Show file tree
Hide file tree
Showing 125 changed files with 2,454 additions and 1,766 deletions.
13 changes: 5 additions & 8 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,17 @@ on:
- cron: '17 3 * * 0'

jobs:
flake8:
name: Flake8
ruff:
name: Ruff
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
-
uses: actions/setup-python@v5
with:
# matches compat target in setup.py
python-version: '3.8'
submodules: true
- name: "Main Script"
run: |
curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
. ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples proto-tests contrib
pipx install ruff
ruff check
pylint:
name: Pylint
Expand Down
8 changes: 4 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,12 @@ Documentation:
tags:
- python3

Flake8:
Ruff:
script:
- curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
- . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples
- pipx install ruff
- ruff check
tags:
- python3
- docker-runner
except:
- tags

Expand Down
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "loopy/target/c/compyte"]
path = loopy/target/c/compyte
url = https://github.com/inducer/compyte
url = https://github.com/inducer/compyte.git
5 changes: 3 additions & 2 deletions contrib/c-integer-semantics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/usr/bin/env python

from os import system
import ctypes
from os import system


C_SRC = """
#include <stdlib.h>
Expand Down Expand Up @@ -139,7 +140,7 @@ def main():
if cresult != presult:
print(a, b, cresult, presult)

#print(int_mod(552, -918), 552 % -918)
# print(int_mod(552, -918), 552 % -918)
print(cmod(23, -11), 23 % -11)


Expand Down
5 changes: 3 additions & 2 deletions contrib/mem-pattern-explorer/pattern_vis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np


# Inspired by a visualization used in the Halide tutorial
# https://www.youtube.com/watch?v=3uiEyEKji0M

Expand All @@ -9,8 +10,8 @@ def div_ceil(nr, dr):


def product(iterable):
from operator import mul
from functools import reduce
from operator import mul
return reduce(mul, iterable, 1)


Expand Down Expand Up @@ -42,8 +43,8 @@ def nsubgroups(self):
return div_ceil(product(self.lsize), self.subgroup_size)

def animate(self, f, interval=200):
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import matplotlib.pyplot as plt

fig = plt.figure()

Expand Down
4 changes: 3 additions & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from urllib.request import urlopen


_conf_url = "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py" # noqa
with urlopen(_conf_url) as _inf:
exec(compile(_inf.read(), _conf_url, "exec"), globals())
Expand Down Expand Up @@ -31,7 +32,6 @@
"pyopencl": ("https://documen.tician.de/pyopencl", None),
"cgen": ("https://documen.tician.de/cgen", None),
"pymbolic": ("https://documen.tician.de/pymbolic", None),
"pytools": ("https://documen.tician.de/pytools", None),
"pyrsistent": ("https://pyrsistent.readthedocs.io/en/latest/", None),
}

Expand All @@ -45,6 +45,8 @@
# this needs a setting of the same name across all packages involved, that's
# why this name is as global-sounding as it is.
import sys


sys._BUILDING_SPHINX_DOCS = True

nitpicky = True
Expand Down
49 changes: 21 additions & 28 deletions doc/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
if n is None:
if a is not None:
n = a.shape[0]
<BLANKLINE>
elif out is not None:
n = out.shape[0]
<BLANKLINE>
Expand Down Expand Up @@ -505,7 +506,7 @@ ambiguous.
...
for (int j = 0; j <= -1 + n; ++j)
for (int i = 0; i <= -1 + n; ++i)
a[n * i + j] = 0.0f;
a[n * i + j] = (float) (0.0f);
...

No more warnings! Loop nesting is also reflected in the dependency graph:
Expand Down Expand Up @@ -563,7 +564,7 @@ Consider this example:
...
for (int i_outer = 0; i_outer <= -1 + (15 + n) / 16; ++i_outer)
for (int i_inner = 0; i_inner <= ((-17 + n + -16 * i_outer >= 0) ? 15 : -1 + n + -16 * i_outer); ++i_inner)
a[16 * i_outer + i_inner] = 0.0f;
a[16 * i_outer + i_inner] = (float) (0.0f);
...

By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
Expand Down Expand Up @@ -594,7 +595,7 @@ relation to loop nesting. For example, it's perfectly possible to request
...
for (int i_inner = 0; i_inner <= ((-17 + n >= 0) ? 15 : -1 + n); ++i_inner)
for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + (15 + n + 15 * i_inner) / 16; ++i_outer)
a[16 * i_outer + i_inner] = 0.0f;
a[16 * i_outer + i_inner] = (float) (0.0f);
...

Notice how loopy has automatically generated guard conditionals to make
Expand Down Expand Up @@ -662,10 +663,10 @@ loop's tag to ``"unr"``:
...
for (int i_outer = 0; i_outer <= loopy_floor_div_pos_b_int32(-4 + n, 4); ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[1 + 4 * i_outer] = 0.0f;
a[2 + 4 * i_outer] = 0.0f;
a[3 + 4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
a[1 + 4 * i_outer] = (float) (0.0f);
a[2 + 4 * i_outer] = (float) (0.0f);
a[3 + 4 * i_outer] = (float) (0.0f);
}
...

Expand Down Expand Up @@ -737,7 +738,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
__kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
{
if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
a[128 * gid(0) + lid(0)] = 0.0f;
a[128 * gid(0) + lid(0)] = (float) (0.0f);
}

Loopy requires that workgroup sizes are fixed and constant at compile time.
Expand Down Expand Up @@ -782,13 +783,13 @@ assumption:
...
for (int i_outer = 0; i_outer <= -1 + (3 + n) / 4; ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
if (-2 + -4 * i_outer + n >= 0)
a[1 + 4 * i_outer] = 0.0f;
a[1 + 4 * i_outer] = (float) (0.0f);
if (-3 + -4 * i_outer + n >= 0)
a[2 + 4 * i_outer] = 0.0f;
a[2 + 4 * i_outer] = (float) (0.0f);
if (-4 + -4 * i_outer + n >= 0)
a[3 + 4 * i_outer] = 0.0f;
a[3 + 4 * i_outer] = (float) (0.0f);
}
...

Expand All @@ -812,24 +813,24 @@ enabling some cost savings:
/* bulk slab for 'i_outer' */
for (int i_outer = 0; i_outer <= -2 + (3 + n) / 4; ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[1 + 4 * i_outer] = 0.0f;
a[2 + 4 * i_outer] = 0.0f;
a[3 + 4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
a[1 + 4 * i_outer] = (float) (0.0f);
a[2 + 4 * i_outer] = (float) (0.0f);
a[3 + 4 * i_outer] = (float) (0.0f);
}
/* final slab for 'i_outer' */
{
int const i_outer = -1 + n + -1 * ((3 * n) / 4);
<BLANKLINE>
if (-1 + n >= 0)
{
a[4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
if (-2 + -4 * i_outer + n >= 0)
a[1 + 4 * i_outer] = 0.0f;
a[1 + 4 * i_outer] = (float) (0.0f);
if (-3 + -4 * i_outer + n >= 0)
a[2 + 4 * i_outer] = 0.0f;
a[2 + 4 * i_outer] = (float) (0.0f);
if (4 + 4 * i_outer + -1 * n == 0)
a[3 + 4 * i_outer] = 0.0f;
a[3 + 4 * i_outer] = (float) (0.0f);
}
}
...
Expand Down Expand Up @@ -1630,7 +1631,6 @@ together into keys containing only the specified fields:
>>> op_map_dtype = op_map.group_by('dtype')
>>> print(op_map_dtype)
Op(np:dtype('float32'), None, None): ...
<BLANKLINE>
>>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
... ].eval_with_dict(param_dict)
>>> print(f32op_count)
Expand All @@ -1656,7 +1656,6 @@ we'll continue using the kernel from the previous example:
>>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
>>> print(mem_map)
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
<BLANKLINE>

Each line of output will look roughly like::

Expand Down Expand Up @@ -1727,13 +1726,11 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
>>> bytes_map = mem_map.to_bytes()
>>> print(bytes_map)
MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, 'stats_knl'): ...
<BLANKLINE>
>>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
... ).group_by('direction')
>>> print(global_ld_st_bytes)
MemAccess(None, None, None, None, load, None, None, None, None): ...
MemAccess(None, None, None, None, store, None, None, None, None): ...
<BLANKLINE>
>>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
... ].eval_with_dict(param_dict)
>>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
Expand Down Expand Up @@ -1775,7 +1772,6 @@ this time.
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, 'stats_knl'): ...
<BLANKLINE>

With this parallelization, consecutive work-items will access consecutive array
elements in memory. The polynomials are a bit more complicated now due to the
Expand Down Expand Up @@ -1820,7 +1816,6 @@ we'll switch the inner and outer tags in our parallelization of the kernel:
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, 'stats_knl'): ...
MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, 'stats_knl'): ...
<BLANKLINE>

With this parallelization, consecutive work-items will access *nonconsecutive*
array elements in memory. The total number of array accesses still has not
Expand Down Expand Up @@ -1873,7 +1868,6 @@ kernel from the previous example:
>>> sync_map = lp.get_synchronization_map(knl)
>>> print(sync_map)
Sync(kernel_launch, stats_knl): [l, m, n] -> { 1 }
<BLANKLINE>

We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`:

Expand Down Expand Up @@ -1934,7 +1928,6 @@ count the barriers using :func:`loopy.get_synchronization_map`:
>>> print(sync_map)
Sync(barrier_local, loopy_kernel): { 1000 }
Sync(kernel_launch, loopy_kernel): { 1 }
<BLANKLINE>

Based on the kernel code printed above, we would expect each work-item to
encounter 50x10x2 barriers, which matches the result from
Expand Down
2 changes: 2 additions & 0 deletions examples/fortran/matmul-driver.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import numpy as np
import numpy.linalg as la

import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom

import loopy as lp


Expand Down
3 changes: 2 additions & 1 deletion examples/python/call-external.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import loopy as lp
import numpy as np

import loopy as lp
from loopy.diagnostic import LoopyError
from loopy.target.c import CTarget
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
Expand Down
2 changes: 2 additions & 0 deletions examples/python/find-centers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import numpy as np

import pyopencl as cl

import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401


cl_ctx = cl.create_some_context()

knl = lp.make_kernel(
Expand Down
6 changes: 6 additions & 0 deletions examples/python/global_barrier_removal.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np

import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401


knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
Expand All @@ -20,9 +22,13 @@

# schedule
from loopy.preprocess import preprocess_kernel


knl = preprocess_kernel(knl)

from loopy.schedule import get_one_linearized_kernel


knl = knl.with_kernel(get_one_linearized_kernel(knl["loopy_kernel"],
knl.callables_table))

Expand Down
5 changes: 4 additions & 1 deletion examples/python/hello-loopy.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import numpy as np
import loopy as lp

import pyopencl as cl
import pyopencl.array

import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401


# setup
# -----
ctx = cl.create_some_context()
Expand Down
Loading

0 comments on commit 87c1cd8

Please sign in to comment.