From b73cdcfca9e1570f11120c82b4ae31c7b7c0d9b7 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 15 Nov 2024 08:59:24 +0100
Subject: [PATCH 01/31] Add AUTO option to config, use it for ghost layers

---
 .../backend/kernelcreation/iteration_space.py | 41 ++++++---
 src/pystencils/config.py                      | 51 +++++++----
 src/pystencils/kernelcreation.py              | 87 +++++++++++--------
 src/pystencils/simp/simplificationstrategy.py |  2 +-
 4 files changed, 117 insertions(+), 64 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py
index 05e7153bf..c3c9eaa7a 100644
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -6,6 +6,7 @@ from functools import reduce
 from operator import mul
 
 from ...defaults import DEFAULTS
+from ...config import _AUTO_TYPE, AUTO
 from ...simp import AssignmentCollection
 from ...field import Field, FieldType
 
@@ -60,6 +61,7 @@ class FullIterationSpace(IterationSpace):
 
     @dataclass
     class Dimension:
+        """One dimension of a dense iteration space"""
         start: PsExpression
         stop: PsExpression
         step: PsExpression
@@ -180,7 +182,7 @@ class FullIterationSpace(IterationSpace):
     def __init__(
         self,
         ctx: KernelCreationContext,
-        dimensions: Sequence[Dimension],
+        dimensions: Sequence[FullIterationSpace.Dimension],
         archetype_field: Field | None = None,
     ):
         super().__init__(tuple(dim.counter for dim in dimensions))
@@ -192,22 +194,27 @@ class FullIterationSpace(IterationSpace):
 
     @property
     def dimensions(self):
+        """The dimensions of this iteration space"""
         return self._dimensions
 
     @property
     def lower(self):
+        """Lower limits of each dimension"""
         return (dim.start for dim in self._dimensions)
 
     @property
     def upper(self):
+        """Upper limits of each dimension"""
         return (dim.stop for dim in self._dimensions)
 
     @property
     def steps(self):
+        """Iteration steps of each dimension"""
         return (dim.step for dim in self._dimensions)
 
     @property
     def archetype_field(self) -> Field | None:
+        """Field whose shape and memory layout act as archetypes for this iteration space's dimensions."""
         return self._archetype_field
     
     @property
@@ -230,6 +237,13 @@ class FullIterationSpace(IterationSpace):
     def actual_iterations(
         self, dimension: int | FullIterationSpace.Dimension | None = None
     ) -> PsExpression:
+        """Construct an expression representing the actual number of unique points inside the iteration space.
+        
+        Args:
+            dimension: If an integer or a `Dimension` object is given, the number of iterations in that
+                dimension is computed. If `None`, the total number of iterations inside the entire space
+                is computed.
+        """
         from .typification import Typifier
         from ..transformations import EliminateConstants
 
@@ -399,7 +413,7 @@ def create_sparse_iteration_space(
 def create_full_iteration_space(
     ctx: KernelCreationContext,
     assignments: AssignmentCollection,
-    ghost_layers: None | int | Sequence[int | tuple[int, int]] = None,
+    ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None,
     iteration_slice: None | int | slice | tuple[int | slice, ...] = None,
 ) -> IterationSpace:
     assert not ctx.fields.index_fields
@@ -439,16 +453,7 @@ def create_full_iteration_space(
     # Otherwise, if an iteration slice was specified, use that
     # Otherwise, use the inferred ghost layers
 
-    if ghost_layers is not None:
-        ctx.metadata["ghost_layers"] = ghost_layers
-        return FullIterationSpace.create_with_ghost_layers(
-            ctx, ghost_layers, archetype_field
-        )
-    elif iteration_slice is not None:
-        return FullIterationSpace.create_from_slice(
-            ctx, iteration_slice, archetype_field
-        )
-    else:
+    if ghost_layers is AUTO:
         if len(domain_field_accesses) > 0:
             inferred_gls = max(
                 [fa.required_ghost_layers for fa in domain_field_accesses]
@@ -460,3 +465,15 @@ def create_full_iteration_space(
         return FullIterationSpace.create_with_ghost_layers(
             ctx, inferred_gls, archetype_field
         )
+    elif ghost_layers is not None:
+        assert not isinstance(ghost_layers, _AUTO_TYPE)
+        ctx.metadata["ghost_layers"] = ghost_layers
+        return FullIterationSpace.create_with_ghost_layers(
+            ctx, ghost_layers, archetype_field
+        )
+    elif iteration_slice is not None:
+        return FullIterationSpace.create_from_slice(
+            ctx, iteration_slice, archetype_field
+        )
+    else:
+        assert False, "unreachable code"
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index 9e2af1b7e..c688530ae 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -28,6 +28,14 @@ class PsOptionsError(Exception):
     """Indicates an option clash in the `CreateKernelConfig`."""
 
 
+class _AUTO_TYPE:
+    ...
+
+
+AUTO = _AUTO_TYPE()
+"""Special value that can be passed to some options for invoking automatic behaviour."""
+
+
 @dataclass
 class OpenMpConfig:
     """Parameters controlling kernel parallelization using OpenMP."""
@@ -68,8 +76,8 @@ class CpuOptimConfig:
     openmp: bool | OpenMpConfig = False
     """Enable OpenMP parallelization.
     
-    If set to `True`, the kernel will be parallelized using OpenMP according to the default settings in `OpenMpParams`.
-    To customize OpenMP parallelization, pass an instance of `OpenMpParams` instead.
+    If set to `True`, the kernel will be parallelized using OpenMP according to the default settings in `OpenMpConfig`.
+    To customize OpenMP parallelization, pass an instance of `OpenMpConfig` instead.
     """
 
     vectorize: bool | VectorizationConfig = False
@@ -188,11 +196,11 @@ class GpuIndexingConfig:
     If set to `True`, the kernel is generated for execution via
     `parallel_for <https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke>`_
     -dispatch using
-    a flat `sycl::range`. In this case, the GPU block size will be inferred by the SYCL runtime.
+    a flat ``sycl::range``. In this case, the GPU block size will be inferred by the SYCL runtime.
 
-    If set to `False`, the kernel will receive an `nd_item` and has to be executed using
+    If set to `False`, the kernel will receive an ``nd_item`` and has to be executed using
     `parallel_for <https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke>`_
-    with an `nd_range`. This allows manual specification of the block size.
+    with an ``nd_range``. This allows manual specification of the block size.
     """
 
 
@@ -207,38 +215,49 @@ class CreateKernelConfig:
     """Just-in-time compiler used to compile and load the kernel for invocation from the current Python environment.
     
     If left at `None`, a default just-in-time compiler will be inferred from the `target` parameter.
-    To explicitly disable JIT compilation, pass `pystencils.nbackend.jit.no_jit`.
+    To explicitly disable JIT compilation, pass `pystencils.backend.jit.no_jit`.
     """
 
     function_name: str = "kernel"
     """Name of the generated function"""
 
-    ghost_layers: None | int | Sequence[int | tuple[int, int]] = None
+    ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None
     """Specifies the number of ghost layers of the iteration region.
     
     Options:
-     - `None`: Required ghost layers are inferred from field accesses
+     - :py:data:`AUTO <pystencils.config.AUTO>`: Required ghost layers are inferred from field accesses
      - `int`:  A uniform number of ghost layers in each spatial coordinate is applied
      - ``Sequence[int, tuple[int, int]]``: Ghost layers are specified for each spatial coordinate.
         In each coordinate, a single integer specifies the ghost layers at both the lower and upper iteration limit,
         while a pair of integers specifies the lower and upper ghost layers separately.
 
     When manually specifying ghost layers, it is the user's responsibility to avoid out-of-bounds memory accesses.
-    If ``ghost_layers=None`` is specified, the iteration region may otherwise be set using the `iteration_slice` option.
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
     """
 
-    iteration_slice: None | Sequence[slice] = None
+    iteration_slice: None | int | slice | tuple[int | slice] = None
     """Specifies the kernel's iteration slice.
-    
-    `iteration_slice` may only be set if ``ghost_layers=None``.
-    If it is set, a slice must be specified for each spatial coordinate.
-    TODO: Specification of valid slices and their behaviour
+
+    Example:
+        >>> cfg = CreateKernelConfig(
+        ...     iteration_slice=ps.make_slice[3:14, 2:-2]
+        ... )
+        >>> cfg.iteration_slice
+        (slice(3, 14, None), slice(2, -2, None))
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
     """
 
     index_field: Field | None = None
     """Index field for a sparse kernel.
     
     If this option is set, a sparse kernel with the given field as index field will be generated.
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
     """
 
     """Data Types"""
@@ -288,10 +307,10 @@ class CreateKernelConfig:
     """Deprecated; use `default_dtype` instead"""
 
     cpu_openmp: InitVar[bool | int | None] = None
-    """Deprecated; use `cpu_optim.openmp` instead."""
+    """Deprecated; use `cpu_optim.openmp <CpuOptimConfig.openmp>` instead."""
 
     cpu_vectorize_info: InitVar[dict | None] = None
-    """Deprecated; use `cpu_optim.vectorize` instead."""
+    """Deprecated; use `cpu_optim.vectorize <CpuOptimConfig.vectorize>` instead."""
 
     gpu_indexing_params: InitVar[dict | None] = None
     """Deprecated; use `gpu_indexing` instead."""
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 651a67cf2..548fbc9bb 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -6,6 +6,7 @@ from .config import (
     CreateKernelConfig,
     OpenMpConfig,
     VectorizationConfig,
+    AUTO
 )
 from .backend import KernelFunction
 from .types import create_numeric_type, PsIntegerType, PsScalarType
@@ -91,49 +92,18 @@ class DefaultKernelCreationDriver:
         self,
         assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
     ):
-        if isinstance(assignments, AssignmentBase):
-            assignments = [assignments]
-
-        if not isinstance(assignments, AssignmentCollection):
-            assignments = AssignmentCollection(assignments)  # type: ignore
-
-        _ = _parse_simplification_hints(assignments)
-
-        analysis = KernelAnalysis(
-            self._ctx,
-            not self._cfg.skip_independence_check,
-            not self._cfg.allow_double_writes,
+        kernel_body = self.parse_kernel_body(
+            assignments
         )
-        analysis(assignments)
-
-        if len(self._ctx.fields.index_fields) > 0 or self._cfg.index_field is not None:
-            ispace = create_sparse_iteration_space(
-                self._ctx, assignments, index_field=self._cfg.index_field
-            )
-        else:
-            ispace = create_full_iteration_space(
-                self._ctx,
-                assignments,
-                ghost_layers=self._cfg.ghost_layers,
-                iteration_slice=self._cfg.iteration_slice,
-            )
-
-        self._ctx.set_iteration_space(ispace)
-
-        freeze = FreezeExpressions(self._ctx)
-        kernel_body = freeze(assignments)
-
-        typify = Typifier(self._ctx)
-        kernel_body = typify(kernel_body)
 
         match self._platform:
             case GenericCpu():
                 kernel_ast = self._platform.materialize_iteration_space(
-                    kernel_body, ispace
+                    kernel_body, self._ctx.get_iteration_space()
                 )
             case GenericGpu():
                 kernel_ast, gpu_threads = self._platform.materialize_iteration_space(
-                    kernel_body, ispace
+                    kernel_body, self._ctx.get_iteration_space()
                 )
 
         #   Fold and extract constants
@@ -179,6 +149,53 @@ class DefaultKernelCreationDriver:
                 self._cfg.get_jit(),
             )
 
+    def parse_kernel_body(
+        self,
+        assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
+    ) -> PsBlock:
+        if isinstance(assignments, AssignmentBase):
+            assignments = [assignments]
+
+        if not isinstance(assignments, AssignmentCollection):
+            assignments = AssignmentCollection(assignments)  # type: ignore
+
+        _ = _parse_simplification_hints(assignments)
+
+        analysis = KernelAnalysis(
+            self._ctx,
+            not self._cfg.skip_independence_check,
+            not self._cfg.allow_double_writes,
+        )
+        analysis(assignments)
+
+        if self._cfg.index_field is not None:
+            ispace = create_sparse_iteration_space(
+                self._ctx, assignments, index_field=self._cfg.index_field
+            )
+        else:
+            gls = self._cfg.ghost_layers
+            islice = self._cfg.iteration_slice
+
+            if gls is None and islice is None:
+                gls = AUTO
+
+            ispace = create_full_iteration_space(
+                self._ctx,
+                assignments,
+                ghost_layers=gls,
+                iteration_slice=islice,
+            )
+
+        self._ctx.set_iteration_space(ispace)
+
+        freeze = FreezeExpressions(self._ctx)
+        kernel_body = freeze(assignments)
+
+        typify = Typifier(self._ctx)
+        kernel_body = typify(kernel_body)
+
+        return kernel_body
+
     def _transform_for_cpu(self, kernel_ast: PsBlock):
         canonicalize = CanonicalizeSymbols(self._ctx, True)
         kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
diff --git a/src/pystencils/simp/simplificationstrategy.py b/src/pystencils/simp/simplificationstrategy.py
index 22ffa34d0..7cba94f8b 100644
--- a/src/pystencils/simp/simplificationstrategy.py
+++ b/src/pystencils/simp/simplificationstrategy.py
@@ -57,7 +57,7 @@ class SimplificationStrategy:
 
             def __str__(self):
                 try:
-                    import tabulate
+                    from tabulate import tabulate
                     return tabulate(self.elements, headers=['Name', 'Runtime', 'Adds', 'Muls', 'Divs', 'Total'])
                 except ImportError:
                     result = "Name, Adds, Muls, Divs, Runtime\n"
-- 
GitLab


From f6769587fe9d3124d6bcd3027694af6da094eaf9 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 15 Nov 2024 10:41:39 +0100
Subject: [PATCH 02/31] Add dependency checks to GouThreadsRange creation; add
 option to manually specify launch grid

---
 src/pystencils/backend/jit/gpu_cupy.py        | 52 ++++++----
 .../backend/kernelcreation/iteration_space.py | 16 ++--
 src/pystencils/backend/kernelfunction.py      | 12 ++-
 src/pystencils/backend/platforms/cuda.py      | 96 +++++++++++++++++--
 .../backend/platforms/generic_gpu.py          | 16 +++-
 src/pystencils/config.py                      |  8 ++
 6 files changed, 166 insertions(+), 34 deletions(-)

diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py
index 563a9c06a..1dd187671 100644
--- a/src/pystencils/backend/jit/gpu_cupy.py
+++ b/src/pystencils/backend/jit/gpu_cupy.py
@@ -41,6 +41,7 @@ class CupyKernelWrapper(KernelWrapper):
         self._kfunc: GpuKernelFunction = kfunc
         self._raw_kernel = raw_kernel
         self._block_size = block_size
+        self._num_blocks: tuple[int, int, int] | None = None
         self._args_cache: dict[Any, tuple] = dict()
 
     @property
@@ -59,6 +60,14 @@ class CupyKernelWrapper(KernelWrapper):
     def block_size(self, bs: tuple[int, int, int]):
         self._block_size = bs
 
+    @property
+    def num_blocks(self) -> tuple[int, int, int] | None:
+        return self._num_blocks
+
+    @num_blocks.setter
+    def num_blocks(self, nb: tuple[int, int, int] | None):
+        self._num_blocks = nb
+
     def __call__(self, **kwargs: Any):
         kernel_args, launch_grid = self._get_cached_args(**kwargs)
         device = self._get_device(kernel_args)
@@ -72,7 +81,7 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        key = (self._block_size,) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items())
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -185,25 +194,36 @@ class CupyKernelWrapper(KernelWrapper):
 
         symbolic_threads_range = self._kfunc.threads_range
 
-        threads_range: list[int] = [
-            evaluate_expression(expr, valuation)
-            for expr in symbolic_threads_range.num_work_items
-        ]
+        if self._num_blocks is not None:
+            launch_grid = LaunchGrid(self._num_blocks, self._block_size)
 
-        if symbolic_threads_range.dim < 3:
-            threads_range += [1] * (3 - symbolic_threads_range.dim)
+        elif symbolic_threads_range is not None:
+            threads_range: list[int] = [
+                evaluate_expression(expr, valuation)
+                for expr in symbolic_threads_range.num_work_items
+            ]
 
-        def div_ceil(a, b):
-            return a // b if a % b == 0 else a // b + 1
+            if symbolic_threads_range.dim < 3:
+                threads_range += [1] * (3 - symbolic_threads_range.dim)
 
-        #   TODO: Refine this?
-        grid_size = tuple(
-            div_ceil(threads, tpb)
-            for threads, tpb in zip(threads_range, self._block_size)
-        )
-        assert len(grid_size) == 3
+            def div_ceil(a, b):
+                return a // b if a % b == 0 else a // b + 1
+
+            #   TODO: Refine this?
+            num_blocks = tuple(
+                div_ceil(threads, tpb)
+                for threads, tpb in zip(threads_range, self._block_size)
+            )
+            assert len(num_blocks) == 3
+
+            launch_grid = LaunchGrid(num_blocks, self._block_size)
 
-        launch_grid = LaunchGrid(grid_size, self._block_size)
+        else:
+            raise JitError(
+                "Unable to determine launch grid for GPU kernel invocation: "
+                "No manual grid size was specified, and the number of threads could not "
+                "be determined automatically."
+            )
 
         return tuple(args), launch_grid
 
diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py
index c3c9eaa7a..9df9883ce 100644
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -196,21 +196,25 @@ class FullIterationSpace(IterationSpace):
     def dimensions(self):
         """The dimensions of this iteration space"""
         return self._dimensions
+    
+    @property
+    def counters(self) -> tuple[PsSymbol, ...]:
+        return tuple(dim.counter for dim in self._dimensions)
 
     @property
-    def lower(self):
+    def lower(self) -> tuple[PsExpression, ...]:
         """Lower limits of each dimension"""
-        return (dim.start for dim in self._dimensions)
+        return tuple(dim.start for dim in self._dimensions)
 
     @property
-    def upper(self):
+    def upper(self) -> tuple[PsExpression, ...]:
         """Upper limits of each dimension"""
-        return (dim.stop for dim in self._dimensions)
+        return tuple(dim.stop for dim in self._dimensions)
 
     @property
-    def steps(self):
+    def steps(self) -> tuple[PsExpression, ...]:
         """Iteration steps of each dimension"""
-        return (dim.step for dim in self._dimensions)
+        return tuple(dim.step for dim in self._dimensions)
 
     @property
     def archetype_field(self) -> Field | None:
diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py
index 0118c4f40..8f599d57e 100644
--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -261,7 +261,7 @@ class GpuKernelFunction(KernelFunction):
     def __init__(
         self,
         body: PsBlock,
-        threads_range: GpuThreadsRange,
+        threads_range: GpuThreadsRange | None,
         target: Target,
         name: str,
         parameters: Sequence[KernelParameter],
@@ -275,7 +275,7 @@ class GpuKernelFunction(KernelFunction):
         self._threads_range = threads_range
 
     @property
-    def threads_range(self) -> GpuThreadsRange:
+    def threads_range(self) -> GpuThreadsRange | None:
         return self._threads_range
 
 
@@ -283,14 +283,16 @@ def create_gpu_kernel_function(
     ctx: KernelCreationContext,
     platform: Platform,
     body: PsBlock,
-    threads_range: GpuThreadsRange,
+    threads_range: GpuThreadsRange | None,
     function_name: str,
     target_spec: Target,
     jit: JitBase,
 ):
     undef_symbols = collect_undefined_symbols(body)
-    for threads in threads_range.num_work_items:
-        undef_symbols |= collect_undefined_symbols(threads)
+
+    if threads_range is not None:
+        for threads in threads_range.num_work_items:
+            undef_symbols |= collect_undefined_symbols(threads)
 
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 323dcc5a9..5c8b5b504 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,3 +1,5 @@
+from warnings import warn
+
 from ...types import constify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu, GpuThreadsRange
@@ -7,7 +9,7 @@ from ..kernelcreation import (
     IterationSpace,
     FullIterationSpace,
     SparseIterationSpace,
-    AstFactory
+    AstFactory,
 )
 
 from ..kernelcreation.context import KernelCreationContext
@@ -43,6 +45,7 @@ GRID_DIM = [
 
 
 class CudaPlatform(GenericGpu):
+    """Platform for CUDA-based GPUs."""
 
     def __init__(
         self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None
@@ -57,7 +60,7 @@ class CudaPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -123,13 +126,25 @@ class CudaPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         dimensions = ispace.dimensions_in_loop_order()
-        launch_config = GpuThreadsRange.from_ispace(ispace)
+
+        if not self._cfg.manual_launch_grid:
+            try:
+                threads_range = GpuThreadsRange.from_ispace(ispace)
+            except MaterializationError as e:
+                warn(
+                    str(e.args[0])
+                    + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
+                    UserWarning,
+                )
+                threads_range = None
+        else:
+            threads_range = None
 
         indexing_decls = []
         conds = []
-        for i, dim in enumerate(dimensions[::-1]):
+        for i, dim in enumerate(dimensions):
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
             ctr = PsExpression.make(dim.counter)
@@ -155,7 +170,7 @@ class CudaPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, launch_config
+        return ast, threads_range
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
@@ -199,3 +214,72 @@ class CudaPlatform(GenericGpu):
         block_idx = BLOCK_IDX[coord]
         thread_idx = THREAD_IDX[coord]
         return block_idx * block_size + thread_idx
+
+
+# class LinearIndexing:
+#     """Linear GPU thread indexing.
+
+#     This indexing scheme maps GPU threads to iteration space points in the following way:
+#     - Starting from the slowest coordinate, each coordinate is mapped to a dimension
+#       of the GPU grid until just one dimension is left
+#     - All remaining dimensions of the iteration space are linearly mapped
+#       onto the fastest launch grid dimension
+#     """
+
+#     def __init__(
+#         self,
+#         ctx: KernelCreationContext,
+#         launch_grid_dimensions: int,
+#         ispace: FullIterationSpace,
+#     ) -> None:
+#         if not (0 < launch_grid_dimensions <= 3):
+#             raise ValueError(
+#                 f"Invalid number of launch grid dimensions: {launch_grid_dimensions}"
+#             )
+
+#         self._ctx = ctx
+
+#         self._grid_dims = launch_grid_dimensions
+#         self._ispace = ispace
+#         self._ispace_dims = len(ispace.dimensions)
+
+#         self._typify = Typifier(ctx)
+
+#     def get_counter_declarations(self) -> Sequence[PsDeclaration]:
+#         num_slower_dimensions = min(self._grid_dims, self._ispace_dims) - 1
+#         num_fast_dimensions = self._ispace_dims - num_slower_dimensions
+
+#         decls = []
+
+#         #   Slower n dimensions
+#         for i in range(num_slower_dimensions, 0, -1):
+#             thread_idx = BLOCK_IDX[i] * BLOCK_DIM[i] + THREAD_IDX[i]
+#             decls.append(self._make_ctr_decl(self._ispace.dimensions[num_fast_dimensions + i], thread_idx))
+
+#         #   Fastest dimensions
+#         thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
+
+#         if num_fast_dimensions == 1:
+#             decls.append(self._make_ctr_decl(self._ispace.dimensions[0], thread_idx))
+#         else:
+#             for i in range(num_fast_dimensions, 0, -1):
+#                 decls.append(
+#                     self._make_ctr_decl(
+#                         self._ispace.dimensions[i],
+#                         #   ergh... need actual iterations here...
+#                     )
+#                 )
+
+
+#     def _make_ctr_decl(
+#         self, dim: FullIterationSpace.Dimension, thread_idx: PsExpression
+#     ):
+#         dim.counter.dtype = constify(dim.counter.get_dtype())
+
+#         ctr = PsExpression.make(dim.counter)
+#         return self._typify(
+#             PsDeclaration(
+#                 ctr,
+#                 dim.start + dim.step * PsCast(ctr.get_dtype(), thread_idx),
+#             )
+#         )
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 774b9405c..f6b888a49 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -10,6 +10,7 @@ from ..kernelcreation.iteration_space import (
     SparseIterationSpace,
 )
 from .platform import Platform
+from ..exceptions import MaterializationError
 
 
 class GpuThreadsRange:
@@ -56,6 +57,19 @@ class GpuThreadsRange:
             raise NotImplementedError(
                 f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
             )
+        
+        from ..ast.analysis import collect_undefined_symbols as collect
+
+        for dim in dimensions:
+            symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
+            for ctr in ispace.counters:
+                if ctr in symbs:
+                    raise MaterializationError(
+                        "Unable to construct GPU threads range for iteration space: "
+                        f"Limits of dimension counter {dim.counter.name} "
+                        f"depend on another dimension's counter {ctr.name}"
+                    )
+
         work_items = [ispace.actual_iterations(dim) for dim in dimensions]
         return GpuThreadsRange(work_items)
 
@@ -64,5 +78,5 @@ class GenericGpu(Platform):
     @abstractmethod
     def materialize_iteration_space(
         self, block: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         pass
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index c688530ae..c08ddc161 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -190,6 +190,14 @@ class GpuIndexingConfig:
     block_size: tuple[int, int, int] | None = None
     """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
 
+    manual_launch_grid: bool = False
+    """Always require a manually specified launch grid when running this kernel.
+    
+    If set to `True`, the code generator will not attempt to infer the size of
+    the launch grid from the kernel.
+    The launch grid will then have to be specified manually at runtime.
+    """
+
     sycl_automatic_block_size: bool = True
     """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size.
 
-- 
GitLab


From 617a9282e178de64ba6adde1e882c7a89c5c5159 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 15 Nov 2024 14:20:28 +0100
Subject: [PATCH 03/31] Add test cases for various sliced iterations. Fix
 trailing iters in loop vectorizer when simd loop is never entered. Extract
 default gen configs to fixtures.

---
 .flake8                                       |   2 +-
 conftest.py                                   |   5 +
 src/pystencils/backend/platforms/cuda.py      |  73 +------
 .../transformations/loop_vectorizer.py        |  48 +++--
 tests/fixtures.py                             |  71 +++++++
 tests/kernelcreation/test_domain_kernels.py   |  29 ---
 tests/kernelcreation/test_iteration_slices.py | 190 ++++++++++++++++++
 tests/kernelcreation/test_sliced_iteration.py |  29 ---
 tests/nbackend/test_vectorization.py          |  34 +++-
 9 files changed, 328 insertions(+), 153 deletions(-)
 create mode 100644 tests/fixtures.py
 create mode 100644 tests/kernelcreation/test_iteration_slices.py
 delete mode 100644 tests/kernelcreation/test_sliced_iteration.py

diff --git a/.flake8 b/.flake8
index 3f946922a..05758405a 100644
--- a/.flake8
+++ b/.flake8
@@ -4,4 +4,4 @@ exclude=src/pystencils/jupyter.py,
         src/pystencils/plot.py
         src/pystencils/session.py
         src/pystencils/old
-ignore = W293 W503 W291 C901 E741
+ignore = W293 W503 W291 C901 E741 E704
diff --git a/conftest.py b/conftest.py
index 0c6c49153..4e8e2b73a 100644
--- a/conftest.py
+++ b/conftest.py
@@ -203,3 +203,8 @@ else:
                 return IPyNbFile.from_parent(fspath=path, parent=parent)
             else:
                 return IPyNbFile(path, parent)
+
+
+#   Fixtures
+
+from tests.fixtures import *
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 5c8b5b504..dbade47d1 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -144,7 +144,7 @@ class CudaPlatform(GenericGpu):
 
         indexing_decls = []
         conds = []
-        for i, dim in enumerate(dimensions):
+        for i, dim in enumerate(dimensions[::-1]):
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
             ctr = PsExpression.make(dim.counter)
@@ -161,6 +161,8 @@ class CudaPlatform(GenericGpu):
             if not self._cfg.omit_range_check:
                 conds.append(PsLt(ctr, dim.stop))
 
+        indexing_decls = indexing_decls[::-1]
+
         if conds:
             condition: PsExpression = conds[0]
             for cond in conds[1:]:
@@ -214,72 +216,3 @@ class CudaPlatform(GenericGpu):
         block_idx = BLOCK_IDX[coord]
         thread_idx = THREAD_IDX[coord]
         return block_idx * block_size + thread_idx
-
-
-# class LinearIndexing:
-#     """Linear GPU thread indexing.
-
-#     This indexing scheme maps GPU threads to iteration space points in the following way:
-#     - Starting from the slowest coordinate, each coordinate is mapped to a dimension
-#       of the GPU grid until just one dimension is left
-#     - All remaining dimensions of the iteration space are linearly mapped
-#       onto the fastest launch grid dimension
-#     """
-
-#     def __init__(
-#         self,
-#         ctx: KernelCreationContext,
-#         launch_grid_dimensions: int,
-#         ispace: FullIterationSpace,
-#     ) -> None:
-#         if not (0 < launch_grid_dimensions <= 3):
-#             raise ValueError(
-#                 f"Invalid number of launch grid dimensions: {launch_grid_dimensions}"
-#             )
-
-#         self._ctx = ctx
-
-#         self._grid_dims = launch_grid_dimensions
-#         self._ispace = ispace
-#         self._ispace_dims = len(ispace.dimensions)
-
-#         self._typify = Typifier(ctx)
-
-#     def get_counter_declarations(self) -> Sequence[PsDeclaration]:
-#         num_slower_dimensions = min(self._grid_dims, self._ispace_dims) - 1
-#         num_fast_dimensions = self._ispace_dims - num_slower_dimensions
-
-#         decls = []
-
-#         #   Slower n dimensions
-#         for i in range(num_slower_dimensions, 0, -1):
-#             thread_idx = BLOCK_IDX[i] * BLOCK_DIM[i] + THREAD_IDX[i]
-#             decls.append(self._make_ctr_decl(self._ispace.dimensions[num_fast_dimensions + i], thread_idx))
-
-#         #   Fastest dimensions
-#         thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
-
-#         if num_fast_dimensions == 1:
-#             decls.append(self._make_ctr_decl(self._ispace.dimensions[0], thread_idx))
-#         else:
-#             for i in range(num_fast_dimensions, 0, -1):
-#                 decls.append(
-#                     self._make_ctr_decl(
-#                         self._ispace.dimensions[i],
-#                         #   ergh... need actual iterations here...
-#                     )
-#                 )
-
-
-#     def _make_ctr_decl(
-#         self, dim: FullIterationSpace.Dimension, thread_idx: PsExpression
-#     ):
-#         dim.counter.dtype = constify(dim.counter.get_dtype())
-
-#         ctr = PsExpression.make(dim.counter)
-#         return self._typify(
-#             PsDeclaration(
-#                 ctr,
-#                 dim.start + dim.step * PsCast(ctr.get_dtype(), thread_idx),
-#             )
-#         )
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index c89698193..e1e4fea50 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -8,7 +8,7 @@ from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
 from ..ast.structural import PsLoop, PsBlock, PsDeclaration
-from ..ast.expressions import PsExpression
+from ..ast.expressions import PsExpression, PsTernary, PsGt
 from ..ast.vector import PsVecBroadcast
 from ..ast.analysis import collect_undefined_symbols
 
@@ -18,7 +18,7 @@ from .rewrite import substitute_symbols
 
 class LoopVectorizer:
     """Vectorize loops.
-    
+
     The loop vectorizer provides methods to vectorize single loops inside an AST
     using a given number of vector lanes.
     During vectorization, the loop body is transformed using the `AstVectorizer`,
@@ -64,29 +64,26 @@ class LoopVectorizer:
     @overload
     def vectorize_select_loops(
         self, node: PsBlock, predicate: Callable[[PsLoop], bool]
-    ) -> PsBlock:
-        ...
+    ) -> PsBlock: ...
 
     @overload
     def vectorize_select_loops(
         self, node: PsLoop, predicate: Callable[[PsLoop], bool]
-    ) -> PsLoop | PsBlock:
-        ...
+    ) -> PsLoop | PsBlock: ...
 
     @overload
     def vectorize_select_loops(
         self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
-    ) -> PsAstNode:
-        ...
+    ) -> PsAstNode: ...
 
     def vectorize_select_loops(
         self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
     ) -> PsAstNode:
         """Select and vectorize loops from a syntax tree according to a predicate.
-        
+
         Finds each loop inside a subtree and evaluates ``predicate`` on them.
         If ``predicate(loop)`` evaluates to `True`, the loop is vectorized.
-        
+
         Loops nested inside a vectorized loop will not be processed.
 
         Args:
@@ -139,7 +136,7 @@ class LoopVectorizer:
 
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
-        
+
         if vector_ctr in collect_undefined_symbols(simd_body):
             simd_body.statements.insert(0, vector_counter_decl)
 
@@ -186,20 +183,31 @@ class LoopVectorizer:
                 trailing_start = self._ctx.get_new_symbol(
                     f"__{scalar_ctr.name}_trailing_start", scalar_ctr.get_dtype()
                 )
+
                 trailing_start_decl = self._type_fold(
                     PsDeclaration(
                         PsExpression.make(trailing_start),
-                        (
+                        PsTernary(
+                            #   If at least one vectorized iteration took place...
+                            PsGt(
+                                PsExpression.make(simd_stop),
+                                simd_start.clone(),
+                            ),
+                            #   start from the smallest non-valid multiple of simd_step, offset from simd_start
                             (
-                                PsExpression.make(simd_stop)
-                                - simd_start.clone()
-                                - PsExpression.make(PsConstant(1))
+                                (
+                                    PsExpression.make(simd_stop)
+                                    - simd_start.clone()
+                                    - PsExpression.make(PsConstant(1))
+                                )
+                                / PsExpression.make(simd_step)
+                                + PsExpression.make(PsConstant(1))
                             )
-                            / PsExpression.make(simd_step)
-                            + PsExpression.make(PsConstant(1))
-                        )
-                        * PsExpression.make(simd_step)
-                        + simd_start.clone(),
+                            * PsExpression.make(simd_step)
+                            + simd_start.clone(),
+                            #   otherwise start at zero
+                            simd_start.clone(),
+                        ),
                     )
                 )
 
diff --git a/tests/fixtures.py b/tests/fixtures.py
new file mode 100644
index 000000000..7c9521614
--- /dev/null
+++ b/tests/fixtures.py
@@ -0,0 +1,71 @@
+"""Fixtures for the pystencils test suite
+
+This module provides a number of fixtures used by the pystencils test suite.
+Use these fixtures wherever applicable to extend the code surface area covered
+by your tests:
+
+- All tests that should work for every target should use the `target` fixture
+- All tests that should work with the highest optimization level for every target
+  should use the `gen_config` fixture
+- Use the `xp` fixture to access the correct array module (numpy or cupy) depending
+  on the target
+
+"""
+
+import pytest
+
+from types import ModuleType
+from dataclasses import replace
+
+import pystencils as ps
+
+AVAILABLE_TARGETS = [ps.Target.GenericCPU]
+
+try:
+    import cupy
+
+    AVAILABLE_TARGETS += [ps.Target.CUDA]
+except ImportError:
+    pass
+
+AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets()
+TARGET_IDS = [t.name for t in AVAILABLE_TARGETS]
+
+@pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS)
+def target(request) -> ps.Target:
+    """Provides all code generation targets available on the current hardware"""
+    return request.param
+
+@pytest.fixture
+def gen_config(target: ps.Target):
+    """Default codegen configuration for the current target.
+    
+    For GPU targets, set default indexing options.
+    For vector-CPU targets, set default vectorization config.
+    """
+
+    gen_config = ps.CreateKernelConfig(target=target)
+
+    if target.is_vector_cpu():
+        gen_config = replace(
+            gen_config,
+            cpu_optim=ps.CpuOptimConfig(
+                vectorize=ps.VectorizationConfig(assume_inner_stride_one=True)
+            ),
+        )
+
+    return gen_config
+
+@pytest.fixture()
+def xp(target: ps.Target) -> ModuleType:
+    """Primary array module for the current target.
+    
+    Returns:
+        `cupy` if `target == Target.CUDA`, and `numpy` otherwise
+    """
+    if target == ps.Target.CUDA:
+        import cupy as xp
+        return xp
+    else:
+        import numpy as np
+        return np
diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py
index b9cebb354..d02bfd8e4 100644
--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -18,35 +18,6 @@ from pystencils.assignment import assignment_from_stencil
 from pystencils.kernelcreation import create_kernel, KernelFunction
 from pystencils.backend.emission import emit_code
 
-AVAILABLE_TARGETS = [Target.GenericCPU]
-
-try:
-    import cupy
-
-    AVAILABLE_TARGETS += [Target.CUDA]
-except ImportError:
-    pass
-
-AVAILABLE_TARGETS += Target.available_vector_cpu_targets()
-TEST_IDS = [t.name for t in AVAILABLE_TARGETS]
-
-
-@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS)
-def gen_config(request):
-    target: Target = request.param
-
-    gen_config = CreateKernelConfig(target=target)
-
-    if Target._VECTOR in target:
-        gen_config = replace(
-            gen_config,
-            cpu_optim=CpuOptimConfig(
-                vectorize=VectorizationConfig(assume_inner_stride_one=True)
-            ),
-        )
-
-    return gen_config
-
 
 def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig):
     code = emit_code(kernel)
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
new file mode 100644
index 000000000..94ed02954
--- /dev/null
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -0,0 +1,190 @@
+import numpy as np
+import sympy as sp
+import pytest
+
+from dataclasses import replace
+
+from pystencils import (
+    DEFAULTS,
+    Assignment,
+    Field,
+    TypedSymbol,
+    create_kernel,
+    make_slice,
+    Target,
+    CreateKernelConfig,
+    GpuIndexingConfig,
+    DynamicType,
+)
+from pystencils.sympyextensions.integer_functions import int_rem
+from pystencils.simp import sympy_cse_on_assignment_list
+from pystencils.slicing import normalize_slice
+from pystencils.backend.jit.gpu_cupy import CupyKernelWrapper
+
+
+def test_sliced_iteration():
+    size = (4, 4)
+    src_arr = np.ones(size)
+    dst_arr = np.zeros_like(src_arr)
+    src_field = Field.create_from_numpy_array("src", src_arr)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr)
+
+    a, b = sp.symbols("a b")
+    update_rule = Assignment(
+        dst_field[0, 0],
+        (
+            a * src_field[0, 1]
+            + a * src_field[0, -1]
+            + b * src_field[1, 0]
+            + b * src_field[-1, 0]
+        )
+        / 4,
+    )
+
+    x_end = TypedSymbol("x_end", "int")
+    s = make_slice[1:x_end, 1]
+    x_end_value = size[1] - 1
+    kernel = create_kernel(
+        sympy_cse_on_assignment_list([update_rule]), iteration_slice=s
+    ).compile()
+
+    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
+
+    expected_result = np.zeros(size)
+    expected_result[1:x_end_value, 1] = 1
+    np.testing.assert_almost_equal(expected_result, dst_arr)
+
+
+@pytest.mark.parametrize(
+    "islice",
+    [
+        make_slice[1:-1, 1:-1],
+        make_slice[3, 2:-2],
+        make_slice[2:-2:2, ::3],
+        make_slice[10:, :-5:2],
+    ],
+)
+def test_numerical_slices(gen_config: CreateKernelConfig, xp, islice):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    expected = xp.zeros_like(f_arr)
+    expected[islice] = 1.0
+
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    update = Assignment(f.center(), 1)
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    try:
+        kernel = create_kernel(update, gen_config).compile()
+    except NotImplementedError:
+        if gen_config.target.is_vector_cpu():
+            #   TODO Gather/Scatter not implemented yet
+            pytest.xfail("Gather/Scatter not available yet")
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_symbolic_slice(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    sx, sy, ex, ey = [
+        TypedSymbol(n, DynamicType.INDEX_TYPE) for n in ("sx", "sy", "ex", "ey")
+    ]
+
+    f_arr = xp.zeros(shape)
+
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    update = Assignment(f.center(), 1)
+    islice = make_slice[sy:ey, sx:ex]
+    gen_config = replace(gen_config, iteration_slice=islice)
+    kernel = create_kernel(update, gen_config).compile()
+
+    for slic in [make_slice[:, :], make_slice[1:-1, 2:-2], make_slice[8:14, 7:11]]:
+        slic = normalize_slice(slic, shape)
+        expected = xp.zeros_like(f_arr)
+        expected[slic] = 1.0
+
+        f_arr[:] = 0.0
+
+        kernel(
+            f=f_arr,
+            sy=slic[0].start,
+            ey=slic[0].stop,
+            sx=slic[1].start,
+            ex=slic[1].stop,
+        )
+
+        xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    expected = xp.zeros_like(f_arr)
+    for r in range(shape[0]):
+        expected[r, r:] = 1.0
+
+    update = Assignment(f.center(), 1)
+    outer_counter = DEFAULTS.spatial_counters[0]
+    islice = make_slice[:, outer_counter:]
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    if gen_config.target == Target.CUDA:
+        gen_config = replace(
+            gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
+        )
+
+    kernel = create_kernel(update, gen_config).compile()
+
+    if isinstance(kernel, CupyKernelWrapper):
+        kernel.block_size = shape + (1,)
+        kernel.num_blocks = (1, 1, 1)
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    expected = xp.zeros_like(f_arr)
+    for r in range(shape[0]):
+        start = 0 if r % 2 == 0 else 1
+        expected[r, start::2] = 1.0
+
+    update = Assignment(f.center(), 1)
+    outer_counter = DEFAULTS.spatial_counters[0]
+    start = sp.Piecewise((0, sp.Eq(int_rem(outer_counter, 2), 0)), (1, True))
+    islice = make_slice[:, start::2]
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    if gen_config.target == Target.CUDA:
+        gen_config = replace(
+            gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
+        )
+
+    try:
+        kernel = create_kernel(update, gen_config).compile()
+    except NotImplementedError:
+        if gen_config.target.is_vector_cpu():
+            pytest.xfail("Gather/Scatter not implemented yet")
+
+    if isinstance(kernel, CupyKernelWrapper):
+        kernel.block_size = (8, 16, 1)
+        kernel.num_blocks = (1, 1, 1)
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
diff --git a/tests/kernelcreation/test_sliced_iteration.py b/tests/kernelcreation/test_sliced_iteration.py
deleted file mode 100644
index 5eff0a89d..000000000
--- a/tests/kernelcreation/test_sliced_iteration.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import numpy as np
-import sympy as sp
-
-from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice
-from pystencils.simp import sympy_cse_on_assignment_list
-
-
-def test_sliced_iteration():
-    size = (4, 4)
-    src_arr = np.ones(size)
-    dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr)
-
-    a, b = sp.symbols("a b")
-    update_rule = Assignment(dst_field[0, 0],
-                             (a * src_field[0, 1] + a * src_field[0, -1] +
-                              b * src_field[1, 0] + b * src_field[-1, 0]) / 4)
-
-    x_end = TypedSymbol("x_end", "int")
-    s = make_slice[1:x_end, 1]
-    x_end_value = size[1] - 1
-    kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile()
-
-    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
-
-    expected_result = np.zeros(size)
-    expected_result[1:x_end_value, 1] = 1
-    np.testing.assert_almost_equal(expected_result, dst_arr)
diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py
index aeb0ebaa0..55330c9ee 100644
--- a/tests/nbackend/test_vectorization.py
+++ b/tests/nbackend/test_vectorization.py
@@ -89,6 +89,11 @@ TEST_SETUPS: list[VectorTestSetup] = list(
 TEST_IDS = [t.name for t in TEST_SETUPS]
 
 
+@pytest.fixture(params=TEST_SETUPS, ids=TEST_IDS)
+def vectorization_setup(request) -> VectorTestSetup:
+    return request.param
+
+
 def create_vector_kernel(
     assignments: list[Assignment],
     field: Field,
@@ -139,9 +144,9 @@ def create_vector_kernel(
     return kernel
 
 
-@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
 @pytest.mark.parametrize("ghost_layers", [0, 2])
-def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
+def test_update_kernel(vectorization_setup: VectorTestSetup, ghost_layers: int):
+    setup = vectorization_setup
     src, dst = fields(f"src(2), dst(4): {setup.numeric_dtype}[2D]", layout="fzyx")
 
     x = sp.symbols("x_:4")
@@ -197,8 +202,8 @@ def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
             np.testing.assert_equal(dst_arr[:, -i, :], 0.0)
 
 
-@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
-def test_trailing_iterations(setup: VectorTestSetup):
+def test_trailing_iterations(vectorization_setup: VectorTestSetup):
+    setup = vectorization_setup
     f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
 
     update = [Assignment(f(0), 2 * f(0))]
@@ -216,3 +221,24 @@ def test_trailing_iterations(setup: VectorTestSetup):
         kernel(f=f_arr)
 
         np.testing.assert_equal(f_arr, 2.0)
+
+
+def test_only_trailing_iterations(vectorization_setup: VectorTestSetup):
+    setup = vectorization_setup
+    f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
+
+    update = [Assignment(f(0), 2 * f(0))]
+
+    kernel = create_vector_kernel(update, f, setup)
+
+    for trailing_iters in range(1, setup.lanes):
+        shape = (trailing_iters, 1)
+        f_arr = create_numpy_array_with_layout(
+            shape, layout=(1, 0), dtype=setup.numeric_dtype.numpy_dtype
+        )
+
+        f_arr[:] = 1.0
+
+        kernel(f=f_arr)
+
+        np.testing.assert_equal(f_arr, 2.0)
-- 
GitLab


From 63f396e7768d985bcd1c78cfd48220d1239e72f5 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 18 Nov 2024 12:49:51 +0100
Subject: [PATCH 04/31] Minor bugfixes:

 - allow `np.integer` args to float-type constant creation
 - fix target checking in datahandling
 - re-enable OpenMP for boundary handling kernels
---
 src/pystencils/boundaries/boundaryhandling.py      | 2 +-
 src/pystencils/datahandling/serial_datahandling.py | 7 +++++--
 src/pystencils/types/types.py                      | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/boundaries/boundaryhandling.py b/src/pystencils/boundaries/boundaryhandling.py
index 5c0869c29..fe8dd7d00 100644
--- a/src/pystencils/boundaries/boundaryhandling.py
+++ b/src/pystencils/boundaries/boundaryhandling.py
@@ -314,7 +314,7 @@ class BoundaryHandling:
 
     def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj):
         return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj,
-                                      target=self._target,)  # cpu_openmp=self._openmp) TODO: replace
+                                      target=self._target, cpu_openmp=self._openmp)
 
     def _create_index_fields(self):
         dh = self._data_handling
diff --git a/src/pystencils/datahandling/serial_datahandling.py b/src/pystencils/datahandling/serial_datahandling.py
index 8521dda10..6a5ce5730 100644
--- a/src/pystencils/datahandling/serial_datahandling.py
+++ b/src/pystencils/datahandling/serial_datahandling.py
@@ -291,7 +291,10 @@ class SerialDataHandling(DataHandling):
     def synchronization_function(self, names, stencil=None, target=None, functor=None, **_):
         if target is None:
             target = self.default_target
-        assert target in (Target.CPU, Target.GPU)
+            
+        if not (target.is_cpu() or target == Target.CUDA):
+            raise ValueError(f"Unsupported target: {target}")
+
         if not hasattr(names, '__len__') or type(names) is str:
             names = [names]
 
@@ -325,7 +328,7 @@ class SerialDataHandling(DataHandling):
                 values_per_cell = values_per_cell[0]
 
             if len(filtered_stencil) > 0:
-                if target == Target.CPU:
+                if target.is_cpu():
                     if functor is None:
                         from pystencils.slicing import get_periodic_boundary_functor
                         functor = get_periodic_boundary_functor
diff --git a/src/pystencils/types/types.py b/src/pystencils/types/types.py
index ae751992d..7645a452f 100644
--- a/src/pystencils/types/types.py
+++ b/src/pystencils/types/types.py
@@ -683,7 +683,7 @@ class PsIeeeFloatType(PsScalarType):
     def create_constant(self, value: Any) -> Any:
         np_type = self.NUMPY_TYPES[self._width]
 
-        if isinstance(value, (int, float, np.floating)):
+        if isinstance(value, (int, float, np.integer, np.floating)):
             finfo = np.finfo(np_type)  # type: ignore
             if value < finfo.min or value > finfo.max:
                 raise PsTypeError(
-- 
GitLab


From a86d9e094e9e83f0cf283ef01ba29221c0ff05d4 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 18 Nov 2024 13:36:40 +0100
Subject: [PATCH 05/31] Fix: Parsing of negative integer slices

---
 .../backend/kernelcreation/ast_factory.py     | 33 +++++++++++--------
 .../kernelcreation/test_iteration_space.py    | 30 +++++++++++++++++
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/ast_factory.py b/src/pystencils/backend/kernelcreation/ast_factory.py
index 2462e5e66..5a7084457 100644
--- a/src/pystencils/backend/kernelcreation/ast_factory.py
+++ b/src/pystencils/backend/kernelcreation/ast_factory.py
@@ -138,6 +138,13 @@ class AstFactory:
                 self._typify(self.parse_index(iter_slice) + self.parse_index(1))
             )
             step = self.parse_index(1)
+
+            if normalize_to is not None:
+                upper_limit = self.parse_index(normalize_to)
+                if isinstance(start, PsConstantExpr) and start.constant.value < 0:
+                    start = fold(self._typify(upper_limit.clone() + start))
+                    stop = fold(self._typify(upper_limit.clone() + stop))
+
         else:
             start = self._parse_any_index(
                 iter_slice.start if iter_slice.start is not None else 0
@@ -156,21 +163,21 @@ class AstFactory:
                     f"Invalid value for `slice.step`: {step.constant.value}"
                 )
 
-        if normalize_to is not None:
-            upper_limit = self.parse_index(normalize_to)
-            if isinstance(start, PsConstantExpr) and start.constant.value < 0:
-                start = fold(self._typify(upper_limit.clone() + start))
+            if normalize_to is not None:
+                upper_limit = self.parse_index(normalize_to)
+                if isinstance(start, PsConstantExpr) and start.constant.value < 0:
+                    start = fold(self._typify(upper_limit.clone() + start))
 
-            if stop is None:
-                stop = upper_limit
-            elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0:
-                stop = fold(self._typify(upper_limit.clone() + stop))
+                if stop is None:
+                    stop = upper_limit
+                elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0:
+                    stop = fold(self._typify(upper_limit.clone() + stop))
+
+            elif stop is None:
+                raise ValueError(
+                    "Cannot parse a slice with `stop == None` if no normalization limit is given"
+                )
 
-        elif stop is None:
-            raise ValueError(
-                "Cannot parse a slice with `stop == None` if no normalization limit is given"
-            )
-        
         assert stop is not None  # for mypy
 
         return start, stop, step
diff --git a/tests/nbackend/kernelcreation/test_iteration_space.py b/tests/nbackend/kernelcreation/test_iteration_space.py
index 5d56abd2b..abc1c9820 100644
--- a/tests/nbackend/kernelcreation/test_iteration_space.py
+++ b/tests/nbackend/kernelcreation/test_iteration_space.py
@@ -129,6 +129,36 @@ def test_slices_with_negative_start():
     )
 
 
+def test_negative_singular_slices():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+
+    archetype_field = Field.create_generic("f", spatial_dimensions=2, layout="fzyx")
+    ctx.add_field(archetype_field)
+    archetype_arr = ctx.get_buffer(archetype_field)
+
+    islice = (-2, -1)
+    ispace = FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
+
+    dims = ispace.dimensions
+
+    assert dims[0].start.structurally_equal(
+        PsExpression.make(archetype_arr.shape[0]) + factory.parse_index(-2)
+    )
+
+    assert dims[0].stop.structurally_equal(
+        PsExpression.make(archetype_arr.shape[0]) + factory.parse_index(-1)
+    )
+
+    assert dims[1].start.structurally_equal(
+        PsExpression.make(archetype_arr.shape[1]) + factory.parse_index(-1)
+    )
+
+    assert dims[1].stop.structurally_equal(
+        PsExpression.make(archetype_arr.shape[1])
+    )
+
+
 def test_field_independent_slices():
     ctx = KernelCreationContext()
 
-- 
GitLab


From 773c2997a6ea5629234cda9f85fce91afeb0285c Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 18 Nov 2024 13:53:20 +0100
Subject: [PATCH 06/31] add new doc dependencies to pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 4c629cc7a..5b72d5ef2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ doc = [
     'sphinx_autodoc_typehints',
     'pandoc',
     'sphinx_design',
+    'myst-nb'
 ]
 tests = [
     'pytest',
-- 
GitLab


From f8dc1882b6dc75288527ba3e58da1082c74746ac Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 14 Nov 2024 13:55:40 +0100
Subject: [PATCH 07/31] Introduce Myst-NB as a better-than-replacement for
 nbsphinx. WIP kernelcreation reference guide.

---
 docs/source/conf.py                           |  11 +-
 docs/source/reference/kernelcreation.md       | 424 ++++++++++++++++++
 ...nelcreation.rst => kernelcreation.rst.old} |   0
 pyproject.toml                                |   2 +-
 4 files changed, 435 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/reference/kernelcreation.md
 rename docs/source/reference/{kernelcreation.rst => kernelcreation.rst.old} (100%)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 9a4592e2e..0e6fe25db 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -33,9 +33,9 @@ extensions = [
     "sphinx.ext.mathjax",
     "sphinx.ext.napoleon",
     "sphinx.ext.inheritance_diagram",
-    "nbsphinx",
     "sphinxcontrib.bibtex",
     "sphinx_autodoc_typehints",
+    "myst_nb",
 ]
 
 templates_path = ["_templates"]
@@ -57,6 +57,15 @@ inheritance_graph_attrs = {
     "bgcolor": "white",
 }
 
+# -- Options for MyST / MyST-NB ----------------------------------------------
+
+nb_execution_mode = "off" # do not execute notebooks by default
+
+myst_enable_extensions = [
+    "dollarmath",
+    "colon_fence",
+]
+
 # -- Options for HTML output -------------------------------------------------
 
 html_theme = "furo"
diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
new file mode 100644
index 000000000..0beeedfd2
--- /dev/null
+++ b/docs/source/reference/kernelcreation.md
@@ -0,0 +1,424 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import matplotlib.pyplot as plt
+```
+
+# Kernel Creation
+
+Once a kernel's assignments are fully assembled, they need to be passed through pystencils' code
+generation engine in order to produce the kernel's executable code.
+The code generation engine can be invoked through the `create_kernel` function,
+which takes two arguments: The list of assignment that make up the kernel
+(optionally wrapped as an ``AssignmentCollection``),
+and a configuration object.
+The configuration object, an instance of {any}`CreateKernelConfig <pystencils.config.CreateKernelConfig>`, allows to set a
+wide variety of options that influence the code generator.
+
+## Code Generator API
+
+### Invocation
+
+```{eval-rst}
+.. module:: pystencils.kernelcreation
+
+.. autosummary::
+  :toctree: autoapi
+  :nosignatures:
+
+  create_kernel
+```
+
+### Configuration
+
+```{eval-rst}
+
+.. module:: pystencils.config
+
+.. autosummary::
+  :toctree: autoapi
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  CreateKernelConfig
+  CpuOptimConfig
+  OpenMpConfig
+  VectorizationConfig
+  GpuIndexingConfig
+
+.. autosummary::
+  :toctree: autoapi
+  :nosignatures:
+
+  AUTO
+```
+
+## Target Specification
+
+Pystencils supports code generation for a variety of CPU and GPU hardware.
+
+```{eval-rst}
+.. currentmodule:: pystencils.config
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.target
+
+.. module:: pystencils.target
+
+.. autosummary::
+  :toctree: autoapi
+  :nosignatures:
+  :template: autosummary/recursive_class.rst
+
+  Target
+```
+
+## Data Types
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+from pystencils.kernelcreation import DefaultKernelCreationDriver
+from pystencils.display_utils import show_ir
+
+def _inspect_ir(kernel, cfg=ps.CreateKernelConfig()):
+    body = DefaultKernelCreationDriver(ps.CreateKernelConfig()).parse_kernel_body(assignments)
+    show_ir(body)
+```
+
+To produce valid output code, the code generator has to figure out the data types of each
+symbol, expression, and assignment occuring inside a kernel.
+This happens roughly according to the following rules:
+ - **Field Accesses:** Each field has a fixed data type set at its creation, which is also applied to
+   each access to that field.
+ - **Symbols:** Symbols obtain their data types from two sources. 
+   A symbol occuring first on the left-hand side of an assignment receives the data type that
+   was inferred for the right-hand side expression of that assignment.
+   Symbols occuring first inside some expression on the right-hand side of an assignment, on the other
+   hand, receive the {any}`default_dtype <CreateKernelConfig.default_dtype>` set in the {any}`CreateKernelConfig`.
+
+We can observe this behavior by setting up a kernel including several fields with different data types:
+
+```{code-cell} ipython3
+from pystencils.sympyextensions import CastFunc
+
+f = ps.fields("f: float64[2D]")
+g = ps.fields("g: float32[2D]")
+
+x, y, z = sp.symbols("x, y, z")
+
+assignments = [
+  ps.Assignment(z, 42),
+  ps.Assignment(x, f(0) + z),
+  ps.Assignment(y, g(0)),
+  ps.Assignment(f(0), CastFunc(y, "float64") + x)
+]
+```
+
+Inspecting the above kernel's intermediate representation, we see that `y` has inherited
+its data type from the access to `g`.
+The symbol `z`, on the other hand, has been assigned the {any}`default_dtype <CreateKernelConfig.default_dtype>`,
+which, unless otherwise specified, is `float64`:
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_inspect_ir(assignments)
+```
+
+```{eval-rst}
+.. currentmodule:: pystencils.config
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.default_dtype
+  CreateKernelConfig.index_dtype
+```
+
+## The Iteration Space
+
+The *domain fields* a kernel operates on are understood to reside on a common,
+one-, two- or three-dimensional computational grid.
+The grid points may be understood as vertices or cells, depending on the application.
+When executed, the kernel performs a computation and updates values on all, or a specific subset
+of, these grid points.
+The set of points the kernel actually operates on is defined by its *iteration space*.
+
+There are three distinct options to control the iteration space in the code generator,
+only one of which can be specified at a time:
+ - The ``ghost_layers`` option allows to specify a number of layers of grid points on all
+   domain borders that should be excluded from iteration;
+ - The ``iteration_slice`` option allows to describe the iteration space using Pythonic slice objects;
+ - The ``index_field`` option can be used to realize a sparse list-based iteration by passing a special
+   *index field* which holds a list of all points that should be processed.
+
+:::{note}
+  The points within a kernel's iteration space are understood to be processed concurrently and in
+  no particular order;
+  the output of any kernel that relies on some specific iteration order is therefore undefined.
+  (When running on a GPU, all grid points might in fact be processed in perfect simultaniety!)
+:::
+
+```{eval-rst}
+.. currentmodule:: pystencils.config
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.ghost_layers
+  CreateKernelConfig.iteration_slice
+  CreateKernelConfig.index_field
+```
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _draw_ispace(f_arr):
+    n, m = f_arr.shape
+    fig, ax = plt.subplots()
+    
+    ax.set_xticks(np.arange(0, m, 4))
+    ax.set_yticks(np.arange(0, n, 4))
+    # ax.set_xticklabels([])
+    # ax.set_yticklabels([])
+
+    ax.set_xticks(np.arange(-.5, m, 1), minor=True)
+    ax.set_yticks(np.arange(-.5, n, 1), minor=True)
+    
+    ax.grid(which="minor", linewidth=2)
+    ax.tick_params(which='minor', bottom=False, left=False)
+    
+    ax.imshow(f_arr, interpolation="none", aspect="equal", origin="lower")
+```
+
+### Specifying Ghost Layers
+
+One way to alter the iteration space is by introducing ghost layers on the domain borders.
+These layers of grid points are stripped from the iterations, and can be used to hold
+boundary values or exchange data in MPI-parallel simulations.
+
+#### Automatic Ghost Layers
+
+The easiest way to define an iteration space with ghost layers
+is to set `ghost_layers=ps.config.AUTO`, which is also the default
+when no iteration space options are specified.
+In this case, the code generator will examine the kernel to find the maximum range
+of its stencil -- that is, the maximum neighbor offset encountered in any field access.
+If, for instance, a neighbor node in $x$-direction with offset $k$ is accessed by the kernel,
+it cannot legally execute on the outermost $k$ layers of nodes in that direction since it would
+access memory out-of-bounds.
+Therefore, an automatic number of $k$ ghost layers is inferred.
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+u, v = ps.fields("u, v: [2D]")
+```
+
+To illustrate, the following kernel accesses neighbor nodes with a maximum offset of two:
+
+```{code-cell} ipython3
+assignments = [
+  ps.Assignment(u.center(), v[-2, -2] + v[2, 2])
+]
+```
+
+With `ghost_layers=ps.config.AUTO`, its iteration space will look like this (yellow cells are included, purple cells excluded):
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+f = ps.fields("f: float64[2D]")
+assignments += [
+    ps.Assignment(f(0), 1)
+]
+kernel = ps.create_kernel(assignments).compile()
+
+f_arr = np.zeros((16, 16))
+u_arr = np.zeros_like(f_arr)
+v_arr = np.zeros_like(f_arr)
+
+kernel(f=f_arr, u=u_arr, v=v_arr)
+
+_draw_ispace(f_arr)
+```
+
+#### Uniform and Nonuniform Ghost Layers
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _show_ispace(cfg):
+    f = ps.fields("f: float64[2D]")
+    assignments = [
+        ps.Assignment(f(0), 1)
+    ]
+    kernel = ps.create_kernel(assignments, cfg).compile()
+
+    f_arr = np.zeros((16, 16))
+    kernel(f=f_arr)
+
+    _draw_ispace(f_arr)
+```
+
+Setting `ghost_layers` to a number will remove that many layers from the iteration space in each dimension:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    ghost_layers=1
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+Ghost layers can also be specified individually for each dimension and lower/upper borders,
+by passing a sequence with either a single integer or a pair of integers per dimension:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    ghost_layers=[(2, 1), 3]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+### Iteration Slices
+
+Using the `iteration_slice` option, we can assert much finer control on the kernel's iteration space
+by specifying it using sequences of Python `slice` objects.
+
+We can quickly create those using `ps.make_slice`, using the `start:stop:step` slice notation.
+The easiest case is to set the iteration space with fixed numerical limits:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[3:-4, 9:14]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+#### Strided Iteration
+
+It is also possible to set up a strided iteration that skips over a fixed number of elements.
+The following example processes only every second line in $y$-direction, using the slice `::2`:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[::2, 3:-3]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+#### Triangular Iteration
+
+Iteration slices are not limited to constant numerical values; they can be arbitrarily complex
+*SymPy* expressions.
+By using the counter symbol for the first dimension to define the iteration limits of the second,
+we can produce a triangular iteration pattern:
+
+```{code-cell} ipython3
+y = ps.DEFAULTS.spatial_counters[0]
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[:, y:]
+)
+```
+
+:::{warning}
+On CPU, the above will only work if the loop over the second dimension is nested
+*inside* the loop over the first dimension!
+The loop order depends on the memory layout of the fields used in the kernel;
+depending on the loop order, the above might have to be adapted such that the first dimension
+depends on the second.
+:::
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+#### Red-Black Iteration
+
+Using a case distinction for the second dimension's start index, we can even produce
+a checkerboard pattern, as required for e.g. red-black Gauss-Seidel-type smoothers.
+We use the integer remainder (`int_rem`) to distinguish between even- and odd-numbered rows,
+set the start value accordingly, and use a step size of two:
+
+```{code-cell} ipython3
+from pystencils.sympyextensions.integer_functions import int_rem
+
+y = ps.DEFAULTS.spatial_counters[0]
+start = sp.Piecewise(
+    (0, sp.Eq(int_rem(y, 2), 0)),
+    (1, True)
+)
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[:, start::2]
+)
+```
+
+:::{warning}
+As with the triangular iteration pattern described above, this specification of the red-black
+pattern also depends on the second dimension being mapped to the inner loop (for CPU kernels).
+:::
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+## Kernel Parameters and Function Objects
+
+```{eval-rst}
+.. module:: pystencils.backend.kernelfunction
+
+.. autosummary::
+  :toctree: autoapi
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  KernelParameter
+  KernelFunction
+  GpuKernelFunction
+```
\ No newline at end of file
diff --git a/docs/source/reference/kernelcreation.rst b/docs/source/reference/kernelcreation.rst.old
similarity index 100%
rename from docs/source/reference/kernelcreation.rst
rename to docs/source/reference/kernelcreation.rst.old
diff --git a/pyproject.toml b/pyproject.toml
index 5b72d5ef2..3626bba74 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ use_cython = [
 doc = [
     'sphinx',
     'furo',
-    'nbsphinx',
+    'myst-nb',
     'sphinxcontrib-bibtex',
     'sphinx_autodoc_typehints',
     'pandoc',
-- 
GitLab


From 57e5b7a373e6619fe0d1536e8ad398da794feec8 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 18 Nov 2024 14:19:53 +0100
Subject: [PATCH 08/31] Fix doc deps in pyproject.toml. Revert to C code
 printing in dtype explanation.

---
 docs/source/index.rst                        |  2 +-
 docs/source/reference/kernelcreation.md      | 47 ++++++++++-----
 docs/source/reference/kernelcreation.rst.old | 61 --------------------
 docs/source/reference/types.rst              |  2 +-
 pyproject.toml                               |  1 -
 5 files changed, 33 insertions(+), 80 deletions(-)
 delete mode 100644 docs/source/reference/kernelcreation.rst.old

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 868fbc036..31334af4f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -39,7 +39,7 @@ Its features include:
   and take control of numerical precision using the `versatile type system <page_type_system>`.
 - **Kernel Description:** Derive and optimize stencil-based update rules using a symbolic abstraction
   of numerical `fields <page_symbolic_language>`.
-- **Code Generation:** `Generate and compile <page_kernel_creation>` high-performance parallel kernels for CPUs and GPUs.
+- **Code Generation:** `Generate and compile <guide_kernelcreation>` high-performance parallel kernels for CPUs and GPUs.
   Accelerate your kernels on multicore CPUs using the automatic OpenMP parallelization
   and make full use of your cores' SIMD units through the highly configurable vectorizer.
 - **Rapid Prototyping:** Run your numerical solvers on `NumPy <https://numpy.org>`_ and `CuPy <https://cupy.dev>`_ arrays
diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
index 0beeedfd2..0761f2532 100644
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -23,6 +23,7 @@ import numpy as np
 import matplotlib.pyplot as plt
 ```
 
+(guide_kernelcreation)=
 # Kernel Creation
 
 Once a kernel's assignments are fully assembled, they need to be passed through pystencils' code
@@ -99,12 +100,12 @@ Pystencils supports code generation for a variety of CPU and GPU hardware.
 ```{code-cell} ipython3
 :tags: [remove-cell]
 
-from pystencils.kernelcreation import DefaultKernelCreationDriver
-from pystencils.display_utils import show_ir
+# from pystencils.kernelcreation import DefaultKernelCreationDriver
+# from pystencils.display_utils import show_ir
 
-def _inspect_ir(kernel, cfg=ps.CreateKernelConfig()):
-    body = DefaultKernelCreationDriver(ps.CreateKernelConfig()).parse_kernel_body(assignments)
-    show_ir(body)
+# def _inspect_ir(kernel, cfg=ps.CreateKernelConfig()):
+#     body = DefaultKernelCreationDriver(ps.CreateKernelConfig()).parse_kernel_body(assignments)
+#     show_ir(body)
 ```
 
 To produce valid output code, the code generator has to figure out the data types of each
@@ -123,28 +124,42 @@ We can observe this behavior by setting up a kernel including several fields wit
 ```{code-cell} ipython3
 from pystencils.sympyextensions import CastFunc
 
-f = ps.fields("f: float64[2D]")
-g = ps.fields("g: float32[2D]")
+f = ps.fields("f: float32[2D]")
+g = ps.fields("g: float16[2D]")
 
 x, y, z = sp.symbols("x, y, z")
 
 assignments = [
-  ps.Assignment(z, 42),
-  ps.Assignment(x, f(0) + z),
-  ps.Assignment(y, g(0)),
-  ps.Assignment(f(0), CastFunc(y, "float64") + x)
+  ps.Assignment(x, 42),
+  ps.Assignment(y, f(0) + x),
+  ps.Assignment(z, g(0))
 ]
+
+cfg = ps.CreateKernelConfig(
+  default_dtype="float32",
+  index_dtype="int32"
+)
+
+kernel = ps.create_kernel(assignments, cfg)
 ```
 
-Inspecting the above kernel's intermediate representation, we see that `y` has inherited
-its data type from the access to `g`.
-The symbol `z`, on the other hand, has been assigned the {any}`default_dtype <CreateKernelConfig.default_dtype>`,
-which, unless otherwise specified, is `float64`:
+Inspecting the above kernel's output code, we see that `x` has received the `float` type,
+which was specified via the {py:data}`default_dtype <CreateKernelConfig.default_dtype>` option.
+The symbol `y`, on the other hand, has inherited its data type `half` from the access to the field `g`
+on its declaration's right-hand side.
+Also, we can observe that the loop counters and symbols related to the field's memory layout
+are using the `int32` data type, as specified in {py:data}`index_dtype <CreateKernelConfig.index_dtype>`:
+
+:::{admonition} Developer TODO
+
+Don't show the final C code here, but inspect the codegen IR.
+That might make it clearer.
+:::
 
 ```{code-cell} ipython3
 :tags: [remove-input]
 
-_inspect_ir(assignments)
+ps.show_code(kernel)
 ```
 
 ```{eval-rst}
diff --git a/docs/source/reference/kernelcreation.rst.old b/docs/source/reference/kernelcreation.rst.old
deleted file mode 100644
index 7ea20d6b6..000000000
--- a/docs/source/reference/kernelcreation.rst.old
+++ /dev/null
@@ -1,61 +0,0 @@
-.. _page_kernel_creation:
-
-***************
-Kernel Creation
-***************
-
-Targets
-=======
-
-.. module:: pystencils.target
-
-.. autosummary::
-    :toctree: autoapi
-    :nosignatures:
-    :template: autosummary/recursive_class.rst
-
-    Target
-
-
-Configuration
-=============
-
-.. module:: pystencils.config
-
-.. autosummary::
-    :toctree: autoapi
-    :nosignatures:
-    :template: autosummary/entire_class.rst
-
-    CreateKernelConfig
-    CpuOptimConfig
-    OpenMpConfig
-    VectorizationConfig
-    GpuIndexingConfig
-
-
-Creation
-========
-
-.. module:: pystencils.kernelcreation
-
-.. autosummary::
-    :toctree: autoapi
-    :nosignatures:
-
-    create_kernel
-
-
-Kernel Parameters and Function Objects
-======================================
-
-.. module:: pystencils.backend.kernelfunction
-
-.. autosummary::
-    :toctree: autoapi
-    :nosignatures:
-    :template: autosummary/entire_class.rst
-
-    KernelParameter
-    KernelFunction
-    GpuKernelFunction
diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index 041b0cb2a..a795cdaec 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -85,7 +85,7 @@ Exceptions
     :toctree: autoapi
     :nosignatures:
 
-    pystencils.types.PsTypeError
+    PsTypeError
 
 
 Implementation Details
diff --git a/pyproject.toml b/pyproject.toml
index 3626bba74..affddafd2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,6 @@ use_cython = [
 doc = [
     'sphinx',
     'furo',
-    'myst-nb',
     'sphinxcontrib-bibtex',
     'sphinx_autodoc_typehints',
     'pandoc',
-- 
GitLab


From c43ea450ab6576da09dfc8a65375c439e7455e45 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 18 Nov 2024 16:33:11 +0100
Subject: [PATCH 09/31] Start writing GPU kernels reference guide.

---
 .gitlab-ci.yml                                |   1 +
 docs/source/conf.py                           |   1 +
 docs/source/reference/gpu_kernels.md          | 105 ++++++++++++++++++
 docs/source/reference/index.rst               |   4 +-
 docs/source/reference/kernelcreation.md       |  10 +-
 src/pystencils/backend/kernelfunction.py      |   3 +
 .../backend/platforms/generic_gpu.py          |   9 ++
 7 files changed, 128 insertions(+), 5 deletions(-)
 create mode 100644 docs/source/reference/gpu_kernels.md

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9eea3a2e6..97342f47a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -328,6 +328,7 @@ build-documentation:
     - make html SPHINXOPTS="-W --keep-going"
   tags:
     - docker
+    - cuda11
   artifacts:
     paths:
       - docs/build/html
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0e6fe25db..c38894118 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -49,6 +49,7 @@ intersphinx_mapping = {
     "numpy": ("https://docs.scipy.org/doc/numpy/", None),
     "matplotlib": ("https://matplotlib.org/", None),
     "sympy": ("https://docs.sympy.org/latest/", None),
+    "cupy": ("https://docs.cupy.dev/en/stable/", None),
 }
 
 # -- Options for inheritance diagrams-----------------------------------------
diff --git a/docs/source/reference/gpu_kernels.md b/docs/source/reference/gpu_kernels.md
new file mode 100644
index 000000000..735118e9a
--- /dev/null
+++ b/docs/source/reference/gpu_kernels.md
@@ -0,0 +1,105 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import matplotlib.pyplot as plt
+```
+
+(guide_gpukernels)=
+# CUDA Code Generation for GPUs
+
+Pystencils offers code generation for Nvidia GPUs using the CUDA programming model,
+as well as just-in-time compilation and execution of CUDA kernels from within Python
+based on the [cupy] library.w
+This section's objective is to give a detailed introduction into the creation of
+GPU kernels with pystencils.
+
+## Generate, Compile and Run CUDA Kernels
+
+In order to obtain a CUDA implementation of a symbolic kernel, naught more is required
+than setting the {any}`target <CreateKernelConfig.target>` code generator option to
+{any}`Target.CUDA`:
+
+```{code-cell} ipython3
+f, g = ps.fields("f, g: float64[3D]")
+update = ps.Assignment(f.center(), 2 * g.center())
+
+cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+kernel = ps.create_kernel(update, cfg)
+
+ps.show_code(kernel)
+```
+
+The `kernel` object returned by the code generator in above snippet is an instance
+of the {py:class}`GpuKernelFunction` class.
+It extends {py:class}`KernelFunction` with some GPU-specific information.
+In particular, it defines the {any}`threads_range <GpuKernelFunction.threads_range>`
+property, which tells us how many threads the kernel is expecting to be executed with:
+
+```{code-cell} ipython3
+kernel.threads_range
+```
+
+If a GPU is available and [cupy] is installed in the current environment,
+the kernel can be compiled and run immediately.
+To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field;
+this is the GPU analogue to {any}`numpy.ndarray`:
+
+```{code-cell} ipython3
+:tags: [raises-exception]
+import cupy as cp
+
+rng = cp.random.default_rng(seed=42)
+f_arr = rng.random((16, 16, 16))
+g_arr = cp.zeros_like(f_arr)
+
+kfunc = kernel.compile()
+kfunc(f=f_arr, g=g_arr)
+```
+
+### Modifying the Launch Grid
+
+The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
+Its interface allows us to customize the GPU launch grid.
+
+## API Reference
+
+```{eval-rst}
+.. autosummary::
+  :toctree: autoapi
+  :nosignatures:
+  :template: autosummary/recursive_class.rst
+
+  pystencils.backend.kernelfunction.GpuKernelFunction
+  pystencils.backend.jit.gpu_cupy.CupyKernelWrapper
+```
+
+:::{admonition} To Do:
+
+- GPU kernels in general: Selecting the CUDA target, compiling and running on cupy arrays
+- Setting the launch grid
+- Indexing options and iteration spaces
+- Fast approximation functions
+- Fp16 on GPU
+:::
+
+
+[cupy]: https://cupy.dev "CuPy Homepage"
diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst
index 7da7b5795..314ec3b14 100644
--- a/docs/source/reference/index.rst
+++ b/docs/source/reference/index.rst
@@ -7,8 +7,10 @@ Reference Guide
 These pages list the public APIs of pystencils, with advice on how to use them.
 
 .. toctree::
-    :maxdepth: 2
+    :maxdepth: 1
 
     symbolic_language
     kernelcreation
     types
+    gpu_kernels
+
diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
index 0761f2532..f9437dcd2 100644
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -255,9 +255,10 @@ u, v = ps.fields("u, v: [2D]")
 To illustrate, the following kernel accesses neighbor nodes with a maximum offset of two:
 
 ```{code-cell} ipython3
-assignments = [
-  ps.Assignment(u.center(), v[-2, -2] + v[2, 2])
-]
+ranged_update = ps.Assignment(u.center(), v[-2, -2] + v[2, 2])
+
+cfg = ps.CreateKernelConfig(ghost_layers=ps.config.AUTO)
+kernel = ps.create_kernel(ranged_update, cfg)
 ```
 
 With `ghost_layers=ps.config.AUTO`, its iteration space will look like this (yellow cells are included, purple cells excluded):
@@ -266,7 +267,8 @@ With `ghost_layers=ps.config.AUTO`, its iteration space will look like this (yel
 :tags: [remove-input]
 
 f = ps.fields("f: float64[2D]")
-assignments += [
+assignments = [
+    ranged_update,
     ps.Assignment(f(0), 1)
 ]
 kernel = ps.create_kernel(assignments).compile()
diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py
index afd54c110..e2161590e 100644
--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -259,6 +259,8 @@ def create_cpu_kernel_function(
 
 
 class GpuKernelFunction(KernelFunction):
+    """Internal representation of a kernel function targeted at CUDA GPUs."""
+
     def __init__(
         self,
         body: PsBlock,
@@ -277,6 +279,7 @@ class GpuKernelFunction(KernelFunction):
 
     @property
     def threads_range(self) -> GpuThreadsRange | None:
+        """Object exposing the total size of the launch grid this kernel expects to be executed with."""
         return self._threads_range
 
 
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index f6b888a49..0512351cd 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -49,6 +49,15 @@ class GpuThreadsRange:
     @property
     def dim(self) -> int:
         return self._dim
+    
+    def __str__(self) -> str:
+        rep = "GpuThreadsRange { "
+        rep += "; ".join(f"{x}: {w}" for x, w in zip("xyz", self._num_work_items))
+        rep += " }"
+        return rep
+    
+    def _repr_html_(self) -> str:
+        return str(self)
 
     @staticmethod
     def _from_full_ispace(ispace: FullIterationSpace) -> GpuThreadsRange:
-- 
GitLab


From 4331efb2a60d20cd8179ba1411aedbc89239551c Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 19 Nov 2024 13:45:54 +0100
Subject: [PATCH 10/31] documentation refactoring

- Theme: replace furo by sphinx-book-theme
- Refactor primary ToC; split into rubrics
- Add installation guide
- Move some apis to the "API" subpage
---
 docs/.gitignore                               |  2 +-
 docs/source/backend/ast.rst                   | 10 +--
 docs/source/conf.py                           |  4 +-
 docs/source/index.rst                         | 69 +++++++++++--------
 docs/source/installation.md                   | 54 +++++++++++++++
 docs/source/reference/{ => api}/field.rst     |  8 +--
 docs/source/reference/api/index.rst           | 12 ++++
 .../reference/{ => api}/sympyextensions.rst   | 10 +--
 docs/source/reference/gpu_kernels.md          |  2 +-
 docs/source/reference/index.rst               | 16 -----
 docs/source/reference/kernelcreation.md       | 10 +--
 docs/source/reference/symbolic_language.rst   | 10 +--
 docs/source/reference/types.rst               |  6 +-
 pyproject.toml                                |  2 +-
 14 files changed, 136 insertions(+), 79 deletions(-)
 create mode 100644 docs/source/installation.md
 rename docs/source/reference/{ => api}/field.rst (92%)
 create mode 100644 docs/source/reference/api/index.rst
 rename docs/source/reference/{ => api}/sympyextensions.rst (93%)
 delete mode 100644 docs/source/reference/index.rst

diff --git a/docs/.gitignore b/docs/.gitignore
index be765e19a..87cf4fd00 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,4 +1,4 @@
 build
 
 #   sphinx.ext.autosummary generated files
-**/autoapi
+**/generated
diff --git a/docs/source/backend/ast.rst b/docs/source/backend/ast.rst
index 84ccb01f3..409ae164a 100644
--- a/docs/source/backend/ast.rst
+++ b/docs/source/backend/ast.rst
@@ -20,7 +20,7 @@ Base Classes
 .. module:: pystencils.backend.ast.astnode
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
     :template: autosummary/entire_class.rst
 
@@ -34,7 +34,7 @@ Structural Nodes
 .. module:: pystencils.backend.ast.structural
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
     :template: autosummary/entire_class.rst
 
@@ -55,7 +55,7 @@ Expressions
 .. module:: pystencils.backend.ast.expressions
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
     :template: autosummary/entire_class.rst
 
@@ -108,7 +108,7 @@ SIMD Nodes
 .. module:: pystencils.backend.ast.vector
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
     :template: autosummary/entire_class.rst
 
@@ -123,7 +123,7 @@ Utility
 .. currentmodule:: pystencils.backend.ast
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     expressions.evaluate_expression
diff --git a/docs/source/conf.py b/docs/source/conf.py
index c38894118..03d43b19b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -60,7 +60,7 @@ inheritance_graph_attrs = {
 
 # -- Options for MyST / MyST-NB ----------------------------------------------
 
-nb_execution_mode = "off" # do not execute notebooks by default
+nb_execution_mode = "off"  # do not execute notebooks by default
 
 myst_enable_extensions = [
     "dollarmath",
@@ -69,7 +69,7 @@ myst_enable_extensions = [
 
 # -- Options for HTML output -------------------------------------------------
 
-html_theme = "furo"
+html_theme = "sphinx_book_theme"
 html_static_path = ["_static"]
 html_css_files = [
     'css/fixtables.css',
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 31334af4f..66582cb4b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -2,15 +2,6 @@
 pystencils v2.0-dev Documentation
 #################################
 
-.. toctree::
-   :maxdepth: 1
-   :hidden:
-   
-   tutorials/index
-   reference/index
-   migration
-   backend/index
-
 .. note::
    You are currently viewing the documentation pages for the development revision |release|
    of pystencils 2.0.
@@ -50,32 +41,56 @@ Its features include:
   such as `waLBerla`_ to build massively parallel simulations.
 
 
-Contents
---------
+.. .. card:: Getting Started: Our Tutorials
+..    :link: page_tutorials
+..    :link-type: ref
 
-.. card:: Getting Started: Our Tutorials
-   :link: page_tutorials
-   :link-type: ref
+..    New to *pystencils*? Check out our set of tutorials to quickly and interactively learn the basics.
 
-   New to *pystencils*? Check out our set of tutorials to quickly and interactively learn the basics.
+.. .. card:: Reference Guide and APIs
+..    :link: page_api
+..    :link-type: ref
 
-.. card:: Reference Guide and APIs
-   :link: page_api
-   :link-type: ref
+..    Get an overview of *pystencils*' APIs for mathematical modelling and code generation.
 
-   Get an overview of *pystencils*' APIs for mathematical modelling and code generation.
+.. .. card:: Migration Guide: 1.3.x to 2.0
+..    :link: page_v2_migration
+..    :link-type: ref
 
-.. card:: Migration Guide: 1.3.x to 2.0
-   :link: page_v2_migration
-   :link-type: ref
+..    Find advice on migrating your code from *pystencils 1.3.x* to *pystencils 2.0*
 
-   Find advice on migrating your code from *pystencils 1.3.x* to *pystencils 2.0*
+.. .. card:: Developers's Reference: Code Generation Backend
+..    :link: page_codegen_backend
+..    :link-type: ref
 
-.. card:: Developers's Reference: Code Generation Backend
-   :link: page_codegen_backend
-   :link-type: ref
+..    Dive deep into the core of pystencils' code generation engine.
 
-   Dive deep into the core of pystencils' code generation engine.
+Topics
+------
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Getting Started
+
+   installation
+   tutorials/index
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Reference Guides
+
+   reference/symbolic_language
+   reference/kernelcreation
+   reference/gpu_kernels
+   reference/types
+   reference/api/index
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Advanced
+
+   migration
+   backend/index
 
 Projects using pystencils
 -------------------------
diff --git a/docs/source/installation.md b/docs/source/installation.md
new file mode 100644
index 000000000..dcc5b0474
--- /dev/null
+++ b/docs/source/installation.md
@@ -0,0 +1,54 @@
+(_installation)=
+# Setup and Installation
+
+## Install pystencils
+
+The latest development version of *pystencils 2.0* in one of two ways.
+You can either install it directly from our git repository:
+
+```bash
+pip install "git+https://i10git.cs.fau.de/pycodegen/pystencils.git@v2.0-dev"
+```
+
+Or clone the repository locally and perform an editable install:
+
+```bash
+git clone -b v2.0-dev https://i10git.cs.fau.de/pycodegen/pystencils.git
+pip install -e pystencils
+```
+
+### Feature Groups
+
+In both cases, you can add a set of optional features to your installation by listing them
+in square brackets (e.g. `pip install -e pystencils[feature1, feature2]`).
+The following feature sets are available:
+
+- `interactive` (**recommended**): Install dependencies for using pystencils interactively from
+  within Jupyter notebooks
+- `alltrafos` (**recommended**): Install dependencies to enable a wider set of code transformation.
+  These include [islpy](https://pypi.org/project/islpy/) for polyhedral loop transformations,
+  and [py-cpuinfo](https://pypi.org/project/py-cpuinfo/) for detecting the current hardware in order
+  to select optimal vector instructions.
+- `use_cython`: Install [Cython](https://cython.org/), which is used internally by pystencils
+  to accelerate the setup of boundary conditions
+
+:::{dropdown} For Developers
+
+If you are developing pystencils, we recommend you perform an editable install of your
+local clone of the repository, with all optional features:
+```bash
+pip install -e pystencils[alltrafos,interactive,use_cython,doc,tests]
+```
+
+This includes the additional feature groups `doc`, which contains all dependencies required
+to build this documentation, and `tests`, which adds `flake8` for code style checking,
+`mypy` for static type checking, and `pytest` plus plugins for running the test suite.
+:::
+
+### For Nvidia GPUs
+
+If you have an Nvidia graphics processor and CUDA installed, you can use pystencils to directly compile
+and execute kernels running on your GPU.
+This requires a working installation of [cupy](https://cupy.dev).
+Please refer to the cupy's [installation manual](https://docs.cupy.dev/en/stable/install.html)
+for details about installing cupy.
diff --git a/docs/source/reference/field.rst b/docs/source/reference/api/field.rst
similarity index 92%
rename from docs/source/reference/field.rst
rename to docs/source/reference/api/field.rst
index 7ad7dafee..79cc12a3a 100644
--- a/docs/source/reference/field.rst
+++ b/docs/source/reference/api/field.rst
@@ -14,7 +14,7 @@ Creating Fields
 ---------------
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     fields
@@ -30,7 +30,7 @@ Name and Element Type
 ^^^^^^^^^^^^^^^^^^^^^
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
 
     Field.name
     Field.dtype
@@ -40,7 +40,7 @@ Dimensionality, Shape, and Memory Layout
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     
     Field.ndim
     Field.values_per_cell
@@ -58,7 +58,7 @@ Accessing Field Entries
 -----------------------
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     Field.center
diff --git a/docs/source/reference/api/index.rst b/docs/source/reference/api/index.rst
new file mode 100644
index 000000000..7a824422e
--- /dev/null
+++ b/docs/source/reference/api/index.rst
@@ -0,0 +1,12 @@
+***
+API
+***
+
+Modules
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    field
+    sympyextensions
diff --git a/docs/source/reference/sympyextensions.rst b/docs/source/reference/api/sympyextensions.rst
similarity index 93%
rename from docs/source/reference/sympyextensions.rst
rename to docs/source/reference/api/sympyextensions.rst
index 98b6e3593..d377f998e 100644
--- a/docs/source/reference/sympyextensions.rst
+++ b/docs/source/reference/api/sympyextensions.rst
@@ -7,7 +7,7 @@ Symbol Factory
 --------------
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     SymbolCreator
@@ -17,7 +17,7 @@ Functions
 ---------
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     math.prod
@@ -31,7 +31,7 @@ Expression Analysis
 -------------------
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     math.is_constant
@@ -46,7 +46,7 @@ Expression Rewriting and Simplifications
 ----------------------------------------
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     math.remove_small_floats
@@ -78,7 +78,7 @@ Integer Operations
 ------------------
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
     :template: autosummary/sympy_class.rst
 
diff --git a/docs/source/reference/gpu_kernels.md b/docs/source/reference/gpu_kernels.md
index 735118e9a..7747901f0 100644
--- a/docs/source/reference/gpu_kernels.md
+++ b/docs/source/reference/gpu_kernels.md
@@ -84,7 +84,7 @@ Its interface allows us to customize the GPU launch grid.
 
 ```{eval-rst}
 .. autosummary::
-  :toctree: autoapi
+  :toctree: generated
   :nosignatures:
   :template: autosummary/recursive_class.rst
 
diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst
deleted file mode 100644
index 314ec3b14..000000000
--- a/docs/source/reference/index.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-.. _page_api:
-
-###############
-Reference Guide
-###############
-
-These pages list the public APIs of pystencils, with advice on how to use them.
-
-.. toctree::
-    :maxdepth: 1
-
-    symbolic_language
-    kernelcreation
-    types
-    gpu_kernels
-
diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
index f9437dcd2..ba27b1e76 100644
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -43,7 +43,7 @@ wide variety of options that influence the code generator.
 .. module:: pystencils.kernelcreation
 
 .. autosummary::
-  :toctree: autoapi
+  :toctree: generated
   :nosignatures:
 
   create_kernel
@@ -56,7 +56,7 @@ wide variety of options that influence the code generator.
 .. module:: pystencils.config
 
 .. autosummary::
-  :toctree: autoapi
+  :toctree: generated
   :nosignatures:
   :template: autosummary/entire_class.rst
 
@@ -67,7 +67,7 @@ wide variety of options that influence the code generator.
   GpuIndexingConfig
 
 .. autosummary::
-  :toctree: autoapi
+  :toctree: generated
   :nosignatures:
 
   AUTO
@@ -88,7 +88,7 @@ Pystencils supports code generation for a variety of CPU and GPU hardware.
 .. module:: pystencils.target
 
 .. autosummary::
-  :toctree: autoapi
+  :toctree: generated
   :nosignatures:
   :template: autosummary/recursive_class.rst
 
@@ -431,7 +431,7 @@ _show_ispace(cfg)
 .. module:: pystencils.backend.kernelfunction
 
 .. autosummary::
-  :toctree: autoapi
+  :toctree: generated
   :nosignatures:
   :template: autosummary/entire_class.rst
 
diff --git a/docs/source/reference/symbolic_language.rst b/docs/source/reference/symbolic_language.rst
index f56b8e372..63b94e04d 100644
--- a/docs/source/reference/symbolic_language.rst
+++ b/docs/source/reference/symbolic_language.rst
@@ -4,14 +4,6 @@
 Symbolic Language
 *****************
 
-.. toctree::
-    :maxdepth: 2
-    :hidden:
-
-    field
-    sympyextensions
-
-
 Pystencils allows you to define near-arbitrarily complex numerical kernels in its symbolic
 language, which is based on the computer algebra system `SymPy <https://www.sympy.org>`_.
 The pystencils code generator is able to parse and translate a large portion of SymPy's
@@ -64,7 +56,7 @@ An assignment collection contains two separate lists of assignments:
   into fields.
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
     :template: autosummary/recursive_class.rst
 
diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index a795cdaec..3df133770 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -11,7 +11,7 @@ Type Creation and Conversion
 ----------------------------
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     create_type
@@ -34,7 +34,7 @@ unless you have very particular needs.
     :parts: 1
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
     :template: autosummary/entire_class.rst
 
@@ -82,7 +82,7 @@ Exceptions
 .. currentmodule:: pystencils.types
 
 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
     :nosignatures:
 
     PsTypeError
diff --git a/pyproject.toml b/pyproject.toml
index affddafd2..f0ec014a5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ use_cython = [
 ]
 doc = [
     'sphinx',
-    'furo',
+    'sphinx-book-theme',
     'sphinxcontrib-bibtex',
     'sphinx_autodoc_typehints',
     'pandoc',
-- 
GitLab


From 9c509a902ef5dcdd36308e9c4d7ef10135496b76 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 20 Nov 2024 15:11:30 +0100
Subject: [PATCH 11/31] Implement interactive IR inspection

---
 src/pystencils/backend/emission/__init__.py   |   3 +-
 .../backend/emission/base_printer.py          |   2 +-
 src/pystencils/backend/emission/c_printer.py  |  13 +-
 src/pystencils/inspect.py                     | 152 ++++++++++++++++++
 src/pystencils/kernelcreation.py              |  40 ++++-
 5 files changed, 201 insertions(+), 9 deletions(-)
 create mode 100644 src/pystencils/inspect.py

diff --git a/src/pystencils/backend/emission/__init__.py b/src/pystencils/backend/emission/__init__.py
index b38b0c459..8d413d4c2 100644
--- a/src/pystencils/backend/emission/__init__.py
+++ b/src/pystencils/backend/emission/__init__.py
@@ -1,4 +1,5 @@
+from .base_printer import EmissionError
 from .c_printer import emit_code, CAstPrinter
 from .ir_printer import emit_ir, IRAstPrinter
 
-__all__ = ["emit_code", "CAstPrinter", "emit_ir", "IRAstPrinter"]
+__all__ = ["emit_code", "CAstPrinter", "emit_ir", "IRAstPrinter", "EmissionError"]
diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py
index 9b1d5481a..50cd1bfea 100644
--- a/src/pystencils/backend/emission/base_printer.py
+++ b/src/pystencils/backend/emission/base_printer.py
@@ -189,7 +189,7 @@ class BasePrinter(ABC):
                 pc.indent_level += self._indent_width
                 interior = "\n".join(self.visit(stmt, pc) for stmt in statements) + "\n"
                 pc.indent_level -= self._indent_width
-                return pc.indent("{\n") + interior + pc.indent("}\n")
+                return pc.indent("{\n") + interior + pc.indent("}")
 
             case PsStatement(expr):
                 return pc.indent(f"{self.visit(expr, pc)};")
diff --git a/src/pystencils/backend/emission/c_printer.py b/src/pystencils/backend/emission/c_printer.py
index 0efe87588..487fe17bc 100644
--- a/src/pystencils/backend/emission/c_printer.py
+++ b/src/pystencils/backend/emission/c_printer.py
@@ -5,7 +5,7 @@ from pystencils.backend.memory import PsSymbol
 from .base_printer import BasePrinter
 
 from ..kernelfunction import KernelFunction
-from ...types import PsType, PsArrayType, PsScalarType
+from ...types import PsType, PsArrayType, PsScalarType, PsTypeError
 from ..ast.expressions import PsBufferAcc
 from ..ast.vector import PsVecMemAcc
 
@@ -33,7 +33,7 @@ class CAstPrinter(BasePrinter):
 
             case _:
                 return super().visit(node, pc)
-            
+
     def _symbol_decl(self, symb: PsSymbol):
         dtype = symb.get_dtype()
 
@@ -52,11 +52,12 @@ class CAstPrinter(BasePrinter):
     def _constant_literal(self, constant: PsConstant):
         dtype = constant.get_dtype()
         if not isinstance(dtype, PsScalarType):
-            raise EmissionError(
-                "Cannot print literals for non-scalar constants."
-            )
+            raise EmissionError("Cannot print literals for non-scalar constants.")
 
         return dtype.create_literal(constant.value)
 
     def _type_str(self, dtype: PsType):
-        return dtype.c_string()
+        try:
+            return dtype.c_string()
+        except PsTypeError:
+            raise EmissionError(f"Unable to print type {dtype} as a C data type.")
diff --git a/src/pystencils/inspect.py b/src/pystencils/inspect.py
new file mode 100644
index 000000000..d90155593
--- /dev/null
+++ b/src/pystencils/inspect.py
@@ -0,0 +1,152 @@
+from dataclasses import dataclass
+from typing import ClassVar
+
+from .backend.ast import PsAstNode
+from .backend.emission import CAstPrinter, IRAstPrinter, EmissionError
+
+_UNABLE_TO_DISPLAY_CPP = """
+<div>
+    <b>Unable to display C code for this abstract syntax tree</b>
+    <p>
+    This intermediate abstract syntax tree contains nodes that cannot be
+    printed as valid C code.
+    </p>
+</div>
+"""
+
+_ERR_MSG = """
+<div style="font-family: monospace; background-color: #EEEEEE; white-space: nowrap; overflow-x: scroll">
+    {}
+</div>
+"""
+
+
+class AstInspection:
+    """Inspect an abstract syntax tree produced by the code generation backend.
+
+    **Interactive:** This class can be used in Jupyter notebooks to interactively
+    explore an abstract syntax tree.
+    """
+
+    def __init__(self, ast: PsAstNode):
+        self._ast = ast
+        self._ir_printer = IRAstPrinter(annotate_constants=False)
+        self._c_printer = CAstPrinter()
+
+    def _ipython_display_(self):
+        from IPython.display import display
+
+        display(self._widget())
+
+    def _widget(self):
+        import ipywidgets as widgets
+
+        ir = self._ir_printer(self._ast)
+        ir_tab = widgets.HTML(self._highlight_as_cpp(ir))
+
+        try:
+            cpp = self._c_printer(self._ast)
+            cpp_tab = widgets.HTML(self._highlight_as_cpp(cpp))
+        except EmissionError as e:
+            cpp_tab = widgets.VBox(
+                children=[
+                    widgets.HTML(_UNABLE_TO_DISPLAY_CPP),
+                    widgets.Accordion(
+                        children=[widgets.HTML(_ERR_MSG.format(e.args[0]))],
+                        titles=["Error Details"],
+                    ),
+                ]
+            )
+
+        tabs = widgets.Tab()
+        tabs.children = [ir_tab, cpp_tab]
+
+        for t in tabs.children:
+            t.layout.display = "inline-block"
+            t.layout.padding = "0 15pt 0 0"
+
+        tabs.titles = ["IR Code", "C Code"]
+
+        tabs.layout.width = "100%"
+        tabs.layout.height = "250pt"
+
+        return tabs
+
+    def _highlight_as_cpp(self, code: str) -> str:
+        from pygments import highlight
+        from pygments.formatters import HtmlFormatter
+        from pygments.lexers import CppLexer
+
+        formatter = HtmlFormatter(
+            prestyles="white-space: pre;",
+        )
+        html_code = highlight(code, CppLexer(), formatter)
+        return html_code
+
+
+@dataclass
+class IntermediatesInspection:
+    """Inspect intermediate results produced by the code generator."""
+
+    parsed_body: PsAstNode | None = None
+    materialized_ispace: PsAstNode | None = None
+    constants_eliminated: PsAstNode | None = None
+    cpu_canonicalize: PsAstNode | None = None
+    cpu_hoist_invariants: PsAstNode | None = None
+    cpu_vectorize: PsAstNode | None = None
+    cpu_select_intrins: PsAstNode | None = None
+    cpu_openmp: PsAstNode | None = None
+    lowered: PsAstNode | None = None
+
+    DESCR: ClassVar = {
+        "parsed_body": "Freeze & Type Deduction",
+        "materialized_ispace": "Iteration Space Materialization",
+        "constants_eliminated": "Constant Elimination",
+        "cpu_canonicalize": "CPU: Symbol Canonicalization",
+        "cpu_hoist_invariants": "CPU: Hoisting of Loop Invariants",
+        "cpu_vectorize": "CPU: Vectorization",
+        "cpu_select_intrins": "CPU: Intrinsics Selection",
+        "cpu_openmp": "CPU: OpenMP Instrumentation",
+        "lowered": "C Language Lowering",
+    }
+
+    def _ipython_display_(self):
+        from IPython.display import display
+        import ipywidgets as widgets
+
+        from dataclasses import fields, asdict
+
+        all_fields = fields(IntermediatesInspection)
+        fields_dict = asdict(self)
+
+        all_asts = [
+            (self.DESCR.get(f.name, f.name), fields_dict[f.name])
+            for f in all_fields
+            if fields_dict[f.name] is not None
+        ]
+
+        labels = [t[0] for t in all_asts]
+
+        code_views = [AstInspection(t[1])._widget() for t in all_asts]
+
+        select_label = widgets.HTML("<div><b>Code Generator Stages</b></div>")
+        select = widgets.Select(options=labels)
+        select.layout.height = "250pt"
+
+        selection_box = widgets.VBox([select_label, select])
+        selection_box.layout.overflow = "visible"
+
+        preview_label = widgets.HTML("<div><b>Preview</b></div>")
+        preview_stack = widgets.Stack(children=code_views)
+        preview_stack.layout.overflow = "hidden"
+
+        preview_box = widgets.VBox([preview_label, preview_stack])
+
+        widgets.jslink((select, "index"), (preview_stack, "selected_index"))
+
+        grid = widgets.GridBox(
+            [selection_box, preview_box],
+            layout=widgets.Layout(grid_template_columns="max-content auto"),
+        )
+
+        display(grid)
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 548fbc9bb..a9191aa8a 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -41,6 +41,8 @@ from .backend.kernelfunction import (
 from .simp import AssignmentCollection
 from sympy.codegen.ast import AssignmentBase
 
+from .inspect import IntermediatesInspection
+
 
 __all__ = ["create_kernel"]
 
@@ -74,7 +76,7 @@ def create_kernel(
 
 
 class DefaultKernelCreationDriver:
-    def __init__(self, cfg: CreateKernelConfig):
+    def __init__(self, cfg: CreateKernelConfig, inspect_intermediates: bool = False):
         self._cfg = cfg
 
         idx_dtype = create_numeric_type(self._cfg.index_dtype)
@@ -88,6 +90,15 @@ class DefaultKernelCreationDriver:
         self._target = self._cfg.get_target()
         self._platform = self._get_platform()
 
+        if inspect_intermediates:
+            self._inspect = IntermediatesInspection()
+        else:
+            self._inspect = None
+
+    @property
+    def intermediates(self) -> IntermediatesInspection | None:
+        return self._inspect
+
     def __call__(
         self,
         assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
@@ -106,10 +117,16 @@ class DefaultKernelCreationDriver:
                     kernel_body, self._ctx.get_iteration_space()
                 )
 
+        if self._inspect is not None:
+            self._inspect.materialized_ispace = kernel_ast.clone()
+
         #   Fold and extract constants
         elim_constants = EliminateConstants(self._ctx, extract_constant_exprs=True)
         kernel_ast = cast(PsBlock, elim_constants(kernel_ast))
 
+        if self._inspect is not None:
+            self._inspect.constants_eliminated = kernel_ast.clone()
+
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
             kernel_ast = self._transform_for_cpu(kernel_ast)
@@ -124,6 +141,9 @@ class DefaultKernelCreationDriver:
         select_functions = SelectFunctions(self._platform)
         kernel_ast = cast(PsBlock, select_functions(kernel_ast))
 
+        if self._inspect is not None:
+            self._inspect.lowered = kernel_ast.clone()
+
         #   Late canonicalization pass: Canonicalize new symbols introduced by LowerToC
 
         canonicalize = CanonicalizeSymbols(self._ctx, True)
@@ -194,15 +214,24 @@ class DefaultKernelCreationDriver:
         typify = Typifier(self._ctx)
         kernel_body = typify(kernel_body)
 
+        if self._inspect is not None:
+            self._inspect.parsed_body = kernel_body.clone()
+
         return kernel_body
 
     def _transform_for_cpu(self, kernel_ast: PsBlock):
         canonicalize = CanonicalizeSymbols(self._ctx, True)
         kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
 
+        if self._inspect is not None:
+            self._inspect.cpu_canonicalize = kernel_ast.clone()
+
         hoist_invariants = HoistLoopInvariantDeclarations(self._ctx)
         kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast))
 
+        if self._inspect is not None:
+            self._inspect.cpu_hoist_invariants = kernel_ast.clone()
+
         cpu_cfg = self._cfg.cpu_optim
 
         if cpu_cfg is None:
@@ -224,6 +253,9 @@ class DefaultKernelCreationDriver:
             add_omp = AddOpenMP(self._ctx, params)
             kernel_ast = cast(PsBlock, add_omp(kernel_ast))
 
+            if self._inspect is not None:
+                self._inspect.cpu_openmp = kernel_ast.clone()
+
         if cpu_cfg.use_cacheline_zeroing:
             raise NotImplementedError("CL-zeroing not implemented yet")
 
@@ -279,9 +311,15 @@ class DefaultKernelCreationDriver:
 
         kernel_ast = vectorizer.vectorize_select_loops(kernel_ast, loop_predicate)
 
+        if self._inspect is not None:
+            self._inspect.cpu_vectorize = kernel_ast.clone()
+
         select_intrin = SelectIntrinsics(self._ctx, self._platform)
         kernel_ast = cast(PsBlock, select_intrin(kernel_ast))
 
+        if self._inspect is not None:
+            self._inspect.cpu_select_intrins = kernel_ast.clone()
+
         return kernel_ast
 
     def _get_platform(self) -> Platform:
-- 
GitLab


From 8eb51b159110968d9b9065ecd1e7b5a2c823083c Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 20 Nov 2024 15:39:12 +0100
Subject: [PATCH 12/31] use descriptor magic instead of dataclasses

---
 src/pystencils/inspect.py | 93 ++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 36 deletions(-)

diff --git a/src/pystencils/inspect.py b/src/pystencils/inspect.py
index d90155593..c830c8702 100644
--- a/src/pystencils/inspect.py
+++ b/src/pystencils/inspect.py
@@ -1,6 +1,3 @@
-from dataclasses import dataclass
-from typing import ClassVar
-
 from .backend.ast import PsAstNode
 from .backend.emission import CAstPrinter, IRAstPrinter, EmissionError
 
@@ -84,50 +81,74 @@ class AstInspection:
         return html_code
 
 
-@dataclass
+class IntermediateAst:
+    def __init__(self, description: str | None = None):
+        self._description = description
+        self._name: str
+        self._lookup: str
+
+    @property
+    def description(self) -> str:
+        if self._description is not None:
+            return self._description
+        else:
+            return self._name
+
+    def __set_name__(self, owner, name: str):
+        self._name = name
+        self._lookup = f"_{name}"
+
+    def __get__(self, obj, objtype=None) -> AstInspection | None:
+        if obj is None:
+            return None
+
+        ast = getattr(obj, self._lookup, None)
+        if ast is not None:
+            return AstInspection(ast)
+        else:
+            return None
+
+    def __set__(self, obj, val: PsAstNode | None):
+        setattr(obj, self._lookup, val)
+
+
 class IntermediatesInspection:
     """Inspect intermediate results produced by the code generator."""
 
-    parsed_body: PsAstNode | None = None
-    materialized_ispace: PsAstNode | None = None
-    constants_eliminated: PsAstNode | None = None
-    cpu_canonicalize: PsAstNode | None = None
-    cpu_hoist_invariants: PsAstNode | None = None
-    cpu_vectorize: PsAstNode | None = None
-    cpu_select_intrins: PsAstNode | None = None
-    cpu_openmp: PsAstNode | None = None
-    lowered: PsAstNode | None = None
-
-    DESCR: ClassVar = {
-        "parsed_body": "Freeze & Type Deduction",
-        "materialized_ispace": "Iteration Space Materialization",
-        "constants_eliminated": "Constant Elimination",
-        "cpu_canonicalize": "CPU: Symbol Canonicalization",
-        "cpu_hoist_invariants": "CPU: Hoisting of Loop Invariants",
-        "cpu_vectorize": "CPU: Vectorization",
-        "cpu_select_intrins": "CPU: Intrinsics Selection",
-        "cpu_openmp": "CPU: OpenMP Instrumentation",
-        "lowered": "C Language Lowering",
-    }
+    parsed_body: IntermediateAst = IntermediateAst("Freeze & Type Deduction")
+    materialized_ispace: IntermediateAst = IntermediateAst(
+        "Iteration Space Materialization"
+    )
+    constants_eliminated: IntermediateAst = IntermediateAst("Constant Elimination")
+    cpu_canonicalize: IntermediateAst = IntermediateAst("CPU: Symbol Canonicalization")
+    cpu_hoist_invariants: IntermediateAst = IntermediateAst(
+        "CPU: Hoisting of Loop Invariants"
+    )
+    cpu_vectorize: IntermediateAst = IntermediateAst("CPU: Vectorization")
+    cpu_select_intrins: IntermediateAst = IntermediateAst("CPU: Intrinsics Selection")
+    cpu_openmp: IntermediateAst = IntermediateAst("CPU: OpenMP Instrumentation")
+    lowered: IntermediateAst = IntermediateAst("C Language Lowering")
 
     def _ipython_display_(self):
         from IPython.display import display
         import ipywidgets as widgets
 
-        from dataclasses import fields, asdict
-
-        all_fields = fields(IntermediatesInspection)
-        fields_dict = asdict(self)
-
-        all_asts = [
-            (self.DESCR.get(f.name, f.name), fields_dict[f.name])
-            for f in all_fields
-            if fields_dict[f.name] is not None
+        all_fields = [
+            (name, obj.description)
+            for name, obj in IntermediatesInspection.__dict__.items()
+            if isinstance(obj, IntermediateAst)
         ]
 
-        labels = [t[0] for t in all_asts]
+        previews: list[AstInspection] = []
+        labels: list[str] = []
+
+        for name, descr in all_fields:
+            preview = getattr(self, name)
+            if preview is not None:
+                previews.append(preview)
+                labels.append(descr)
 
-        code_views = [AstInspection(t[1])._widget() for t in all_asts]
+        code_views = [p._widget() for p in previews]
 
         select_label = widgets.HTML("<div><b>Code Generator Stages</b></div>")
         select = widgets.Select(options=labels)
-- 
GitLab


From 24478097a3f16a6abe6998ed8f9b92e68523c895 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 21 Nov 2024 16:00:22 +0100
Subject: [PATCH 13/31] refactor `inspect` to `inspection`; add ipython output
 for KernelFunction

---
 src/pystencils/__init__.py                   |   3 +-
 src/pystencils/{inspect.py => inspection.py} | 145 +++++++++++++------
 src/pystencils/kernelcreation.py             |   2 +-
 3 files changed, 103 insertions(+), 47 deletions(-)
 rename src/pystencils/{inspect.py => inspection.py} (63%)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 66533a0b7..b2cdeca07 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -5,6 +5,7 @@ from .defaults import DEFAULTS
 from . import fd
 from . import stencil as stencil
 from .display_utils import get_code_obj, get_code_str, show_code, to_dot
+from .inspection import inspect
 from .field import Field, FieldType, fields
 from .types import create_type, create_numeric_type
 from .cache import clear_cache
@@ -37,7 +38,6 @@ from .sympyextensions.typed_sympy import TypedSymbol, DynamicType
 from .sympyextensions import SymbolCreator
 from .datahandling import create_data_handling
 
-
 __all__ = [
     "Field",
     "FieldType",
@@ -63,6 +63,7 @@ __all__ = [
     "to_dot",
     "get_code_obj",
     "get_code_str",
+    "inspect",
     "AssignmentCollection",
     "Assignment",
     "AddAugmentedAssignment",
diff --git a/src/pystencils/inspect.py b/src/pystencils/inspection.py
similarity index 63%
rename from src/pystencils/inspect.py
rename to src/pystencils/inspection.py
index c830c8702..759baddfb 100644
--- a/src/pystencils/inspect.py
+++ b/src/pystencils/inspection.py
@@ -1,5 +1,9 @@
+from typing import overload
+
 from .backend.ast import PsAstNode
 from .backend.emission import CAstPrinter, IRAstPrinter, EmissionError
+from .backend.kernelfunction import KernelFunction
+from abc import ABC, abstractmethod
 
 _UNABLE_TO_DISPLAY_CPP = """
 <div>
@@ -18,31 +22,24 @@ _ERR_MSG = """
 """
 
 
-class AstInspection:
-    """Inspect an abstract syntax tree produced by the code generation backend.
-
-    **Interactive:** This class can be used in Jupyter notebooks to interactively
-    explore an abstract syntax tree.
-    """
-
-    def __init__(self, ast: PsAstNode):
-        self._ast = ast
+class CodeInspectionBase(ABC):
+    def __init__(self) -> None:
         self._ir_printer = IRAstPrinter(annotate_constants=False)
         self._c_printer = CAstPrinter()
 
-    def _ipython_display_(self):
-        from IPython.display import display
-
-        display(self._widget())
-
-    def _widget(self):
+    def _ir_tab(self, ir_obj: PsAstNode | KernelFunction):
         import ipywidgets as widgets
 
-        ir = self._ir_printer(self._ast)
+        ir = self._ir_printer(ir_obj)
         ir_tab = widgets.HTML(self._highlight_as_cpp(ir))
+        self._apply_tab_layout(ir_tab)
+        return ir_tab
+
+    def _cpp_tab(self, ir_obj: PsAstNode | KernelFunction):
+        import ipywidgets as widgets
 
         try:
-            cpp = self._c_printer(self._ast)
+            cpp = self._c_printer(ir_obj)
             cpp_tab = widgets.HTML(self._highlight_as_cpp(cpp))
         except EmissionError as e:
             cpp_tab = widgets.VBox(
@@ -54,20 +51,12 @@ class AstInspection:
                     ),
                 ]
             )
+        self._apply_tab_layout(cpp_tab)
+        return cpp_tab
 
-        tabs = widgets.Tab()
-        tabs.children = [ir_tab, cpp_tab]
-
-        for t in tabs.children:
-            t.layout.display = "inline-block"
-            t.layout.padding = "0 15pt 0 0"
-
-        tabs.titles = ["IR Code", "C Code"]
-
-        tabs.layout.width = "100%"
-        tabs.layout.height = "250pt"
-
-        return tabs
+    def _apply_tab_layout(self, tab):
+        tab.layout.display = "inline-block"
+        tab.layout.padding = "0 15pt 0 0"
 
     def _highlight_as_cpp(self, code: str) -> str:
         from pygments import highlight
@@ -80,8 +69,56 @@ class AstInspection:
         html_code = highlight(code, CppLexer(), formatter)
         return html_code
 
+    def _ipython_display_(self):
+        from IPython.display import display
 
-class IntermediateAst:
+        display(self._widget())
+
+    @abstractmethod
+    def _widget(self): ...
+
+
+class AstInspection(CodeInspectionBase):
+    """Inspect an abstract syntax tree produced by the code generation backend.
+
+    **Interactive:** This class can be used in Jupyter notebooks to interactively
+    explore an abstract syntax tree.
+    """
+
+    def __init__(self, ast: PsAstNode):
+        super().__init__()
+        self._ast = ast
+
+    def _widget(self):
+        import ipywidgets as widgets
+
+        tabs = widgets.Tab(children=[self._ir_tab(self._ast), self._cpp_tab(self._ast)])
+        tabs.titles = ["IR Code", "C Code"]
+
+        tabs.layout.height = "250pt"
+
+        return tabs
+
+
+class KernelInspection(CodeInspectionBase):
+    def __init__(self, kernel: KernelFunction) -> None:
+        super().__init__()
+        self._kernel = kernel
+
+    def _widget(self):
+        import ipywidgets as widgets
+
+        tabs = widgets.Tab(
+            children=[self._ir_tab(self._kernel), self._cpp_tab(self._kernel)]
+        )
+        tabs.titles = ["IR Code", "C Code"]
+
+        tabs.layout.height = "250pt"
+
+        return tabs
+
+
+class StageResult:
     def __init__(self, description: str | None = None):
         self._description = description
         self._name: str
@@ -115,19 +152,15 @@ class IntermediateAst:
 class IntermediatesInspection:
     """Inspect intermediate results produced by the code generator."""
 
-    parsed_body: IntermediateAst = IntermediateAst("Freeze & Type Deduction")
-    materialized_ispace: IntermediateAst = IntermediateAst(
-        "Iteration Space Materialization"
-    )
-    constants_eliminated: IntermediateAst = IntermediateAst("Constant Elimination")
-    cpu_canonicalize: IntermediateAst = IntermediateAst("CPU: Symbol Canonicalization")
-    cpu_hoist_invariants: IntermediateAst = IntermediateAst(
-        "CPU: Hoisting of Loop Invariants"
-    )
-    cpu_vectorize: IntermediateAst = IntermediateAst("CPU: Vectorization")
-    cpu_select_intrins: IntermediateAst = IntermediateAst("CPU: Intrinsics Selection")
-    cpu_openmp: IntermediateAst = IntermediateAst("CPU: OpenMP Instrumentation")
-    lowered: IntermediateAst = IntermediateAst("C Language Lowering")
+    parsed_body = StageResult("Freeze & Type Deduction")
+    materialized_ispace = StageResult("Iteration Space Materialization")
+    constants_eliminated = StageResult("Constant Elimination")
+    cpu_canonicalize = StageResult("CPU: Symbol Canonicalization")
+    cpu_hoist_invariants = StageResult("CPU: Hoisting of Loop Invariants")
+    cpu_vectorize = StageResult("CPU: Vectorization")
+    cpu_select_intrins = StageResult("CPU: Intrinsics Selection")
+    cpu_openmp = StageResult("CPU: OpenMP Instrumentation")
+    lowered = StageResult("C Language Lowering")
 
     def _ipython_display_(self):
         from IPython.display import display
@@ -136,7 +169,7 @@ class IntermediatesInspection:
         all_fields = [
             (name, obj.description)
             for name, obj in IntermediatesInspection.__dict__.items()
-            if isinstance(obj, IntermediateAst)
+            if isinstance(obj, StageResult)
         ]
 
         previews: list[AstInspection] = []
@@ -149,6 +182,8 @@ class IntermediatesInspection:
                 labels.append(descr)
 
         code_views = [p._widget() for p in previews]
+        for v in code_views:
+            v.layout.width = "100%"
 
         select_label = widgets.HTML("<div><b>Code Generator Stages</b></div>")
         select = widgets.Select(options=labels)
@@ -171,3 +206,23 @@ class IntermediatesInspection:
         )
 
         display(grid)
+
+
+@overload
+def inspect(obj: PsAstNode): ...
+
+
+@overload
+def inspect(obj: KernelFunction): ...
+
+
+def inspect(obj: PsAstNode | KernelFunction):
+    from IPython.display import display
+
+    match obj:
+        case PsAstNode():
+            preview = AstInspection(obj)
+        case KernelFunction():
+            preview = KernelInspection(obj)
+
+    display(preview)
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index a9191aa8a..a4653b883 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -41,7 +41,7 @@ from .backend.kernelfunction import (
 from .simp import AssignmentCollection
 from sympy.codegen.ast import AssignmentBase
 
-from .inspect import IntermediatesInspection
+from .inspection import IntermediatesInspection
 
 
 __all__ = ["create_kernel"]
-- 
GitLab


From 56f7d51ec6592385c176d18f073180aa0fade9ac Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 21 Nov 2024 17:09:29 +0100
Subject: [PATCH 14/31] Extend kernel creation reference guide using code
 inspection

---
 docs/source/reference/gpu_kernels.md          |   2 +-
 docs/source/reference/kernelcreation.md       | 169 +++++++++++++-----
 src/pystencils/backend/emission/c_printer.py  |   5 +-
 src/pystencils/backend/emission/ir_printer.py |   2 +-
 src/pystencils/inspection.py                  |  34 +++-
 src/pystencils/kernelcreation.py              |  61 +++----
 6 files changed, 193 insertions(+), 80 deletions(-)

diff --git a/docs/source/reference/gpu_kernels.md b/docs/source/reference/gpu_kernels.md
index 7747901f0..c76215117 100644
--- a/docs/source/reference/gpu_kernels.md
+++ b/docs/source/reference/gpu_kernels.md
@@ -45,7 +45,7 @@ update = ps.Assignment(f.center(), 2 * g.center())
 cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
 kernel = ps.create_kernel(update, cfg)
 
-ps.show_code(kernel)
+ps.inspect(kernel)
 ```
 
 The `kernel` object returned by the code generator in above snippet is an instance
diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
index ba27b1e76..5f1c82279 100644
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -28,16 +28,16 @@ import matplotlib.pyplot as plt
 
 Once a kernel's assignments are fully assembled, they need to be passed through pystencils' code
 generation engine in order to produce the kernel's executable code.
-The code generation engine can be invoked through the `create_kernel` function,
-which takes two arguments: The list of assignment that make up the kernel
-(optionally wrapped as an ``AssignmentCollection``),
-and a configuration object.
-The configuration object, an instance of {any}`CreateKernelConfig <pystencils.config.CreateKernelConfig>`, allows to set a
-wide variety of options that influence the code generator.
+The goal of this chapter is to shed light on pystencils' main code generation pipeline.
+Here, we show how to invoke the code generator and discuss its various configuration options
+and their effects on the generated kernel.
 
-## Code Generator API
+## Running the Code Generator
 
-### Invocation
+The primary way to invoke the code generation engine is through the `create_kernel` function.
+It takes two arguments:
+- the list of assignment that make up the kernel (optionally wrapped as an ``AssignmentCollection``),
+- and a configuration object, an instance of {any}`CreateKernelConfig <pystencils.config.CreateKernelConfig>`.
 
 ```{eval-rst}
 .. module:: pystencils.kernelcreation
@@ -49,7 +49,59 @@ wide variety of options that influence the code generator.
   create_kernel
 ```
 
-### Configuration
+For a simple kernel, an invocation of the code generator might look like this:
+
+```{code-cell} ipython3
+u_src, u_dst, f = ps.fields("u_src, u_dst, f: float32[2D]")
+h = sp.Symbol("h")
+
+update = [
+  ps.Assignment(
+    u_dst[0,0], (h**2 * f[0, 0] + u_src[1, 0] + u_src[-1, 0] + u_src[0, 1] + u_src[0, -1]) / 4
+  )
+]
+
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.CUDA,
+  default_dtype="float32",
+  ghost_layers=1
+)
+
+kernel = ps.create_kernel(update, cfg)
+```
+
+The above snippet defines a five-point-stencil Jacobi update. A few noteworthy things are going on:
+- The target data type of the kernel is to be `float64`. This is explicitly specified for the three fields `u`, `u_tmp` and `f`;
+  but left implicit for the symbol `h`. To make sure the data type is correctly applied to the whole kernel,
+  it must also be passed to the configuration as its `default_dtype`.
+- The target hardware for this kernel are Nvidia GPUs; this is reflected by the `target` property being set to `Target.CUDA`.
+- As the five-point stencil reads data from neighbors offset by one cell, it can not be legally executed on the outermost
+  layer of nodes of the fields' 2D arrays. Here, we ensure that these outer layers are excluded by setting `ghost_layers=1`.
+  This is not strictly necessary, since the code generator could infer that information by itself.
+
+## Inspecting the Generated Code
+
+The object returned by the code generator, here named `kernel`, is an instance of the {any}`KernelFunction` class.
+This object stores the kernel's name, its list of parameters, the set of fields it operates on, and its hardware target.
+Also, it of course holds the kernel itself, in the form of an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST).
+This tree can be printed out as compilable code in the target language (C++ or, in this case, CUDA),
+but holds much more information than the printed-out code string.
+When working in a Jupyter notebook, we can interactively inspect the kernel using pystencils' `inspect` function.
+This reveals a widget that allows us investigate various details about the kernel:
+- its general properties, such as name, parameters, fields, target, ...;
+- its code, represented in the *pystencils IR syntax*;
+- the same code, in native C++/CUDA syntax;
+- and a visual representation of its abstract syntax tree.
+
+```{code-cell} ipython3
+ps.inspect(kernel)
+```
+
+## Configuring the Code Generator
+
+The code generation engine can be configured using a wide range of options.
+This section aims at explaining the majority of these options,
+their interaction and effects, use cases and caveats.
 
 ```{eval-rst}
 
@@ -73,7 +125,8 @@ wide variety of options that influence the code generator.
   AUTO
 ```
 
-## Target Specification
+
+### Target Specification
 
 Pystencils supports code generation for a variety of CPU and GPU hardware.
 
@@ -95,18 +148,7 @@ Pystencils supports code generation for a variety of CPU and GPU hardware.
   Target
 ```
 
-## Data Types
-
-```{code-cell} ipython3
-:tags: [remove-cell]
-
-# from pystencils.kernelcreation import DefaultKernelCreationDriver
-# from pystencils.display_utils import show_ir
-
-# def _inspect_ir(kernel, cfg=ps.CreateKernelConfig()):
-#     body = DefaultKernelCreationDriver(ps.CreateKernelConfig()).parse_kernel_body(assignments)
-#     show_ir(body)
-```
+### Data Types
 
 To produce valid output code, the code generator has to figure out the data types of each
 symbol, expression, and assignment occuring inside a kernel.
@@ -143,25 +185,27 @@ cfg = ps.CreateKernelConfig(
 kernel = ps.create_kernel(assignments, cfg)
 ```
 
-Inspecting the above kernel's output code, we see that `x` has received the `float` type,
+We can take a look at the result produced by the code generator after parsing the above kernel.
+Inspecting the internal representation of the kernel's body and loop nest,
+we see that `x` has received the `float32` type,
 which was specified via the {py:data}`default_dtype <CreateKernelConfig.default_dtype>` option.
-The symbol `y`, on the other hand, has inherited its data type `half` from the access to the field `g`
+The symbol `y`, on the other hand, has inherited its data type `float16` from the access to the field `g`
 on its declaration's right-hand side.
 Also, we can observe that the loop counters and symbols related to the field's memory layout
 are using the `int32` data type, as specified in {py:data}`index_dtype <CreateKernelConfig.index_dtype>`:
 
-:::{admonition} Developer TODO
-
-Don't show the final C code here, but inspect the codegen IR.
-That might make it clearer.
-:::
-
 ```{code-cell} ipython3
 :tags: [remove-input]
 
-ps.show_code(kernel)
+driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+kernel = driver(assignments)
+driver.intermediates.materialized_ispace
 ```
 
+:::{note}
+To learn more about inspecting code after different stages of the code generator, refer to [this section](#section_codegen_stages).
+:::
+
 ```{eval-rst}
 .. currentmodule:: pystencils.config
 
@@ -172,7 +216,7 @@ ps.show_code(kernel)
   CreateKernelConfig.index_dtype
 ```
 
-## The Iteration Space
+### The Iteration Space
 
 The *domain fields* a kernel operates on are understood to reside on a common,
 one-, two- or three-dimensional computational grid.
@@ -228,13 +272,13 @@ def _draw_ispace(f_arr):
     ax.imshow(f_arr, interpolation="none", aspect="equal", origin="lower")
 ```
 
-### Specifying Ghost Layers
+#### Specifying Ghost Layers
 
 One way to alter the iteration space is by introducing ghost layers on the domain borders.
 These layers of grid points are stripped from the iterations, and can be used to hold
 boundary values or exchange data in MPI-parallel simulations.
 
-#### Automatic Ghost Layers
+##### Automatic Ghost Layers
 
 The easiest way to define an iteration space with ghost layers
 is to set `ghost_layers=ps.config.AUTO`, which is also the default
@@ -282,7 +326,7 @@ kernel(f=f_arr, u=u_arr, v=v_arr)
 _draw_ispace(f_arr)
 ```
 
-#### Uniform and Nonuniform Ghost Layers
+##### Uniform and Nonuniform Ghost Layers
 
 ```{code-cell} ipython3
 :tags: [remove-cell]
@@ -329,7 +373,7 @@ cfg = ps.CreateKernelConfig(
 _show_ispace(cfg)
 ```
 
-### Iteration Slices
+#### Iteration Slices
 
 Using the `iteration_slice` option, we can assert much finer control on the kernel's iteration space
 by specifying it using sequences of Python `slice` objects.
@@ -349,7 +393,7 @@ cfg = ps.CreateKernelConfig(
 _show_ispace(cfg)
 ```
 
-#### Strided Iteration
+##### Strided Iteration
 
 It is also possible to set up a strided iteration that skips over a fixed number of elements.
 The following example processes only every second line in $y$-direction, using the slice `::2`:
@@ -366,7 +410,7 @@ cfg = ps.CreateKernelConfig(
 _show_ispace(cfg)
 ```
 
-#### Triangular Iteration
+##### Triangular Iteration
 
 Iteration slices are not limited to constant numerical values; they can be arbitrarily complex
 *SymPy* expressions.
@@ -394,7 +438,7 @@ depends on the second.
 _show_ispace(cfg)
 ```
 
-#### Red-Black Iteration
+##### Red-Black Iteration
 
 Using a case distinction for the second dimension's start index, we can even produce
 a checkerboard pattern, as required for e.g. red-black Gauss-Seidel-type smoothers.
@@ -425,7 +469,50 @@ pattern also depends on the second dimension being mapped to the inner loop (for
 _show_ispace(cfg)
 ```
 
-## Kernel Parameters and Function Objects
+(section_codegen_stages)=
+## Advanced: Understanding the Stages of the Code Generator
+
+While translating a set of symbolic definitions to a kernel, the code generator of pystencils
+goes through a number of stages, gradually extending and transforming the AST.
+Pystencils allows you to retrieve and inspect the intermediate results produced by the
+code generator, in order to better understand the process of kernel translation.
+This can be immensely helpful when tracking down bugs or trying to explain unexpected
+output code.
+
+To get access to the intermediate results, the code generator has to be invoked in a slightly different way.
+Instead of just calling `create_kernel`, we directly create the so-called *driver* and instruct it to
+store its intermediate ASTs:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+u_src, u_dst, f = ps.fields("u_src, u_dst, f: float32[2D]")
+h = sp.Symbol("h")
+
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.X86_AVX512,
+  default_dtype="float32",
+  cpu_optim=ps.CpuOptimConfig(
+    openmp=True,
+    vectorize=ps.VectorizationConfig(
+        assume_inner_stride_one=True
+    )
+  )
+)
+
+assignments = [
+  ps.Assignment(
+    u_dst[0,0], (h**2 * f[0, 0] + u_src[1, 0] + u_src[-1, 0] + u_src[0, 1] + u_src[0, -1]) / 4
+  )
+]
+```
+
+```{code-cell} ipython3
+driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+kernel = driver(assignments)
+driver.intermediates
+```
+
+## API: Kernel Parameters and Function Objects
 
 ```{eval-rst}
 .. module:: pystencils.backend.kernelfunction
@@ -438,4 +525,4 @@ _show_ispace(cfg)
   KernelParameter
   KernelFunction
   GpuKernelFunction
-```
\ No newline at end of file
+```
diff --git a/src/pystencils/backend/emission/c_printer.py b/src/pystencils/backend/emission/c_printer.py
index 487fe17bc..95e27bd66 100644
--- a/src/pystencils/backend/emission/c_printer.py
+++ b/src/pystencils/backend/emission/c_printer.py
@@ -23,7 +23,10 @@ class CAstPrinter(BasePrinter):
     def visit(self, node: PsAstNode, pc: PrinterCtx) -> str:
         match node:
             case PsVecMemAcc():
-                raise EmissionError("Cannot print vectorized array accesses to C code.")
+                raise EmissionError(
+                    f"Unable to print C code for vector memory access {node}.\n"
+                    f"Vectorized memory accesses must be mapped to intrinsics before emission."
+                )
 
             case PsBufferAcc():
                 raise EmissionError(
diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py
index 4986f1a7f..124ce200d 100644
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -59,7 +59,7 @@ class IRAstPrinter(BasePrinter):
 
                 stride_code = "" if stride is None else f", stride={stride}"
 
-                code = f"vec_load< {lanes}{stride_code} >({ptr_code}, {offset_code})"
+                code = f"vec_memacc< {lanes}{stride_code} >({ptr_code}, {offset_code})"
                 return pc.parenthesize(code, Ops.Subscript)
 
             case PsVecBroadcast(lanes, operand):
diff --git a/src/pystencils/inspection.py b/src/pystencils/inspection.py
index 759baddfb..59be8c786 100644
--- a/src/pystencils/inspection.py
+++ b/src/pystencils/inspection.py
@@ -15,6 +15,15 @@ _UNABLE_TO_DISPLAY_CPP = """
 </div>
 """
 
+_GRAPHVIZ_NOT_IMPLEMENTED = """
+<div>
+    <b>AST Visualization Unavailable</b>
+    <p>
+    AST visualization using GraphViz is not implemented yet.
+    </p>
+</div>
+"""
+
 _ERR_MSG = """
 <div style="font-family: monospace; background-color: #EEEEEE; white-space: nowrap; overflow-x: scroll">
     {}
@@ -54,6 +63,13 @@ class CodeInspectionBase(ABC):
         self._apply_tab_layout(cpp_tab)
         return cpp_tab
 
+    def _graphviz_tab(self, ir_obj: PsAstNode | KernelFunction):
+        import ipywidgets as widgets
+
+        graphviz_tab = widgets.HTML(_GRAPHVIZ_NOT_IMPLEMENTED)
+        self._apply_tab_layout(graphviz_tab)
+        return graphviz_tab
+
     def _apply_tab_layout(self, tab):
         tab.layout.display = "inline-block"
         tab.layout.padding = "0 15pt 0 0"
@@ -92,8 +108,14 @@ class AstInspection(CodeInspectionBase):
     def _widget(self):
         import ipywidgets as widgets
 
-        tabs = widgets.Tab(children=[self._ir_tab(self._ast), self._cpp_tab(self._ast)])
-        tabs.titles = ["IR Code", "C Code"]
+        tabs = widgets.Tab(
+            children=[
+                self._ir_tab(self._ast),
+                self._cpp_tab(self._ast),
+                self._graphviz_tab(self._ast),
+            ]
+        )
+        tabs.titles = ["IR Code", "C Code", "AST Visualization"]
 
         tabs.layout.height = "250pt"
 
@@ -109,9 +131,13 @@ class KernelInspection(CodeInspectionBase):
         import ipywidgets as widgets
 
         tabs = widgets.Tab(
-            children=[self._ir_tab(self._kernel), self._cpp_tab(self._kernel)]
+            children=[
+                self._ir_tab(self._kernel),
+                self._cpp_tab(self._kernel),
+                self._graphviz_tab(self._kernel),
+            ]
         )
-        tabs.titles = ["IR Code", "C Code"]
+        tabs.titles = ["IR Code", "C Code", "AST Visualization"]
 
         tabs.layout.height = "250pt"
 
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index a4653b883..54b9f4694 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -2,12 +2,7 @@ from typing import cast, Sequence
 from dataclasses import replace
 
 from .target import Target
-from .config import (
-    CreateKernelConfig,
-    OpenMpConfig,
-    VectorizationConfig,
-    AUTO
-)
+from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .backend import KernelFunction
 from .types import create_numeric_type, PsIntegerType, PsScalarType
 from .backend.ast.structural import PsBlock, PsLoop
@@ -75,8 +70,12 @@ def create_kernel(
     return driver(assignments)
 
 
+def get_driver(cfg: CreateKernelConfig, *, retain_intermediates: bool = False):
+    return DefaultKernelCreationDriver(cfg, retain_intermediates)
+
+
 class DefaultKernelCreationDriver:
-    def __init__(self, cfg: CreateKernelConfig, inspect_intermediates: bool = False):
+    def __init__(self, cfg: CreateKernelConfig, retain_intermediates: bool = False):
         self._cfg = cfg
 
         idx_dtype = create_numeric_type(self._cfg.index_dtype)
@@ -90,22 +89,20 @@ class DefaultKernelCreationDriver:
         self._target = self._cfg.get_target()
         self._platform = self._get_platform()
 
-        if inspect_intermediates:
-            self._inspect = IntermediatesInspection()
+        if retain_intermediates:
+            self._intermediates = IntermediatesInspection()
         else:
-            self._inspect = None
+            self._intermediates = None
 
     @property
     def intermediates(self) -> IntermediatesInspection | None:
-        return self._inspect
+        return self._intermediates
 
     def __call__(
         self,
         assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
     ):
-        kernel_body = self.parse_kernel_body(
-            assignments
-        )
+        kernel_body = self.parse_kernel_body(assignments)
 
         match self._platform:
             case GenericCpu():
@@ -117,15 +114,15 @@ class DefaultKernelCreationDriver:
                     kernel_body, self._ctx.get_iteration_space()
                 )
 
-        if self._inspect is not None:
-            self._inspect.materialized_ispace = kernel_ast.clone()
+        if self._intermediates is not None:
+            self._intermediates.materialized_ispace = kernel_ast.clone()
 
         #   Fold and extract constants
         elim_constants = EliminateConstants(self._ctx, extract_constant_exprs=True)
         kernel_ast = cast(PsBlock, elim_constants(kernel_ast))
 
-        if self._inspect is not None:
-            self._inspect.constants_eliminated = kernel_ast.clone()
+        if self._intermediates is not None:
+            self._intermediates.constants_eliminated = kernel_ast.clone()
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
@@ -141,8 +138,8 @@ class DefaultKernelCreationDriver:
         select_functions = SelectFunctions(self._platform)
         kernel_ast = cast(PsBlock, select_functions(kernel_ast))
 
-        if self._inspect is not None:
-            self._inspect.lowered = kernel_ast.clone()
+        if self._intermediates is not None:
+            self._intermediates.lowered = kernel_ast.clone()
 
         #   Late canonicalization pass: Canonicalize new symbols introduced by LowerToC
 
@@ -214,8 +211,8 @@ class DefaultKernelCreationDriver:
         typify = Typifier(self._ctx)
         kernel_body = typify(kernel_body)
 
-        if self._inspect is not None:
-            self._inspect.parsed_body = kernel_body.clone()
+        if self._intermediates is not None:
+            self._intermediates.parsed_body = kernel_body.clone()
 
         return kernel_body
 
@@ -223,14 +220,14 @@ class DefaultKernelCreationDriver:
         canonicalize = CanonicalizeSymbols(self._ctx, True)
         kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
 
-        if self._inspect is not None:
-            self._inspect.cpu_canonicalize = kernel_ast.clone()
+        if self._intermediates is not None:
+            self._intermediates.cpu_canonicalize = kernel_ast.clone()
 
         hoist_invariants = HoistLoopInvariantDeclarations(self._ctx)
         kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast))
 
-        if self._inspect is not None:
-            self._inspect.cpu_hoist_invariants = kernel_ast.clone()
+        if self._intermediates is not None:
+            self._intermediates.cpu_hoist_invariants = kernel_ast.clone()
 
         cpu_cfg = self._cfg.cpu_optim
 
@@ -253,8 +250,8 @@ class DefaultKernelCreationDriver:
             add_omp = AddOpenMP(self._ctx, params)
             kernel_ast = cast(PsBlock, add_omp(kernel_ast))
 
-            if self._inspect is not None:
-                self._inspect.cpu_openmp = kernel_ast.clone()
+            if self._intermediates is not None:
+                self._intermediates.cpu_openmp = kernel_ast.clone()
 
         if cpu_cfg.use_cacheline_zeroing:
             raise NotImplementedError("CL-zeroing not implemented yet")
@@ -311,14 +308,14 @@ class DefaultKernelCreationDriver:
 
         kernel_ast = vectorizer.vectorize_select_loops(kernel_ast, loop_predicate)
 
-        if self._inspect is not None:
-            self._inspect.cpu_vectorize = kernel_ast.clone()
+        if self._intermediates is not None:
+            self._intermediates.cpu_vectorize = kernel_ast.clone()
 
         select_intrin = SelectIntrinsics(self._ctx, self._platform)
         kernel_ast = cast(PsBlock, select_intrin(kernel_ast))
 
-        if self._inspect is not None:
-            self._inspect.cpu_select_intrins = kernel_ast.clone()
+        if self._intermediates is not None:
+            self._intermediates.cpu_select_intrins = kernel_ast.clone()
 
         return kernel_ast
 
-- 
GitLab


From df7d2fae247c5ef0f5433320b567867537bf4230 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 22 Nov 2024 10:29:09 +0100
Subject: [PATCH 15/31] factor out stage results into kernelcreation module and
 separate display utility from data.

---
 docs/source/reference/kernelcreation.md |  4 +-
 src/pystencils/inspection.py            | 87 +++++++++----------------
 src/pystencils/kernelcreation.py        | 65 ++++++++++++++++--
 3 files changed, 93 insertions(+), 63 deletions(-)

diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
index 5f1c82279..a5be087fe 100644
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -199,7 +199,7 @@ are using the `int32` data type, as specified in {py:data}`index_dtype <CreateKe
 
 driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
 kernel = driver(assignments)
-driver.intermediates.materialized_ispace
+ps.inspect(driver.intermediates.materialized_ispace)
 ```
 
 :::{note}
@@ -509,7 +509,7 @@ assignments = [
 ```{code-cell} ipython3
 driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
 kernel = driver(assignments)
-driver.intermediates
+ps.inspect(driver.intermediates)
 ```
 
 ## API: Kernel Parameters and Function Objects
diff --git a/src/pystencils/inspection.py b/src/pystencils/inspection.py
index 59be8c786..7fa3047c6 100644
--- a/src/pystencils/inspection.py
+++ b/src/pystencils/inspection.py
@@ -3,6 +3,7 @@ from typing import overload
 from .backend.ast import PsAstNode
 from .backend.emission import CAstPrinter, IRAstPrinter, EmissionError
 from .backend.kernelfunction import KernelFunction
+from .kernelcreation import StageResult, CodegenIntermediates
 from abc import ABC, abstractmethod
 
 _UNABLE_TO_DISPLAY_CPP = """
@@ -144,68 +145,18 @@ class KernelInspection(CodeInspectionBase):
         return tabs
 
 
-class StageResult:
-    def __init__(self, description: str | None = None):
-        self._description = description
-        self._name: str
-        self._lookup: str
-
-    @property
-    def description(self) -> str:
-        if self._description is not None:
-            return self._description
-        else:
-            return self._name
-
-    def __set_name__(self, owner, name: str):
-        self._name = name
-        self._lookup = f"_{name}"
-
-    def __get__(self, obj, objtype=None) -> AstInspection | None:
-        if obj is None:
-            return None
-
-        ast = getattr(obj, self._lookup, None)
-        if ast is not None:
-            return AstInspection(ast)
-        else:
-            return None
-
-    def __set__(self, obj, val: PsAstNode | None):
-        setattr(obj, self._lookup, val)
-
-
 class IntermediatesInspection:
-    """Inspect intermediate results produced by the code generator."""
-
-    parsed_body = StageResult("Freeze & Type Deduction")
-    materialized_ispace = StageResult("Iteration Space Materialization")
-    constants_eliminated = StageResult("Constant Elimination")
-    cpu_canonicalize = StageResult("CPU: Symbol Canonicalization")
-    cpu_hoist_invariants = StageResult("CPU: Hoisting of Loop Invariants")
-    cpu_vectorize = StageResult("CPU: Vectorization")
-    cpu_select_intrins = StageResult("CPU: Intrinsics Selection")
-    cpu_openmp = StageResult("CPU: OpenMP Instrumentation")
-    lowered = StageResult("C Language Lowering")
+    def __init__(self, intermediates: CodegenIntermediates):
+        self._intermediates = intermediates
 
     def _ipython_display_(self):
         from IPython.display import display
         import ipywidgets as widgets
 
-        all_fields = [
-            (name, obj.description)
-            for name, obj in IntermediatesInspection.__dict__.items()
-            if isinstance(obj, StageResult)
-        ]
+        stages = self._intermediates.available_stages
 
-        previews: list[AstInspection] = []
-        labels: list[str] = []
-
-        for name, descr in all_fields:
-            preview = getattr(self, name)
-            if preview is not None:
-                previews.append(preview)
-                labels.append(descr)
+        previews: list[AstInspection] = [AstInspection(stage.ast) for stage in stages]
+        labels: list[str] = [stage.label for stage in stages]
 
         code_views = [p._widget() for p in previews]
         for v in code_views:
@@ -242,7 +193,25 @@ def inspect(obj: PsAstNode): ...
 def inspect(obj: KernelFunction): ...
 
 
-def inspect(obj: PsAstNode | KernelFunction):
+@overload
+def inspect(obj: StageResult): ...
+
+
+@overload
+def inspect(obj: CodegenIntermediates): ...
+
+
+def inspect(obj):
+    """Interactively inspect various products of the code generator.
+    
+    When run inside a Jupyter notebook, this function displays an inspection widget
+    for the following types of objects:
+    - `PsAstNode`
+    - `KernelFunction`
+    - `StageResult`
+    - `CodegenIntermediates`
+    """
+
     from IPython.display import display
 
     match obj:
@@ -250,5 +219,11 @@ def inspect(obj: PsAstNode | KernelFunction):
             preview = AstInspection(obj)
         case KernelFunction():
             preview = KernelInspection(obj)
+        case StageResult(ast, _):
+            preview = AstInspection(ast)
+        case CodegenIntermediates():
+            preview = IntermediatesInspection(obj)
+        case _:
+            raise ValueError(f"Cannot inspect object of type {type(obj)}")
 
     display(preview)
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 54b9f4694..e718f1617 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -1,10 +1,13 @@
+from __future__ import annotations
+
 from typing import cast, Sequence
-from dataclasses import replace
+from dataclasses import dataclass, replace
 
 from .target import Target
 from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .backend import KernelFunction
 from .types import create_numeric_type, PsIntegerType, PsScalarType
+from .backend.ast import PsAstNode
 from .backend.ast.structural import PsBlock, PsLoop
 from .backend.kernelcreation import (
     KernelCreationContext,
@@ -36,8 +39,6 @@ from .backend.kernelfunction import (
 from .simp import AssignmentCollection
 from sympy.codegen.ast import AssignmentBase
 
-from .inspection import IntermediatesInspection
-
 
 __all__ = ["create_kernel"]
 
@@ -90,12 +91,12 @@ class DefaultKernelCreationDriver:
         self._platform = self._get_platform()
 
         if retain_intermediates:
-            self._intermediates = IntermediatesInspection()
+            self._intermediates = CodegenIntermediates()
         else:
             self._intermediates = None
 
     @property
-    def intermediates(self) -> IntermediatesInspection | None:
+    def intermediates(self) -> CodegenIntermediates | None:
         return self._intermediates
 
     def __call__(
@@ -362,6 +363,60 @@ class DefaultKernelCreationDriver:
         )
 
 
+@dataclass
+class StageResult:
+    ast: PsAstNode
+    label: str
+
+
+class StageResultSlot:
+    def __init__(self, description: str | None = None):
+        self._description = description
+        self._name: str
+        self._lookup: str
+
+    def __set_name__(self, owner, name: str):
+        self._name = name
+        self._lookup = f"_{name}"
+
+    def __get__(self, obj, objtype=None) -> StageResult | None:
+        if obj is None:
+            return None
+
+        ast = getattr(obj, self._lookup, None)
+        if ast is not None:
+            descr = self._name if self._description is None else self._description
+            return StageResult(ast, descr)
+        else:
+            return None
+
+    def __set__(self, obj, val: PsAstNode | None):
+        setattr(obj, self._lookup, val)
+
+
+class CodegenIntermediates:
+    """Intermediate results produced by the code generator."""
+
+    parsed_body = StageResultSlot("Freeze & Type Deduction")
+    materialized_ispace = StageResultSlot("Iteration Space Materialization")
+    constants_eliminated = StageResultSlot("Constant Elimination")
+    cpu_canonicalize = StageResultSlot("CPU: Symbol Canonicalization")
+    cpu_hoist_invariants = StageResultSlot("CPU: Hoisting of Loop Invariants")
+    cpu_vectorize = StageResultSlot("CPU: Vectorization")
+    cpu_select_intrins = StageResultSlot("CPU: Intrinsics Selection")
+    cpu_openmp = StageResultSlot("CPU: OpenMP Instrumentation")
+    lowered = StageResultSlot("C Language Lowering")
+
+    @property
+    def available_stages(self) -> Sequence[StageResult]:
+        all_results: list[StageResult | None] = [
+            getattr(self, name)
+            for name, slot in CodegenIntermediates.__dict__.items()
+            if isinstance(slot, StageResultSlot)
+        ]
+        return tuple(filter(lambda r: r is not None, all_results))  # type: ignore
+
+
 def create_staggered_kernel(
     assignments, target: Target = Target.CPU, gpu_exclusive_conditions=False, **kwargs
 ):
-- 
GitLab


From 0b5cd939ac1d14b09a04b02f12d7c5bb8932f513 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 22 Nov 2024 11:45:25 +0100
Subject: [PATCH 16/31] Add manual launch grids, triangular iteration to GPU
 guide.

---
 docs/source/reference/gpu_kernels.md    | 112 ++++++++++++++++++++++--
 docs/source/reference/kernelcreation.md |  23 +++--
 2 files changed, 123 insertions(+), 12 deletions(-)

diff --git a/docs/source/reference/gpu_kernels.md b/docs/source/reference/gpu_kernels.md
index c76215117..c3fa70ec2 100644
--- a/docs/source/reference/gpu_kernels.md
+++ b/docs/source/reference/gpu_kernels.md
@@ -24,7 +24,7 @@ import matplotlib.pyplot as plt
 ```
 
 (guide_gpukernels)=
-# CUDA Code Generation for GPUs
+# Pystencils for GPUs
 
 Pystencils offers code generation for Nvidia GPUs using the CUDA programming model,
 as well as just-in-time compilation and execution of CUDA kernels from within Python
@@ -79,6 +79,111 @@ kfunc(f=f_arr, g=g_arr)
 
 The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
 Its interface allows us to customize the GPU launch grid.
+We can manually set both the number of threads per block, and the number of blocks on the grid:
+
+```{code-cell} ipython3
+kfunc.block_size = (16, 8, 8)
+kfunc.num_blocks = (1, 2, 2)
+```
+
+In most cases, the number of blocks is automatically inferred from the block size
+in order to cover the entire iteration space, so it does not need to be specified.
+Setting a launch grid that is larger than the iteration space is also possible,
+but will cause any threads working outside of the iteration bounds to idle.
+
+### Manual Launch Grids and Non-Cuboid Iteration Patterns
+
+In some cases, it will be unavoidable to set the launch grid size manually;
+especially if the code generator is unable to automatically determine the size of the
+iteration space.
+An example for this is the triangular iteration previously described in the [Kernel Creation Guide](#example_triangular_iteration).
+Let's set it up once more:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _draw_ispace(f_arr):
+    n, m = f_arr.shape
+    fig, ax = plt.subplots()
+    
+    ax.set_xticks(np.arange(0, m, 4))
+    ax.set_yticks(np.arange(0, n, 4))
+    # ax.set_xticklabels([])
+    # ax.set_yticklabels([])
+
+    ax.set_xticks(np.arange(-.5, m, 1), minor=True)
+    ax.set_yticks(np.arange(-.5, n, 1), minor=True)
+    
+    ax.grid(which="minor", linewidth=2)
+    ax.tick_params(which='minor', bottom=False, left=False)
+    
+    ax.imshow(f_arr, interpolation="none", aspect="equal", origin="lower")
+```
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+f = ps.fields("f: float64[2D]")
+assignments = [
+    ps.Assignment(f(0), 1)
+]
+```
+
+```{code-cell} ipython3
+y = ps.DEFAULTS.spatial_counters[0]
+cfg = ps.CreateKernelConfig(
+    target=ps.Target.CUDA,
+    iteration_slice=ps.make_slice[:, y:]
+)
+    
+kernel = ps.create_kernel(assignments, cfg).compile()
+```
+
+This warns us that the threads range could not be determined automatically.
+We can disable this warning by setting `manual_launch_grid` in the GPU indexing options:
+
+```{code-cell}
+cfg = ps.CreateKernelConfig(
+    # ... other options ...
+    gpu_indexing=ps.GpuIndexingConfig(
+        manual_launch_grid=True
+    )
+)
+```
+
+Now, to execute our kernel, we have to manually specify its launch grid:
+
+```{code-cell} ipython3
+kernel.block_size = (8, 8)
+kernel.num_blocks = (2, 2)
+```
+
+This way the kernel will cover this iteration space:
+
+```{code-cell} ipython3
+:tags: [remove-input]
+f_arr = cp.zeros((16, 16))
+kernel(f=f_arr)
+_draw_ispace(cp.asnumpy(f_arr))
+```
+
+We can also observe the effect of decreasing the launch grid size:
+
+```{code-cell} ipython3
+kernel.block_size = (4, 4)
+kernel.num_blocks = (2, 3)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+f_arr = cp.zeros((16, 16))
+kernel(f=f_arr)
+_draw_ispace(cp.asnumpy(f_arr))
+```
+
+Here, since there are only eight threads operating in $x$-direction, 
+and twelve threads in $y$-direction,
+only a part of the triangle is being processed.
 
 ## API Reference
 
@@ -92,11 +197,8 @@ Its interface allows us to customize the GPU launch grid.
   pystencils.backend.jit.gpu_cupy.CupyKernelWrapper
 ```
 
-:::{admonition} To Do:
+:::{admonition} Developers To Do:
 
-- GPU kernels in general: Selecting the CUDA target, compiling and running on cupy arrays
-- Setting the launch grid
-- Indexing options and iteration spaces
 - Fast approximation functions
 - Fp16 on GPU
 :::
diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
index a5be087fe..be60fb28e 100644
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -410,6 +410,7 @@ cfg = ps.CreateKernelConfig(
 _show_ispace(cfg)
 ```
 
+(example_triangular_iteration)=
 ##### Triangular Iteration
 
 Iteration slices are not limited to constant numerical values; they can be arbitrarily complex
@@ -425,11 +426,11 @@ cfg = ps.CreateKernelConfig(
 ```
 
 :::{warning}
-On CPU, the above will only work if the loop over the second dimension is nested
-*inside* the loop over the first dimension!
-The loop order depends on the memory layout of the fields used in the kernel;
-depending on the loop order, the above might have to be adapted such that the first dimension
-depends on the second.
+This type of dependency is restricted by the ordering of the iteration space dimensions:
+The limits of a dimension can only depend on the counters of dimensions that are *slower*
+than itself.
+The ordering of dimensions is determined by the memory layout of the kernels' fields;
+see also the [section on memory layouts](#section_memory_layout).
 :::
 
 ```{code-cell} ipython3
@@ -459,8 +460,8 @@ cfg = ps.CreateKernelConfig(
 ```
 
 :::{warning}
-As with the triangular iteration pattern described above, this specification of the red-black
-pattern also depends on the second dimension being mapped to the inner loop (for CPU kernels).
+The restrictions on dimension ordering of the triangular iteration example apply
+to the checkerboard-iteration as well.
 :::
 
 ```{code-cell} ipython3
@@ -469,6 +470,14 @@ pattern also depends on the second dimension being mapped to the inner loop (for
 _show_ispace(cfg)
 ```
 
+(section_memory_layout)=
+## Memory Layout and Dimension Ordering
+
+:::{admonition} Developer To Do
+Briefly explain about field memory layouts, cache locality, coalesced memory accesses (on GPU and vector CPUs),
+and the need for correct ordering of the dimensions (loop order on CPU, thread indexing order on GPU).
+:::
+
 (section_codegen_stages)=
 ## Advanced: Understanding the Stages of the Code Generator
 
-- 
GitLab


From bb19ebcbc5eff0ed6fd25c884682ad75b2323251 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 22 Nov 2024 11:53:18 +0100
Subject: [PATCH 17/31] use CUDA image for building docs

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 97342f47a..9f80cd261 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -318,7 +318,7 @@ tests-and-coverage:
 
 
 build-documentation:
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/documentation
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
   stage: docs
   needs: []
   before_script:
-- 
GitLab


From 098f5b9e5c4536fd955985f7bc5d4c90f58939b8 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 25 Nov 2024 10:07:55 +0100
Subject: [PATCH 18/31] fix pystencils logo geometry. Add light and dark mode
 logos for new doc theme. Pin pydata theme dependency.

The pydata theme dependency is pinned to 0.15.4 to work around an issue with the sidebar overlay in 0.16; see [this discussion](https://github.com/executablebooks/sphinx-book-theme/issues/865)
---
 docs/source/_static/img/logo copy.png         | Bin 10116 -> 0 bytes
 ...ogo_large.svg => pystencils-logo-dark.svg} | 411 ++++++++--------
 .../_static/img/pystencils-logo-light.svg     | 465 ++++++++++++++++++
 docs/source/conf.py                           |  15 +-
 pyproject.toml                                |   3 +-
 5 files changed, 691 insertions(+), 203 deletions(-)
 delete mode 100644 docs/source/_static/img/logo copy.png
 rename docs/source/_static/img/{logo_large.svg => pystencils-logo-dark.svg} (52%)
 create mode 100644 docs/source/_static/img/pystencils-logo-light.svg

diff --git a/docs/source/_static/img/logo copy.png b/docs/source/_static/img/logo copy.png
deleted file mode 100644
index 784183cd932787cfce3cf6c38e77622f40699e8f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10116
zcmeAS@N?(olHy`uVBq!ia0y~yV3-EN9Bd2>3^t5~j~EyjSc;uILpV4%IBGajIv5xj
zI14-?iy0WW?tw6432)>l1_lPn64!{5;QX|b^2DN4hVt@qz0ADq;^f4FRK5J7^x5xh
zq!<_!7(87ZLn`LHom)Lc<m=Sq_FrGT`u@eHifxADg+{kH<F@JRRT?_^r>w|m%2CK$
zR9$x0*0bm4<i}Y>n=I4MMxM{}NxHV?W~!^tA{B{Nl`iioya_!Dr?MqnCF9N+*gQMv
z<G+9P>Q~$AL&^;gpE=OLC$P)@`Inh5-&Ort^(*@Oy<J~_-<^NNlSA=I?C!F=Y&ug^
zUS4*XDsZUwQ;{o|xaTAzhuH#;N*|b)mX_|HuHyM6a@qIUbFIt$rl<atRCJkp`>k2i
z!3D31c1}qB8MAHo3D&;U$QQf(bi|BHV#8HEvnHheJRQ8u=jO!tV}jSt&9%;-ocgnJ
z`|Y)1;o;hK`~Uyhvg<^=NT*BFwQFI{*GrjyIK{@^l{&$xoxbuO4-3<+8HUMA7PV>>
zsYOh4EiT^Nas07l<h1oOjZ#nXMRF*f-t?#E_S>-TMOsB>CpMj#VVJDqdpYE?$$T*<
zk058~!%PXKxw*Qo-@~6Q7U(~2S@3|tspEF)1HJ9Hb46TDIi|7leAu++^Q|@96CD&9
zmSwViU=5wL#Gkd<@oSCwE2l%7bW$WY=;)kwIQO@7LaKwC8*9q*T0suB<}9<>bJsG3
z>P>%Lv;8^KA*W*pj7khHMYu7Y_DP(XDWJw><W#1p9j7y$UqpY?t42E|?HChZW{H(t
zigzZd%+34RJ%7*X%_Ta@eN!#_rsf>Ja@*nkp)+3^Rw|$5+Og+EmS)#_p*vGgyffG&
z<`h<I!K59a;_1ZkP;Zw*L5$pzjhE&|Y!utnSvzCn-z}!^pZ(>(w_N>}uh!gSPlG=0
z*xMs#JBxGj>yK0W-M?L5#HR0D;ij;&$&x#4$0MfSpR|;s6I29enw@#af1>Q|qRi`T
zX8Cp8ws)OZ^4H{j$WUN;!I^QVaDut{-jh3SMcJ8tXJZPz9C&*9q94a^y$iUeky0qy
znkpILCNwe2`F)<s?Qdf1*VZ;VpZ&d3v;C8xz=wuqx2go0e7G-Jxl9(zeOq1mPF(j+
zfiw5?YfCTaoQ@0NR6HGZ@K_A1o(}ivOB=uFZL;-!@isc*>!0(6!N+HqNF7prP~Pm#
z>T37#&uO8Ah=51U6Zrc|Qhyx26~5!3kz{$R><w>6aj{Lh>zI~2Xl<S7vZO!Z+n&?%
z*Vie{wmJ4AFd#B1S}T+_^40GxaSW>%d^hUOT-Bfwxh8shUqI#zFMn3OaL!fhuis5-
z46Ja!!+rX~#xELfUl`oK=RZ8$TrC-#*{${BY^0W^`kFZX$W6L4R^2EJUn%c4)iKoI
z^s%q64HnBQd`XUQ+ms-@l2unnM^^iC<IJ3Az4m(#o(OWZ;Hc}@owa8HoAV@xsRti#
zI%>K|q;I?Y!x!z=hQjR|dsq^9_BD3TI3zahgo07*i;HP}<=cYm*cP2q_FSOxpv>_E
z|5McqH+BeXm-1}dxKHvo)1nQnQ+<<Dciq_hz4lMuY0fL{TB~e?)OEs7u)FigAN44|
zA+}m$%^~L*k{g0tcK1CloZGST+!x0dl}43j?ev{I%FPBs38@?8qr~M}b#-)fw7ch>
zykU8^rlC&k_M{U#T_wM)Y1E1Ia1v?$?&RX+HZ3+o&U{+xa>14j|L1A`EY)S4<?YHL
zsp!c%TS52C;&X~3;wR#~oKu%<o?v>whC7s#@ley6%=9Osq0V7{6?SW0367cJKG!+S
z#$%np)`_P*Q&NOp&e@?8t>f&PkpAd$<O@&7uO}vY%=<KF2bVE>?1Mv}l&ZB0rpGvC
zMzS7%Ji((m@s?h$#-mgA)@M={s&mdYoMEz*@2auTO;Hy;uGJqlr10Jh?b9yMod55M
zx~0%m3%T}`V`AZUc~U$RtJ7wjbTZ*QzH8x@92V2Z%M>f!8(%FCm_JK&%Z8LtX5AL4
z9a93gh6e=~OP+mu;U4oxlNXWJ&rU_Jzo&nB)6wbI6lXOojxE^Yp876*oja$W!?!+$
zo54KyxP>JnxflG%Dwyj3v@P1M|H7d+_F3FZtrMg5ZL`^*xxHH|y=`jO(%kFUFEO(B
zojfzwdisYO*@0fi7FTI;?eY&xyp`D=vt4X0&r+49+owLXx{AGyt8Bd6ZntC^(<$a}
z$#S~WCyF;@K4bc-BGc`Ze&=ZV`u{vz!_OQJjq882n(b!zhL55mdp5jTD7@*JbE*o%
zq$iJ_oo}xwzy1H?w`t`TH8zeK*PBB^-_$J6bI@VExa7=?jmZ%;x4*KiN?$bf>Oz*|
zP1-D352Lg-xK8U;BsDJmHDRxEpsILR*r_Yo?uIMhMMZskpY*%$?Uu5#gqdYxwh=*2
zDW@vbeD^guSTu>OI4Ziv^;=(4(){uT8amREZrefvm`~X8o|ZlEcCE`4mleAEn=Gbp
zc)Ip$g-WAp!lj<<8D`mLo3?YE3~qhHwYcikhLrwV=YR)`nig&Fn;>7Jb^64nt~Eb4
z-bqxsFY)Nz<C=mkf{m>zo>x|t?pS5Vswcnd(eX*96HHUOn8QDdv|i<x<Q9z#ofyi$
zDa^@j-II(v?JArqo=VF74&SamX)3l1w_l^N;+{yWDTBJ!yBke{lV<m<w75Tel62(P
z1pX76Tv-ZThpc2b<$SLZP`A2#<NxW_S-)k3js^#(L}+lm*3r1O%fi+*k(aShtu~{J
z!8K7-QmyNd6?fOV4_`!DLqUpGpV$x}axF5Tq;>i8CXWeG>b-k+U)Xe0{8wz^`_JVI
z=j{Hhro(+&^7P9;`pnb*Zt**3-@JJZ_p_8(jfkDalf<u_6i>@PcYUYz^Fyb)J+||d
zoOasIv+0n_c_{_&6CN2|UEY&EtdN?uMk#vQ-z)15uwILHXiH3xU~}TwbaTz;ou{8V
zO;SmTEIW7or$zpApNMIxiHY1=nG!BvRqoz2xqFjmW6rE83zb=Rq)5(xq_(6|<LaF$
zl{GA}c{ZEcW^GP+K8yRLs8pMy*=*PArA9M(X8N#QHrc4N+q-|&svR+UF?!;8+ogRk
z2ZVl|)BR=+$LUQvJUWxw{KBoyPAuV<SKF1*>25jc&9ek=?pWKvO;ytz4xB%6N-#!f
zUD6i&BQqQ%lbfD~WNkZO66vaIa5^<oY+CBXvuU1clU0N`AAPhqnKCK)c;Cf~7aeWn
z+%KDWd3&#1x$@xq?~gzK<X~(5QDbK#$G>WoRbI&s$+E<!uA=58ean(eO1FQGE0@lf
zub6*!w%O%t7yDK|@!IM!$#$LB`A6l|lIH8Cy|(+Qxoldvv0=!`h<eu4S>pXwbw^9}
z-gD`mK4idC600uPFWl>vnV<jstoi+v`}=B_T+Y0cw_SN!j#+o+tdP*q!lzTijq>hT
z9C~Q5Ec4OVs*5IX=X7L8v`#IrFN}R&pml29WB1D4duN8O-{bRnVwlSb_Qi+G-padf
zYfZiBDj>9yhnM}^rsvJJXXe?qOWPZEuMz0sZ&^8EZ<+pirW4bisLy@#^5w$Kn<u}$
zy?t}xV>c6DWf3mJq9-1oCQs#3H1M75qvmWQmwtAZX-If@@oC-d0imI;7BXxW>W`%2
z5*a<6mn+XsDBsd+FMI27mw27?6rC#}4hOQkwc?%l-+T<6#u_QlQoCd^XWZ=%^)AV_
z9XIDY3JR?XYSjAucGE`jmC<L|U){`^R$=2;U2VO7|39nS+j18#TGSMv(Q^HDsms*u
zi>9R}&Qd#=Fd@OfBRySRJ8VtD!$YkaU5{A4uCm&5F6+j&yhyROR6%3oH~nQB{{Bde
zt9bd{B~^IUDa9Ae+q14m+U(SrDo`eBGM#HyS3v5voLctz=bb-fnXBfR#~t-`xO%Sp
ztX25B3kE!sPO8Ml#=g0+u{rnlHp8SN9H&m5`cSpkFW}y8TPKbd0Y8TKMLRR{^V7e+
zx~dbmM<O)zyr+SE&i-!JCA!VqQkraSW%ul<lYM(qZUxUx)=PV%)?F)o-<5u;^HW7v
z4)^)@jhhq}ZE7+9_l3>+R#b+kOQ)n{PjSeVH#K)epT#EF$UVQYF?mh={&@u!K9P}<
zcJujtF9(RQdQSSarz%=y{dI3oPtJ!0GeWeE9_y8!yWoGw&h`Dxi6YZn51lEnxOuiZ
zxBceddwIJ*WnZ{%zD(-Hqw~{`pS~XdW#>W@hil6}9=FeZ;;?4&xe%GZO6%TdMotsf
z&b_lkaR2@L$Nzp6yL|GAeYgm#?_~}arYX~>f4*1!{^GT3Zdy}cP5v##*XA41mMVDp
zvW(}^it6KLJnK)dJMq4Z_su^xr$b`XS2bMcRjuQbv+yWh(|yioQb{An>YY1hCK$Zf
zvHu{;)!B_lo_?&ju`!vw(}n4<LEF_V-qjm~s-}Gl>3KFS?e@~jEyB~XR(RZaXOzCP
zgmYFD-=(RZ6AjNi%&L<=*RE~C)9j;m_}<=XMmC-+?M`-?ugZ3R{QXzPrs9I<2EOGR
z+H@<-b-Jgl6gJ*;M5u5D*W8^eK6%_Ij991hadv**qU_gsZbvpvS<?33-M-?3g0HXd
zx!>yZDw^*9eOG>Xm;d(Lz6KI1lTSV<+kM_Eq_V1e_1?X6L$q229_a^1h>58Q?Mh9M
zSf;BLX}9;c^n|#^y;_s~mcK9)d2)_r<&6)=xEL;GOz}~ByyyM9cgNnA@xQifKB%xs
zCt2c{yO8LY((a!(P8qbB_ehw9XWTy)<#Fle{0WSLH#X0S5-;DsVvp<39sP#;_x_Q6
zQeaqf$my7>c7+J*^hLL(p8jOI{``!6sb54Q-8Skh*{oUkN8HIyeyK*6aK`-HHPgj2
z&1RRbnZHP5dHhFZgPe5%Y<s%gJiZ2m#d&E>-g5NmwxY;yigBGkofa%hoo?dy@=nR|
z=Mg4frP5kful(57_F6zDY;Dw0<MTF;_w0Ya@AnM*`hOeV8Bg`Pc<tJ;b+Nl6_Ewqh
zT9-U)*<?d|Vee_;p=y&Cu35v=fBf@wZJp3*oI6b1et1j7ThBZEeh+KvF58DaO3BB4
zoo@#Dh3sPNG^sm#_($O4l^=e6Sb1y1e1!$^HN}4Ge#A6wUbc+Q_p*oDWELhy^<q1b
zPhsZ`crLv(`B*VWd;OkEg4g1gXosZE6xu$0;yi`$SVP|VYaDbkuSb}iS?J3+asDgC
z>{~}27IQAU;+vSWV9ti){qB1!()B|_MSU-S`2E*sp3Tg6@A8B?S#IW(dE{-}zQSlG
z&&CLzhF!a?_Qd7SmlHgEaKo{PIM2i><_B5=R5z!c+jLs+yGqd`qc<;QSZZ0P)z2@l
zuezqUug=CzK)g6T_UH`<&D2@^Z<T5pc(;Xq`B^h%%9Mh$rq>z1-OVtXS#l>S&StOc
zwcnffEbVezTvAq6^z~|Za9CK|{rB=^-PS#anEF=*zSLF>5_<OQWw_zLOB>4!zr2ev
zEWNuy`zY7xZ9YGDT@PFT(>Y1=MaQBt)gO1BKMM{IJUhd|vhHYWRbSWfdHpPz_wKSW
zF=}aP?fCoc_RDwgmMvb~9HHZ;Hu<ISmFL#X&vu`CvCGeB=91mJXMg)vrnmczQQDaq
zjs_AXR)2Zb4~y?PZKNd-a;N9xRKvB8gn~ZjJ-wb?cmAKVZotf~^QEgye^2b4bh+aF
zO1VjrFS;K0y}na&+U8l$>@!afdL7EMmkZtdqB{BM<XH;?I&R&HdUkg9@zv}1btOtn
znKtcPwFIxa72C2P&7_SIva+&0lE!Vb^Y_jC^r?ucv7tEKVok36x!%(aYNtagYJ3!U
zHnl$grvG7o^u2`J*3l=wsJyj{l)K&;xmWi4^u~EC%BQqv{6A{FW0#s!?N5zkELt2Z
zq8z^{oc$ie!NSCF<@$ADuGWgTTd!wiW-eT^<cMJSzQytH9qt<Nq@A60G&DTcH8pi<
z@>;#FbtjD;1e>~5Fn#;qHT#q$W18Q}=jZeDpFL<l>vG%Lpp_xgX%kD{^lm|A_oAa)
zi)UEY2?={^OUkcVa!Xw0$+q8n=X*!aJ%3Q7a{AQsM|R)MTeWIcMpo9Mg$o;#kM~VH
zn>JB}Q}IpEw1oj3oWg1b#m{^WJv3Olbm^2SQ@%(?7_NCM_fGc7akgI}y@LA~?558)
zn%-{sPoVT+pp3Wb%X4a-ODCmV%Jgzl^fECQoi#rzzC$X0Lf}22Q#N)?tJp*B{>x4}
zUwX`5zpduZ)6YLObahidJvpf|+0*B;id65TvfZ-W&%XcuShcrrYLH0kjmN9(a_z*e
zCl)7tI4Ktrvd6L2%fI1Pf~BFY_oXwFHkRmci$oq<t-%svuDE^rf>#HZuj-p>b$^l7
z{3mx89e;fA`Da0{))Q%){RGx)bbUUxT>I0<D?3F*=R9Uy{d0!*RNEgRy%QcD`_Pmq
zS)1k;mMSSKscSCfuGf($ynI=Ccv9rFiza6UubgC7{&afI`Hu|GYg9_^RYi)em=&8-
zzu=wpxevd%1A?Caepp^tQCqj=tiyfgzbxf?mib$y{q`MVi+mxo_1Tpe59NiQc~T9I
z{M-^DeoR07$)d#?Ym^Q<1x7eOa4KmPw*obA#Xt?*bKheh#xuDl@-{Bgwkhs9q-CTd
z(poAe68rF_CfDi{8wwn!Jl+>@GQ3D^Qp<~2Esse>qVu~&=dv%>Si`tsgKKJpSX<&m
zBUPT8lV87A@kn9yJbc1d=X_|A)&1Um2Q;`=i&q-HoaP#2?(srwV~=CDlhQLiR}Y~{
z0y16^GH-kYzi9lpyXx**_p`pHaUl``BGZ~<weBd*Gkf7BE%VZ|P-WJ$h?w+rpJy}b
zb(kgy?76HLz%P^Ta=^8;bGjg-d8(c0{AG(`8Ds_ZS-K7>h1Ujb??~Gu^U7=UcH?97
zwy8wlzkhu9`?t%!259e^=o+CmNvHV9)vtTL#tDBd4gPUthha>dXi80L!>a`fFTy@#
zi2rx!za04C&?eW^3sa45T%Vb6eWJyB^LLN?Uz=YN>}}dE;^o%#ipkTBqkGE2%OyW7
ze1FG1I=jj%@5dXNb)7v6_w8yqtyNRzZnJZPy0U&oV&t^eJVzaEiwjeeUN6+p-@kM6
z_4s<(!_iz*A0K$k9?I;waWyv=w~lZRPs%AvA^UUd!`3Z&yLrB1_0^CRl_?usR{Fbc
z)w{Rq-15a6T3J;}n-5<U>2<HJNWIK=E&oP?#@a>))*YW0P4d6;Txh4`)>-T(`a5b;
z9?S0e^=kEw7_GSLDK&YLzF!425~fF7ICP1(JL6VDe00-U`7N6gAG^6^nobF6vcBLQ
zvN!D1^UW<u(T0(Xk;jCZuE*8(8eUy2Ay^{(>BLS+lXIsR&MoKqvLW|Un`^Yq;eG2=
z{%@~SZd)$$M^xQK^|^O6--k&myFJ#w+BThk%H*)xJ=^M(e_I91_|IJOwR`)IcP}+E
z5|{c!N65>(TJrUU;D_lq4W~XjrMjdo;9BIgw+HSCGt^7|{`U5%gL}`nh-p0<chb99
zb=SDudJ>^?_+0F6fp-iU=}N0?CcWho<-JqqbJg4VF?)%M@sZ%e@$z4tmM&<Wdc}89
zX0x(;f77$mJMEw9Xjwl$vUHc@=ZlwaIfVzQ7Yk0`WFyO}yJn5R5}wH+T9Y@2aHWS{
zv$UVwwx;2c(nOC*r)Ir)zJl*>tA``^=}kI5hYy+b3LB@+DUDx!r)pA5rc}f<2M^co
z&<QIx^(?z#x?;np(7ETscn*2atE#m#ncfw2K$eL+Dk9Hy*^$>uAwf(xc_Q|n3ZKWx
zRBda_?bfy_prb+UabWxY5ARyN9Jx=w57YRr>7wSMAj@pao8}ke92R~qXlC2qI;&$(
zPEO`zx2f2=Xakqf{$)m+j=2AAiQ2<8;ay|St+MV%Pgd<_p16KyR>Qoe5dNi(_QF}K
zII2VUtUF}8x3&67gZorT!2)ADnUrH|A}?I#Te(74@3KYjj!&CYpXh{cl1_RXQ+;;1
zg!s2$(Jv1)uROOD*|ouBW>>?pA6oMB_f|=nDxXrUIwR(3mU`T0+SKmdF6*aHeVw<e
zd0xwr!VWp*qMNBTtLm7hPp#Jcx=Zg>(Zl773;Jb#ue<Pml7#T5kDarE-bEE?i^g&G
zZFgG39Ap@6(U#Y(^ZyjL*45<bo$JEfrvCbI=y~3feY;v_O_<%VY?0W8qgpTS@s{aU
z&FpG8#d6vG+6srim&7X{F8%*B;d47jy=8^>fv1eFwik1cov|{UD)K(%(Np0apPxz3
zu-rApW9fXZ{nEa=Z&q(x(8~IA)j6%Jin|q-9$y-;t$=ZM`TNw%%X~}9-W(6Q^X;*e
z=cc<07A~G)6*^V4=GQUnnWaB7mi1|+#<cdIin;%t$t=tvFk)vXpH9sxpVAY(w^ail
z78`FnbUJi$cIL~T&C4_G?_QJ(TFDZ(etuV%fUvTl+w>~Q=!eNUC;WY9wokY%Hq|H3
zA&D{g!=a|QN0(AUA`h=W8S*QpXZ44Rrn{!B^AOcMd}zxXPLn^YoMxW)of*FH&U2+F
z7e8bg&C+_V6Pnn~)pe-og!tX5p*QCV`CndPz+<#bVeM=Qrt|}=mfeUf-R|u;+3~dR
z{-6Wqjej+`UaO>7h8x}4#F`wLP#L@P)sb%{NBTHYtEX*VwBT2;$Tfv0uFW~$n-0v(
zeABX{i^nf8!g;~|j;=*kvPX?WQX(`!rFuX~Yki}PWl;;Sw3$Kmx16l3EVkIZqg|qV
z{{Q=3o3*0KMsAu;<e{b0<E+-??s?HN)o*#>&Y0&ZWxqck-TVFC>3y}oSFBr?_V?FU
z=4tCpc$(eiDp@YSx&Qv&^Uoj8=GPTR+^z`ZeEs3!;X4I~d3|OWG+tdDKL7nQ9_gDo
z+s@x_3<#b2d|q|mvSn(|H&4|LzjWh<#ey4ld3sifshcCey}w_-Z?@4Io!DJd!s9Ai
zrStcEoYGph+<*SLFE208{hq(?r&@1s@AE~S>FR5I)QbJm-t-)QTvA%f7!VuVtKwNw
zQW6jvYWm=l{8qW$nX|TR->&Q_^swMZ&)rNLx$<ZEa#|}xy3U>RV{q$}v0PFd6&-#0
z`o3>ljc=X5nIyqBckbL5yZjUc7OYv*lVQ@eEVE+oJ;&?PUo`6ezK;Ja|No<W-_E_C
z&sncrwdzs${$HZ)e6marPHC@y(#S6NqB!);&CTumfA8L}Jki5qey@aK(^T#7Bnh@|
zas6d$*Pi{UC&BaS%VmG-gDGaSQ(s<Mx+ZFC*R*M3>6Y`)A3rfs`R4w5dw<)nBAe6C
zvnAZ!Ra*1D`o8n^Ql7)>rcUr%KGQTiZ2n_Sef{S<pU+d4;rniVZQVM(c@>X1<Emb&
zF1o0}keZsR6SHH&`ns>HYyBSF|8w^Kg$oxL8kS|+d^({F8utc;XNr-hrzfZRy$WSe
zPmSI7qZ3Ei`na>3&)eNT-n1h|Z{4m}TKoS0eIFbf`?luZy?f{GmfwHNVQ@Y)Joc!6
z{jbZ6-#-T4+?Ff-|L6SwC#UH~pGYw}_x$nk{^u7KI@dfBp8r66|Bs^!Uh`|MnKw^v
z*REYM^?Pc57CpCKf4zI%>zsRgW~zEm`|x6M|E7e4ObjKl%U7@N4$*q5^S)@Ofv@mk
zgBLGe6x7*lXkGGp*XMuI_kXaxzP6Tm!IC8{pFR~`dTElIo7>lYnp@g0V(Xq22mU>*
zdar!mVWVqtv2pyrPt!AVb6FQ`+9b50{>)PE>6>bQm(~6IJl}fSlPBN)|Gocz@8|vJ
zpD$j%yj?u5f-yAo=k)rY)9Zd4^EB80IbNUg_SV*h+Ua4z!Oe*hTefWZa7;S?!(#cr
z9mnOW*POR}_2$i+a-ZmJIWM>W?Qbr>SJ}S*-|PLy40u99m+smn<$F0JJ3IOPy}b;z
zdcPPmGcp8ZWo45TjAr&!d^)M_sv;yf(Zl6V&F8a8$NOXtZPK}T_pa`~e_z*kd#ETa
z^O>pi?cLqiYtm;ei`ZYcx5Gs#>Ea^SzuSIqum8PWNvN|UJiaz`#<G_+_by#hQkvK?
z!!Wt+#-`Nn4LYZPu3Nd%(`UAsuZxo4$2XhLf4ezv-RniWci+C5UR%5O#k+U!K1*>N
zHaPO<=Vy>3K0iAv%*)GruKe}&_2Q+ar69wXdQblrsV~;uS^4>y<@2Yn<NxonD0?$Q
zz^NmBer=eGlAvMnvonm}e@>qN#|6YIeHHTYPVxE9605g0<;P{qJ3LgB<SL&`WW0V>
z{<QqR5ADJ4k`3PN|6eEA>2id>?n861-uK@88fTCT_r9<7|MKOF#r$+5$&VMD`8z#S
zl%AcNYyDuCtgP&!g$o_$7&)%Wl#=UbXK)St@&5n6{B=7XajAGJiEw?KJpWJ2uU}Oq
zvCr%OJ+GhhH6=3b+#JguS?g(UZ*Tt|Sgtko(c10z7Oh#+^LpLxb>|;X+B3Cy_uU>D
z%T4|XR&{?Wj{luIXLsSwm=-6+KXv;*M{0<?`YhEkJ+7+Lw))$H@4q9r<w%y6mYypw
zetz!hzTfYdAN+hi|NZ7T1%ZM`>pdr{eLO7x??UJAb^HHSF)X^cLRkJ;r`uwk=xsa=
z?Q&HeN4v#uH>VcsL~rYf|MN)P=Ff-2%-hWCb?oNnA9!!CH|eAbLuhDd?YE_wQr^>a
z949Raes=x=Pt*60kB=9hRGt3ir_UsncYD9b$yGdHG|Rbh;r;GbZt+hylKVGSea!-e
zCWBed4F|2MkFG`MzZGB6CvQLR<>lp@b8nmF?RdzhxBpL(n%|rgPp8M9dvbDe<(G@@
zKcCnCOFwn`bo;G0!IR2&e*OAY#-?IIGduscpZk_9VQJvE|1;s;yF4#%@87wPuSMq{
zeOhGs{<FA}gVMwWYu22p{C`Y3e}dOi4SoIehlg7KZVL?yJ9afZ{_n)TmyaK;+x;%-
z@v&Z;e?K1o`80ih%Bw3Y&*d8?ALB90z4c_y?1K`UHn4HKsQb;~IPmfDanGeewbOQg
zK4)F|=i~9nty!Y+zi(Zikb1LI_u2XR_w%DCseHT=?Emq}WdBdU@Bg27cX#>b>hF2a
ze}DQ^1geJY{~Wfj(J5c@b4S+YWxhGLw!EBj`dHFN#@5ilK*J}0Ykxd!Z>T+I;%nVJ
zZC*4edZy{co(lH2J!<`a&*5Xe(l>V$DkoGg&R(~(tzEuu2B)wZ&#qp@|5nYak3TtX
z);V3fE~cTt;>_3U@#k-C&Az#(()jW+-{iw>ye)?l*X{XqigEk(J9o}JI@<m7+V*`%
z=h;?Q{QvtsGbbnI^|iHc?(Llo8ue0k@B6U+|F8AVxw|f(n|yRex%2ecvYEw=JT9)T
zuJd<q*{qs>w(9<xn2wV2a$ydZif1#^JKPqVn3-*={+_oj|NgmaYomXDp8x-h`2HW$
zzw72OFtB|Etp#uemCZ8@liBVZo6uVN;eq3&mnHJS{2S{2RvjoyW{#Y=``xbYt=ZQ%
zWnI-u+9<Iyq|5sKp5g;C7w6elM{G!7oH}*tpO5|ZDUXhH`pmJoxbtyNM#h78yWc-M
zDjxsjAiKQGg(e;sXJ_VaG2Ntpe}2}!+Xl*T+j4KWty%?&U1l}u`_=Dl)$_jx&a|oA
zba~#jYu6YbPszQ#?cvv|nI@T+-0$7L`RPG3|E9#lY@lkR_<C%)VfD8h^?klegZ^B$
z|0|q*ecjWa&*wjnzW?i5!)=M8_iZjp2fIYIf7O(Ic;NW&_5Od{4d?6seP&)){&wqi
zqpB|%akXDV>psuE&ron$cRSCH933&{H_?AV1^nNA^Gz~^8XXuEUFMg#&ATVIu>2jU
zsW)TUO?6+!4H*}eL|hlHT-jOw|9k!W?R#r~fBP6-|214EYD>rC<Nf?Mo*5@zTN4>m
z^>XQpUFQz-+rMEsSt%tY6|pIWv;1!9b<4^(MiRERwhT5upG^LHZu9lmsjsfAWcYjQ
z{%m8ekh%}e@)JB%X4utQ_4f9H+9xhb2cMjr9J#0BW5eqco6a0+<^K7xzy8eY_51xo
zLq(0#&RD$NCs+N(uusnR)Evv=f}Js-iseza{-t~O{Bm=37rXVo+74pK^|zN;oh{$@
zSvGI~-!hQ$V1HZI1E$wwF6aMbm#b(1m8+n#Wo`8KO$85~^maZG;*+sBaK7%F@uin0
zmzVpO=Sh6ZTey3-bi?+0Rov50A6;($H}l(@n}>J5-}gG_|I_L5X@^=k&&;(JkDL}L
z(&{c(*|NUosWzxWzMZ$b_xHQq{oZ;zTb3<TtNXTje&Q^(>#^mr^4v51mPhWd+xxls
zd9mlNb(@QxdL1ZxJ^#-W^|>}O2l`~KGqSQym9Dm%Sbb}6_4dv47HM3$ew|-JU)HW>
zhHJN2;rqScjk2z2Jo;#nzyGh<dAr{_3DUg>CrC+2xs;ZEUEuiQ)6>(S!s_Gize_H^
ztP8)iCG+x+m;Uv)o;Ua{Tf4T`viMoTEHzNf|2$W|&-g)+b@{s#@EnEF3(@Gv$di|r
zdV^MgXx^?iy1d*!{pY8r%=cP!;`hyw|NFwdc1_GYPGPkTIX8{mdL$m^sLS<#kIp+e
zN!8~2onq!~nMb=s&&;vBT)98-Fq>S>2S@igdw2f%bUJchP33%b7VYqLPj=t``>x^k
zyR)+8cMk6VdwV})nNMVRxVg&d@ZjLbEhY@*_iN4b_kO*mS>Fx{&6?l0@4x$AnVzoR
z&Mz;w>*y?_R4#tI9}4>Wekkec>VnFX-S4dOcD-D7`P=_>t5zl5*-;2;GS1H5S9#&7
z(nJpPyCuS)rrBb*-lVg$Ot);?mL$RUuwcf^%gc*juigIbWcb`Ow%_j@-jaE_;^R?q
z%bFhs4+~~k7C$q{zqiNncIA27?**UFnt$GX|8L)1>+(rHYM_Sdt?czri|7B2nSbNs
z!*=;HxtC8qR@i{@#H?9T&(G=mM@G&Rj($4Rx?Jz|wY878@Bga{imi<iJ*Q52t=s$U
z7UN}u;%8?b-m8B9_VbK4DKUED+wYdm{u~z?y6^AX{LSg-=Q&<y{O$MW$?<q@F&zaN
zzRwq&`8TGWl`4C8$J0dW==0Bpd3P-GcD+<PF3WdnrO|sQj*nk1`-9p(({v(>+#>JY
z+p=w27pJgV$IZ>@-%l<R-}yr#=-J;KL*L6M1si3vITUBu*Vh%=J!MLaXes^w_j~c(
z((4}|%m1Gdp>wQDRQrNS?gfVZCypLpV0i7?+UUrgMXK-j|F=uH-&yl{_WhJ+XJ+a|
zZhA6B`s{JZ`hTD2|GYf^-;sxh+kd_a-+$`G#l`l=mWl72AbBNeZRF;*`~SXe@7vHP
zZ+~u<X?Db}l9!q0zn1FPa(Yh6$jDf*apOc*Zm~I=`!Zeic-VZ+k0cr1tNZ=dZu^5x
zI>woomN;JivA6R1-13Z^98jfLo42()=FXGLyvxrfi)t3BJ!uV<*$@Tog(-MXnj$D)
zSmHfVsj@}DW73p~joX5GYjlF1_?-A^YOTes82A3=OHHlSCyqPmh#5EZ8hwzCy&k)~
z>Oka@69FB^AHUeOE^6A>rkL>X@711@%C>FWCiSz<+O=zr=Css{w{B%I+<x`?b^J6{
zPoYgZ^{&@T7w_B|DRMOQiO)uz?q|=^ysv*<qvHA9V^T@dtgo@V%g(aLT`=)oyKY_B
z`7LaY6A!7#^<TbqE9=Fsd93>a;^Y5w%`sv8wEO+O-<+3C=Cdu!oR!AX>==4^OOoPs
zrP>WT+_Bd?k0!OPd#$y>chZvTSFc{pk(=<>%AHejNn2Z6w5n&6hUcZ_jT4<XzMniT
ysu`o=d5XbZ_jHx)58v9_zo#c%DQ9}~pWPs9hP`FjBUJ_l1_n=8KbLh*2~7Y-#Qp~W

diff --git a/docs/source/_static/img/logo_large.svg b/docs/source/_static/img/pystencils-logo-dark.svg
similarity index 52%
rename from docs/source/_static/img/logo_large.svg
rename to docs/source/_static/img/pystencils-logo-dark.svg
index 8b4155d3a..28c8bcaab 100644
--- a/docs/source/_static/img/logo_large.svg
+++ b/docs/source/_static/img/pystencils-logo-dark.svg
@@ -2,443 +2,460 @@
 <!-- Created with Inkscape (http://www.inkscape.org/) -->
 
 <svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="379.82614mm"
-   height="189.91307mm"
-   viewBox="0 0 1345.8407 672.92033"
-   id="svg2"
+   width="49.310894mm"
+   height="48.976913mm"
+   viewBox="0 0 49.310894 48.976913"
    version="1.1"
-   inkscape:version="0.92.3 (2405546, 2018-03-11)"
-   sodipodi:docname="logo_large.svg"
-   inkscape:export-filename="/home/martin/code/pycodegen/pystencils/doc/img/github_repo_card.png"
-   inkscape:export-xdpi="85.599998"
-   inkscape:export-ydpi="85.599998">
+   id="svg1"
+   inkscape:version="1.4 (e7c3feb100, 2024-10-09)"
+   sodipodi:docname="pystencils-logo-dark.svg"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview1"
+     pagecolor="#ffffff"
+     bordercolor="#111111"
+     borderopacity="1"
+     inkscape:showpageshadow="0"
+     inkscape:pageopacity="0"
+     inkscape:pagecheckerboard="1"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:document-units="mm"
+     showguides="true"
+     inkscape:lockguides="false"
+     showgrid="false"
+     inkscape:zoom="4"
+     inkscape:cx="73.125"
+     inkscape:cy="102.5"
+     inkscape:window-width="1920"
+     inkscape:window-height="1039"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1">
+    <inkscape:grid
+       id="grid4"
+       units="mm"
+       originx="4.6554451"
+       originy="-0.83749382"
+       spacingx="0.99999998"
+       spacingy="1"
+       empcolor="#0099e5"
+       empopacity="0.30196078"
+       color="#0099e5"
+       opacity="0.14901961"
+       empspacing="5"
+       enabled="true"
+       visible="false" />
+    <sodipodi:guide
+       position="38.829266,-5.2751542"
+       orientation="1,0"
+       id="guide2"
+       inkscape:locked="false" />
+    <sodipodi:guide
+       position="10.481264,-5.2751542"
+       orientation="1,0"
+       id="guide3"
+       inkscape:locked="false" />
+    <sodipodi:guide
+       position="24.655266,8.8988453"
+       orientation="0,-1"
+       id="guide4"
+       inkscape:locked="false" />
+    <sodipodi:guide
+       position="24.655266,-19.449154"
+       orientation="0,-1"
+       id="guide5"
+       inkscape:locked="false" />
+  </sodipodi:namedview>
   <defs
-     id="defs4">
-    <inkscape:path-effect
-       effect="spiro"
-       id="path-effect4188"
-       is_visible="true" />
+     id="defs1">
     <inkscape:path-effect
        effect="spiro"
-       id="path-effect4188-5"
-       is_visible="true" />
+       id="path-effect5"
+       is_visible="true"
+       lpeversion="0" />
     <filter
-       y="-0.25"
-       height="1.5"
+       y="-0.17469697"
+       height="1.3493938"
        inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
        inkscape:menu="Shadows and Glows"
        inkscape:label="Dark And Glow"
        style="color-interpolation-filters:sRGB"
-       id="filter4596">
+       id="filter4608-0"
+       x="-0.17469697"
+       width="1.3493938">
       <feGaussianBlur
          stdDeviation="5"
          result="result6"
-         id="feGaussianBlur4598" />
+         id="feGaussianBlur4610-2" />
       <feComposite
          result="result8"
          in="SourceGraphic"
          operator="atop"
          in2="result6"
-         id="feComposite4600" />
+         id="feComposite4612-5" />
       <feComposite
          result="result9"
          operator="over"
          in2="SourceAlpha"
          in="result8"
-         id="feComposite4602" />
+         id="feComposite4614-7" />
       <feColorMatrix
          values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
          result="result10"
-         id="feColorMatrix4604" />
+         id="feColorMatrix4616-6" />
       <feBlend
          in="result10"
          mode="normal"
          in2="result6"
-         id="feBlend4606" />
+         id="feBlend4618-9" />
     </filter>
     <filter
-       y="-0.25"
-       height="1.5"
+       y="-0.17469697"
+       height="1.3493938"
        inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
        inkscape:menu="Shadows and Glows"
        inkscape:label="Dark And Glow"
        style="color-interpolation-filters:sRGB"
-       id="filter4608">
+       id="filter4632-1"
+       x="-0.17469697"
+       width="1.3493938">
       <feGaussianBlur
          stdDeviation="5"
          result="result6"
-         id="feGaussianBlur4610" />
+         id="feGaussianBlur4634-9" />
       <feComposite
          result="result8"
          in="SourceGraphic"
          operator="atop"
          in2="result6"
-         id="feComposite4612" />
+         id="feComposite4636-8" />
       <feComposite
          result="result9"
          operator="over"
          in2="SourceAlpha"
          in="result8"
-         id="feComposite4614" />
+         id="feComposite4638-7" />
       <feColorMatrix
          values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
          result="result10"
-         id="feColorMatrix4616" />
+         id="feColorMatrix4640-6" />
       <feBlend
          in="result10"
          mode="normal"
          in2="result6"
-         id="feBlend4618" />
+         id="feBlend4642-5" />
     </filter>
     <filter
-       y="-0.25"
-       height="1.5"
+       y="-0.17469697"
+       height="1.3493938"
        inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
        inkscape:menu="Shadows and Glows"
        inkscape:label="Dark And Glow"
        style="color-interpolation-filters:sRGB"
-       id="filter4620">
+       id="filter4620-1"
+       x="-0.17469697"
+       width="1.3493938">
       <feGaussianBlur
          stdDeviation="5"
          result="result6"
-         id="feGaussianBlur4622" />
+         id="feGaussianBlur4622-1" />
       <feComposite
          result="result8"
          in="SourceGraphic"
          operator="atop"
          in2="result6"
-         id="feComposite4624" />
+         id="feComposite4624-4" />
       <feComposite
          result="result9"
          operator="over"
          in2="SourceAlpha"
          in="result8"
-         id="feComposite4626" />
+         id="feComposite4626-8" />
       <feColorMatrix
          values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
          result="result10"
-         id="feColorMatrix4628" />
+         id="feColorMatrix4628-5" />
       <feBlend
          in="result10"
          mode="normal"
          in2="result6"
-         id="feBlend4630" />
+         id="feBlend4630-7" />
     </filter>
     <filter
-       y="-0.25"
-       height="1.5"
+       y="-0.17469697"
+       height="1.3493938"
        inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
        inkscape:menu="Shadows and Glows"
        inkscape:label="Dark And Glow"
        style="color-interpolation-filters:sRGB"
-       id="filter4632">
+       id="filter4596-6"
+       x="-0.17469697"
+       width="1.3493938">
       <feGaussianBlur
          stdDeviation="5"
          result="result6"
-         id="feGaussianBlur4634" />
+         id="feGaussianBlur4598-6" />
       <feComposite
          result="result8"
          in="SourceGraphic"
          operator="atop"
          in2="result6"
-         id="feComposite4636" />
+         id="feComposite4600-9" />
       <feComposite
          result="result9"
          operator="over"
          in2="SourceAlpha"
          in="result8"
-         id="feComposite4638" />
+         id="feComposite4602-1" />
       <feColorMatrix
          values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
          result="result10"
-         id="feColorMatrix4640" />
+         id="feColorMatrix4604-4" />
       <feBlend
          in="result10"
          mode="normal"
          in2="result6"
-         id="feBlend4642" />
+         id="feBlend4606-3" />
     </filter>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect4188-5-6"
+       is_visible="true"
+       lpeversion="0" />
     <inkscape:path-effect
        effect="spiro"
        id="path-effect4188-7"
-       is_visible="true" />
+       is_visible="true"
+       lpeversion="0" />
     <inkscape:path-effect
        effect="spiro"
-       id="path-effect4188-5-6"
-       is_visible="true" />
+       id="path-effect4188-5-6-3"
+       is_visible="true"
+       lpeversion="0" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect4188-7-2"
+       is_visible="true"
+       lpeversion="0" />
     <filter
-       y="-0.25"
-       height="1.5"
+       y="-0.17469696"
+       height="1.3493939"
        inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
        inkscape:menu="Shadows and Glows"
        inkscape:label="Dark And Glow"
        style="color-interpolation-filters:sRGB"
-       id="filter4596-6">
+       id="filter4608-0-5"
+       x="-0.17469696"
+       width="1.3493939">
       <feGaussianBlur
          stdDeviation="5"
          result="result6"
-         id="feGaussianBlur4598-6" />
+         id="feGaussianBlur4610-2-5" />
       <feComposite
          result="result8"
          in="SourceGraphic"
          operator="atop"
          in2="result6"
-         id="feComposite4600-9" />
+         id="feComposite4612-5-4" />
       <feComposite
          result="result9"
          operator="over"
          in2="SourceAlpha"
          in="result8"
-         id="feComposite4602-1" />
+         id="feComposite4614-7-7" />
       <feColorMatrix
          values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
          result="result10"
-         id="feColorMatrix4604-4" />
+         id="feColorMatrix4616-6-6" />
       <feBlend
          in="result10"
          mode="normal"
          in2="result6"
-         id="feBlend4606-3" />
+         id="feBlend4618-9-5" />
     </filter>
     <filter
-       y="-0.25"
-       height="1.5"
+       y="-0.17469696"
+       height="1.3493939"
        inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
        inkscape:menu="Shadows and Glows"
        inkscape:label="Dark And Glow"
        style="color-interpolation-filters:sRGB"
-       id="filter4620-1">
+       id="filter4620-1-7"
+       x="-0.17469696"
+       width="1.3493939">
       <feGaussianBlur
          stdDeviation="5"
          result="result6"
-         id="feGaussianBlur4622-1" />
+         id="feGaussianBlur4622-1-4" />
       <feComposite
          result="result8"
          in="SourceGraphic"
          operator="atop"
          in2="result6"
-         id="feComposite4624-4" />
+         id="feComposite4624-4-5" />
       <feComposite
          result="result9"
          operator="over"
          in2="SourceAlpha"
          in="result8"
-         id="feComposite4626-8" />
+         id="feComposite4626-8-2" />
       <feColorMatrix
          values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
          result="result10"
-         id="feColorMatrix4628-5" />
+         id="feColorMatrix4628-5-5" />
       <feBlend
          in="result10"
          mode="normal"
          in2="result6"
-         id="feBlend4630-7" />
+         id="feBlend4630-7-4" />
     </filter>
     <filter
-       y="-0.25"
-       height="1.5"
+       y="-0.17469696"
+       height="1.3493939"
        inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
        inkscape:menu="Shadows and Glows"
        inkscape:label="Dark And Glow"
        style="color-interpolation-filters:sRGB"
-       id="filter4632-1">
+       id="filter4596-6-3"
+       x="-0.17469696"
+       width="1.3493939">
       <feGaussianBlur
          stdDeviation="5"
          result="result6"
-         id="feGaussianBlur4634-9" />
+         id="feGaussianBlur4598-6-0" />
       <feComposite
          result="result8"
          in="SourceGraphic"
          operator="atop"
          in2="result6"
-         id="feComposite4636-8" />
+         id="feComposite4600-9-7" />
       <feComposite
          result="result9"
          operator="over"
          in2="SourceAlpha"
          in="result8"
-         id="feComposite4638-7" />
+         id="feComposite4602-1-8" />
       <feColorMatrix
          values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
          result="result10"
-         id="feColorMatrix4640-6" />
+         id="feColorMatrix4604-4-6" />
       <feBlend
          in="result10"
          mode="normal"
          in2="result6"
-         id="feBlend4642-5" />
+         id="feBlend4606-3-8" />
     </filter>
     <filter
-       y="-0.25"
-       height="1.5"
+       y="-0.17469696"
+       height="1.3493939"
        inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
        inkscape:menu="Shadows and Glows"
        inkscape:label="Dark And Glow"
        style="color-interpolation-filters:sRGB"
-       id="filter4608-0">
+       id="filter4632-1-4"
+       x="-0.17469696"
+       width="1.3493939">
       <feGaussianBlur
          stdDeviation="5"
          result="result6"
-         id="feGaussianBlur4610-2" />
+         id="feGaussianBlur4634-9-9" />
       <feComposite
          result="result8"
          in="SourceGraphic"
          operator="atop"
          in2="result6"
-         id="feComposite4612-5" />
+         id="feComposite4636-8-2" />
       <feComposite
          result="result9"
          operator="over"
          in2="SourceAlpha"
          in="result8"
-         id="feComposite4614-7" />
+         id="feComposite4638-7-0" />
       <feColorMatrix
          values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
          result="result10"
-         id="feColorMatrix4616-6" />
+         id="feColorMatrix4640-6-6" />
       <feBlend
          in="result10"
          mode="normal"
          in2="result6"
-         id="feBlend4618-9" />
+         id="feBlend4642-5-8" />
     </filter>
   </defs>
-  <sodipodi:namedview
-     id="base"
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1.0"
-     inkscape:pageopacity="0.0"
-     inkscape:pageshadow="2"
-     inkscape:zoom="0.70000001"
-     inkscape:cx="545.01294"
-     inkscape:cy="35.725386"
-     inkscape:document-units="px"
-     inkscape:current-layer="layer1"
-     showgrid="false"
-     inkscape:window-width="3840"
-     inkscape:window-height="2061"
-     inkscape:window-x="0"
-     inkscape:window-y="0"
-     inkscape:window-maximized="1"
-     fit-margin-top="0"
-     fit-margin-left="0"
-     fit-margin-right="0"
-     fit-margin-bottom="0">
-    <inkscape:grid
-       type="xygrid"
-       id="grid4176"
-       originx="267.20477"
-       originy="315.17846" />
-  </sodipodi:namedview>
-  <metadata
-     id="metadata7">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
   <g
      inkscape:label="Layer 1"
      inkscape:groupmode="layer"
      id="layer1"
-     transform="translate(267.20477,-694.6203)">
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:34.78659058px;line-height:125%;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, ';letter-spacing:0px;word-spacing:0px;fill:#252525;fill-opacity:1;stroke:none;stroke-width:4.34832382px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       x="159.99139"
-       y="964.43109"
-       id="text1392-1"
-       inkscape:export-xdpi="70.669998"
-       inkscape:export-ydpi="70.669998"><tspan
-         sodipodi:role="line"
-         id="tspan1390-1"
-         x="159.99139"
-         y="964.43109"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:156.53968811px;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, Bold';fill:#252525;fill-opacity:1;stroke-width:4.34832382px">pystencils</tspan></text>
+     transform="translate(4.6554452,-0.83749467)">
     <g
-       id="g9986"
-       transform="matrix(4.1201463,0,0,4.1201463,-399.75066,866.02979)"
-       inkscape:export-xdpi="70.669998"
-       inkscape:export-ydpi="70.669998">
-      <path
-         inkscape:connector-curvature="0"
-         inkscape:original-d="M 60.891002,27.333516 H 118.64865"
-         inkscape:path-effect="#path-effect4188-7"
-         id="path4186-6"
-         d="M 60.891002,27.333516 H 118.64865"
-         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.78799796;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.70388345" />
-      <path
-         sodipodi:nodetypes="cc"
-         inkscape:connector-curvature="0"
-         inkscape:original-d="M 89.922623,-0.47572315 C 31.237244,132.88729 89.846228,36.88339 89.846228,56.13594"
-         inkscape:path-effect="#path-effect4188-5-6"
-         id="path4186-3-9"
-         d="M 89.922623,-0.47572315 89.846228,56.13594"
-         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.78799796;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.70388345" />
+       id="g24"
+       transform="translate(1.812973e-4,-8.5216229e-5)">
+      <rect
+         style="display:inline;opacity:1;fill:#ffffff;fill-opacity:0.701961;stroke-width:0.409154"
+         id="rect1"
+         width="28.348"
+         height="1.396094"
+         x="5.825819"
+         y="19.277994"
+         inkscape:label="axis-H" />
+      <rect
+         style="opacity:1;fill:#ffffff;fill-opacity:0.701961;stroke-width:0.409154"
+         id="rect2"
+         width="28.348"
+         height="1.396094"
+         x="5.8020415"
+         y="-20.697866"
+         transform="rotate(90)"
+         inkscape:label="axis-V" />
       <circle
-         transform="matrix(0.21391721,0,0,0.21391721,27.733834,-23.442344)"
+         transform="matrix(0.10711925,0,0,0.10711925,-10.838025,-5.3822253)"
          r="34.345188"
-         cy="108.02044"
-         cx="291.42902"
-         id="path4136-76"
-         style="opacity:1;fill:#e69f00;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4596-6)" />
+         cy="236.72931"
+         cx="155.56349"
+         id="path4136-7-0"
+         style="fill:#009e73;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4608-0-5)"
+         inkscape:label="circle-L" />
       <circle
-         transform="matrix(0.21391721,0,0,0.21391721,27.733834,-23.442344)"
+         transform="matrix(0.10711925,0,0,0.10711925,-11.109448,-4.9954233)"
          r="34.345188"
          cy="365.43817"
          cx="290.41885"
          id="path4136-6-0"
-         style="opacity:1;fill:#0072b2;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4620-1)" />
+         style="fill:#0072b2;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4620-1-7)"
+         inkscape:label="circle-B" />
       <circle
-         transform="matrix(0.21391721,0,0,0.21391721,27.733834,-23.442344)"
+         transform="matrix(0.10711925,0,0,0.10711925,-11.20464,-5.7690267)"
          r="34.345188"
-         cy="236.72931"
-         cx="422.24377"
-         id="path4136-3-9"
-         style="opacity:1;fill:#999999;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4632-1)" />
+         cy="108.02044"
+         cx="291.42902"
+         id="path4136-76"
+         style="fill:#e69f00;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4596-6-3)"
+         inkscape:label="circle-T" />
       <circle
-         transform="matrix(0.21391721,0,0,0.21391721,27.733834,-23.442344)"
+         transform="matrix(0.10711925,0,0,0.10711925,-11.056616,-5.2185227)"
          r="34.345188"
          cy="236.72931"
-         cx="155.56349"
-         id="path4136-7-0"
-         style="opacity:1;fill:#009e73;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4608-0)" />
+         cx="422.24377"
+         id="path4136-3-9"
+         style="fill:#999999;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4632-1-4)"
+         inkscape:label="circle-R" />
     </g>
     <text
        xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.7668047px;line-height:125%;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, ';letter-spacing:0px;word-spacing:0px;fill:#252525;fill-opacity:0.70629368;stroke:none;stroke-width:1.09585059px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       x="229.96391"
-       y="1071.713"
-       id="text1392-1-3"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11667px;line-height:125%;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, ';letter-spacing:0px;word-spacing:0px;fill:#cccccc;fill-opacity:1;stroke:none;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="-4.7506952"
+       y="47.69986"
+       id="text1392-1"
        inkscape:export-xdpi="70.669998"
        inkscape:export-ydpi="70.669998"><tspan
          sodipodi:role="line"
-         id="tspan1390-1-6"
-         x="229.96391"
-         y="1071.713"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:62.0406723px;line-height:105.99999428%;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, Bold';fill:#252525;fill-opacity:0.70629368;stroke-width:1.09585059px">speed up stencil </tspan><tspan
-         sodipodi:role="line"
-         x="229.96391"
-         y="1137.4761"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:62.0406723px;line-height:105.99999428%;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, Bold';fill:#252525;fill-opacity:0.70629368;stroke-width:1.09585059px"
-         id="tspan109">computations on</tspan><tspan
-         sodipodi:role="line"
-         x="229.96391"
-         y="1203.2393"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:62.0406723px;line-height:105.99999428%;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, Bold';fill:#252525;fill-opacity:0.70629368;stroke-width:1.09585059px"
-         id="tspan107">numpy arrays</tspan></text>
+         id="tspan1390-1"
+         x="-4.7506952"
+         y="47.69986"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.525px;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, Bold';fill:#cccccc;fill-opacity:1;stroke-width:0.264583px">pystencils</tspan></text>
   </g>
 </svg>
diff --git a/docs/source/_static/img/pystencils-logo-light.svg b/docs/source/_static/img/pystencils-logo-light.svg
new file mode 100644
index 000000000..519a4816a
--- /dev/null
+++ b/docs/source/_static/img/pystencils-logo-light.svg
@@ -0,0 +1,465 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="49.310894mm"
+   height="48.976913mm"
+   viewBox="0 0 49.310894 48.976913"
+   version="1.1"
+   id="svg1"
+   inkscape:version="1.4 (e7c3feb100, 2024-10-09)"
+   sodipodi:docname="pystencils-logo-light.svg"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview1"
+     pagecolor="#ffffff"
+     bordercolor="#111111"
+     borderopacity="1"
+     inkscape:showpageshadow="0"
+     inkscape:pageopacity="0"
+     inkscape:pagecheckerboard="1"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:document-units="mm"
+     showguides="true"
+     inkscape:lockguides="false"
+     showgrid="false"
+     inkscape:zoom="2.8284271"
+     inkscape:cx="33.410795"
+     inkscape:cy="140.00714"
+     inkscape:window-width="1920"
+     inkscape:window-height="1039"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1">
+    <inkscape:grid
+       id="grid4"
+       units="mm"
+       originx="4.6554451"
+       originy="-0.837491"
+       spacingx="0.99999998"
+       spacingy="1"
+       empcolor="#0099e5"
+       empopacity="0.30196078"
+       color="#0099e5"
+       opacity="0.14901961"
+       empspacing="5"
+       enabled="true"
+       visible="false" />
+    <sodipodi:guide
+       position="38.829267,-5.2751543"
+       orientation="1,0"
+       id="guide2"
+       inkscape:locked="false" />
+    <sodipodi:guide
+       position="10.481264,-5.2751543"
+       orientation="1,0"
+       id="guide3"
+       inkscape:locked="false" />
+    <sodipodi:guide
+       position="24.655266,8.8988446"
+       orientation="0,-1"
+       id="guide4"
+       inkscape:locked="false" />
+    <sodipodi:guide
+       position="24.655266,-19.449154"
+       orientation="0,-1"
+       id="guide5"
+       inkscape:locked="false" />
+  </sodipodi:namedview>
+  <defs
+     id="defs1">
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect5"
+       is_visible="true"
+       lpeversion="0" />
+    <filter
+       y="-0.17469697"
+       height="1.3493938"
+       inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
+       inkscape:menu="Shadows and Glows"
+       inkscape:label="Dark And Glow"
+       style="color-interpolation-filters:sRGB"
+       id="filter4608-0"
+       x="-0.17469697"
+       width="1.3493938">
+      <feGaussianBlur
+         stdDeviation="5"
+         result="result6"
+         id="feGaussianBlur4610-2" />
+      <feComposite
+         result="result8"
+         in="SourceGraphic"
+         operator="atop"
+         in2="result6"
+         id="feComposite4612-5" />
+      <feComposite
+         result="result9"
+         operator="over"
+         in2="SourceAlpha"
+         in="result8"
+         id="feComposite4614-7" />
+      <feColorMatrix
+         values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
+         result="result10"
+         id="feColorMatrix4616-6" />
+      <feBlend
+         in="result10"
+         mode="normal"
+         in2="result6"
+         id="feBlend4618-9" />
+    </filter>
+    <filter
+       y="-0.17469697"
+       height="1.3493938"
+       inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
+       inkscape:menu="Shadows and Glows"
+       inkscape:label="Dark And Glow"
+       style="color-interpolation-filters:sRGB"
+       id="filter4632-1"
+       x="-0.17469697"
+       width="1.3493938">
+      <feGaussianBlur
+         stdDeviation="5"
+         result="result6"
+         id="feGaussianBlur4634-9" />
+      <feComposite
+         result="result8"
+         in="SourceGraphic"
+         operator="atop"
+         in2="result6"
+         id="feComposite4636-8" />
+      <feComposite
+         result="result9"
+         operator="over"
+         in2="SourceAlpha"
+         in="result8"
+         id="feComposite4638-7" />
+      <feColorMatrix
+         values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
+         result="result10"
+         id="feColorMatrix4640-6" />
+      <feBlend
+         in="result10"
+         mode="normal"
+         in2="result6"
+         id="feBlend4642-5" />
+    </filter>
+    <filter
+       y="-0.17469697"
+       height="1.3493938"
+       inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
+       inkscape:menu="Shadows and Glows"
+       inkscape:label="Dark And Glow"
+       style="color-interpolation-filters:sRGB"
+       id="filter4620-1"
+       x="-0.17469697"
+       width="1.3493938">
+      <feGaussianBlur
+         stdDeviation="5"
+         result="result6"
+         id="feGaussianBlur4622-1" />
+      <feComposite
+         result="result8"
+         in="SourceGraphic"
+         operator="atop"
+         in2="result6"
+         id="feComposite4624-4" />
+      <feComposite
+         result="result9"
+         operator="over"
+         in2="SourceAlpha"
+         in="result8"
+         id="feComposite4626-8" />
+      <feColorMatrix
+         values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
+         result="result10"
+         id="feColorMatrix4628-5" />
+      <feBlend
+         in="result10"
+         mode="normal"
+         in2="result6"
+         id="feBlend4630-7" />
+    </filter>
+    <filter
+       y="-0.17469697"
+       height="1.3493938"
+       inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
+       inkscape:menu="Shadows and Glows"
+       inkscape:label="Dark And Glow"
+       style="color-interpolation-filters:sRGB"
+       id="filter4596-6"
+       x="-0.17469697"
+       width="1.3493938">
+      <feGaussianBlur
+         stdDeviation="5"
+         result="result6"
+         id="feGaussianBlur4598-6" />
+      <feComposite
+         result="result8"
+         in="SourceGraphic"
+         operator="atop"
+         in2="result6"
+         id="feComposite4600-9" />
+      <feComposite
+         result="result9"
+         operator="over"
+         in2="SourceAlpha"
+         in="result8"
+         id="feComposite4602-1" />
+      <feColorMatrix
+         values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
+         result="result10"
+         id="feColorMatrix4604-4" />
+      <feBlend
+         in="result10"
+         mode="normal"
+         in2="result6"
+         id="feBlend4606-3" />
+    </filter>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect4188-5-6"
+       is_visible="true"
+       lpeversion="0" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect4188-7"
+       is_visible="true"
+       lpeversion="0" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect4188-5-6-3"
+       is_visible="true"
+       lpeversion="0" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect4188-7-2"
+       is_visible="true"
+       lpeversion="0" />
+    <filter
+       y="-0.17469696"
+       height="1.3493939"
+       inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
+       inkscape:menu="Shadows and Glows"
+       inkscape:label="Dark And Glow"
+       style="color-interpolation-filters:sRGB"
+       id="filter4608-0-5"
+       x="-0.17469696"
+       width="1.3493939">
+      <feGaussianBlur
+         stdDeviation="5"
+         result="result6"
+         id="feGaussianBlur4610-2-5" />
+      <feComposite
+         result="result8"
+         in="SourceGraphic"
+         operator="atop"
+         in2="result6"
+         id="feComposite4612-5-4" />
+      <feComposite
+         result="result9"
+         operator="over"
+         in2="SourceAlpha"
+         in="result8"
+         id="feComposite4614-7-7" />
+      <feColorMatrix
+         values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
+         result="result10"
+         id="feColorMatrix4616-6-6" />
+      <feBlend
+         in="result10"
+         mode="normal"
+         in2="result6"
+         id="feBlend4618-9-5" />
+    </filter>
+    <filter
+       y="-0.17469696"
+       height="1.3493939"
+       inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
+       inkscape:menu="Shadows and Glows"
+       inkscape:label="Dark And Glow"
+       style="color-interpolation-filters:sRGB"
+       id="filter4620-1-7"
+       x="-0.17469696"
+       width="1.3493939">
+      <feGaussianBlur
+         stdDeviation="5"
+         result="result6"
+         id="feGaussianBlur4622-1-4" />
+      <feComposite
+         result="result8"
+         in="SourceGraphic"
+         operator="atop"
+         in2="result6"
+         id="feComposite4624-4-5" />
+      <feComposite
+         result="result9"
+         operator="over"
+         in2="SourceAlpha"
+         in="result8"
+         id="feComposite4626-8-2" />
+      <feColorMatrix
+         values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
+         result="result10"
+         id="feColorMatrix4628-5-5" />
+      <feBlend
+         in="result10"
+         mode="normal"
+         in2="result6"
+         id="feBlend4630-7-4" />
+    </filter>
+    <filter
+       y="-0.17469696"
+       height="1.3493939"
+       inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
+       inkscape:menu="Shadows and Glows"
+       inkscape:label="Dark And Glow"
+       style="color-interpolation-filters:sRGB"
+       id="filter4596-6-3"
+       x="-0.17469696"
+       width="1.3493939">
+      <feGaussianBlur
+         stdDeviation="5"
+         result="result6"
+         id="feGaussianBlur4598-6-0" />
+      <feComposite
+         result="result8"
+         in="SourceGraphic"
+         operator="atop"
+         in2="result6"
+         id="feComposite4600-9-7" />
+      <feComposite
+         result="result9"
+         operator="over"
+         in2="SourceAlpha"
+         in="result8"
+         id="feComposite4602-1-8" />
+      <feColorMatrix
+         values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
+         result="result10"
+         id="feColorMatrix4604-4-6" />
+      <feBlend
+         in="result10"
+         mode="normal"
+         in2="result6"
+         id="feBlend4606-3-8" />
+    </filter>
+    <filter
+       y="-0.17469696"
+       height="1.3493939"
+       inkscape:menu-tooltip="Darkens the edge with an inner blur and adds a flexible glow"
+       inkscape:menu="Shadows and Glows"
+       inkscape:label="Dark And Glow"
+       style="color-interpolation-filters:sRGB"
+       id="filter4632-1-4"
+       x="-0.17469696"
+       width="1.3493939">
+      <feGaussianBlur
+         stdDeviation="5"
+         result="result6"
+         id="feGaussianBlur4634-9-9" />
+      <feComposite
+         result="result8"
+         in="SourceGraphic"
+         operator="atop"
+         in2="result6"
+         id="feComposite4636-8-2" />
+      <feComposite
+         result="result9"
+         operator="over"
+         in2="SourceAlpha"
+         in="result8"
+         id="feComposite4638-7-0" />
+      <feColorMatrix
+         values="1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 "
+         result="result10"
+         id="feColorMatrix4640-6-6" />
+      <feBlend
+         in="result10"
+         mode="normal"
+         in2="result6"
+         id="feBlend4642-5-8" />
+    </filter>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(4.6554451,-0.83749467)">
+    <g
+       id="g25"
+       transform="translate(1.812973e-4,-8.5216229e-5)">
+      <g
+         id="g24">
+        <rect
+           style="display:inline;opacity:1;fill:#000000;fill-opacity:0.701961;stroke-width:0.409154"
+           id="rect1"
+           width="28.348"
+           height="1.396094"
+           x="5.825819"
+           y="19.277994"
+           inkscape:label="axis-H" />
+        <rect
+           style="opacity:1;fill:#000000;fill-opacity:0.701961;stroke-width:0.409154"
+           id="rect2"
+           width="28.348"
+           height="1.396094"
+           x="5.8020415"
+           y="-20.697866"
+           transform="rotate(90)"
+           inkscape:label="axis-V" />
+        <circle
+           transform="matrix(0.10711925,0,0,0.10711925,-10.838025,-5.3822253)"
+           r="34.345188"
+           cy="236.72931"
+           cx="155.56349"
+           id="path4136-7-0"
+           style="fill:#009e73;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4608-0-5)"
+           inkscape:label="circle-L" />
+        <circle
+           transform="matrix(0.10711925,0,0,0.10711925,-11.109448,-4.9954233)"
+           r="34.345188"
+           cy="365.43817"
+           cx="290.41885"
+           id="path4136-6-0"
+           style="fill:#0072b2;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4620-1-7)"
+           inkscape:label="circle-B" />
+        <circle
+           transform="matrix(0.10711925,0,0,0.10711925,-11.20464,-5.7690267)"
+           r="34.345188"
+           cy="108.02044"
+           cx="291.42902"
+           id="path4136-76"
+           style="fill:#e69f00;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4596-6-3)"
+           inkscape:label="circle-T" />
+        <circle
+           transform="matrix(0.10711925,0,0,0.10711925,-11.056616,-5.2185227)"
+           r="34.345188"
+           cy="236.72931"
+           cx="422.24377"
+           id="path4136-3-9"
+           style="fill:#999999;fill-opacity:1;stroke:none;stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;filter:url(#filter4632-1-4)"
+           inkscape:label="circle-R" />
+      </g>
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11667px;line-height:125%;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, ';letter-spacing:0px;word-spacing:0px;fill:#252525;fill-opacity:1;stroke:none;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         x="-4.7508764"
+         y="47.699944"
+         id="text1392-1"
+         inkscape:export-xdpi="70.669998"
+         inkscape:export-ydpi="70.669998"
+         inkscape:label="text1392-1"><tspan
+           sodipodi:role="line"
+           id="tspan1390-1"
+           x="-4.7508764"
+           y="47.699944"
+           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.525px;font-family:'Latin Modern Mono Light';-inkscape-font-specification:'Latin Modern Mono Light, Bold';fill:#252525;fill-opacity:1;stroke-width:0.264583px">pystencils</tspan></text>
+    </g>
+  </g>
+</svg>
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 03d43b19b..e88859b99 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -4,7 +4,6 @@ import re
 from pystencils import __version__ as pystencils_version
 
 project = "pystencils"
-html_logo = "_static/img/logo.png"
 html_title = "pystencils Documentation"
 
 copyright = (
@@ -72,13 +71,19 @@ myst_enable_extensions = [
 html_theme = "sphinx_book_theme"
 html_static_path = ["_static"]
 html_css_files = [
-    'css/fixtables.css',
+    "css/fixtables.css",
 ]
+html_theme_options = {
+    "logo": {
+        "image_light": "_static/img/pystencils-logo-light.svg",
+        "image_dark": "_static/img/pystencils-logo-dark.svg",
+    }
+}
 
 # NbSphinx configuration
 
-nbsphinx_execute = 'never'
-nbsphinx_codecell_lexer = 'python3'
+nbsphinx_execute = "never"
+nbsphinx_codecell_lexer = "python3"
 
 #   BibTex
-bibtex_bibfiles = ['pystencils.bib']
+bibtex_bibfiles = ["pystencils.bib"]
diff --git a/pyproject.toml b/pyproject.toml
index f0ec014a5..d9a33c9d7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,8 @@ use_cython = [
 ]
 doc = [
     'sphinx',
-    'sphinx-book-theme',
+    'pydata-sphinx-theme==0.15.4',
+    'sphinx-book-theme==1.1.3',  # workaround for https://github.com/executablebooks/sphinx-book-theme/issues/865
     'sphinxcontrib-bibtex',
     'sphinx_autodoc_typehints',
     'pandoc',
-- 
GitLab


From 6c8040a989b6cae012e140e577f0908e87ebf186 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 4 Dec 2024 14:30:48 +0100
Subject: [PATCH 19/31] Refactor codegen and jit modules: Initial file
 movements

---
 src/pystencils/__init__.py                    |  20 +-
 src/pystencils/backend/__init__.py            |  14 -
 .../backend/emission/base_printer.py          |  16 +-
 .../backend/kernelcreation/context.py         |   5 +-
 .../backend/kernelcreation/iteration_space.py |   4 +-
 src/pystencils/backend/kernelfunction.py      | 233 +---------
 src/pystencils/backend/memory.py              |   2 +-
 src/pystencils/backend/platforms/cuda.py      |   5 +-
 src/pystencils/backend/platforms/sycl.py      |   6 +-
 .../backend/transformations/add_pragmas.py    |   4 +-
 src/pystencils/boundaries/boundaryhandling.py |   2 +-
 src/pystencils/codegen/__init__.py            |  23 +
 src/pystencils/{ => codegen}/config.py        |  16 +-
 src/pystencils/codegen/driver.py              | 432 ++++++++++++++++++
 src/pystencils/codegen/gpu.py                 |  28 ++
 src/pystencils/codegen/kernel.py              | 102 +++++
 src/pystencils/codegen/parameters.py          | 142 ++++++
 .../{backend => codegen}/properties.py        |   0
 src/pystencils/{ => codegen}/target.py        |   0
 src/pystencils/datahandling/__init__.py       |   2 +-
 .../datahandling/datahandling_interface.py    |   2 +-
 .../datahandling/parallel_datahandling.py     |   2 +-
 .../datahandling/serial_datahandling.py       |   2 +-
 src/pystencils/enums.py                       |   2 +-
 src/pystencils/{backend => }/jit/__init__.py  |   0
 .../{backend => }/jit/cpu_extension_module.py |   2 +-
 src/pystencils/{backend => }/jit/gpu_cupy.py  |   4 +-
 src/pystencils/{backend => }/jit/jit.py       |   2 +-
 .../{backend => }/jit/legacy_cpu.py           |   0
 .../{backend => }/jit/msvc_detection.py       |   0
 src/pystencils/kernel_decorator.py            |   2 +-
 src/pystencils/kernelcreation.py              | 429 +----------------
 tests/_todo/test_vectorization.py             |   9 +-
 tests/_todo/test_vectorization_specific.py    |  17 +-
 tests/frontend/test_simplifications.py        |   3 +-
 tests/kernelcreation/test_sum_prod.py         |   3 +-
 tests/nbackend/kernelcreation/test_context.py |   2 +-
 tests/nbackend/kernelcreation/test_options.py |   2 +-
 38 files changed, 811 insertions(+), 728 deletions(-)
 create mode 100644 src/pystencils/codegen/__init__.py
 rename src/pystencils/{ => codegen}/config.py (98%)
 create mode 100644 src/pystencils/codegen/driver.py
 create mode 100644 src/pystencils/codegen/gpu.py
 create mode 100644 src/pystencils/codegen/kernel.py
 create mode 100644 src/pystencils/codegen/parameters.py
 rename src/pystencils/{backend => codegen}/properties.py (100%)
 rename src/pystencils/{ => codegen}/target.py (100%)
 rename src/pystencils/{backend => }/jit/__init__.py (100%)
 rename src/pystencils/{backend => }/jit/cpu_extension_module.py (99%)
 rename src/pystencils/{backend => }/jit/gpu_cupy.py (99%)
 rename src/pystencils/{backend => }/jit/jit.py (97%)
 rename src/pystencils/{backend => }/jit/legacy_cpu.py (100%)
 rename src/pystencils/{backend => }/jit/msvc_detection.py (100%)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index b2cdeca07..4374ccda4 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -1,6 +1,13 @@
 """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions"""
 
-from .target import Target
+from .codegen import (
+    Target,
+    CreateKernelConfig,
+    CpuOptimConfig,
+    VectorizationConfig,
+    OpenMpConfig,
+    GpuIndexingConfig,
+)
 from .defaults import DEFAULTS
 from . import fd
 from . import stencil as stencil
@@ -9,16 +16,9 @@ from .inspection import inspect
 from .field import Field, FieldType, fields
 from .types import create_type, create_numeric_type
 from .cache import clear_cache
-from .config import (
-    CreateKernelConfig,
-    CpuOptimConfig,
-    VectorizationConfig,
-    OpenMpConfig,
-    GpuIndexingConfig,
-)
 from .kernel_decorator import kernel, kernel_config
 from .kernelcreation import create_kernel, create_staggered_kernel
-from .backend.kernelfunction import KernelFunction
+from .codegen import Kernel
 from .backend.jit import no_jit
 from .backend.exceptions import KernelConstraintsError
 from .slicing import make_slice
@@ -55,7 +55,7 @@ __all__ = [
     "OpenMpConfig",
     "create_kernel",
     "create_staggered_kernel",
-    "KernelFunction",
+    "Kernel",
     "KernelConstraintsError",
     "Target",
     "no_jit",
diff --git a/src/pystencils/backend/__init__.py b/src/pystencils/backend/__init__.py
index b947a112e..e69de29bb 100644
--- a/src/pystencils/backend/__init__.py
+++ b/src/pystencils/backend/__init__.py
@@ -1,14 +0,0 @@
-from .kernelfunction import (
-    KernelParameter,
-    KernelFunction,
-    GpuKernelFunction,
-)
-
-from .constraints import KernelParamsConstraint
-
-__all__ = [
-    "KernelParameter",
-    "KernelFunction",
-    "GpuKernelFunction",
-    "KernelParamsConstraint",
-]
diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py
index 50cd1bfea..d721b9f89 100644
--- a/src/pystencils/backend/emission/base_printer.py
+++ b/src/pystencils/backend/emission/base_printer.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 from enum import Enum
 from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
 
-from ...target import Target
+from ...codegen import Target
 
 from ..ast.structural import (
     PsAstNode,
@@ -59,7 +60,8 @@ from ..memory import PsSymbol
 from ..constants import PsConstant
 from ...types import PsType
 
-from ..kernelfunction import KernelFunction, GpuKernelFunction
+if TYPE_CHECKING:
+    from ...codegen import Kernel, GpuKernel
 
 
 class EmissionError(Exception):
@@ -172,8 +174,8 @@ class BasePrinter(ABC):
     def __init__(self, indent_width=3):
         self._indent_width = indent_width
 
-    def __call__(self, obj: PsAstNode | KernelFunction) -> str:
-        if isinstance(obj, KernelFunction):
+    def __call__(self, obj: PsAstNode | Kernel) -> str:
+        if isinstance(obj, Kernel):
             sig = self.print_signature(obj)
             body_code = self.visit(obj.body, PrinterCtx())
             return f"{sig}\n{body_code}"
@@ -372,7 +374,7 @@ class BasePrinter(ABC):
                     f"BasePrinter does not know how to print {type(node)}"
                 )
 
-    def print_signature(self, func: KernelFunction) -> str:
+    def print_signature(self, func: Kernel) -> str:
         prefix = self._func_prefix(func)
         params_str = ", ".join(
             f"{self._type_str(p.dtype)} {p.name}" for p in func.parameters
@@ -380,8 +382,8 @@ class BasePrinter(ABC):
         signature = " ".join([prefix, "void", func.name, f"({params_str})"])
         return signature
 
-    def _func_prefix(self, func: KernelFunction):
-        if isinstance(func, GpuKernelFunction) and func.target == Target.CUDA:
+    def _func_prefix(self, func: Kernel):
+        if isinstance(func, GpuKernel) and func.target == Target.CUDA:
             return "__global__"
         else:
             return "FUNC_PREFIX"
diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 1cf159cf4..bb7bd708d 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -10,7 +10,6 @@ from ...field import Field, FieldType
 from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType
 
 from ..memory import PsSymbol, PsBuffer
-from ..properties import FieldShape, FieldStride
 from ..constants import PsConstant
 from ...types import (
     PsType,
@@ -371,6 +370,8 @@ class KernelCreationContext:
             buf_shape += [convert_size(1)]
             buf_strides += [convert_size(1)]
 
+        from ...codegen.properties import FieldShape, FieldStride
+
         for i, size in enumerate(buf_shape):
             if isinstance(size, PsSymbol):
                 size.add_property(FieldShape(field, i))
@@ -410,6 +411,8 @@ class KernelCreationContext:
         buf_shape: list[PsSymbol | PsConstant]
 
         if isinstance(buffer_len, TypedSymbol):
+            from ...codegen.properties import FieldShape
+
             idx_type = self._normalize_type(buffer_len)
             len_symb = self.get_symbol(buffer_len.name, idx_type)
             len_symb.add_property(FieldShape(field, 0))
diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py
index 9df9883ce..a7802c931 100644
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -6,7 +6,6 @@ from functools import reduce
 from operator import mul
 
 from ...defaults import DEFAULTS
-from ...config import _AUTO_TYPE, AUTO
 from ...simp import AssignmentCollection
 from ...field import Field, FieldType
 
@@ -18,6 +17,7 @@ from ...types import PsStructType
 from ..exceptions import PsInputError, KernelConstraintsError
 
 if TYPE_CHECKING:
+    from ...codegen.config import _AUTO_TYPE
     from .context import KernelCreationContext
 
 
@@ -457,6 +457,8 @@ def create_full_iteration_space(
     # Otherwise, if an iteration slice was specified, use that
     # Otherwise, use the inferred ghost layers
 
+    from ...codegen.config import AUTO
+
     if ghost_layers is AUTO:
         if len(domain_field_accesses) > 0:
             inferred_gls = max(
diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py
index e2161590e..3c7e103b3 100644
--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -9,7 +9,7 @@ from .._deprecation import _deprecated
 from .ast.structural import PsBlock
 from .ast.analysis import collect_required_headers, collect_undefined_symbols
 from .memory import PsSymbol
-from .properties import (
+from ..codegen.properties import (
     PsSymbolProperty,
     _FieldProperty,
     FieldShape,
@@ -22,7 +22,7 @@ from .platforms import Platform, GpuThreadsRange
 from .constraints import KernelParamsConstraint
 from ..types import PsType
 
-from ..target import Target
+from ..codegen.target import Target
 from ..field import Field
 from ..sympyextensions import TypedSymbol
 
@@ -30,212 +30,6 @@ if TYPE_CHECKING:
     from .jit import JitBase
 
 
-class KernelParameter:
-    """Parameter to a `KernelFunction`."""
-
-    __match_args__ = ("name", "dtype", "properties")
-
-    def __init__(
-        self, name: str, dtype: PsType, properties: Iterable[PsSymbolProperty] = ()
-    ):
-        self._name = name
-        self._dtype = dtype
-        self._properties: frozenset[PsSymbolProperty] = (
-            frozenset(properties) if properties is not None else frozenset()
-        )
-        self._fields: tuple[Field, ...] = tuple(
-            sorted(
-                set(
-                    p.field  # type: ignore
-                    for p in filter(
-                        lambda p: isinstance(p, _FieldProperty), self._properties
-                    )
-                ),
-                key=lambda f: f.name
-            )
-        )
-
-    @property
-    def name(self):
-        return self._name
-
-    @property
-    def dtype(self):
-        return self._dtype
-
-    def _hashable_contents(self):
-        return (self._name, self._dtype, self._properties)
-
-    #   TODO: Need?
-    def __hash__(self) -> int:
-        return hash(self._hashable_contents())
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, KernelParameter):
-            return False
-
-        return (
-            type(self) is type(other)
-            and self._hashable_contents() == other._hashable_contents()
-        )
-
-    def __str__(self) -> str:
-        return self._name
-
-    def __repr__(self) -> str:
-        return f"{type(self).__name__}(name = {self._name}, dtype = {self._dtype})"
-
-    @property
-    def symbol(self) -> TypedSymbol:
-        return TypedSymbol(self.name, self.dtype)
-
-    @property
-    def fields(self) -> Sequence[Field]:
-        """Set of fields associated with this parameter."""
-        return self._fields
-
-    def get_properties(
-        self, prop_type: type[PsSymbolProperty] | tuple[type[PsSymbolProperty], ...]
-    ) -> set[PsSymbolProperty]:
-        """Retrieve all properties of the given type(s) attached to this parameter"""
-        return set(filter(lambda p: isinstance(p, prop_type), self._properties))
-
-    @property
-    def properties(self) -> frozenset[PsSymbolProperty]:
-        return self._properties
-
-    @property
-    def is_field_parameter(self) -> bool:
-        return bool(self._fields)
-
-    #   Deprecated legacy properties
-    #   These are kept mostly for the legacy waLBerla code generation system
-
-    @property
-    def is_field_pointer(self) -> bool:
-        warn(
-            "`is_field_pointer` is deprecated and will be removed in a future version of pystencils. "
-            "Use `param.get_properties(FieldBasePtr)` instead.",
-            DeprecationWarning,
-        )
-        return bool(self.get_properties(FieldBasePtr))
-
-    @property
-    def is_field_stride(self) -> bool:
-        warn(
-            "`is_field_stride` is deprecated and will be removed in a future version of pystencils. "
-            "Use `param.get_properties(FieldStride)` instead.",
-            DeprecationWarning,
-        )
-        return bool(self.get_properties(FieldStride))
-
-    @property
-    def is_field_shape(self) -> bool:
-        warn(
-            "`is_field_shape` is deprecated and will be removed in a future version of pystencils. "
-            "Use `param.get_properties(FieldShape)` instead.",
-            DeprecationWarning,
-        )
-        return bool(self.get_properties(FieldShape))
-
-    @property
-    def field_name(self) -> str:
-        warn(
-            "`field_name` is deprecated and will be removed in a future version of pystencils. "
-            "Use `param.fields[0].name` instead.",
-            DeprecationWarning,
-        )
-        return self._fields[0].name
-
-
-class KernelFunction:
-    """A pystencils kernel function.
-
-    The kernel function is the final result of the translation process.
-    It is immutable, and its AST should not be altered any more, either, as this
-    might invalidate information about the kernel already stored in the `KernelFunction` object.
-    """
-
-    def __init__(
-        self,
-        body: PsBlock,
-        target: Target,
-        name: str,
-        parameters: Sequence[KernelParameter],
-        required_headers: set[str],
-        constraints: Sequence[KernelParamsConstraint],
-        jit: JitBase,
-    ):
-        self._body: PsBlock = body
-        self._target = target
-        self._name = name
-        self._params = tuple(parameters)
-        self._required_headers = required_headers
-        self._constraints = tuple(constraints)
-        self._jit = jit
-        self._metadata: dict[str, Any] = dict()
-
-    @property
-    def metadata(self) -> dict[str, Any]:
-        return self._metadata
-
-    @property
-    def body(self) -> PsBlock:
-        return self._body
-
-    @property
-    def target(self) -> Target:
-        return self._target
-
-    @property
-    def name(self) -> str:
-        return self._name
-
-    @name.setter
-    def name(self, n: str):
-        self._name = n
-
-    @property
-    def function_name(self) -> str:
-        _deprecated("function_name", "name")
-        return self._name
-
-    @function_name.setter
-    def function_name(self, n: str):
-        _deprecated("function_name", "name")
-        self._name = n
-
-    @property
-    def parameters(self) -> tuple[KernelParameter, ...]:
-        return self._params
-
-    def get_parameters(self) -> tuple[KernelParameter, ...]:
-        _deprecated("KernelFunction.get_parameters", "KernelFunction.parameters")
-        return self.parameters
-
-    def get_fields(self) -> set[Field]:
-        return set(chain.from_iterable(p.fields for p in self._params))
-
-    @property
-    def fields_accessed(self) -> set[Field]:
-        warn(
-            "`fields_accessed` is deprecated and will be removed in a future version of pystencils. "
-            "Use `get_fields` instead.",
-            DeprecationWarning,
-        )
-        return self.get_fields()
-
-    @property
-    def required_headers(self) -> set[str]:
-        return self._required_headers
-
-    @property
-    def constraints(self) -> tuple[KernelParamsConstraint, ...]:
-        return self._constraints
-
-    def compile(self) -> Callable[..., None]:
-        """Invoke the underlying just-in-time compiler to obtain the kernel as an executable Python function."""
-        return self._jit.compile(self)
 
 
 def create_cpu_kernel_function(
@@ -258,29 +52,6 @@ def create_cpu_kernel_function(
     return kfunc
 
 
-class GpuKernelFunction(KernelFunction):
-    """Internal representation of a kernel function targeted at CUDA GPUs."""
-
-    def __init__(
-        self,
-        body: PsBlock,
-        threads_range: GpuThreadsRange | None,
-        target: Target,
-        name: str,
-        parameters: Sequence[KernelParameter],
-        required_headers: set[str],
-        constraints: Sequence[KernelParamsConstraint],
-        jit: JitBase,
-    ):
-        super().__init__(
-            body, target, name, parameters, required_headers, constraints, jit
-        )
-        self._threads_range = threads_range
-
-    @property
-    def threads_range(self) -> GpuThreadsRange | None:
-        """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._threads_range
 
 
 def create_gpu_kernel_function(
diff --git a/src/pystencils/backend/memory.py b/src/pystencils/backend/memory.py
index fcfae9f90..7a5d62f69 100644
--- a/src/pystencils/backend/memory.py
+++ b/src/pystencils/backend/memory.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 from ..types import PsType, PsTypeError, deconstify, PsIntegerType, PsPointerType
 from .exceptions import PsInternalCompilerError
 from .constants import PsConstant
-from .properties import PsSymbolProperty, UniqueSymbolProperty
+from ..codegen.properties import PsSymbolProperty, UniqueSymbolProperty
 
 
 class PsSymbol:
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index dbade47d1..048bcb0d5 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,4 +1,5 @@
 from warnings import warn
+from typing import TYPE_CHECKING
 
 from ...types import constify
 from ..exceptions import MaterializationError
@@ -26,7 +27,9 @@ from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
 from ..functions import PsMathFunction, MathFunctions, CFunction
-from ...config import GpuIndexingConfig
+
+if TYPE_CHECKING:
+    from ...codegen.config import GpuIndexingConfig
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index ec5e7eda0..56615af24 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ..functions import CFunction, PsMathFunction, MathFunctions
 from ..kernelcreation.iteration_space import (
     IterationSpace,
@@ -25,7 +27,9 @@ from ..constants import PsConstant
 from .generic_gpu import GenericGpu, GpuThreadsRange
 from ..exceptions import MaterializationError
 from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType
-from ...config import GpuIndexingConfig
+
+if TYPE_CHECKING:
+    from ...codegen.config import GpuIndexingConfig
 
 
 class SyclPlatform(GenericGpu):
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index d01f42874..47c008819 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 from typing import Sequence
 from collections import defaultdict
@@ -8,7 +9,8 @@ from ..ast import PsAstNode
 from ..ast.structural import PsBlock, PsLoop, PsPragma
 from ..ast.expressions import PsExpression
 
-from ...config import OpenMpConfig
+if TYPE_CHECKING:
+    from ...codegen.config import OpenMpConfig
 
 __all__ = ["InsertPragmasAtLoops", "LoopPragma", "AddOpenMP"]
 
diff --git a/src/pystencils/boundaries/boundaryhandling.py b/src/pystencils/boundaries/boundaryhandling.py
index fe8dd7d00..1f6e3d126 100644
--- a/src/pystencils/boundaries/boundaryhandling.py
+++ b/src/pystencils/boundaries/boundaryhandling.py
@@ -12,7 +12,7 @@ from pystencils.types import PsIntegerType
 from pystencils.types.quick import Arr, SInt
 from pystencils.gpu.gpu_array_handler import GPUArrayHandler
 from pystencils.field import Field, FieldType
-from pystencils.backend.properties import FieldBasePtr
+from pystencils.codegen.properties import FieldBasePtr
 
 try:
     # noinspection PyPep8Naming
diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py
new file mode 100644
index 000000000..be9fd9510
--- /dev/null
+++ b/src/pystencils/codegen/__init__.py
@@ -0,0 +1,23 @@
+from .target import Target
+from .config import (
+    CreateKernelConfig,
+    CpuOptimConfig,
+    VectorizationConfig,
+    OpenMpConfig,
+    GpuIndexingConfig,
+)
+
+from .kernel import Kernel
+from .driver import create_kernel, get_driver
+
+__all__ = [
+    "Target",
+    "CreateKernelConfig",
+    "CpuOptimConfig",
+    "VectorizationConfig",
+    "OpenMpConfig",
+    "GpuIndexingConfig",
+    "Kernel",
+    "create_kernel",
+    "get_driver",
+]
\ No newline at end of file
diff --git a/src/pystencils/config.py b/src/pystencils/codegen/config.py
similarity index 98%
rename from src/pystencils/config.py
rename to src/pystencils/codegen/config.py
index c08ddc161..05e3ec3de 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/codegen/config.py
@@ -8,9 +8,9 @@ from typing import Sequence
 from dataclasses import dataclass, InitVar, replace
 
 from .target import Target
-from .field import Field, FieldType
+from ..field import Field, FieldType
 
-from .types import (
+from ..types import (
     PsIntegerType,
     UserTypeSpec,
     PsIeeeFloatType,
@@ -18,10 +18,10 @@ from .types import (
     create_type,
 )
 
-from .defaults import DEFAULTS
+from ..defaults import DEFAULTS
 
 if TYPE_CHECKING:
-    from .backend.jit import JitBase
+    from ..backend.jit import JitBase
 
 
 class PsOptionsError(Exception):
@@ -336,12 +336,12 @@ class CreateKernelConfig:
         """Returns either the user-specified JIT compiler, or infers one from the target if none is given."""
         if self.jit is None:
             if self.target.is_cpu():
-                from .backend.jit import LegacyCpuJit
+                from ..backend.jit import LegacyCpuJit
 
                 return LegacyCpuJit()
             elif self.target == Target.CUDA:
                 try:
-                    from .backend.jit.gpu_cupy import CupyJit
+                    from ..backend.jit.gpu_cupy import CupyJit
 
                     if (
                         self.gpu_indexing is not None
@@ -352,12 +352,12 @@ class CreateKernelConfig:
                         return CupyJit()
 
                 except ImportError:
-                    from .backend.jit import no_jit
+                    from ..backend.jit import no_jit
 
                     return no_jit
 
             elif self.target == Target.SYCL:
-                from .backend.jit import no_jit
+                from ..backend.jit import no_jit
 
                 return no_jit
             else:
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
new file mode 100644
index 000000000..bc690a598
--- /dev/null
+++ b/src/pystencils/codegen/driver.py
@@ -0,0 +1,432 @@
+from __future__ import annotations
+
+from typing import cast, Sequence
+from dataclasses import dataclass, replace
+
+from .target import Target
+from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
+from .kernel import Kernel
+
+from ..types import create_numeric_type, PsIntegerType, PsScalarType
+from ..backend.ast import PsAstNode
+from ..backend.ast.structural import PsBlock, PsLoop
+from ..backend.kernelcreation import (
+    KernelCreationContext,
+    KernelAnalysis,
+    FreezeExpressions,
+    Typifier,
+)
+from ..backend.constants import PsConstant
+from ..backend.kernelcreation.iteration_space import (
+    create_sparse_iteration_space,
+    create_full_iteration_space,
+    FullIterationSpace,
+)
+from ..backend.platforms import Platform, GenericCpu, GenericVectorCpu, GenericGpu
+from ..backend.exceptions import VectorizationError
+
+from ..backend.transformations import (
+    EliminateConstants,
+    LowerToC,
+    SelectFunctions,
+    CanonicalizeSymbols,
+    HoistLoopInvariantDeclarations,
+)
+
+from ..simp import AssignmentCollection
+from sympy.codegen.ast import AssignmentBase
+
+
+__all__ = ["create_kernel"]
+
+
+def create_kernel(
+    assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
+    config: CreateKernelConfig | None = None,
+    **kwargs,
+) -> Kernel:
+    """Create a kernel function from a set of assignments.
+
+    Args:
+        assignments: The kernel's sequence of assignments, expressed using SymPy
+        config: The configuration for the kernel translator
+        kwargs: If ``config`` is not set, it is created from the keyword arguments;
+            if it is set, its option will be overridden by any keyword arguments.
+
+    Returns:
+        The numerical kernel in pystencil's internal representation, ready to be
+        exported or compiled
+    """
+
+    if not config:
+        config = CreateKernelConfig()
+
+    if kwargs:
+        config = replace(config, **kwargs)
+
+    driver = DefaultKernelCreationDriver(config)
+    return driver(assignments)
+
+
+def get_driver(cfg: CreateKernelConfig, *, retain_intermediates: bool = False):
+    return DefaultKernelCreationDriver(cfg, retain_intermediates)
+
+
+class DefaultKernelCreationDriver:
+    def __init__(self, cfg: CreateKernelConfig, retain_intermediates: bool = False):
+        self._cfg = cfg
+
+        idx_dtype = create_numeric_type(self._cfg.index_dtype)
+        assert isinstance(idx_dtype, PsIntegerType)
+
+        self._ctx = KernelCreationContext(
+            default_dtype=create_numeric_type(self._cfg.default_dtype),
+            index_dtype=idx_dtype,
+        )
+
+        self._target = self._cfg.get_target()
+        self._platform = self._get_platform()
+
+        if retain_intermediates:
+            self._intermediates = CodegenIntermediates()
+        else:
+            self._intermediates = None
+
+    @property
+    def intermediates(self) -> CodegenIntermediates | None:
+        return self._intermediates
+
+    def __call__(
+        self,
+        assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
+    ):
+        kernel_body = self.parse_kernel_body(assignments)
+
+        match self._platform:
+            case GenericCpu():
+                kernel_ast = self._platform.materialize_iteration_space(
+                    kernel_body, self._ctx.get_iteration_space()
+                )
+            case GenericGpu():
+                kernel_ast, gpu_threads = self._platform.materialize_iteration_space(
+                    kernel_body, self._ctx.get_iteration_space()
+                )
+
+        if self._intermediates is not None:
+            self._intermediates.materialized_ispace = kernel_ast.clone()
+
+        #   Fold and extract constants
+        elim_constants = EliminateConstants(self._ctx, extract_constant_exprs=True)
+        kernel_ast = cast(PsBlock, elim_constants(kernel_ast))
+
+        if self._intermediates is not None:
+            self._intermediates.constants_eliminated = kernel_ast.clone()
+
+        #   Target-Specific optimizations
+        if self._cfg.target.is_cpu():
+            kernel_ast = self._transform_for_cpu(kernel_ast)
+
+        #   Note: After this point, the AST may contain intrinsics, so type-dependent
+        #   transformations cannot be run any more
+
+        #   Lowering
+        lower_to_c = LowerToC(self._ctx)
+        kernel_ast = cast(PsBlock, lower_to_c(kernel_ast))
+
+        select_functions = SelectFunctions(self._platform)
+        kernel_ast = cast(PsBlock, select_functions(kernel_ast))
+
+        if self._intermediates is not None:
+            self._intermediates.lowered = kernel_ast.clone()
+
+        #   Late canonicalization pass: Canonicalize new symbols introduced by LowerToC
+
+        canonicalize = CanonicalizeSymbols(self._ctx, True)
+        kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
+
+        if self._cfg.target.is_cpu():
+            return create_cpu_kernel_function(
+                self._ctx,
+                self._platform,
+                kernel_ast,
+                self._cfg.function_name,
+                self._cfg.target,
+                self._cfg.get_jit(),
+            )
+        else:
+            return create_gpu_kernel_function(
+                self._ctx,
+                self._platform,
+                kernel_ast,
+                gpu_threads,
+                self._cfg.function_name,
+                self._cfg.target,
+                self._cfg.get_jit(),
+            )
+
+    def parse_kernel_body(
+        self,
+        assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
+    ) -> PsBlock:
+        if isinstance(assignments, AssignmentBase):
+            assignments = [assignments]
+
+        if not isinstance(assignments, AssignmentCollection):
+            assignments = AssignmentCollection(assignments)  # type: ignore
+
+        _ = _parse_simplification_hints(assignments)
+
+        analysis = KernelAnalysis(
+            self._ctx,
+            not self._cfg.skip_independence_check,
+            not self._cfg.allow_double_writes,
+        )
+        analysis(assignments)
+
+        if self._cfg.index_field is not None:
+            ispace = create_sparse_iteration_space(
+                self._ctx, assignments, index_field=self._cfg.index_field
+            )
+        else:
+            gls = self._cfg.ghost_layers
+            islice = self._cfg.iteration_slice
+
+            if gls is None and islice is None:
+                gls = AUTO
+
+            ispace = create_full_iteration_space(
+                self._ctx,
+                assignments,
+                ghost_layers=gls,
+                iteration_slice=islice,
+            )
+
+        self._ctx.set_iteration_space(ispace)
+
+        freeze = FreezeExpressions(self._ctx)
+        kernel_body = freeze(assignments)
+
+        typify = Typifier(self._ctx)
+        kernel_body = typify(kernel_body)
+
+        if self._intermediates is not None:
+            self._intermediates.parsed_body = kernel_body.clone()
+
+        return kernel_body
+
+    def _transform_for_cpu(self, kernel_ast: PsBlock):
+        canonicalize = CanonicalizeSymbols(self._ctx, True)
+        kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
+
+        if self._intermediates is not None:
+            self._intermediates.cpu_canonicalize = kernel_ast.clone()
+
+        hoist_invariants = HoistLoopInvariantDeclarations(self._ctx)
+        kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast))
+
+        if self._intermediates is not None:
+            self._intermediates.cpu_hoist_invariants = kernel_ast.clone()
+
+        cpu_cfg = self._cfg.cpu_optim
+
+        if cpu_cfg is None:
+            return kernel_ast
+
+        if cpu_cfg.loop_blocking:
+            raise NotImplementedError("Loop blocking not implemented yet.")
+
+        kernel_ast = self._vectorize(kernel_ast)
+
+        if cpu_cfg.openmp is not False:
+            from .backend.transformations import AddOpenMP
+
+            params = (
+                cpu_cfg.openmp
+                if isinstance(cpu_cfg.openmp, OpenMpConfig)
+                else OpenMpConfig()
+            )
+            add_omp = AddOpenMP(self._ctx, params)
+            kernel_ast = cast(PsBlock, add_omp(kernel_ast))
+
+            if self._intermediates is not None:
+                self._intermediates.cpu_openmp = kernel_ast.clone()
+
+        if cpu_cfg.use_cacheline_zeroing:
+            raise NotImplementedError("CL-zeroing not implemented yet")
+
+        return kernel_ast
+
+    def _vectorize(self, kernel_ast: PsBlock) -> PsBlock:
+        assert self._cfg.cpu_optim is not None
+        vec_config = self._cfg.cpu_optim.get_vectorization_config()
+        if vec_config is None:
+            return kernel_ast
+
+        from .backend.transformations import LoopVectorizer, SelectIntrinsics
+
+        assert isinstance(self._platform, GenericVectorCpu)
+
+        ispace = self._ctx.get_iteration_space()
+        if not isinstance(ispace, FullIterationSpace):
+            raise VectorizationError(
+                "Unable to vectorize kernel: The kernel is not using a dense iteration space."
+            )
+
+        inner_loop_coord = ispace.loop_order[-1]
+        inner_loop_dim = ispace.dimensions[inner_loop_coord]
+
+        #   Apply stride (TODO: and alignment) assumptions
+        if vec_config.assume_inner_stride_one:
+            for field in self._ctx.fields:
+                buf = self._ctx.get_buffer(field)
+                inner_stride = buf.strides[inner_loop_coord]
+                if isinstance(inner_stride, PsConstant):
+                    if inner_stride.value != 1:
+                        raise VectorizationError(
+                            f"Unable to apply assumption 'assume_inner_stride_one': "
+                            f"Field {field} has fixed stride {inner_stride} "
+                            f"set in the inner coordinate {inner_loop_coord}."
+                        )
+                else:
+                    buf.strides[inner_loop_coord] = PsConstant(1, buf.index_type)
+                    #   TODO: Communicate assumption to runtime system via a precondition
+
+        #   Call loop vectorizer
+        if vec_config.lanes is None:
+            lanes = VectorizationConfig.default_lanes(
+                self._target, cast(PsScalarType, self._ctx.default_dtype)
+            )
+        else:
+            lanes = vec_config.lanes
+
+        vectorizer = LoopVectorizer(self._ctx, lanes)
+
+        def loop_predicate(loop: PsLoop):
+            return loop.counter.symbol == inner_loop_dim.counter
+
+        kernel_ast = vectorizer.vectorize_select_loops(kernel_ast, loop_predicate)
+
+        if self._intermediates is not None:
+            self._intermediates.cpu_vectorize = kernel_ast.clone()
+
+        select_intrin = SelectIntrinsics(self._ctx, self._platform)
+        kernel_ast = cast(PsBlock, select_intrin(kernel_ast))
+
+        if self._intermediates is not None:
+            self._intermediates.cpu_select_intrins = kernel_ast.clone()
+
+        return kernel_ast
+
+    def _get_platform(self) -> Platform:
+        if Target._CPU in self._target:
+            if Target._X86 in self._target:
+                from ..backend.platforms.x86 import X86VectorArch, X86VectorCpu
+
+                arch: X86VectorArch
+
+                if Target._SSE in self._target:
+                    arch = X86VectorArch.SSE
+                elif Target._AVX in self._target:
+                    arch = X86VectorArch.AVX
+                elif Target._AVX512 in self._target:
+                    if Target._FP16 in self._target:
+                        arch = X86VectorArch.AVX512_FP16
+                    else:
+                        arch = X86VectorArch.AVX512
+                else:
+                    assert False, "unreachable code"
+
+                return X86VectorCpu(self._ctx, arch)
+            elif self._target == Target.GenericCPU:
+                return GenericCpu(self._ctx)
+            else:
+                raise NotImplementedError(
+                    f"No platform is currently available for CPU target {self._target}"
+                )
+
+        elif Target._GPU in self._target:
+            match self._target:
+                case Target.SYCL:
+                    from ..backend.platforms import SyclPlatform
+
+                    return SyclPlatform(self._ctx, self._cfg.gpu_indexing)
+                case Target.CUDA:
+                    from ..backend.platforms import CudaPlatform
+
+                    return CudaPlatform(self._ctx, self._cfg.gpu_indexing)
+
+        raise NotImplementedError(
+            f"Code generation for target {self._target} not implemented"
+        )
+
+
+@dataclass
+class StageResult:
+    ast: PsAstNode
+    label: str
+
+
+class StageResultSlot:
+    def __init__(self, description: str | None = None):
+        self._description = description
+        self._name: str
+        self._lookup: str
+
+    def __set_name__(self, owner, name: str):
+        self._name = name
+        self._lookup = f"_{name}"
+
+    def __get__(self, obj, objtype=None) -> StageResult | None:
+        if obj is None:
+            return None
+
+        ast = getattr(obj, self._lookup, None)
+        if ast is not None:
+            descr = self._name if self._description is None else self._description
+            return StageResult(ast, descr)
+        else:
+            return None
+
+    def __set__(self, obj, val: PsAstNode | None):
+        setattr(obj, self._lookup, val)
+
+
+class CodegenIntermediates:
+    """Intermediate results produced by the code generator."""
+
+    parsed_body = StageResultSlot("Freeze & Type Deduction")
+    materialized_ispace = StageResultSlot("Iteration Space Materialization")
+    constants_eliminated = StageResultSlot("Constant Elimination")
+    cpu_canonicalize = StageResultSlot("CPU: Symbol Canonicalization")
+    cpu_hoist_invariants = StageResultSlot("CPU: Hoisting of Loop Invariants")
+    cpu_vectorize = StageResultSlot("CPU: Vectorization")
+    cpu_select_intrins = StageResultSlot("CPU: Intrinsics Selection")
+    cpu_openmp = StageResultSlot("CPU: OpenMP Instrumentation")
+    lowered = StageResultSlot("C Language Lowering")
+
+    @property
+    def available_stages(self) -> Sequence[StageResult]:
+        all_results: list[StageResult | None] = [
+            getattr(self, name)
+            for name, slot in CodegenIntermediates.__dict__.items()
+            if isinstance(slot, StageResultSlot)
+        ]
+        return tuple(filter(lambda r: r is not None, all_results))  # type: ignore
+
+
+def create_staggered_kernel(
+    assignments, target: Target = Target.CPU, gpu_exclusive_conditions=False, **kwargs
+):
+    raise NotImplementedError(
+        "Staggered kernels are not yet implemented for pystencils 2.0"
+    )
+
+
+#   Internals
+
+
+def _parse_simplification_hints(ac: AssignmentCollection):
+    if "split_groups" in ac.simplification_hints:
+        raise NotImplementedError(
+            "Loop splitting was requested, but is not implemented yet"
+        )
diff --git a/src/pystencils/codegen/gpu.py b/src/pystencils/codegen/gpu.py
new file mode 100644
index 000000000..9cce9b55b
--- /dev/null
+++ b/src/pystencils/codegen/gpu.py
@@ -0,0 +1,28 @@
+
+
+from .kernel import Kernel
+
+
+class GpuKernel(Kernel):
+    """Internal representation of a kernel function targeted at CUDA GPUs."""
+
+    def __init__(
+        self,
+        body: PsBlock,
+        threads_range: GpuThreadsRange | None,
+        target: Target,
+        name: str,
+        parameters: Sequence[KernelParameter],
+        required_headers: set[str],
+        constraints: Sequence[KernelParamsConstraint],
+        jit: JitBase,
+    ):
+        super().__init__(
+            body, target, name, parameters, required_headers, constraints, jit
+        )
+        self._threads_range = threads_range
+
+    @property
+    def threads_range(self) -> GpuThreadsRange | None:
+        """Object exposing the total size of the launch grid this kernel expects to be executed with."""
+        return self._threads_range
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
new file mode 100644
index 000000000..6a0a6d576
--- /dev/null
+++ b/src/pystencils/codegen/kernel.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+from warnings import warn
+from typing import Callable, Sequence, Iterable, Any, TYPE_CHECKING
+from itertools import chain
+
+from .._deprecation import _deprecated
+
+from ..backend.ast.structural import PsBlock
+from ..backend.ast.analysis import collect_required_headers, collect_undefined_symbols
+from ..backend.memory import PsSymbol
+
+from ..types import PsType
+
+from .target import Target
+from .parameters import Parameter
+from ..field import Field
+from ..sympyextensions import TypedSymbol
+
+
+class Kernel:
+    """A pystencils kernel.
+
+    The kernel object is the final result of the translation process.
+    It is immutable, and its AST should not be altered any more, either, as this
+    might invalidate information about the kernel already stored in the `KernelFunction` object.
+    """
+
+    def __init__(
+        self,
+        body: PsBlock,
+        target: Target,
+        name: str,
+        parameters: Sequence[Parameter],
+        required_headers: set[str],
+        jit: JitBase,
+    ):
+        self._body: PsBlock = body
+        self._target = target
+        self._name = name
+        self._params = tuple(parameters)
+        self._required_headers = required_headers
+        self._jit = jit
+        self._metadata: dict[str, Any] = dict()
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        return self._metadata
+
+    @property
+    def body(self) -> PsBlock:
+        return self._body
+
+    @property
+    def target(self) -> Target:
+        return self._target
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @name.setter
+    def name(self, n: str):
+        self._name = n
+
+    @property
+    def function_name(self) -> str:
+        _deprecated("function_name", "name")
+        return self._name
+
+    @function_name.setter
+    def function_name(self, n: str):
+        _deprecated("function_name", "name")
+        self._name = n
+
+    @property
+    def parameters(self) -> tuple[Parameter, ...]:
+        return self._params
+
+    def get_parameters(self) -> tuple[Parameter, ...]:
+        _deprecated("KernelFunction.get_parameters", "KernelFunction.parameters")
+        return self.parameters
+
+    def get_fields(self) -> set[Field]:
+        return set(chain.from_iterable(p.fields for p in self._params))
+
+    @property
+    def fields_accessed(self) -> set[Field]:
+        warn(
+            "`fields_accessed` is deprecated and will be removed in a future version of pystencils. "
+            "Use `get_fields` instead.",
+            DeprecationWarning,
+        )
+        return self.get_fields()
+
+    @property
+    def required_headers(self) -> set[str]:
+        return self._required_headers
+
+    def compile(self) -> Callable[..., None]:
+        """Invoke the underlying just-in-time compiler to obtain the kernel as an executable Python function."""
+        return self._jit.compile(self)
diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py
new file mode 100644
index 000000000..1e01e07aa
--- /dev/null
+++ b/src/pystencils/codegen/parameters.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from warnings import warn
+from typing import Callable, Sequence, Iterable, Any, TYPE_CHECKING
+from itertools import chain
+
+from .._deprecation import _deprecated
+
+from ..backend.ast.structural import PsBlock
+from ..backend.ast.analysis import collect_required_headers, collect_undefined_symbols
+from ..backend.memory import PsSymbol
+from .properties import (
+    PsSymbolProperty,
+    _FieldProperty,
+    FieldShape,
+    FieldStride,
+    FieldBasePtr,
+)
+
+from ..types import PsType
+
+from .target import Target
+from ..field import Field
+from ..sympyextensions import TypedSymbol
+
+
+class Parameter:
+    """Parameter to a `KernelFunction`."""
+
+    __match_args__ = ("name", "dtype", "properties")
+
+    def __init__(
+        self, name: str, dtype: PsType, properties: Iterable[PsSymbolProperty] = ()
+    ):
+        self._name = name
+        self._dtype = dtype
+        self._properties: frozenset[PsSymbolProperty] = (
+            frozenset(properties) if properties is not None else frozenset()
+        )
+        self._fields: tuple[Field, ...] = tuple(
+            sorted(
+                set(
+                    p.field  # type: ignore
+                    for p in filter(
+                        lambda p: isinstance(p, _FieldProperty), self._properties
+                    )
+                ),
+                key=lambda f: f.name
+            )
+        )
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def _hashable_contents(self):
+        return (self._name, self._dtype, self._properties)
+
+    #   TODO: Need?
+    def __hash__(self) -> int:
+        return hash(self._hashable_contents())
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Parameter):
+            return False
+
+        return (
+            type(self) is type(other)
+            and self._hashable_contents() == other._hashable_contents()
+        )
+
+    def __str__(self) -> str:
+        return self._name
+
+    def __repr__(self) -> str:
+        return f"{type(self).__name__}(name = {self._name}, dtype = {self._dtype})"
+
+    @property
+    def symbol(self) -> TypedSymbol:
+        return TypedSymbol(self.name, self.dtype)
+
+    @property
+    def fields(self) -> Sequence[Field]:
+        """Set of fields associated with this parameter."""
+        return self._fields
+
+    def get_properties(
+        self, prop_type: type[PsSymbolProperty] | tuple[type[PsSymbolProperty], ...]
+    ) -> set[PsSymbolProperty]:
+        """Retrieve all properties of the given type(s) attached to this parameter"""
+        return set(filter(lambda p: isinstance(p, prop_type), self._properties))
+
+    @property
+    def properties(self) -> frozenset[PsSymbolProperty]:
+        return self._properties
+
+    @property
+    def is_field_parameter(self) -> bool:
+        return bool(self._fields)
+
+    #   Deprecated legacy properties
+    #   These are kept mostly for the legacy waLBerla code generation system
+
+    @property
+    def is_field_pointer(self) -> bool:
+        warn(
+            "`is_field_pointer` is deprecated and will be removed in a future version of pystencils. "
+            "Use `param.get_properties(FieldBasePtr)` instead.",
+            DeprecationWarning,
+        )
+        return bool(self.get_properties(FieldBasePtr))
+
+    @property
+    def is_field_stride(self) -> bool:
+        warn(
+            "`is_field_stride` is deprecated and will be removed in a future version of pystencils. "
+            "Use `param.get_properties(FieldStride)` instead.",
+            DeprecationWarning,
+        )
+        return bool(self.get_properties(FieldStride))
+
+    @property
+    def is_field_shape(self) -> bool:
+        warn(
+            "`is_field_shape` is deprecated and will be removed in a future version of pystencils. "
+            "Use `param.get_properties(FieldShape)` instead.",
+            DeprecationWarning,
+        )
+        return bool(self.get_properties(FieldShape))
+
+    @property
+    def field_name(self) -> str:
+        warn(
+            "`field_name` is deprecated and will be removed in a future version of pystencils. "
+            "Use `param.fields[0].name` instead.",
+            DeprecationWarning,
+        )
+        return self._fields[0].name
\ No newline at end of file
diff --git a/src/pystencils/backend/properties.py b/src/pystencils/codegen/properties.py
similarity index 100%
rename from src/pystencils/backend/properties.py
rename to src/pystencils/codegen/properties.py
diff --git a/src/pystencils/target.py b/src/pystencils/codegen/target.py
similarity index 100%
rename from src/pystencils/target.py
rename to src/pystencils/codegen/target.py
diff --git a/src/pystencils/datahandling/__init__.py b/src/pystencils/datahandling/__init__.py
index 76a494255..ff1a12c96 100644
--- a/src/pystencils/datahandling/__init__.py
+++ b/src/pystencils/datahandling/__init__.py
@@ -3,7 +3,7 @@ import warnings
 from typing import Tuple, Union
 
 from .datahandling_interface import DataHandling
-from ..target import Target
+from ..codegen.target import Target
 from .serial_datahandling import SerialDataHandling
 
 try:
diff --git a/src/pystencils/datahandling/datahandling_interface.py b/src/pystencils/datahandling/datahandling_interface.py
index f42c4ef13..867bbf062 100644
--- a/src/pystencils/datahandling/datahandling_interface.py
+++ b/src/pystencils/datahandling/datahandling_interface.py
@@ -3,7 +3,7 @@ from typing import Callable, Dict, Iterable, Optional, Sequence, Tuple, Union
 
 import numpy as np
 
-from pystencils.target import Target
+from ..codegen import Target
 from pystencils.field import Field, FieldType
 
 
diff --git a/src/pystencils/datahandling/parallel_datahandling.py b/src/pystencils/datahandling/parallel_datahandling.py
index f3f730522..8c7ce6e62 100644
--- a/src/pystencils/datahandling/parallel_datahandling.py
+++ b/src/pystencils/datahandling/parallel_datahandling.py
@@ -9,7 +9,7 @@ from pystencils.datahandling.blockiteration import block_iteration, sliced_block
 from pystencils.datahandling.datahandling_interface import DataHandling
 from pystencils.field import Field, FieldType
 from pystencils.utils import DotDict
-from pystencils.backend.properties import FieldBasePtr
+from pystencils.codegen.properties import FieldBasePtr
 from pystencils import Target
 
 
diff --git a/src/pystencils/datahandling/serial_datahandling.py b/src/pystencils/datahandling/serial_datahandling.py
index 6a5ce5730..73b749ca4 100644
--- a/src/pystencils/datahandling/serial_datahandling.py
+++ b/src/pystencils/datahandling/serial_datahandling.py
@@ -6,7 +6,7 @@ import numpy as np
 
 from pystencils.datahandling.blockiteration import SerialBlock
 from pystencils.datahandling.datahandling_interface import DataHandling
-from pystencils.target import Target
+from ..codegen import Target
 from pystencils.field import (Field, FieldType, create_numpy_array_with_layout,
                               layout_string_to_tuple, spatial_layout_string_to_tuple)
 from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py
index 86048059d..bcea50e84 100644
--- a/src/pystencils/enums.py
+++ b/src/pystencils/enums.py
@@ -1,4 +1,4 @@
-from .target import Target as _Target
+from .codegen import Target as _Target
 
 from warnings import warn
 
diff --git a/src/pystencils/backend/jit/__init__.py b/src/pystencils/jit/__init__.py
similarity index 100%
rename from src/pystencils/backend/jit/__init__.py
rename to src/pystencils/jit/__init__.py
diff --git a/src/pystencils/backend/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
similarity index 99%
rename from src/pystencils/backend/jit/cpu_extension_module.py
rename to src/pystencils/jit/cpu_extension_module.py
index 4412f8879..444167f9d 100644
--- a/src/pystencils/backend/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -14,7 +14,7 @@ from ..kernelfunction import (
     KernelFunction,
     KernelParameter,
 )
-from ..properties import FieldBasePtr, FieldShape, FieldStride
+from ...codegen.properties import FieldBasePtr, FieldShape, FieldStride
 from ..constraints import KernelParamsConstraint
 from ...types import (
     PsType,
diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
similarity index 99%
rename from src/pystencils/backend/jit/gpu_cupy.py
rename to src/pystencils/jit/gpu_cupy.py
index 1dd187671..2f5753e05 100644
--- a/src/pystencils/backend/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -8,7 +8,7 @@ try:
 except ImportError:
     HAVE_CUPY = False
 
-from ...target import Target
+from ...codegen import Target
 from ...field import FieldType
 
 from ...types import PsType
@@ -18,7 +18,7 @@ from ..kernelfunction import (
     GpuKernelFunction,
     KernelParameter,
 )
-from ..properties import FieldShape, FieldStride, FieldBasePtr
+from ...codegen.properties import FieldShape, FieldStride, FieldBasePtr
 from ..emission import emit_code
 from ...types import PsStructType
 
diff --git a/src/pystencils/backend/jit/jit.py b/src/pystencils/jit/jit.py
similarity index 97%
rename from src/pystencils/backend/jit/jit.py
rename to src/pystencils/jit/jit.py
index 2d091c4a0..250bba240 100644
--- a/src/pystencils/backend/jit/jit.py
+++ b/src/pystencils/jit/jit.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 
 if TYPE_CHECKING:
     from ..kernelfunction import KernelFunction, KernelParameter
-    from ...target import Target
+    from ...codegen.target import Target
 
 
 class JitError(Exception):
diff --git a/src/pystencils/backend/jit/legacy_cpu.py b/src/pystencils/jit/legacy_cpu.py
similarity index 100%
rename from src/pystencils/backend/jit/legacy_cpu.py
rename to src/pystencils/jit/legacy_cpu.py
diff --git a/src/pystencils/backend/jit/msvc_detection.py b/src/pystencils/jit/msvc_detection.py
similarity index 100%
rename from src/pystencils/backend/jit/msvc_detection.py
rename to src/pystencils/jit/msvc_detection.py
diff --git a/src/pystencils/kernel_decorator.py b/src/pystencils/kernel_decorator.py
index a3590d3a4..4e18d7245 100644
--- a/src/pystencils/kernel_decorator.py
+++ b/src/pystencils/kernel_decorator.py
@@ -7,7 +7,7 @@ import sympy as sp
 
 from .assignment import Assignment
 from .sympyextensions import SymbolCreator
-from pystencils.config import CreateKernelConfig
+from .codegen import CreateKernelConfig
 
 __all__ = ['kernel', 'kernel_config']
 
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index e718f1617..9bf3eaf67 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -1,420 +1,16 @@
-from __future__ import annotations
+from .codegen import Target
+from .codegen import create_kernel as _create_kernel
 
-from typing import cast, Sequence
-from dataclasses import dataclass, replace
+from warnings import warn
 
-from .target import Target
-from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
-from .backend import KernelFunction
-from .types import create_numeric_type, PsIntegerType, PsScalarType
-from .backend.ast import PsAstNode
-from .backend.ast.structural import PsBlock, PsLoop
-from .backend.kernelcreation import (
-    KernelCreationContext,
-    KernelAnalysis,
-    FreezeExpressions,
-    Typifier,
+warn(
+    "Importing anything from `pystencils.kernelcreation` is deprecated and the module will be removed in pystencils 2.1. "
+    "Import from `pystencils` instead.",
+    FutureWarning
 )
-from .backend.constants import PsConstant
-from .backend.kernelcreation.iteration_space import (
-    create_sparse_iteration_space,
-    create_full_iteration_space,
-    FullIterationSpace,
-)
-from .backend.platforms import Platform, GenericCpu, GenericVectorCpu, GenericGpu
-from .backend.exceptions import VectorizationError
-
-from .backend.transformations import (
-    EliminateConstants,
-    LowerToC,
-    SelectFunctions,
-    CanonicalizeSymbols,
-    HoistLoopInvariantDeclarations,
-)
-from .backend.kernelfunction import (
-    create_cpu_kernel_function,
-    create_gpu_kernel_function,
-)
-
-from .simp import AssignmentCollection
-from sympy.codegen.ast import AssignmentBase
-
-
-__all__ = ["create_kernel"]
-
-
-def create_kernel(
-    assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
-    config: CreateKernelConfig | None = None,
-    **kwargs,
-) -> KernelFunction:
-    """Create a kernel function from a set of assignments.
-
-    Args:
-        assignments: The kernel's sequence of assignments, expressed using SymPy
-        config: The configuration for the kernel translator
-        kwargs: If ``config`` is not set, it is created from the keyword arguments;
-            if it is set, its option will be overridden by any keyword arguments.
-
-    Returns:
-        The numerical kernel in pystencil's internal representation, ready to be
-        exported or compiled
-    """
-
-    if not config:
-        config = CreateKernelConfig()
-
-    if kwargs:
-        config = replace(config, **kwargs)
-
-    driver = DefaultKernelCreationDriver(config)
-    return driver(assignments)
-
-
-def get_driver(cfg: CreateKernelConfig, *, retain_intermediates: bool = False):
-    return DefaultKernelCreationDriver(cfg, retain_intermediates)
-
-
-class DefaultKernelCreationDriver:
-    def __init__(self, cfg: CreateKernelConfig, retain_intermediates: bool = False):
-        self._cfg = cfg
-
-        idx_dtype = create_numeric_type(self._cfg.index_dtype)
-        assert isinstance(idx_dtype, PsIntegerType)
-
-        self._ctx = KernelCreationContext(
-            default_dtype=create_numeric_type(self._cfg.default_dtype),
-            index_dtype=idx_dtype,
-        )
-
-        self._target = self._cfg.get_target()
-        self._platform = self._get_platform()
-
-        if retain_intermediates:
-            self._intermediates = CodegenIntermediates()
-        else:
-            self._intermediates = None
-
-    @property
-    def intermediates(self) -> CodegenIntermediates | None:
-        return self._intermediates
-
-    def __call__(
-        self,
-        assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
-    ):
-        kernel_body = self.parse_kernel_body(assignments)
-
-        match self._platform:
-            case GenericCpu():
-                kernel_ast = self._platform.materialize_iteration_space(
-                    kernel_body, self._ctx.get_iteration_space()
-                )
-            case GenericGpu():
-                kernel_ast, gpu_threads = self._platform.materialize_iteration_space(
-                    kernel_body, self._ctx.get_iteration_space()
-                )
-
-        if self._intermediates is not None:
-            self._intermediates.materialized_ispace = kernel_ast.clone()
-
-        #   Fold and extract constants
-        elim_constants = EliminateConstants(self._ctx, extract_constant_exprs=True)
-        kernel_ast = cast(PsBlock, elim_constants(kernel_ast))
-
-        if self._intermediates is not None:
-            self._intermediates.constants_eliminated = kernel_ast.clone()
-
-        #   Target-Specific optimizations
-        if self._cfg.target.is_cpu():
-            kernel_ast = self._transform_for_cpu(kernel_ast)
-
-        #   Note: After this point, the AST may contain intrinsics, so type-dependent
-        #   transformations cannot be run any more
-
-        #   Lowering
-        lower_to_c = LowerToC(self._ctx)
-        kernel_ast = cast(PsBlock, lower_to_c(kernel_ast))
-
-        select_functions = SelectFunctions(self._platform)
-        kernel_ast = cast(PsBlock, select_functions(kernel_ast))
-
-        if self._intermediates is not None:
-            self._intermediates.lowered = kernel_ast.clone()
-
-        #   Late canonicalization pass: Canonicalize new symbols introduced by LowerToC
-
-        canonicalize = CanonicalizeSymbols(self._ctx, True)
-        kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
-
-        if self._cfg.target.is_cpu():
-            return create_cpu_kernel_function(
-                self._ctx,
-                self._platform,
-                kernel_ast,
-                self._cfg.function_name,
-                self._cfg.target,
-                self._cfg.get_jit(),
-            )
-        else:
-            return create_gpu_kernel_function(
-                self._ctx,
-                self._platform,
-                kernel_ast,
-                gpu_threads,
-                self._cfg.function_name,
-                self._cfg.target,
-                self._cfg.get_jit(),
-            )
-
-    def parse_kernel_body(
-        self,
-        assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
-    ) -> PsBlock:
-        if isinstance(assignments, AssignmentBase):
-            assignments = [assignments]
-
-        if not isinstance(assignments, AssignmentCollection):
-            assignments = AssignmentCollection(assignments)  # type: ignore
-
-        _ = _parse_simplification_hints(assignments)
-
-        analysis = KernelAnalysis(
-            self._ctx,
-            not self._cfg.skip_independence_check,
-            not self._cfg.allow_double_writes,
-        )
-        analysis(assignments)
-
-        if self._cfg.index_field is not None:
-            ispace = create_sparse_iteration_space(
-                self._ctx, assignments, index_field=self._cfg.index_field
-            )
-        else:
-            gls = self._cfg.ghost_layers
-            islice = self._cfg.iteration_slice
-
-            if gls is None and islice is None:
-                gls = AUTO
-
-            ispace = create_full_iteration_space(
-                self._ctx,
-                assignments,
-                ghost_layers=gls,
-                iteration_slice=islice,
-            )
-
-        self._ctx.set_iteration_space(ispace)
-
-        freeze = FreezeExpressions(self._ctx)
-        kernel_body = freeze(assignments)
-
-        typify = Typifier(self._ctx)
-        kernel_body = typify(kernel_body)
-
-        if self._intermediates is not None:
-            self._intermediates.parsed_body = kernel_body.clone()
-
-        return kernel_body
 
-    def _transform_for_cpu(self, kernel_ast: PsBlock):
-        canonicalize = CanonicalizeSymbols(self._ctx, True)
-        kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
 
-        if self._intermediates is not None:
-            self._intermediates.cpu_canonicalize = kernel_ast.clone()
-
-        hoist_invariants = HoistLoopInvariantDeclarations(self._ctx)
-        kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast))
-
-        if self._intermediates is not None:
-            self._intermediates.cpu_hoist_invariants = kernel_ast.clone()
-
-        cpu_cfg = self._cfg.cpu_optim
-
-        if cpu_cfg is None:
-            return kernel_ast
-
-        if cpu_cfg.loop_blocking:
-            raise NotImplementedError("Loop blocking not implemented yet.")
-
-        kernel_ast = self._vectorize(kernel_ast)
-
-        if cpu_cfg.openmp is not False:
-            from .backend.transformations import AddOpenMP
-
-            params = (
-                cpu_cfg.openmp
-                if isinstance(cpu_cfg.openmp, OpenMpConfig)
-                else OpenMpConfig()
-            )
-            add_omp = AddOpenMP(self._ctx, params)
-            kernel_ast = cast(PsBlock, add_omp(kernel_ast))
-
-            if self._intermediates is not None:
-                self._intermediates.cpu_openmp = kernel_ast.clone()
-
-        if cpu_cfg.use_cacheline_zeroing:
-            raise NotImplementedError("CL-zeroing not implemented yet")
-
-        return kernel_ast
-
-    def _vectorize(self, kernel_ast: PsBlock) -> PsBlock:
-        assert self._cfg.cpu_optim is not None
-        vec_config = self._cfg.cpu_optim.get_vectorization_config()
-        if vec_config is None:
-            return kernel_ast
-
-        from .backend.transformations import LoopVectorizer, SelectIntrinsics
-
-        assert isinstance(self._platform, GenericVectorCpu)
-
-        ispace = self._ctx.get_iteration_space()
-        if not isinstance(ispace, FullIterationSpace):
-            raise VectorizationError(
-                "Unable to vectorize kernel: The kernel is not using a dense iteration space."
-            )
-
-        inner_loop_coord = ispace.loop_order[-1]
-        inner_loop_dim = ispace.dimensions[inner_loop_coord]
-
-        #   Apply stride (TODO: and alignment) assumptions
-        if vec_config.assume_inner_stride_one:
-            for field in self._ctx.fields:
-                buf = self._ctx.get_buffer(field)
-                inner_stride = buf.strides[inner_loop_coord]
-                if isinstance(inner_stride, PsConstant):
-                    if inner_stride.value != 1:
-                        raise VectorizationError(
-                            f"Unable to apply assumption 'assume_inner_stride_one': "
-                            f"Field {field} has fixed stride {inner_stride} "
-                            f"set in the inner coordinate {inner_loop_coord}."
-                        )
-                else:
-                    buf.strides[inner_loop_coord] = PsConstant(1, buf.index_type)
-                    #   TODO: Communicate assumption to runtime system via a precondition
-
-        #   Call loop vectorizer
-        if vec_config.lanes is None:
-            lanes = VectorizationConfig.default_lanes(
-                self._target, cast(PsScalarType, self._ctx.default_dtype)
-            )
-        else:
-            lanes = vec_config.lanes
-
-        vectorizer = LoopVectorizer(self._ctx, lanes)
-
-        def loop_predicate(loop: PsLoop):
-            return loop.counter.symbol == inner_loop_dim.counter
-
-        kernel_ast = vectorizer.vectorize_select_loops(kernel_ast, loop_predicate)
-
-        if self._intermediates is not None:
-            self._intermediates.cpu_vectorize = kernel_ast.clone()
-
-        select_intrin = SelectIntrinsics(self._ctx, self._platform)
-        kernel_ast = cast(PsBlock, select_intrin(kernel_ast))
-
-        if self._intermediates is not None:
-            self._intermediates.cpu_select_intrins = kernel_ast.clone()
-
-        return kernel_ast
-
-    def _get_platform(self) -> Platform:
-        if Target._CPU in self._target:
-            if Target._X86 in self._target:
-                from .backend.platforms.x86 import X86VectorArch, X86VectorCpu
-
-                arch: X86VectorArch
-
-                if Target._SSE in self._target:
-                    arch = X86VectorArch.SSE
-                elif Target._AVX in self._target:
-                    arch = X86VectorArch.AVX
-                elif Target._AVX512 in self._target:
-                    if Target._FP16 in self._target:
-                        arch = X86VectorArch.AVX512_FP16
-                    else:
-                        arch = X86VectorArch.AVX512
-                else:
-                    assert False, "unreachable code"
-
-                return X86VectorCpu(self._ctx, arch)
-            elif self._target == Target.GenericCPU:
-                return GenericCpu(self._ctx)
-            else:
-                raise NotImplementedError(
-                    f"No platform is currently available for CPU target {self._target}"
-                )
-
-        elif Target._GPU in self._target:
-            match self._target:
-                case Target.SYCL:
-                    from .backend.platforms import SyclPlatform
-
-                    return SyclPlatform(self._ctx, self._cfg.gpu_indexing)
-                case Target.CUDA:
-                    from .backend.platforms import CudaPlatform
-
-                    return CudaPlatform(self._ctx, self._cfg.gpu_indexing)
-
-        raise NotImplementedError(
-            f"Code generation for target {self._target} not implemented"
-        )
-
-
-@dataclass
-class StageResult:
-    ast: PsAstNode
-    label: str
-
-
-class StageResultSlot:
-    def __init__(self, description: str | None = None):
-        self._description = description
-        self._name: str
-        self._lookup: str
-
-    def __set_name__(self, owner, name: str):
-        self._name = name
-        self._lookup = f"_{name}"
-
-    def __get__(self, obj, objtype=None) -> StageResult | None:
-        if obj is None:
-            return None
-
-        ast = getattr(obj, self._lookup, None)
-        if ast is not None:
-            descr = self._name if self._description is None else self._description
-            return StageResult(ast, descr)
-        else:
-            return None
-
-    def __set__(self, obj, val: PsAstNode | None):
-        setattr(obj, self._lookup, val)
-
-
-class CodegenIntermediates:
-    """Intermediate results produced by the code generator."""
-
-    parsed_body = StageResultSlot("Freeze & Type Deduction")
-    materialized_ispace = StageResultSlot("Iteration Space Materialization")
-    constants_eliminated = StageResultSlot("Constant Elimination")
-    cpu_canonicalize = StageResultSlot("CPU: Symbol Canonicalization")
-    cpu_hoist_invariants = StageResultSlot("CPU: Hoisting of Loop Invariants")
-    cpu_vectorize = StageResultSlot("CPU: Vectorization")
-    cpu_select_intrins = StageResultSlot("CPU: Intrinsics Selection")
-    cpu_openmp = StageResultSlot("CPU: OpenMP Instrumentation")
-    lowered = StageResultSlot("C Language Lowering")
-
-    @property
-    def available_stages(self) -> Sequence[StageResult]:
-        all_results: list[StageResult | None] = [
-            getattr(self, name)
-            for name, slot in CodegenIntermediates.__dict__.items()
-            if isinstance(slot, StageResultSlot)
-        ]
-        return tuple(filter(lambda r: r is not None, all_results))  # type: ignore
+create_kernel = _create_kernel
 
 
 def create_staggered_kernel(
@@ -424,12 +20,3 @@ def create_staggered_kernel(
         "Staggered kernels are not yet implemented for pystencils 2.0"
     )
 
-
-#   Internals
-
-
-def _parse_simplification_hints(ac: AssignmentCollection):
-    if "split_groups" in ac.simplification_hints:
-        raise NotImplementedError(
-            "Loop splitting was requested, but is not implemented yet"
-        )
diff --git a/tests/_todo/test_vectorization.py b/tests/_todo/test_vectorization.py
index fd416ab4c..de71209ff 100644
--- a/tests/_todo/test_vectorization.py
+++ b/tests/_todo/test_vectorization.py
@@ -2,7 +2,6 @@ import numpy as np
 
 import pytest
 
-import pystencils.config
 import sympy as sp
 
 import pystencils as ps
@@ -141,7 +140,7 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set):
            'assume_inner_stride_one': True}
     update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))]
     # Without the base pointer spec, the inner store is not aligned
-    config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
+    config = ps.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
     ast = ps.create_kernel(update_rule, config=config)
     if instruction_set in ['sse'] or instruction_set.startswith('avx'):
         assert 'stream' in ast.instruction_set
@@ -166,7 +165,7 @@ def test_nt_stores_symbolic_size(instruction_set=instruction_set):
     update_rule = [ps.Assignment(f.center(), 0.0), ps.Assignment(g.center(), 0.0)]
     opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True,
            'assume_inner_stride_one': True}
-    config = pystencils.config.CreateKernelConfig(target=Target.CPU, cpu_vectorize_info=opt)
+    config = ps.CreateKernelConfig(target=Target.CPU, cpu_vectorize_info=opt)
     ast = ps.create_kernel(update_rule, config=config)
     # ps.show_code(ast)
     ast.compile()
@@ -187,7 +186,7 @@ def test_inplace_update(instruction_set=instruction_set):
         f1 @= 2 * s.tmp0
         f2 @= 2 * s.tmp0
 
-    config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set})
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set})
     ast = ps.create_kernel(update_rule, config=config)
     kernel = ast.compile()
     kernel(f=arr)
@@ -379,7 +378,7 @@ def test_issue40(*_):
     eq = [ps.Assignment(sp.Symbol('rho'), 1.0),
           ps.Assignment(src[0, 0](0), sp.Rational(4, 9) * sp.Symbol('rho'))]
 
-    config = pystencils.config.CreateKernelConfig(target=Target.CPU, cpu_vectorize_info=opt, data_type='float64')
+    config = ps.CreateKernelConfig(target=Target.CPU, cpu_vectorize_info=opt, data_type='float64')
     ast = ps.create_kernel(eq, config=config)
 
     code = ps.get_code_str(ast)
diff --git a/tests/_todo/test_vectorization_specific.py b/tests/_todo/test_vectorization_specific.py
index d1930a07a..0f2c68a4d 100644
--- a/tests/_todo/test_vectorization_specific.py
+++ b/tests/_todo/test_vectorization_specific.py
@@ -2,7 +2,6 @@ import pytest
 
 import numpy as np
 
-import pystencils.config
 import sympy as sp
 
 import pystencils as ps
@@ -30,7 +29,7 @@ def test_vectorisation_varying_arch(instruction_set):
         f1 @= 2 * s.tmp0
         f2 @= 2 * s.tmp0
 
-    config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set})
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set})
     ast = ps.create_kernel(update_rule, config=config)
     kernel = ast.compile()
     kernel(f=arr)
@@ -49,7 +48,7 @@ def test_vectorized_abs(instruction_set, dtype):
     f, g = ps.fields(f=arr, g=arr)
     update_rule = [ps.Assignment(g.center(), sp.Abs(f.center()))]
 
-    config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set})
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set})
     ast = ps.create_kernel(update_rule, config=config)
 
     func = ast.compile()
@@ -66,20 +65,20 @@ def test_strided(instruction_set, dtype):
     if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \
             and instruction_set not in ['avx512', 'avx512vl', 'rvv'] and not instruction_set.startswith('sve'):
         with pytest.warns(UserWarning) as warn:
-            config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
+            config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
                                                           default_number_float=dtype)
             ast = ps.create_kernel(update_rule, config=config)
             assert 'Could not vectorize loop' in warn[0].message.args[0]
     else:
         with pytest.warns(None) as warn:
-            config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
+            config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
                                                           default_number_float=dtype)
             ast = ps.create_kernel(update_rule, config=config)
             assert len(warn) == 0
 
     # ps.show_code(ast)
     func = ast.compile()
-    ref_config = pystencils.config.CreateKernelConfig(default_number_float=dtype)
+    ref_config = ps.CreateKernelConfig(default_number_float=dtype)
     ref_func = ps.create_kernel(update_rule, config=ref_config).compile()
 
     # For some reason other array creations fail on the emulated ppc pipeline
@@ -115,7 +114,7 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
     update_rule = ps.Assignment(dst[0, 0], src[0, 0])
     opt = {'instruction_set': instruction_set, 'assume_aligned': True,
            'nontemporal': True, 'assume_inner_stride_one': True}
-    config = pystencils.config.CreateKernelConfig(target=dh.default_target,
+    config = ps.CreateKernelConfig(target=dh.default_target,
                                                   cpu_vectorize_info=opt, ghost_layers=gl_kernel)
     ast = ps.create_kernel(update_rule, config=config)
     kernel = ast.compile()
@@ -152,7 +151,7 @@ def test_vectorization_other(instruction_set, function):
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('field_layout', ('fzyx', 'zyxf'))
 def test_square_root(dtype, instruction_set, field_layout):
-    config = pystencils.config.CreateKernelConfig(data_type=dtype,
+    config = ps.CreateKernelConfig(data_type=dtype,
                                                   default_number_float=dtype,
                                                   cpu_vectorize_info={'instruction_set': instruction_set,
                                                                       'assume_inner_stride_one': True,
@@ -195,7 +194,7 @@ def test_square_root_2(dtype, instruction_set, padding):
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('padding', (True, False))
 def test_pow(dtype, instruction_set, padding):
-    config = pystencils.config.CreateKernelConfig(data_type=dtype,
+    config = ps.CreateKernelConfig(data_type=dtype,
                                                   default_number_float=dtype,
                                                   cpu_vectorize_info={'instruction_set': instruction_set,
                                                                       'assume_inner_stride_one': True,
diff --git a/tests/frontend/test_simplifications.py b/tests/frontend/test_simplifications.py
index 5e1bcb8ed..45cde7241 100644
--- a/tests/frontend/test_simplifications.py
+++ b/tests/frontend/test_simplifications.py
@@ -1,7 +1,6 @@
 from sys import version_info as vs
 import pytest
 
-import pystencils.config
 import sympy as sp
 import pystencils as ps
 
@@ -188,7 +187,7 @@ def test_evaluate_constant_terms(target):
         src[0, 0]: -sp.cos(1) + dst[0, 0]
     })
 
-    config = pystencils.config.CreateKernelConfig(target=target)
+    config = ps.CreateKernelConfig(target=target)
     ast = ps.create_kernel(assignments, config=config)
     code = ps.get_code_str(ast)
     assert 'cos(' not in code and 'cosf(' not in code
diff --git a/tests/kernelcreation/test_sum_prod.py b/tests/kernelcreation/test_sum_prod.py
index 9cd638c00..9d61d3bc4 100644
--- a/tests/kernelcreation/test_sum_prod.py
+++ b/tests/kernelcreation/test_sum_prod.py
@@ -10,7 +10,6 @@
 import pytest
 import numpy as np
 
-import pystencils.config
 import sympy as sp
 import sympy.abc
 
@@ -60,7 +59,7 @@ def test_product(dtype):
 
     assignments = ps.AssignmentCollection({x.center(): sum})
 
-    config = pystencils.config.CreateKernelConfig()
+    config = ps.CreateKernelConfig()
 
     ast = ps.create_kernel(assignments, config=config)
     code = ps.get_code_str(ast)
diff --git a/tests/nbackend/kernelcreation/test_context.py b/tests/nbackend/kernelcreation/test_context.py
index 384fc9315..200c1e34e 100644
--- a/tests/nbackend/kernelcreation/test_context.py
+++ b/tests/nbackend/kernelcreation/test_context.py
@@ -6,7 +6,7 @@ from pystencils import Field, TypedSymbol, FieldType, DynamicType
 from pystencils.backend.kernelcreation import KernelCreationContext
 from pystencils.backend.constants import PsConstant
 from pystencils.backend.memory import PsSymbol
-from pystencils.backend.properties import FieldShape, FieldStride
+from pystencils.codegen.properties import FieldShape, FieldStride
 from pystencils.backend.exceptions import KernelConstraintsError
 from pystencils.types.quick import SInt, Fp
 from pystencils.types import deconstify
diff --git a/tests/nbackend/kernelcreation/test_options.py b/tests/nbackend/kernelcreation/test_options.py
index 7fa7fc513..fefcc98fe 100644
--- a/tests/nbackend/kernelcreation/test_options.py
+++ b/tests/nbackend/kernelcreation/test_options.py
@@ -2,7 +2,7 @@ import pytest
 
 from pystencils.field import Field, FieldType
 from pystencils.types.quick import *
-from pystencils.config import (
+from pystencils.codegen.config import (
     CreateKernelConfig,
     PsOptionsError,
 )
-- 
GitLab


From 4b11cdd047c51d3d342c7873976eca8b42f66998 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 4 Dec 2024 15:16:36 +0100
Subject: [PATCH 20/31] code movements complete, all tests run

---
 src/pystencils/__init__.py                    |   2 +-
 src/pystencils/backend/constraints.py         |  22 ---
 .../backend/emission/base_printer.py          |   5 +-
 src/pystencils/backend/emission/c_printer.py  |  11 +-
 src/pystencils/backend/emission/ir_printer.py |   8 +-
 .../backend/kernelcreation/context.py         |  11 --
 .../backend/kernelcreation/iteration_space.py |   2 +-
 src/pystencils/backend/kernelfunction.py      | 113 ----------------
 src/pystencils/backend/platforms/__init__.py  |   3 +-
 src/pystencils/backend/platforms/cuda.py      |  12 +-
 .../backend/platforms/generic_gpu.py          |  68 +++-------
 src/pystencils/backend/platforms/sycl.py      |  12 +-
 .../backend/transformations/add_pragmas.py    |   1 +
 .../transformations/canonicalize_symbols.py   |   1 -
 src/pystencils/codegen/__init__.py            |   9 +-
 src/pystencils/codegen/config.py              |  13 +-
 src/pystencils/codegen/driver.py              | 125 +++++++++++++++++-
 src/pystencils/codegen/gpu.py                 |  28 ----
 src/pystencils/codegen/kernel.py              |  93 +++++++++++--
 src/pystencils/codegen/parameters.py          |  17 +--
 src/pystencils/display_utils.py               |  30 ++---
 src/pystencils/inspection.py                  |  18 +--
 src/pystencils/jit/__init__.py                |   2 +-
 src/pystencils/jit/cpu_extension_module.py    |  71 +++++-----
 src/pystencils/jit/gpu_cupy.py                |  41 +++---
 src/pystencils/jit/jit.py                     |  15 +--
 src/pystencils/jit/legacy_cpu.py              |   8 +-
 src/pystencils/kernel_wrapper.py              |   2 +-
 src/pystencils/kernelcreation.py              |   6 +-
 tests/kernelcreation/test_domain_kernels.py   |   6 +-
 tests/kernelcreation/test_index_kernels.py    |   2 +-
 tests/kernelcreation/test_iteration_slices.py |   2 +-
 tests/nbackend/test_code_printing.py          |   9 +-
 tests/nbackend/test_cpujit.py                 |   7 +-
 tests/nbackend/test_vectorization.py          |   4 +-
 35 files changed, 376 insertions(+), 403 deletions(-)
 delete mode 100644 src/pystencils/backend/constraints.py
 delete mode 100644 src/pystencils/backend/kernelfunction.py
 delete mode 100644 src/pystencils/codegen/gpu.py

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 4374ccda4..028e4b885 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -19,7 +19,7 @@ from .cache import clear_cache
 from .kernel_decorator import kernel, kernel_config
 from .kernelcreation import create_kernel, create_staggered_kernel
 from .codegen import Kernel
-from .backend.jit import no_jit
+from .jit import no_jit
 from .backend.exceptions import KernelConstraintsError
 from .slicing import make_slice
 from .spatial_coordinates import (
diff --git a/src/pystencils/backend/constraints.py b/src/pystencils/backend/constraints.py
deleted file mode 100644
index 229f6718c..000000000
--- a/src/pystencils/backend/constraints.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, TYPE_CHECKING
-from dataclasses import dataclass
-
-if TYPE_CHECKING:
-    from .kernelfunction import KernelParameter
-
-
-@dataclass
-class KernelParamsConstraint:
-    condition: Any  # FIXME Implement conditions
-    message: str = ""
-
-    def to_code(self):
-        raise NotImplementedError()
-
-    def get_parameters(self) -> set[KernelParameter]:
-        raise NotImplementedError()
-
-    def __str__(self) -> str:
-        return f"{self.message} [{self.condition}]"
diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py
index d721b9f89..a4358bbf3 100644
--- a/src/pystencils/backend/emission/base_printer.py
+++ b/src/pystencils/backend/emission/base_printer.py
@@ -61,7 +61,7 @@ from ..constants import PsConstant
 from ...types import PsType
 
 if TYPE_CHECKING:
-    from ...codegen import Kernel, GpuKernel
+    from ...codegen import Kernel
 
 
 class EmissionError(Exception):
@@ -175,6 +175,7 @@ class BasePrinter(ABC):
         self._indent_width = indent_width
 
     def __call__(self, obj: PsAstNode | Kernel) -> str:
+        from ...codegen import Kernel
         if isinstance(obj, Kernel):
             sig = self.print_signature(obj)
             body_code = self.visit(obj.body, PrinterCtx())
@@ -383,6 +384,8 @@ class BasePrinter(ABC):
         return signature
 
     def _func_prefix(self, func: Kernel):
+        from ...codegen import GpuKernel
+
         if isinstance(func, GpuKernel) and func.target == Target.CUDA:
             return "__global__"
         else:
diff --git a/src/pystencils/backend/emission/c_printer.py b/src/pystencils/backend/emission/c_printer.py
index 95e27bd66..90a7e54e2 100644
--- a/src/pystencils/backend/emission/c_printer.py
+++ b/src/pystencils/backend/emission/c_printer.py
@@ -1,18 +1,23 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
 from pystencils.backend.ast.astnode import PsAstNode
 from pystencils.backend.constants import PsConstant
 from pystencils.backend.emission.base_printer import PrinterCtx, EmissionError
 from pystencils.backend.memory import PsSymbol
 from .base_printer import BasePrinter
 
-from ..kernelfunction import KernelFunction
 from ...types import PsType, PsArrayType, PsScalarType, PsTypeError
 from ..ast.expressions import PsBufferAcc
 from ..ast.vector import PsVecMemAcc
 
+if TYPE_CHECKING:
+    from ...codegen import Kernel
+
 
-def emit_code(kernel: KernelFunction):
+def emit_code(ast: PsAstNode | Kernel):
     printer = CAstPrinter()
-    return printer(kernel)
+    return printer(ast)
 
 
 class CAstPrinter(BasePrinter):
diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py
index 124ce200d..ffb65181c 100644
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
 from pystencils.backend.constants import PsConstant
 from pystencils.backend.emission.base_printer import PrinterCtx
 from pystencils.backend.memory import PsSymbol
@@ -9,8 +12,11 @@ from ..ast import PsAstNode
 from ..ast.expressions import PsBufferAcc
 from ..ast.vector import PsVecMemAcc, PsVecBroadcast
 
+if TYPE_CHECKING:
+    from ...codegen import Kernel
+
 
-def emit_ir(ir: PsAstNode):
+def emit_ir(ir: PsAstNode | Kernel):
     """Emit the IR as C-like pseudo-code for inspection."""
     ir_printer = IRAstPrinter()
     return ir_printer(ir)
diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index bb7bd708d..39fb8ef6d 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -18,7 +18,6 @@ from ...types import (
     PsPointerType,
     deconstify,
 )
-from ..constraints import KernelParamsConstraint
 from ..exceptions import PsInternalCompilerError, KernelConstraintsError
 
 from .iteration_space import IterationSpace, FullIterationSpace, SparseIterationSpace
@@ -81,7 +80,6 @@ class KernelCreationContext:
 
         self._ispace: IterationSpace | None = None
 
-        self._constraints: list[KernelParamsConstraint] = []
         self._req_headers: set[str] = set()
 
         self._metadata: dict[str, Any] = dict()
@@ -96,15 +94,6 @@ class KernelCreationContext:
         """Data type used by default for index expressions"""
         return self._index_dtype
 
-    #   Constraints
-
-    def add_constraints(self, *constraints: KernelParamsConstraint):
-        self._constraints += constraints
-
-    @property
-    def constraints(self) -> tuple[KernelParamsConstraint, ...]:
-        return tuple(self._constraints)
-
     @property
     def metadata(self) -> dict[str, Any]:
         return self._metadata
diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py
index a7802c931..031a0d843 100644
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -457,7 +457,7 @@ def create_full_iteration_space(
     # Otherwise, if an iteration slice was specified, use that
     # Otherwise, use the inferred ghost layers
 
-    from ...codegen.config import AUTO
+    from ...codegen.config import AUTO, _AUTO_TYPE
 
     if ghost_layers is AUTO:
         if len(domain_field_accesses) > 0:
diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py
deleted file mode 100644
index 3c7e103b3..000000000
--- a/src/pystencils/backend/kernelfunction.py
+++ /dev/null
@@ -1,113 +0,0 @@
-from __future__ import annotations
-
-from warnings import warn
-from typing import Callable, Sequence, Iterable, Any, TYPE_CHECKING
-from itertools import chain
-
-from .._deprecation import _deprecated
-
-from .ast.structural import PsBlock
-from .ast.analysis import collect_required_headers, collect_undefined_symbols
-from .memory import PsSymbol
-from ..codegen.properties import (
-    PsSymbolProperty,
-    _FieldProperty,
-    FieldShape,
-    FieldStride,
-    FieldBasePtr,
-)
-from .kernelcreation.context import KernelCreationContext
-from .platforms import Platform, GpuThreadsRange
-
-from .constraints import KernelParamsConstraint
-from ..types import PsType
-
-from ..codegen.target import Target
-from ..field import Field
-from ..sympyextensions import TypedSymbol
-
-if TYPE_CHECKING:
-    from .jit import JitBase
-
-
-
-
-def create_cpu_kernel_function(
-    ctx: KernelCreationContext,
-    platform: Platform,
-    body: PsBlock,
-    function_name: str,
-    target_spec: Target,
-    jit: JitBase,
-):
-    undef_symbols = collect_undefined_symbols(body)
-
-    params = _get_function_params(ctx, undef_symbols)
-    req_headers = _get_headers(ctx, platform, body)
-
-    kfunc = KernelFunction(
-        body, target_spec, function_name, params, req_headers, ctx.constraints, jit
-    )
-    kfunc.metadata.update(ctx.metadata)
-    return kfunc
-
-
-
-
-def create_gpu_kernel_function(
-    ctx: KernelCreationContext,
-    platform: Platform,
-    body: PsBlock,
-    threads_range: GpuThreadsRange | None,
-    function_name: str,
-    target_spec: Target,
-    jit: JitBase,
-):
-    undef_symbols = collect_undefined_symbols(body)
-
-    if threads_range is not None:
-        for threads in threads_range.num_work_items:
-            undef_symbols |= collect_undefined_symbols(threads)
-
-    params = _get_function_params(ctx, undef_symbols)
-    req_headers = _get_headers(ctx, platform, body)
-
-    kfunc = GpuKernelFunction(
-        body,
-        threads_range,
-        target_spec,
-        function_name,
-        params,
-        req_headers,
-        ctx.constraints,
-        jit,
-    )
-    kfunc.metadata.update(ctx.metadata)
-    return kfunc
-
-
-def _get_function_params(ctx: KernelCreationContext, symbols: Iterable[PsSymbol]):
-    params: list[KernelParameter] = []
-
-    from pystencils.backend.memory import BufferBasePtr
-
-    for symb in symbols:
-        props: set[PsSymbolProperty] = set()
-        for prop in symb.properties:
-            match prop:
-                case FieldShape() | FieldStride():
-                    props.add(prop)
-                case BufferBasePtr(buf):
-                    field = ctx.find_field(buf.name)
-                    props.add(FieldBasePtr(field))
-        params.append(KernelParameter(symb.name, symb.get_dtype(), props))
-
-    params.sort(key=lambda p: p.name)
-    return params
-
-
-def _get_headers(ctx: KernelCreationContext, platform: Platform, body: PsBlock):
-    req_headers = collect_required_headers(body)
-    req_headers |= platform.required_headers
-    req_headers |= ctx.required_headers
-    return req_headers
diff --git a/src/pystencils/backend/platforms/__init__.py b/src/pystencils/backend/platforms/__init__.py
index 9332453c6..589841db8 100644
--- a/src/pystencils/backend/platforms/__init__.py
+++ b/src/pystencils/backend/platforms/__init__.py
@@ -1,6 +1,6 @@
 from .platform import Platform
 from .generic_cpu import GenericCpu, GenericVectorCpu
-from .generic_gpu import GenericGpu, GpuThreadsRange
+from .generic_gpu import GenericGpu
 from .cuda import CudaPlatform
 from .x86 import X86VectorCpu, X86VectorArch
 from .sycl import SyclPlatform
@@ -12,7 +12,6 @@ __all__ = [
     "X86VectorCpu",
     "X86VectorArch",
     "GenericGpu",
-    "GpuThreadsRange",
     "CudaPlatform",
     "SyclPlatform",
 ]
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 048bcb0d5..b8f5356ae 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,9 +1,10 @@
+from __future__ import annotations
 from warnings import warn
 from typing import TYPE_CHECKING
 
 from ...types import constify
 from ..exceptions import MaterializationError
-from .generic_gpu import GenericGpu, GpuThreadsRange
+from .generic_gpu import GenericGpu
 
 from ..kernelcreation import (
     Typifier,
@@ -29,7 +30,7 @@ from ..literals import PsLiteral
 from ..functions import PsMathFunction, MathFunctions, CFunction
 
 if TYPE_CHECKING:
-    from ...codegen.config import GpuIndexingConfig
+    from ...codegen import GpuIndexingConfig, GpuThreadsRange
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -54,6 +55,9 @@ class CudaPlatform(GenericGpu):
         self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None
     ) -> None:
         super().__init__(ctx)
+
+        from ...codegen.config import GpuIndexingConfig
+
         self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig()
         self._typify = Typifier(ctx)
 
@@ -134,7 +138,7 @@ class CudaPlatform(GenericGpu):
 
         if not self._cfg.manual_launch_grid:
             try:
-                threads_range = GpuThreadsRange.from_ispace(ispace)
+                threads_range = self.threads_from_ispace(ispace)
             except MaterializationError as e:
                 warn(
                     str(e.args[0])
@@ -212,7 +216,7 @@ class CudaPlatform(GenericGpu):
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
 
-        return ast, GpuThreadsRange.from_ispace(ispace)
+        return ast, self.threads_from_ispace(ispace)
 
     def _linear_thread_idx(self, coord: int):
         block_size = BLOCK_DIM[coord]
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 0512351cd..da8fa64f9 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Sequence
+from typing import TYPE_CHECKING
 from abc import abstractmethod
 
 from ..ast.expressions import PsExpression
@@ -12,55 +12,33 @@ from ..kernelcreation.iteration_space import (
 from .platform import Platform
 from ..exceptions import MaterializationError
 
+if TYPE_CHECKING:
+    from ...codegen.kernel import GpuThreadsRange
 
-class GpuThreadsRange:
-    """Number of threads required by a GPU kernel, in order (x, y, z)."""
 
-    @staticmethod
-    def from_ispace(ispace: IterationSpace) -> GpuThreadsRange:
+class GenericGpu(Platform):
+    @abstractmethod
+    def materialize_iteration_space(
+        self, block: PsBlock, ispace: IterationSpace
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+        pass
+
+    @classmethod
+    def threads_from_ispace(cls, ispace: IterationSpace) -> GpuThreadsRange:
+        from ...codegen.kernel import GpuThreadsRange
+
         if isinstance(ispace, FullIterationSpace):
-            return GpuThreadsRange._from_full_ispace(ispace)
+            return cls._threads_from_full_ispace(ispace)
         elif isinstance(ispace, SparseIterationSpace):
             work_items = (PsExpression.make(ispace.index_list.shape[0]),)
             return GpuThreadsRange(work_items)
         else:
             assert False
 
-    def __init__(
-        self,
-        num_work_items: Sequence[PsExpression],
-    ):
-        self._dim = len(num_work_items)
-        self._num_work_items = tuple(num_work_items)
-
-    # @property
-    # def grid_size(self) -> tuple[PsExpression, ...]:
-    #     return self._grid_size
-
-    # @property
-    # def block_size(self) -> tuple[PsExpression, ...]:
-    #     return self._block_size
-
-    @property
-    def num_work_items(self) -> tuple[PsExpression, ...]:
-        """Number of work items in (x, y, z)-order."""
-        return self._num_work_items
-
-    @property
-    def dim(self) -> int:
-        return self._dim
-    
-    def __str__(self) -> str:
-        rep = "GpuThreadsRange { "
-        rep += "; ".join(f"{x}: {w}" for x, w in zip("xyz", self._num_work_items))
-        rep += " }"
-        return rep
-    
-    def _repr_html_(self) -> str:
-        return str(self)
-
-    @staticmethod
-    def _from_full_ispace(ispace: FullIterationSpace) -> GpuThreadsRange:
+    @classmethod
+    def _threads_from_full_ispace(cls, ispace: FullIterationSpace) -> GpuThreadsRange:
+        from ...codegen.kernel import GpuThreadsRange
+        
         dimensions = ispace.dimensions_in_loop_order()[::-1]
         if len(dimensions) > 3:
             raise NotImplementedError(
@@ -81,11 +59,3 @@ class GpuThreadsRange:
 
         work_items = [ispace.actual_iterations(dim) for dim in dimensions]
         return GpuThreadsRange(work_items)
-
-
-class GenericGpu(Platform):
-    @abstractmethod
-    def materialize_iteration_space(
-        self, block: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
-        pass
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 56615af24..9c04d6074 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 from typing import TYPE_CHECKING
 
 from ..functions import CFunction, PsMathFunction, MathFunctions
@@ -24,12 +25,12 @@ from ..extensions.cpp import CppMethodCall
 
 from ..kernelcreation import KernelCreationContext, AstFactory
 from ..constants import PsConstant
-from .generic_gpu import GenericGpu, GpuThreadsRange
+from .generic_gpu import GenericGpu
 from ..exceptions import MaterializationError
 from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType
 
 if TYPE_CHECKING:
-    from ...codegen.config import GpuIndexingConfig
+    from ...codegen import GpuIndexingConfig, GpuThreadsRange
 
 
 class SyclPlatform(GenericGpu):
@@ -38,6 +39,9 @@ class SyclPlatform(GenericGpu):
         self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None
     ):
         super().__init__(ctx)
+
+        from ...codegen.config import GpuIndexingConfig
+
         self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig()
 
     @property
@@ -113,7 +117,7 @@ class SyclPlatform(GenericGpu):
         id_decl = self._id_declaration(rank, id_symbol)
 
         dimensions = ispace.dimensions_in_loop_order()
-        launch_config = GpuThreadsRange.from_ispace(ispace)
+        launch_config = self.threads_from_ispace(ispace)
 
         indexing_decls = [id_decl]
         conds = []
@@ -188,7 +192,7 @@ class SyclPlatform(GenericGpu):
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
 
-        return ast, GpuThreadsRange.from_ispace(ispace)
+        return ast, self.threads_from_ispace(ispace)
 
     def _item_type(self, rank: int):
         if not self._cfg.sycl_automatic_block_size:
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index 47c008819..78e721f38 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
diff --git a/src/pystencils/backend/transformations/canonicalize_symbols.py b/src/pystencils/backend/transformations/canonicalize_symbols.py
index f5b356432..c0406c25d 100644
--- a/src/pystencils/backend/transformations/canonicalize_symbols.py
+++ b/src/pystencils/backend/transformations/canonicalize_symbols.py
@@ -72,7 +72,6 @@ class CanonicalizeSymbols:
                     symb.dtype = constify(symb.dtype)
 
         #   Any symbols still alive now are function params or globals
-        #   Might use that to populate KernelFunction
         self._last_result = cc
 
         return node
diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py
index be9fd9510..86f7f2940 100644
--- a/src/pystencils/codegen/__init__.py
+++ b/src/pystencils/codegen/__init__.py
@@ -6,8 +6,8 @@ from .config import (
     OpenMpConfig,
     GpuIndexingConfig,
 )
-
-from .kernel import Kernel
+from .parameters import Parameter
+from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .driver import create_kernel, get_driver
 
 __all__ = [
@@ -17,7 +17,10 @@ __all__ = [
     "VectorizationConfig",
     "OpenMpConfig",
     "GpuIndexingConfig",
+    "Parameter",
     "Kernel",
+    "GpuKernel",
+    "GpuThreadsRange",
     "create_kernel",
     "get_driver",
-]
\ No newline at end of file
+]
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 05e3ec3de..b516245fa 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -21,15 +21,14 @@ from ..types import (
 from ..defaults import DEFAULTS
 
 if TYPE_CHECKING:
-    from ..backend.jit import JitBase
+    from ..jit import JitBase
 
 
 class PsOptionsError(Exception):
     """Indicates an option clash in the `CreateKernelConfig`."""
 
 
-class _AUTO_TYPE:
-    ...
+class _AUTO_TYPE: ...  # noqa: E701
 
 
 AUTO = _AUTO_TYPE()
@@ -336,12 +335,12 @@ class CreateKernelConfig:
         """Returns either the user-specified JIT compiler, or infers one from the target if none is given."""
         if self.jit is None:
             if self.target.is_cpu():
-                from ..backend.jit import LegacyCpuJit
+                from ..jit import LegacyCpuJit
 
                 return LegacyCpuJit()
             elif self.target == Target.CUDA:
                 try:
-                    from ..backend.jit.gpu_cupy import CupyJit
+                    from ..jit.gpu_cupy import CupyJit
 
                     if (
                         self.gpu_indexing is not None
@@ -352,12 +351,12 @@ class CreateKernelConfig:
                         return CupyJit()
 
                 except ImportError:
-                    from ..backend.jit import no_jit
+                    from ..jit import no_jit
 
                     return no_jit
 
             elif self.target == Target.SYCL:
-                from ..backend.jit import no_jit
+                from ..jit import no_jit
 
                 return no_jit
             else:
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index bc690a598..0fd49b248 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -1,15 +1,19 @@
 from __future__ import annotations
-
-from typing import cast, Sequence
+from typing import cast, Sequence, Iterable, TYPE_CHECKING
 from dataclasses import dataclass, replace
 
 from .target import Target
 from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
-from .kernel import Kernel
+from .kernel import Kernel, GpuKernel, GpuThreadsRange
+from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
+from .parameters import Parameter
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
+
+from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
 from ..backend.ast.structural import PsBlock, PsLoop
+from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
     KernelAnalysis,
@@ -22,7 +26,12 @@ from ..backend.kernelcreation.iteration_space import (
     create_full_iteration_space,
     FullIterationSpace,
 )
-from ..backend.platforms import Platform, GenericCpu, GenericVectorCpu, GenericGpu
+from ..backend.platforms import (
+    Platform,
+    GenericCpu,
+    GenericVectorCpu,
+    GenericGpu,
+)
 from ..backend.exceptions import VectorizationError
 
 from ..backend.transformations import (
@@ -36,6 +45,9 @@ from ..backend.transformations import (
 from ..simp import AssignmentCollection
 from sympy.codegen.ast import AssignmentBase
 
+if TYPE_CHECKING:
+    from ..jit import JitBase
+
 
 __all__ = ["create_kernel"]
 
@@ -238,7 +250,7 @@ class DefaultKernelCreationDriver:
         kernel_ast = self._vectorize(kernel_ast)
 
         if cpu_cfg.openmp is not False:
-            from .backend.transformations import AddOpenMP
+            from ..backend.transformations import AddOpenMP
 
             params = (
                 cpu_cfg.openmp
@@ -262,7 +274,7 @@ class DefaultKernelCreationDriver:
         if vec_config is None:
             return kernel_ast
 
-        from .backend.transformations import LoopVectorizer, SelectIntrinsics
+        from ..backend.transformations import LoopVectorizer, SelectIntrinsics
 
         assert isinstance(self._platform, GenericVectorCpu)
 
@@ -359,6 +371,107 @@ class DefaultKernelCreationDriver:
             f"Code generation for target {self._target} not implemented"
         )
 
+    def _get_function_params(self, symbols: Iterable[PsSymbol]):
+        params: list[Parameter] = []
+
+        from pystencils.backend.memory import BufferBasePtr
+
+        for symb in symbols:
+            props: set[PsSymbolProperty] = set()
+            for prop in symb.properties:
+                match prop:
+                    case FieldShape() | FieldStride():
+                        props.add(prop)
+                    case BufferBasePtr(buf):
+                        field = self._ctx.find_field(buf.name)
+                        props.add(FieldBasePtr(field))
+            params.append(Parameter(symb.name, symb.get_dtype(), props))
+
+        params.sort(key=lambda p: p.name)
+        return params
+
+    def _get_headers(self, body: PsBlock):
+        req_headers = collect_required_headers(body)
+        req_headers |= self._platform.required_headers
+        req_headers |= self._ctx.required_headers
+        return req_headers
+    
+
+def create_cpu_kernel_function(
+    ctx: KernelCreationContext,
+    platform: Platform,
+    body: PsBlock,
+    function_name: str,
+    target_spec: Target,
+    jit: JitBase,
+):
+    undef_symbols = collect_undefined_symbols(body)
+
+    params = _get_function_params(ctx, undef_symbols)
+    req_headers = _get_headers(ctx, platform, body)
+
+    kfunc = Kernel(
+        body, target_spec, function_name, params, req_headers, jit
+    )
+    kfunc.metadata.update(ctx.metadata)
+    return kfunc
+
+
+def create_gpu_kernel_function(
+    ctx: KernelCreationContext,
+    platform: Platform,
+    body: PsBlock,
+    threads_range: GpuThreadsRange,
+    function_name: str,
+    target_spec: Target,
+    jit: JitBase,
+):
+    undef_symbols = collect_undefined_symbols(body)
+    for threads in threads_range.num_work_items:
+        undef_symbols |= collect_undefined_symbols(threads)
+
+    params = _get_function_params(ctx, undef_symbols)
+    req_headers = _get_headers(ctx, platform, body)
+
+    kfunc = GpuKernel(
+        body,
+        threads_range,
+        target_spec,
+        function_name,
+        params,
+        req_headers,
+        jit,
+    )
+    kfunc.metadata.update(ctx.metadata)
+    return kfunc
+
+
+def _get_function_params(ctx: KernelCreationContext, symbols: Iterable[PsSymbol]):
+    params: list[Parameter] = []
+
+    from pystencils.backend.memory import BufferBasePtr
+
+    for symb in symbols:
+        props: set[PsSymbolProperty] = set()
+        for prop in symb.properties:
+            match prop:
+                case FieldShape() | FieldStride():
+                    props.add(prop)
+                case BufferBasePtr(buf):
+                    field = ctx.find_field(buf.name)
+                    props.add(FieldBasePtr(field))
+        params.append(Parameter(symb.name, symb.get_dtype(), props))
+
+    params.sort(key=lambda p: p.name)
+    return params
+
+
+def _get_headers(ctx: KernelCreationContext, platform: Platform, body: PsBlock):
+    req_headers = collect_required_headers(body)
+    req_headers |= platform.required_headers
+    req_headers |= ctx.required_headers
+    return req_headers
+
 
 @dataclass
 class StageResult:
diff --git a/src/pystencils/codegen/gpu.py b/src/pystencils/codegen/gpu.py
deleted file mode 100644
index 9cce9b55b..000000000
--- a/src/pystencils/codegen/gpu.py
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-from .kernel import Kernel
-
-
-class GpuKernel(Kernel):
-    """Internal representation of a kernel function targeted at CUDA GPUs."""
-
-    def __init__(
-        self,
-        body: PsBlock,
-        threads_range: GpuThreadsRange | None,
-        target: Target,
-        name: str,
-        parameters: Sequence[KernelParameter],
-        required_headers: set[str],
-        constraints: Sequence[KernelParamsConstraint],
-        jit: JitBase,
-    ):
-        super().__init__(
-            body, target, name, parameters, required_headers, constraints, jit
-        )
-        self._threads_range = threads_range
-
-    @property
-    def threads_range(self) -> GpuThreadsRange | None:
-        """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._threads_range
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
index 6a0a6d576..c4ad860b6 100644
--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -1,21 +1,19 @@
 from __future__ import annotations
 
 from warnings import warn
-from typing import Callable, Sequence, Iterable, Any, TYPE_CHECKING
+from typing import Callable, Sequence, Any, TYPE_CHECKING
 from itertools import chain
 
-from .._deprecation import _deprecated
-
-from ..backend.ast.structural import PsBlock
-from ..backend.ast.analysis import collect_required_headers, collect_undefined_symbols
-from ..backend.memory import PsSymbol
-
-from ..types import PsType
-
 from .target import Target
 from .parameters import Parameter
+from ..backend.ast.structural import PsBlock
+from ..backend.ast.expressions import PsExpression
 from ..field import Field
-from ..sympyextensions import TypedSymbol
+
+from .._deprecation import _deprecated
+
+if TYPE_CHECKING:
+    from ..jit import JitBase
 
 
 class Kernel:
@@ -23,7 +21,7 @@ class Kernel:
 
     The kernel object is the final result of the translation process.
     It is immutable, and its AST should not be altered any more, either, as this
-    might invalidate information about the kernel already stored in the `KernelFunction` object.
+    might invalidate information about the kernel already stored in the kernel object.
     """
 
     def __init__(
@@ -78,7 +76,7 @@ class Kernel:
         return self._params
 
     def get_parameters(self) -> tuple[Parameter, ...]:
-        _deprecated("KernelFunction.get_parameters", "KernelFunction.parameters")
+        _deprecated("Kernel.get_parameters", "Kernel.parameters")
         return self.parameters
 
     def get_fields(self) -> set[Field]:
@@ -97,6 +95,77 @@ class Kernel:
     def required_headers(self) -> set[str]:
         return self._required_headers
 
+    def get_c_code(self) -> str:
+        from ..backend.emission import CAstPrinter
+
+        printer = CAstPrinter()
+        return printer(self)
+
+    def get_ir_code(self) -> str:
+        from ..backend.emission import IRAstPrinter
+
+        printer = IRAstPrinter()
+        return printer(self)
+
     def compile(self) -> Callable[..., None]:
         """Invoke the underlying just-in-time compiler to obtain the kernel as an executable Python function."""
         return self._jit.compile(self)
+
+
+class GpuKernel(Kernel):
+    """Internal representation of a kernel function targeted at CUDA GPUs."""
+
+    def __init__(
+        self,
+        body: PsBlock,
+        threads_range: GpuThreadsRange | None,
+        target: Target,
+        name: str,
+        parameters: Sequence[Parameter],
+        required_headers: set[str],
+        jit: JitBase,
+    ):
+        super().__init__(body, target, name, parameters, required_headers, jit)
+        self._threads_range = threads_range
+
+    @property
+    def threads_range(self) -> GpuThreadsRange | None:
+        """Object exposing the total size of the launch grid this kernel expects to be executed with."""
+        return self._threads_range
+
+
+class GpuThreadsRange:
+    """Number of threads required by a GPU kernel, in order (x, y, z)."""
+
+    def __init__(
+        self,
+        num_work_items: Sequence[PsExpression],
+    ):
+        self._dim = len(num_work_items)
+        self._num_work_items = tuple(num_work_items)
+
+    # @property
+    # def grid_size(self) -> tuple[PsExpression, ...]:
+    #     return self._grid_size
+
+    # @property
+    # def block_size(self) -> tuple[PsExpression, ...]:
+    #     return self._block_size
+
+    @property
+    def num_work_items(self) -> tuple[PsExpression, ...]:
+        """Number of work items in (x, y, z)-order."""
+        return self._num_work_items
+
+    @property
+    def dim(self) -> int:
+        return self._dim
+
+    def __str__(self) -> str:
+        rep = "GpuThreadsRange { "
+        rep += "; ".join(f"{x}: {w}" for x, w in zip("xyz", self._num_work_items))
+        rep += " }"
+        return rep
+
+    def _repr_html_(self) -> str:
+        return str(self)
diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py
index 1e01e07aa..d40eae220 100644
--- a/src/pystencils/codegen/parameters.py
+++ b/src/pystencils/codegen/parameters.py
@@ -1,14 +1,8 @@
 from __future__ import annotations
 
 from warnings import warn
-from typing import Callable, Sequence, Iterable, Any, TYPE_CHECKING
-from itertools import chain
+from typing import Sequence, Iterable
 
-from .._deprecation import _deprecated
-
-from ..backend.ast.structural import PsBlock
-from ..backend.ast.analysis import collect_required_headers, collect_undefined_symbols
-from ..backend.memory import PsSymbol
 from .properties import (
     PsSymbolProperty,
     _FieldProperty,
@@ -16,16 +10,13 @@ from .properties import (
     FieldStride,
     FieldBasePtr,
 )
-
 from ..types import PsType
-
-from .target import Target
 from ..field import Field
 from ..sympyextensions import TypedSymbol
 
 
 class Parameter:
-    """Parameter to a `KernelFunction`."""
+    """Parameter to an output object of the code generator."""
 
     __match_args__ = ("name", "dtype", "properties")
 
@@ -45,7 +36,7 @@ class Parameter:
                         lambda p: isinstance(p, _FieldProperty), self._properties
                     )
                 ),
-                key=lambda f: f.name
+                key=lambda f: f.name,
             )
         )
 
@@ -139,4 +130,4 @@ class Parameter:
             "Use `param.fields[0].name` instead.",
             DeprecationWarning,
         )
-        return self._fields[0].name
\ No newline at end of file
+        return self._fields[0].name
diff --git a/src/pystencils/display_utils.py b/src/pystencils/display_utils.py
index 7f110c9c0..919dea4a8 100644
--- a/src/pystencils/display_utils.py
+++ b/src/pystencils/display_utils.py
@@ -2,9 +2,8 @@ from typing import Any, Dict, Optional
 
 import sympy as sp
 
-from pystencils.backend import KernelFunction
-from pystencils.kernel_wrapper import KernelWrapper as OldKernelWrapper
-from .backend.jit import KernelWrapper
+from .codegen import Kernel
+from .jit import KernelWrapper
 
 
 def to_dot(expr: sp.Expr, graph_style: Optional[Dict[str, Any]] = None, short=True):
@@ -43,32 +42,27 @@ def highlight_cpp(code: str):
     return HTML(highlight(code, CppLexer(), HtmlFormatter()))
 
 
-def get_code_obj(ast: KernelWrapper | KernelFunction, custom_backend=None):
+def get_code_obj(ast: KernelWrapper | Kernel, custom_backend=None):
     """Returns an object to display generated code (C/C++ or CUDA)
 
     Can either be displayed as HTML in Jupyter notebooks or printed as normal string.
     """
-    from pystencils.backend.emission import emit_code
-
-    if isinstance(ast, OldKernelWrapper):
-        ast = ast.ast
-    elif isinstance(ast, KernelWrapper):
-        ast = ast.kernel_function
+    if isinstance(ast, KernelWrapper):
+        func = ast.kernel_function
+    else:
+        func = ast
 
     class CodeDisplay:
-        def __init__(self, ast_input):
-            self.ast = ast_input
-
         def _repr_html_(self):
-            return highlight_cpp(emit_code(self.ast)).__html__()
+            return highlight_cpp(func.get_c_code()).__html__()
 
         def __str__(self):
-            return emit_code(self.ast)
+            return func.get_c_code()
 
         def __repr__(self):
-            return emit_code(self.ast)
+            return func.get_c_code()
 
-    return CodeDisplay(ast)
+    return CodeDisplay()
 
 
 def get_code_str(ast, custom_backend=None):
@@ -88,7 +82,7 @@ def _isnotebook():
         return False
 
 
-def show_code(ast: KernelWrapper | KernelFunction, custom_backend=None):
+def show_code(ast: KernelWrapper | Kernel, custom_backend=None):
     code = get_code_obj(ast, custom_backend)
 
     if _isnotebook():
diff --git a/src/pystencils/inspection.py b/src/pystencils/inspection.py
index 7fa3047c6..7f050c745 100644
--- a/src/pystencils/inspection.py
+++ b/src/pystencils/inspection.py
@@ -2,8 +2,8 @@ from typing import overload
 
 from .backend.ast import PsAstNode
 from .backend.emission import CAstPrinter, IRAstPrinter, EmissionError
-from .backend.kernelfunction import KernelFunction
-from .kernelcreation import StageResult, CodegenIntermediates
+from .codegen import Kernel
+from .codegen.driver import StageResult, CodegenIntermediates
 from abc import ABC, abstractmethod
 
 _UNABLE_TO_DISPLAY_CPP = """
@@ -37,7 +37,7 @@ class CodeInspectionBase(ABC):
         self._ir_printer = IRAstPrinter(annotate_constants=False)
         self._c_printer = CAstPrinter()
 
-    def _ir_tab(self, ir_obj: PsAstNode | KernelFunction):
+    def _ir_tab(self, ir_obj: PsAstNode | Kernel):
         import ipywidgets as widgets
 
         ir = self._ir_printer(ir_obj)
@@ -45,7 +45,7 @@ class CodeInspectionBase(ABC):
         self._apply_tab_layout(ir_tab)
         return ir_tab
 
-    def _cpp_tab(self, ir_obj: PsAstNode | KernelFunction):
+    def _cpp_tab(self, ir_obj: PsAstNode | Kernel):
         import ipywidgets as widgets
 
         try:
@@ -64,7 +64,7 @@ class CodeInspectionBase(ABC):
         self._apply_tab_layout(cpp_tab)
         return cpp_tab
 
-    def _graphviz_tab(self, ir_obj: PsAstNode | KernelFunction):
+    def _graphviz_tab(self, ir_obj: PsAstNode | Kernel):
         import ipywidgets as widgets
 
         graphviz_tab = widgets.HTML(_GRAPHVIZ_NOT_IMPLEMENTED)
@@ -124,7 +124,7 @@ class AstInspection(CodeInspectionBase):
 
 
 class KernelInspection(CodeInspectionBase):
-    def __init__(self, kernel: KernelFunction) -> None:
+    def __init__(self, kernel: Kernel) -> None:
         super().__init__()
         self._kernel = kernel
 
@@ -190,7 +190,7 @@ def inspect(obj: PsAstNode): ...
 
 
 @overload
-def inspect(obj: KernelFunction): ...
+def inspect(obj: Kernel): ...
 
 
 @overload
@@ -207,7 +207,7 @@ def inspect(obj):
     When run inside a Jupyter notebook, this function displays an inspection widget
     for the following types of objects:
     - `PsAstNode`
-    - `KernelFunction`
+    - `Kernel`
     - `StageResult`
     - `CodegenIntermediates`
     """
@@ -217,7 +217,7 @@ def inspect(obj):
     match obj:
         case PsAstNode():
             preview = AstInspection(obj)
-        case KernelFunction():
+        case Kernel():
             preview = KernelInspection(obj)
         case StageResult(ast, _):
             preview = AstInspection(ast)
diff --git a/src/pystencils/jit/__init__.py b/src/pystencils/jit/__init__.py
index f45cb9bff..a47dc4aa6 100644
--- a/src/pystencils/jit/__init__.py
+++ b/src/pystencils/jit/__init__.py
@@ -2,7 +2,7 @@
 JIT compilation is realized by subclasses of `JitBase`.
 A JIT compiler may freely be created and configured by the user.
 It can then be passed to `create_kernel` using the ``jit`` argument of
-`CreateKernelConfig`, in which case it is hooked into the `KernelFunction.compile` method
+`CreateKernelConfig`, in which case it is hooked into the `Kernel.compile` method
 of the generated kernel function::
 
     my_jit = MyJit()
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index 444167f9d..befb033e6 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -9,22 +9,19 @@ from textwrap import indent
 
 import numpy as np
 
-from ..exceptions import PsInternalCompilerError
-from ..kernelfunction import (
-    KernelFunction,
-    KernelParameter,
+from ..codegen import (
+    Kernel,
+    Parameter,
 )
-from ...codegen.properties import FieldBasePtr, FieldShape, FieldStride
-from ..constraints import KernelParamsConstraint
-from ...types import (
+from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride
+from ..types import (
     PsType,
     PsUnsignedIntegerType,
     PsSignedIntegerType,
     PsIeeeFloatType,
 )
-from ...types.quick import Fp, SInt, UInt
-from ...field import Field
-from ..emission import emit_code
+from ..types.quick import Fp, SInt, UInt
+from ..field import Field
 
 
 class PsKernelExtensioNModule:
@@ -38,11 +35,11 @@ class PsKernelExtensioNModule:
         self._module_name = module_name
 
         if custom_backend is not None:
-            raise PsInternalCompilerError(
+            raise Exception(
                 "The `custom_backend` parameter exists only for interface compatibility and cannot be set."
             )
 
-        self._kernels: dict[str, KernelFunction] = dict()
+        self._kernels: dict[str, Kernel] = dict()
         self._code_string: str | None = None
         self._code_hash: str | None = None
 
@@ -50,7 +47,7 @@ class PsKernelExtensioNModule:
     def module_name(self) -> str:
         return self._module_name
 
-    def add_function(self, kernel_function: KernelFunction, name: str | None = None):
+    def add_function(self, kernel_function: Kernel, name: str | None = None):
         if name is None:
             name = kernel_function.name
 
@@ -98,7 +95,7 @@ class PsKernelExtensioNModule:
             old_name = kernel.name
             kernel.name = f"kernel_{name}"
 
-            code += emit_code(kernel)
+            code += kernel.get_c_code()
             code += "\n"
             code += emit_call_wrapper(name, kernel)
             code += "\n"
@@ -122,14 +119,14 @@ class PsKernelExtensioNModule:
         print(self._code_string, file=file)
 
 
-def emit_call_wrapper(function_name: str, kernel: KernelFunction) -> str:
+def emit_call_wrapper(function_name: str, kernel: Kernel) -> str:
     builder = CallWrapperBuilder()
 
     for p in kernel.parameters:
         builder.extract_parameter(p)
 
-    for c in kernel.constraints:
-        builder.check_constraint(c)
+    # for c in kernel.constraints:
+    #     builder.check_constraint(c)
 
     builder.call(kernel, kernel.parameters)
 
@@ -206,8 +203,8 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         self._array_extractions: dict[Field, str] = dict()
         self._array_frees: dict[Field, str] = dict()
 
-        self._array_assoc_var_extractions: dict[KernelParameter, str] = dict()
-        self._scalar_extractions: dict[KernelParameter, str] = dict()
+        self._array_assoc_var_extractions: dict[Parameter, str] = dict()
+        self._scalar_extractions: dict[Parameter, str] = dict()
 
         self._constraint_checks: list[str] = []
 
@@ -223,7 +220,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
                 return "PyLong_AsUnsignedLong"
 
             case _:
-                raise PsInternalCompilerError(
+                raise ValueError(
                     f"Don't know how to cast Python objects to {dtype}"
                 )
 
@@ -267,7 +264,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
         return self._array_buffers[field]
 
-    def extract_scalar(self, param: KernelParameter) -> str:
+    def extract_scalar(self, param: Parameter) -> str:
         if param not in self._scalar_extractions:
             extract_func = self._scalar_extractor(param.dtype)
             code = self.TMPL_EXTRACT_SCALAR.format(
@@ -279,7 +276,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
         return param.name
 
-    def extract_array_assoc_var(self, param: KernelParameter) -> str:
+    def extract_array_assoc_var(self, param: Parameter) -> str:
         if param not in self._array_assoc_var_extractions:
             field = param.fields[0]
             buffer = self.extract_field(field)
@@ -305,31 +302,31 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
         return param.name
 
-    def extract_parameter(self, param: KernelParameter):
+    def extract_parameter(self, param: Parameter):
         if param.is_field_parameter:
             self.extract_array_assoc_var(param)
         else:
             self.extract_scalar(param)
 
-    def check_constraint(self, constraint: KernelParamsConstraint):
-        variables = constraint.get_parameters()
+#     def check_constraint(self, constraint: KernelParamsConstraint):
+#         variables = constraint.get_parameters()
 
-        for var in variables:
-            self.extract_parameter(var)
+#         for var in variables:
+#             self.extract_parameter(var)
 
-        cond = constraint.to_code()
+#         cond = constraint.to_code()
 
-        code = f"""
-if(!({cond}))
-{{
-    PyErr_SetString(PyExc_ValueError, "Violated constraint: {constraint}"); 
-    return NULL;
-}}
-"""
+#         code = f"""
+# if(!({cond}))
+# {{
+#     PyErr_SetString(PyExc_ValueError, "Violated constraint: {constraint}"); 
+#     return NULL;
+# }}
+# """
 
-        self._constraint_checks.append(code)
+#         self._constraint_checks.append(code)
 
-    def call(self, kernel: KernelFunction, params: tuple[KernelParameter, ...]):
+    def call(self, kernel: Kernel, params: tuple[Parameter, ...]):
         param_list = ", ".join(p.name for p in params)
         self._call = f"{kernel.name} ({param_list});"
 
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 2f5753e05..c208ac219 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -8,21 +8,20 @@ try:
 except ImportError:
     HAVE_CUPY = False
 
-from ...codegen import Target
-from ...field import FieldType
+from ..codegen import Target
+from ..field import FieldType
 
-from ...types import PsType
+from ..types import PsType
 from .jit import JitBase, JitError, KernelWrapper
-from ..kernelfunction import (
-    KernelFunction,
-    GpuKernelFunction,
-    KernelParameter,
+from ..codegen import (
+    Kernel,
+    GpuKernel,
+    Parameter,
 )
-from ...codegen.properties import FieldShape, FieldStride, FieldBasePtr
-from ..emission import emit_code
-from ...types import PsStructType
+from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
+from ..types import PsStructType
 
-from ...include import get_pystencils_include_path
+from ..include import get_pystencils_include_path
 
 
 @dataclass
@@ -34,18 +33,18 @@ class LaunchGrid:
 class CupyKernelWrapper(KernelWrapper):
     def __init__(
         self,
-        kfunc: GpuKernelFunction,
+        kfunc: GpuKernel,
         raw_kernel: Any,
         block_size: tuple[int, int, int],
     ):
-        self._kfunc: GpuKernelFunction = kfunc
+        self._kfunc: GpuKernel = kfunc
         self._raw_kernel = raw_kernel
         self._block_size = block_size
         self._num_blocks: tuple[int, int, int] | None = None
         self._args_cache: dict[Any, tuple] = dict()
 
     @property
-    def kernel_function(self) -> GpuKernelFunction:
+    def kernel_function(self) -> GpuKernel:
         return self._kfunc
 
     @property
@@ -105,7 +104,7 @@ class CupyKernelWrapper(KernelWrapper):
         field_shapes = set()
         index_shapes = set()
 
-        def check_shape(field_ptr: KernelParameter, arr: cp.ndarray):
+        def check_shape(field_ptr: Parameter, arr: cp.ndarray):
             field = field_ptr.fields[0]
 
             if field.has_fixed_shape:
@@ -190,7 +189,7 @@ class CupyKernelWrapper(KernelWrapper):
                 add_arg(kparam.name, val, kparam.dtype)
 
         #   Determine launch grid
-        from ..ast.expressions import evaluate_expression
+        from ..backend.ast.expressions import evaluate_expression
 
         symbolic_threads_range = self._kfunc.threads_range
 
@@ -243,13 +242,13 @@ class CupyJit(JitBase):
             tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
         )
 
-    def compile(self, kfunc: KernelFunction) -> KernelWrapper:
+    def compile(self, kfunc: Kernel) -> KernelWrapper:
         if not HAVE_CUPY:
             raise JitError(
                 "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
             )
 
-        if not isinstance(kfunc, GpuKernelFunction) or kfunc.target != Target.CUDA:
+        if not isinstance(kfunc, GpuKernel) or kfunc.target != Target.CUDA:
             raise ValueError(
                 "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
             )
@@ -269,7 +268,7 @@ class CupyJit(JitBase):
         options.append("-I" + get_pystencils_include_path())
         return tuple(options)
 
-    def _prelude(self, kfunc: GpuKernelFunction) -> str:
+    def _prelude(self, kfunc: GpuKernel) -> str:
         headers = self._runtime_headers
         headers |= kfunc.required_headers
 
@@ -286,6 +285,6 @@ class CupyJit(JitBase):
 
         return code
 
-    def _kernel_code(self, kfunc: GpuKernelFunction) -> str:
-        kernel_code = emit_code(kfunc)
+    def _kernel_code(self, kfunc: GpuKernel) -> str:
+        kernel_code = kfunc.get_c_code()
         return f'extern "C" {kernel_code}'
diff --git a/src/pystencils/jit/jit.py b/src/pystencils/jit/jit.py
index 250bba240..4998c14ad 100644
--- a/src/pystencils/jit/jit.py
+++ b/src/pystencils/jit/jit.py
@@ -3,8 +3,7 @@ from typing import Sequence, TYPE_CHECKING
 from abc import ABC, abstractmethod
 
 if TYPE_CHECKING:
-    from ..kernelfunction import KernelFunction, KernelParameter
-    from ...codegen.target import Target
+    from ..codegen import Kernel, Parameter, Target
 
 
 class JitError(Exception):
@@ -14,7 +13,7 @@ class JitError(Exception):
 class KernelWrapper(ABC):
     """Wrapper around a compiled and executable pystencils kernel."""
 
-    def __init__(self, kfunc: KernelFunction) -> None:
+    def __init__(self, kfunc: Kernel) -> None:
         self._kfunc = kfunc
 
     @abstractmethod
@@ -22,11 +21,11 @@ class KernelWrapper(ABC):
         pass
 
     @property
-    def kernel_function(self) -> KernelFunction:
+    def kernel_function(self) -> Kernel:
         return self._kfunc
     
     @property
-    def ast(self) -> KernelFunction:
+    def ast(self) -> Kernel:
         return self._kfunc
     
     @property
@@ -34,7 +33,7 @@ class KernelWrapper(ABC):
         return self._kfunc.target
     
     @property
-    def parameters(self) -> Sequence[KernelParameter]:
+    def parameters(self) -> Sequence[Parameter]:
         return self._kfunc.parameters
 
     @property
@@ -48,14 +47,14 @@ class JitBase(ABC):
     """Base class for just-in-time compilation interfaces implemented in pystencils."""
 
     @abstractmethod
-    def compile(self, kernel: KernelFunction) -> KernelWrapper:
+    def compile(self, kernel: Kernel) -> KernelWrapper:
         """Compile a kernel function and return a callable object which invokes the kernel."""
 
 
 class NoJit(JitBase):
     """Not a JIT compiler: Used to explicitly disable JIT compilation on an AST."""
 
-    def compile(self, kernel: KernelFunction) -> KernelWrapper:
+    def compile(self, kernel: Kernel) -> KernelWrapper:
         raise JitError(
             "Just-in-time compilation of this kernel was explicitly disabled."
         )
diff --git a/src/pystencils/jit/legacy_cpu.py b/src/pystencils/jit/legacy_cpu.py
index 1acd1b22a..514e9b60e 100644
--- a/src/pystencils/jit/legacy_cpu.py
+++ b/src/pystencils/jit/legacy_cpu.py
@@ -61,7 +61,7 @@ import time
 import warnings
 
 
-from ..kernelfunction import KernelFunction
+from ..codegen import Kernel
 from .jit import JitBase, KernelWrapper
 from .cpu_extension_module import PsKernelExtensioNModule
 
@@ -71,7 +71,7 @@ from pystencils.utils import atomic_file_write, recursive_dict_update
 
 
 class CpuKernelWrapper(KernelWrapper):
-    def __init__(self, kfunc: KernelFunction, compiled_kernel: Callable[..., None]) -> None:
+    def __init__(self, kfunc: Kernel, compiled_kernel: Callable[..., None]) -> None:
         super().__init__(kfunc)
         self._compiled_kernel = compiled_kernel
 
@@ -86,7 +86,7 @@ class CpuKernelWrapper(KernelWrapper):
 class LegacyCpuJit(JitBase):
     """Wrapper around ``pystencils.cpu.cpujit``"""
 
-    def compile(self, kernel: KernelFunction) -> KernelWrapper:
+    def compile(self, kernel: Kernel) -> KernelWrapper:
         return compile_and_load(kernel)
 
 
@@ -436,7 +436,7 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):
     return lib_file
 
 
-def compile_and_load(kernel: KernelFunction, custom_backend=None):
+def compile_and_load(kernel: Kernel, custom_backend=None):
     cache_config = get_cache_config()
 
     compiler_config = get_compiler_config()
diff --git a/src/pystencils/kernel_wrapper.py b/src/pystencils/kernel_wrapper.py
index afce06d77..5095332c1 100644
--- a/src/pystencils/kernel_wrapper.py
+++ b/src/pystencils/kernel_wrapper.py
@@ -1,3 +1,3 @@
-from .backend.jit import KernelWrapper as _KernelWrapper
+from .jit import KernelWrapper as _KernelWrapper
 
 KernelWrapper = _KernelWrapper
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 9bf3eaf67..97965f709 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -4,9 +4,10 @@ from .codegen import create_kernel as _create_kernel
 from warnings import warn
 
 warn(
-    "Importing anything from `pystencils.kernelcreation` is deprecated and the module will be removed in pystencils 2.1. "
+    "Importing anything from `pystencils.kernelcreation` is deprecated "
+    "and the module will be removed in pystencils 2.1. "
     "Import from `pystencils` instead.",
-    FutureWarning
+    FutureWarning,
 )
 
 
@@ -19,4 +20,3 @@ def create_staggered_kernel(
     raise NotImplementedError(
         "Staggered kernels are not yet implemented for pystencils 2.0"
     )
-
diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py
index d02bfd8e4..da261faec 100644
--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -10,16 +10,14 @@ from pystencils import (
     AssignmentCollection,
     Target,
     CreateKernelConfig,
-    CpuOptimConfig,
-    VectorizationConfig,
 )
 from pystencils.assignment import assignment_from_stencil
 
-from pystencils.kernelcreation import create_kernel, KernelFunction
+from pystencils import create_kernel, Kernel
 from pystencils.backend.emission import emit_code
 
 
-def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig):
+def inspect_dp_kernel(kernel: Kernel, gen_config: CreateKernelConfig):
     code = emit_code(kernel)
 
     match gen_config.target:
diff --git a/tests/kernelcreation/test_index_kernels.py b/tests/kernelcreation/test_index_kernels.py
index 5093c43ff..569c0ab6a 100644
--- a/tests/kernelcreation/test_index_kernels.py
+++ b/tests/kernelcreation/test_index_kernels.py
@@ -2,7 +2,7 @@ import numpy as np
 import pytest
 
 from pystencils import Assignment, Field, FieldType, AssignmentCollection, Target
-from pystencils.kernelcreation import create_kernel, CreateKernelConfig
+from pystencils import create_kernel, CreateKernelConfig
 
 
 @pytest.mark.parametrize("target", [Target.CPU, Target.GPU])
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
index 94ed02954..47f3b5fac 100644
--- a/tests/kernelcreation/test_iteration_slices.py
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -19,7 +19,7 @@ from pystencils import (
 from pystencils.sympyextensions.integer_functions import int_rem
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import normalize_slice
-from pystencils.backend.jit.gpu_cupy import CupyKernelWrapper
+from pystencils.jit.gpu_cupy import CupyKernelWrapper
 
 
 def test_sliced_iteration():
diff --git a/tests/nbackend/test_code_printing.py b/tests/nbackend/test_code_printing.py
index ef4806314..109cfdc19 100644
--- a/tests/nbackend/test_code_printing.py
+++ b/tests/nbackend/test_code_printing.py
@@ -1,11 +1,6 @@
-from pystencils import Target
-
 from pystencils.backend.ast.expressions import PsExpression
-from pystencils.backend.ast.structural import PsAssignment, PsLoop, PsBlock
-from pystencils.backend.kernelfunction import KernelFunction
-from pystencils.backend.memory import PsSymbol, PsBuffer
+from pystencils.backend.memory import PsSymbol
 from pystencils.backend.constants import PsConstant
-from pystencils.backend.literals import PsLiteral
 from pystencils.types.quick import Fp, SInt, UInt, Bool
 from pystencils.backend.emission import CAstPrinter
 
@@ -129,7 +124,7 @@ def test_relations_precedence():
 
 def test_ternary():
     from pystencils.backend.ast.expressions import PsTernary
-    from pystencils.backend.ast.expressions import PsNot, PsAnd, PsOr
+    from pystencils.backend.ast.expressions import PsAnd, PsOr
 
     p, q = [PsExpression.make(PsSymbol(x, Bool())) for x in "pq"]
     x, y, z = [PsExpression.make(PsSymbol(x, Fp(32))) for x in "xyz"]
diff --git a/tests/nbackend/test_cpujit.py b/tests/nbackend/test_cpujit.py
index 648112ef9..c053df9a9 100644
--- a/tests/nbackend/test_cpujit.py
+++ b/tests/nbackend/test_cpujit.py
@@ -1,6 +1,6 @@
 import pytest
 
-from pystencils import Target
+from pystencils import Target, Kernel
 
 # from pystencils.backend.constraints import PsKernelParamsConstraint
 from pystencils.backend.memory import PsSymbol, PsBuffer
@@ -8,10 +8,9 @@ from pystencils.backend.constants import PsConstant
 
 from pystencils.backend.ast.expressions import PsBufferAcc, PsExpression
 from pystencils.backend.ast.structural import PsAssignment, PsBlock, PsLoop
-from pystencils.backend.kernelfunction import KernelFunction
 
 from pystencils.types.quick import SInt, Fp
-from pystencils.backend.jit import LegacyCpuJit
+from pystencils.jit import LegacyCpuJit
 
 import numpy as np
 
@@ -45,7 +44,7 @@ def test_pairwise_addition():
         PsBlock([update])
     )
 
-    func = KernelFunction(PsBlock([loop]), Target.CPU, "kernel", set())
+    func = Kernel(PsBlock([loop]), Target.CPU, "kernel", set())
 
     # sizes_constraint = PsKernelParamsConstraint(
     #     u.shape[0].eq(2 * v.shape[0]),
diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py
index 55330c9ee..a4825669c 100644
--- a/tests/nbackend/test_vectorization.py
+++ b/tests/nbackend/test_vectorization.py
@@ -19,8 +19,8 @@ from pystencils.backend.transformations import (
     LowerToC,
 )
 from pystencils.backend.constants import PsConstant
-from pystencils.backend.kernelfunction import create_cpu_kernel_function
-from pystencils.backend.jit import LegacyCpuJit
+from pystencils.codegen.driver import create_cpu_kernel_function
+from pystencils.jit import LegacyCpuJit
 
 from pystencils import Target, fields, Assignment, Field
 from pystencils.field import create_numpy_array_with_layout
-- 
GitLab


From 8847aa12139c556348dc14175f22ac4d104aca24 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 4 Dec 2024 15:23:07 +0100
Subject: [PATCH 21/31] fix two imports & gpu kernel function creation

---
 .../tutorials/01_tutorial_getting_started.ipynb    |  8 ++++----
 src/pystencils/codegen/driver.py                   | 14 +++++++-------
 src/pystencils/runhelper/db.py                     |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/source/tutorials/01_tutorial_getting_started.ipynb b/docs/source/tutorials/01_tutorial_getting_started.ipynb
index baa3aac6a..5ce765fce 100644
--- a/docs/source/tutorials/01_tutorial_getting_started.ipynb
+++ b/docs/source/tutorials/01_tutorial_getting_started.ipynb
@@ -1140,7 +1140,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1270,8 +1270,8 @@
    "source": [
     "ast = ps.create_kernel(\n",
     "    update_rule,\n",
-    "    cpu_optim = ps.config.CpuOptimConfig(\n",
-    "        openmp=ps.config.OpenMpConfig(num_threads=2))\n",
+    "    cpu_optim = ps.CpuOptimConfig(\n",
+    "        openmp=ps.OpenMpConfig(num_threads=2))\n",
     "    )\n",
     "\n",
     "ps.show_code(ast)"
@@ -1472,7 +1472,7 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 0fd49b248..f7836ea79 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -395,7 +395,7 @@ class DefaultKernelCreationDriver:
         req_headers |= self._platform.required_headers
         req_headers |= self._ctx.required_headers
         return req_headers
-    
+
 
 def create_cpu_kernel_function(
     ctx: KernelCreationContext,
@@ -410,9 +410,7 @@ def create_cpu_kernel_function(
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
 
-    kfunc = Kernel(
-        body, target_spec, function_name, params, req_headers, jit
-    )
+    kfunc = Kernel(body, target_spec, function_name, params, req_headers, jit)
     kfunc.metadata.update(ctx.metadata)
     return kfunc
 
@@ -421,14 +419,16 @@ def create_gpu_kernel_function(
     ctx: KernelCreationContext,
     platform: Platform,
     body: PsBlock,
-    threads_range: GpuThreadsRange,
+    threads_range: GpuThreadsRange | None,
     function_name: str,
     target_spec: Target,
     jit: JitBase,
 ):
     undef_symbols = collect_undefined_symbols(body)
-    for threads in threads_range.num_work_items:
-        undef_symbols |= collect_undefined_symbols(threads)
+
+    if threads_range is not None:
+        for threads in threads_range.num_work_items:
+            undef_symbols |= collect_undefined_symbols(threads)
 
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
diff --git a/src/pystencils/runhelper/db.py b/src/pystencils/runhelper/db.py
index dd413a5e4..e19982958 100644
--- a/src/pystencils/runhelper/db.py
+++ b/src/pystencils/runhelper/db.py
@@ -8,7 +8,7 @@ import six
 from blitzdb.backends.file.backend import serializer_classes
 from blitzdb.backends.file.utils import JsonEncoder
 
-from pystencils.backend.jit.legacy_cpu import get_compiler_config
+from pystencils.jit.legacy_cpu import get_compiler_config
 from pystencils import CreateKernelConfig, Target, Field
 
 import json
-- 
GitLab


From 684eb58cfaed5105e54403706d1fc165bb0a568c Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 4 Dec 2024 15:41:20 +0100
Subject: [PATCH 22/31] update API in docs

---
 docs/Makefile                           |  7 +++++-
 docs/source/backend/jit.rst             |  2 +-
 docs/source/backend/objects.rst         |  4 ++--
 docs/source/reference/gpu_kernels.md    | 10 ++++----
 docs/source/reference/kernelcreation.md | 32 ++++++++++++-------------
 src/pystencils/__init__.py              |  2 ++
 src/pystencils/codegen/__init__.py      |  2 ++
 src/pystencils/codegen/config.py        |  2 +-
 8 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/docs/Makefile b/docs/Makefile
index d0c3cbf10..2cfffacd0 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -12,7 +12,12 @@ BUILDDIR      = build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-.PHONY: help Makefile
+clean:
+	rm -rf source/reference/generated
+	rm -rf source/backend/generated
+	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help clean Makefile
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
diff --git a/docs/source/backend/jit.rst b/docs/source/backend/jit.rst
index f7a02dbd4..79776eb67 100644
--- a/docs/source/backend/jit.rst
+++ b/docs/source/backend/jit.rst
@@ -2,5 +2,5 @@
 Just-In-Time Compilation
 ************************
 
-.. automodule:: pystencils.backend.jit
+.. automodule:: pystencils.jit
     :members:
diff --git a/docs/source/backend/objects.rst b/docs/source/backend/objects.rst
index 050568566..942e6070f 100644
--- a/docs/source/backend/objects.rst
+++ b/docs/source/backend/objects.rst
@@ -76,7 +76,7 @@ The above alignment property, for instance, may be added to a pointer symbol by
 to document its assumption that the pointer be properly aligned, in order to emit aligned load and store instructions.
 It then becomes the responsibility of the runtime system embedding the kernel to check this prequesite before calling the kernel.
 To make sure this information becomes visible, any properties attached to symbols exposed as kernel parameters will also
-be added to their respective `KernelParameter` instance.
+be added to their respective `Parameter` instance.
 
 Buffers
 -------
@@ -110,7 +110,7 @@ The context makes sure to avoid name conflicts between buffers.
 API Documentation
 =================
 
-.. automodule:: pystencils.backend.properties
+.. automodule:: pystencils.codegen.properties
     :members:
 
 .. automodule:: pystencils.backend.memory
diff --git a/docs/source/reference/gpu_kernels.md b/docs/source/reference/gpu_kernels.md
index c3fa70ec2..700a9cf7f 100644
--- a/docs/source/reference/gpu_kernels.md
+++ b/docs/source/reference/gpu_kernels.md
@@ -49,9 +49,9 @@ ps.inspect(kernel)
 ```
 
 The `kernel` object returned by the code generator in above snippet is an instance
-of the {py:class}`GpuKernelFunction` class.
-It extends {py:class}`KernelFunction` with some GPU-specific information.
-In particular, it defines the {any}`threads_range <GpuKernelFunction.threads_range>`
+of the {py:class}`GpuKernel` class.
+It extends {py:class}`Kernel` with some GPU-specific information.
+In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
 property, which tells us how many threads the kernel is expecting to be executed with:
 
 ```{code-cell} ipython3
@@ -193,8 +193,8 @@ only a part of the triangle is being processed.
   :nosignatures:
   :template: autosummary/recursive_class.rst
 
-  pystencils.backend.kernelfunction.GpuKernelFunction
-  pystencils.backend.jit.gpu_cupy.CupyKernelWrapper
+  pystencils.codegen.GpuKernel
+  pystencils.jit.gpu_cupy.CupyKernelWrapper
 ```
 
 :::{admonition} Developers To Do:
diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
index be60fb28e..a8d926f2e 100644
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -37,7 +37,7 @@ and their effects on the generated kernel.
 The primary way to invoke the code generation engine is through the `create_kernel` function.
 It takes two arguments:
 - the list of assignment that make up the kernel (optionally wrapped as an ``AssignmentCollection``),
-- and a configuration object, an instance of {any}`CreateKernelConfig <pystencils.config.CreateKernelConfig>`.
+- and a configuration object, an instance of {any}`CreateKernelConfig <pystencils.codegen.config.CreateKernelConfig>`.
 
 ```{eval-rst}
 .. module:: pystencils.kernelcreation
@@ -81,7 +81,7 @@ The above snippet defines a five-point-stencil Jacobi update. A few noteworthy t
 
 ## Inspecting the Generated Code
 
-The object returned by the code generator, here named `kernel`, is an instance of the {any}`KernelFunction` class.
+The object returned by the code generator, here named `kernel`, is an instance of the {any}`Kernel` class.
 This object stores the kernel's name, its list of parameters, the set of fields it operates on, and its hardware target.
 Also, it of course holds the kernel itself, in the form of an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST).
 This tree can be printed out as compilable code in the target language (C++ or, in this case, CUDA),
@@ -105,7 +105,7 @@ their interaction and effects, use cases and caveats.
 
 ```{eval-rst}
 
-.. module:: pystencils.config
+.. module:: pystencils.codegen.config
 
 .. autosummary::
   :toctree: generated
@@ -131,14 +131,14 @@ their interaction and effects, use cases and caveats.
 Pystencils supports code generation for a variety of CPU and GPU hardware.
 
 ```{eval-rst}
-.. currentmodule:: pystencils.config
+.. currentmodule:: pystencils.codegen.config
 
 .. autosummary::
   :nosignatures:
 
   CreateKernelConfig.target
 
-.. module:: pystencils.target
+.. module:: pystencils.codegen.target
 
 .. autosummary::
   :toctree: generated
@@ -197,7 +197,7 @@ are using the `int32` data type, as specified in {py:data}`index_dtype <CreateKe
 ```{code-cell} ipython3
 :tags: [remove-input]
 
-driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+driver = ps.codegen.get_driver(cfg, retain_intermediates=True)
 kernel = driver(assignments)
 ps.inspect(driver.intermediates.materialized_ispace)
 ```
@@ -207,7 +207,7 @@ To learn more about inspecting code after different stages of the code generator
 :::
 
 ```{eval-rst}
-.. currentmodule:: pystencils.config
+.. currentmodule:: pystencils.codegen.config
 
 .. autosummary::
   :nosignatures:
@@ -241,7 +241,7 @@ only one of which can be specified at a time:
 :::
 
 ```{eval-rst}
-.. currentmodule:: pystencils.config
+.. currentmodule:: pystencils.codegen
 
 .. autosummary::
   :nosignatures:
@@ -281,7 +281,7 @@ boundary values or exchange data in MPI-parallel simulations.
 ##### Automatic Ghost Layers
 
 The easiest way to define an iteration space with ghost layers
-is to set `ghost_layers=ps.config.AUTO`, which is also the default
+is to set `ghost_layers=ps.AUTO`, which is also the default
 when no iteration space options are specified.
 In this case, the code generator will examine the kernel to find the maximum range
 of its stencil -- that is, the maximum neighbor offset encountered in any field access.
@@ -301,11 +301,11 @@ To illustrate, the following kernel accesses neighbor nodes with a maximum offse
 ```{code-cell} ipython3
 ranged_update = ps.Assignment(u.center(), v[-2, -2] + v[2, 2])
 
-cfg = ps.CreateKernelConfig(ghost_layers=ps.config.AUTO)
+cfg = ps.CreateKernelConfig(ghost_layers=ps.AUTO)
 kernel = ps.create_kernel(ranged_update, cfg)
 ```
 
-With `ghost_layers=ps.config.AUTO`, its iteration space will look like this (yellow cells are included, purple cells excluded):
+With `ghost_layers=ps.AUTO`, its iteration space will look like this (yellow cells are included, purple cells excluded):
 
 ```{code-cell} ipython3
 :tags: [remove-input]
@@ -516,7 +516,7 @@ assignments = [
 ```
 
 ```{code-cell} ipython3
-driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+driver = ps.codegen.get_driver(cfg, retain_intermediates=True)
 kernel = driver(assignments)
 ps.inspect(driver.intermediates)
 ```
@@ -524,14 +524,14 @@ ps.inspect(driver.intermediates)
 ## API: Kernel Parameters and Function Objects
 
 ```{eval-rst}
-.. module:: pystencils.backend.kernelfunction
+.. module:: pystencils.codegen
 
 .. autosummary::
   :toctree: generated
   :nosignatures:
   :template: autosummary/entire_class.rst
 
-  KernelParameter
-  KernelFunction
-  GpuKernelFunction
+  Parameter
+  Kernel
+  GpuKernel
 ```
diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 028e4b885..4f8b26607 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -7,6 +7,7 @@ from .codegen import (
     VectorizationConfig,
     OpenMpConfig,
     GpuIndexingConfig,
+    AUTO
 )
 from .defaults import DEFAULTS
 from . import fd
@@ -53,6 +54,7 @@ __all__ = [
     "VectorizationConfig",
     "GpuIndexingConfig",
     "OpenMpConfig",
+    "AUTO",
     "create_kernel",
     "create_staggered_kernel",
     "Kernel",
diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py
index 86f7f2940..e27b94b9e 100644
--- a/src/pystencils/codegen/__init__.py
+++ b/src/pystencils/codegen/__init__.py
@@ -5,6 +5,7 @@ from .config import (
     VectorizationConfig,
     OpenMpConfig,
     GpuIndexingConfig,
+    AUTO,
 )
 from .parameters import Parameter
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
@@ -17,6 +18,7 @@ __all__ = [
     "VectorizationConfig",
     "OpenMpConfig",
     "GpuIndexingConfig",
+    "AUTO",
     "Parameter",
     "Kernel",
     "GpuKernel",
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index b516245fa..b6567e74a 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -222,7 +222,7 @@ class CreateKernelConfig:
     """Just-in-time compiler used to compile and load the kernel for invocation from the current Python environment.
     
     If left at `None`, a default just-in-time compiler will be inferred from the `target` parameter.
-    To explicitly disable JIT compilation, pass `pystencils.backend.jit.no_jit`.
+    To explicitly disable JIT compilation, pass `pystencils.no_jit <pystencils.jit.no_jit>`.
     """
 
     function_name: str = "kernel"
-- 
GitLab


From b1725dc19ec7af95ca2522cc4f0cc5b12c7b8e6f Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 10 Dec 2024 11:49:07 +0100
Subject: [PATCH 23/31] delete redudant code and remove deprecated stuff from
 coverage

---
 pytest.ini                           |  4 ++++
 src/pystencils/_deprecation.py       |  1 +
 src/pystencils/codegen/config.py     |  4 ++--
 src/pystencils/codegen/driver.py     | 25 -------------------------
 src/pystencils/codegen/kernel.py     |  8 ++++----
 src/pystencils/codegen/parameters.py |  8 ++++----
 6 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index b43b0f00c..281eaa21e 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -40,6 +40,7 @@ omit = doc/*
        src/pystencils/cache.py
        src/pystencils/pacxx/benchmark.py
        src/pystencils/_version.py
+       src/pystencils/_deprecation.py
        src/pystencils/old
        venv/
 
@@ -62,6 +63,9 @@ exclude_lines =
        if False:
        if __name__ == .__main__.:
 
+       # Don't cover type checking imports
+       if TYPE_CHECKING:
+
 skip_covered = True
 fail_under = 80
 
diff --git a/src/pystencils/_deprecation.py b/src/pystencils/_deprecation.py
index 29ee648a7..68218f0ae 100644
--- a/src/pystencils/_deprecation.py
+++ b/src/pystencils/_deprecation.py
@@ -5,4 +5,5 @@ def _deprecated(feature, instead, version="2.1"):
         f"{feature} is deprecated and will be removed in pystencils {version}."
         f"Use {instead} instead.",
         DeprecationWarning,
+        stacklevel=2
     )
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index b6567e74a..9bd9cc9a3 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -428,7 +428,7 @@ class CreateKernelConfig:
         cpu_openmp: bool | int | None,
         cpu_vectorize_info: dict | None,
         gpu_indexing_params: dict | None,
-    ):
+    ):  # pragma: no cover
         optim: CpuOptimConfig | None = None
 
         if data_type is not None:
@@ -527,7 +527,7 @@ class CreateKernelConfig:
             )
 
 
-def _deprecated_option(name, instead):
+def _deprecated_option(name, instead):  # pragma: no cover
     from warnings import warn
 
     warn(
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index f7836ea79..62a3f44db 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -371,31 +371,6 @@ class DefaultKernelCreationDriver:
             f"Code generation for target {self._target} not implemented"
         )
 
-    def _get_function_params(self, symbols: Iterable[PsSymbol]):
-        params: list[Parameter] = []
-
-        from pystencils.backend.memory import BufferBasePtr
-
-        for symb in symbols:
-            props: set[PsSymbolProperty] = set()
-            for prop in symb.properties:
-                match prop:
-                    case FieldShape() | FieldStride():
-                        props.add(prop)
-                    case BufferBasePtr(buf):
-                        field = self._ctx.find_field(buf.name)
-                        props.add(FieldBasePtr(field))
-            params.append(Parameter(symb.name, symb.get_dtype(), props))
-
-        params.sort(key=lambda p: p.name)
-        return params
-
-    def _get_headers(self, body: PsBlock):
-        req_headers = collect_required_headers(body)
-        req_headers |= self._platform.required_headers
-        req_headers |= self._ctx.required_headers
-        return req_headers
-
 
 def create_cpu_kernel_function(
     ctx: KernelCreationContext,
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
index c4ad860b6..3adc47876 100644
--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -62,12 +62,12 @@ class Kernel:
         self._name = n
 
     @property
-    def function_name(self) -> str:
+    def function_name(self) -> str:  # pragma: no cover
         _deprecated("function_name", "name")
         return self._name
 
     @function_name.setter
-    def function_name(self, n: str):
+    def function_name(self, n: str):  # pragma: no cover
         _deprecated("function_name", "name")
         self._name = n
 
@@ -75,7 +75,7 @@ class Kernel:
     def parameters(self) -> tuple[Parameter, ...]:
         return self._params
 
-    def get_parameters(self) -> tuple[Parameter, ...]:
+    def get_parameters(self) -> tuple[Parameter, ...]:  # pragma: no cover
         _deprecated("Kernel.get_parameters", "Kernel.parameters")
         return self.parameters
 
@@ -83,7 +83,7 @@ class Kernel:
         return set(chain.from_iterable(p.fields for p in self._params))
 
     @property
-    def fields_accessed(self) -> set[Field]:
+    def fields_accessed(self) -> set[Field]:  # pragma: no cover
         warn(
             "`fields_accessed` is deprecated and will be removed in a future version of pystencils. "
             "Use `get_fields` instead.",
diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py
index d40eae220..d8411266e 100644
--- a/src/pystencils/codegen/parameters.py
+++ b/src/pystencils/codegen/parameters.py
@@ -97,7 +97,7 @@ class Parameter:
     #   These are kept mostly for the legacy waLBerla code generation system
 
     @property
-    def is_field_pointer(self) -> bool:
+    def is_field_pointer(self) -> bool:  # pragma: no cover
         warn(
             "`is_field_pointer` is deprecated and will be removed in a future version of pystencils. "
             "Use `param.get_properties(FieldBasePtr)` instead.",
@@ -106,7 +106,7 @@ class Parameter:
         return bool(self.get_properties(FieldBasePtr))
 
     @property
-    def is_field_stride(self) -> bool:
+    def is_field_stride(self) -> bool:  # pragma: no cover
         warn(
             "`is_field_stride` is deprecated and will be removed in a future version of pystencils. "
             "Use `param.get_properties(FieldStride)` instead.",
@@ -115,7 +115,7 @@ class Parameter:
         return bool(self.get_properties(FieldStride))
 
     @property
-    def is_field_shape(self) -> bool:
+    def is_field_shape(self) -> bool:  # pragma: no cover
         warn(
             "`is_field_shape` is deprecated and will be removed in a future version of pystencils. "
             "Use `param.get_properties(FieldShape)` instead.",
@@ -124,7 +124,7 @@ class Parameter:
         return bool(self.get_properties(FieldShape))
 
     @property
-    def field_name(self) -> str:
+    def field_name(self) -> str:  # pragma: no cover
         warn(
             "`field_name` is deprecated and will be removed in a future version of pystencils. "
             "Use `param.fields[0].name` instead.",
-- 
GitLab


From a9ce0ab86be477b2476b15dd68554d5a92f1931b Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 18 Dec 2024 13:40:24 +0100
Subject: [PATCH 24/31] update docs

---
 docs/Makefile                                 |  5 +-
 docs/source/api/codegen.rst                   | 57 +++++++++++++++++++
 docs/source/{reference => }/api/field.rst     |  0
 docs/source/api/jit.rst                       | 41 +++++++++++++
 .../{reference => }/api/sympyextensions.rst   |  0
 docs/source/backend/index.rst                 |  1 -
 docs/source/backend/jit.rst                   |  6 --
 docs/source/index.rst                         | 38 ++++++++-----
 docs/source/reference/api/codegen.rst         | 29 ----------
 docs/source/reference/api/index.rst           | 13 -----
 docs/source/reference/gpu_kernels.md          | 12 ++--
 docs/source/reference/kernelcreation.md       | 42 ++++----------
 src/pystencils/jit/__init__.py                |  4 +-
 13 files changed, 144 insertions(+), 104 deletions(-)
 create mode 100644 docs/source/api/codegen.rst
 rename docs/source/{reference => }/api/field.rst (100%)
 create mode 100644 docs/source/api/jit.rst
 rename docs/source/{reference => }/api/sympyextensions.rst (100%)
 delete mode 100644 docs/source/backend/jit.rst
 delete mode 100644 docs/source/reference/api/codegen.rst
 delete mode 100644 docs/source/reference/api/index.rst

diff --git a/docs/Makefile b/docs/Makefile
index 55c0a3c56..800b4020a 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -21,10 +21,13 @@ clean:
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
+# %: Makefile
+
+html:
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
 clean:
 	rm -rf source/reference/generated
+	rm -rf source/api/generated
 	rm -rf source/backend/generated
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/source/api/codegen.rst b/docs/source/api/codegen.rst
new file mode 100644
index 000000000..cf8db1700
--- /dev/null
+++ b/docs/source/api/codegen.rst
@@ -0,0 +1,57 @@
+pystencils.codegen
+==================
+
+.. module:: pystencils.codegen
+
+Kernel Creation Driver
+----------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+
+  create_kernel
+  get_driver
+  
+Configuration
+-------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  CreateKernelConfig
+  CpuOptimConfig
+  OpenMpConfig
+  VectorizationConfig
+  GpuIndexingConfig
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+
+  AUTO
+
+Target Specification
+--------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/recursive_class.rst
+
+  Target
+
+Output Code Objects
+-------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  Kernel
+  GpuKernel
+  Parameter
+  GpuThreadsRange
diff --git a/docs/source/reference/api/field.rst b/docs/source/api/field.rst
similarity index 100%
rename from docs/source/reference/api/field.rst
rename to docs/source/api/field.rst
diff --git a/docs/source/api/jit.rst b/docs/source/api/jit.rst
new file mode 100644
index 000000000..5a6bb2f4f
--- /dev/null
+++ b/docs/source/api/jit.rst
@@ -0,0 +1,41 @@
+pystencils.jit
+==============
+
+.. module:: pystencils.jit
+
+Base Infrastructure
+-------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+    KernelWrapper
+    JitBase
+    NoJit
+
+.. autodata:: no_jit
+
+Legacy CPU JIT
+--------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  LegacyCpuJit
+  KernelWrapper
+
+CuPy-based GPU JIT
+------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  CupyJit
+  CupyKernelWrapper
+  LaunchGrid
diff --git a/docs/source/reference/api/sympyextensions.rst b/docs/source/api/sympyextensions.rst
similarity index 100%
rename from docs/source/reference/api/sympyextensions.rst
rename to docs/source/api/sympyextensions.rst
diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst
index 74b57e27b..5ab8dbd34 100644
--- a/docs/source/backend/index.rst
+++ b/docs/source/backend/index.rst
@@ -18,7 +18,6 @@ who wish to customize or extend the behaviour of the code generator in their app
     platforms
     transformations
     errors
-    jit
     extensions
 
 Internal Representation
diff --git a/docs/source/backend/jit.rst b/docs/source/backend/jit.rst
deleted file mode 100644
index 79776eb67..000000000
--- a/docs/source/backend/jit.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-************************
-Just-In-Time Compilation
-************************
-
-.. automodule:: pystencils.jit
-    :members:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 66582cb4b..6aa09bdbd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -69,28 +69,36 @@ Topics
 ------
 
 .. toctree::
-   :maxdepth: 1
-   :caption: Getting Started
+  :maxdepth: 1
+  :caption: Getting Started
 
-   installation
-   tutorials/index
+  installation
+  tutorials/index
 
 .. toctree::
-   :maxdepth: 1
-   :caption: Reference Guides
+  :maxdepth: 1
+  :caption: Reference Guides
 
-   reference/symbolic_language
-   reference/kernelcreation
-   reference/gpu_kernels
-   reference/types
-   reference/api/index
+  reference/symbolic_language
+  reference/kernelcreation
+  reference/gpu_kernels
+  reference/types
 
 .. toctree::
-   :maxdepth: 1
-   :caption: Advanced
+  :maxdepth: 1
+  :caption: API
 
-   migration
-   backend/index
+  api/field
+  api/sympyextensions
+  api/codegen
+  api/jit
+
+.. toctree::
+  :maxdepth: 1
+  :caption: Advanced
+
+  migration
+  backend/index
 
 Projects using pystencils
 -------------------------
diff --git a/docs/source/reference/api/codegen.rst b/docs/source/reference/api/codegen.rst
deleted file mode 100644
index 6418f32f6..000000000
--- a/docs/source/reference/api/codegen.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-Code Generator and Configuration
-================================
-
-.. module:: pystencils.kernelcreation
-
-.. autosummary::
-  :toctree: generated
-  :nosignatures:
-
-  create_kernel
-
-.. module:: pystencils.config
-
-.. autosummary::
-  :toctree: generated
-  :nosignatures:
-  :template: autosummary/entire_class.rst
-
-  CreateKernelConfig
-  CpuOptimConfig
-  OpenMpConfig
-  VectorizationConfig
-  GpuIndexingConfig
-
-.. autosummary::
-  :toctree: generated
-  :nosignatures:
-
-  AUTO
\ No newline at end of file
diff --git a/docs/source/reference/api/index.rst b/docs/source/reference/api/index.rst
deleted file mode 100644
index b19c6303e..000000000
--- a/docs/source/reference/api/index.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-***
-API
-***
-
-Modules
-=======
-
-.. toctree::
-    :maxdepth: 1
-
-    field
-    sympyextensions
-    codegen
diff --git a/docs/source/reference/gpu_kernels.md b/docs/source/reference/gpu_kernels.md
index 7fb0febb4..786840d18 100644
--- a/docs/source/reference/gpu_kernels.md
+++ b/docs/source/reference/gpu_kernels.md
@@ -49,9 +49,9 @@ ps.inspect(kernel)
 ```
 
 The `kernel` object returned by the code generator in above snippet is an instance
-of the {py:class}`GpuKernelFunction` class.
-It extends {py:class}`KernelFunction` with some GPU-specific information.
-In particular, it defines the {any}`threads_range <GpuKernelFunction.threads_range>`
+of the {py:class}`GpuKernel` class.
+It extends {py:class}`Kernel` with some GPU-specific information.
+In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
 property, which tells us how many threads the kernel is expecting to be executed with:
 
 ```{code-cell} ipython3
@@ -208,12 +208,10 @@ only a part of the triangle is being processed.
 
 ```{eval-rst}
 .. autosummary::
-  :toctree: generated
   :nosignatures:
-  :template: autosummary/recursive_class.rst
 
-  pystencils.backend.kernelfunction.GpuKernelFunction
-  pystencils.backend.jit.gpu_cupy.CupyKernelWrapper
+  pystencils.codegen.GpuKernel
+  pystencils.jit.gpu_cupy.CupyKernelWrapper
 ```
 
 :::{admonition} Developers To Do:
diff --git a/docs/source/reference/kernelcreation.md b/docs/source/reference/kernelcreation.md
index 2a5045785..248855fc1 100644
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -34,17 +34,19 @@ and their effects on the generated kernel.
 
 ## Running the Code Generator
 
-The primary way to invoke the code generation engine is through the `create_kernel` function.
+The primary way to invoke the code generation engine is through the {any}`create_kernel` function.
 It takes two arguments:
 - the list of assignment that make up the kernel (optionally wrapped as an ``AssignmentCollection``),
 - and a configuration object, an instance of {any}`CreateKernelConfig <pystencils.codegen.config.CreateKernelConfig>`.
 
 ```{eval-rst}
+.. currentmodule:: pystencils.codegen
+
 .. autosummary::
   :nosignatures:
 
-  pystencils.kernelcreation.create_kernel
-  pystencils.config.CreateKernelConfig
+  create_kernel
+  CreateKernelConfig
 ```
 
 For a simple kernel, an invocation of the code generator might look like this:
@@ -110,21 +112,14 @@ their interaction and effects, use cases and caveats.
 Pystencils supports code generation for a variety of CPU and GPU hardware.
 
 ```{eval-rst}
-.. currentmodule:: pystencils.codegen.config
+.. currentmodule:: pystencils.codegen
 
 .. autosummary::
   :nosignatures:
 
   CreateKernelConfig.target
-
-.. module:: pystencils.codegen.target
-
-.. autosummary::
-  :toctree: generated
-  :nosignatures:
-  :template: autosummary/recursive_class.rst
-
   Target
+
 ```
 
 ### Data Types
@@ -176,7 +171,7 @@ are using the `int32` data type, as specified in {py:data}`index_dtype <CreateKe
 ```{code-cell} ipython3
 :tags: [remove-input]
 
-driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+driver = ps.codegen.get_driver(cfg, retain_intermediates=True)
 kernel = driver(assignments)
 ps.inspect(driver.intermediates.materialized_ispace, show_cpp=False)
 ```
@@ -186,7 +181,7 @@ To learn more about inspecting code after different stages of the code generator
 :::
 
 ```{eval-rst}
-.. currentmodule:: pystencils.codegen.config
+.. currentmodule:: pystencils.codegen
 
 .. autosummary::
   :nosignatures:
@@ -220,7 +215,7 @@ only one of which can be specified at a time:
 :::
 
 ```{eval-rst}
-.. currentmodule:: pystencils.codegen.config
+.. currentmodule:: pystencils.codegen
 
 .. autosummary::
   :nosignatures:
@@ -281,7 +276,7 @@ To illustrate, the following kernel accesses neighbor nodes with a maximum offse
 ```{code-cell} ipython3
 ranged_update = ps.Assignment(u.center(), v[-2, -1] + v[2, 1])
 
-cfg = ps.CreateKernelConfig(ghost_layers=ps.config.AUTO)
+cfg = ps.CreateKernelConfig(ghost_layers=ps.AUTO)
 kernel = ps.create_kernel(ranged_update, cfg)
 ```
 
@@ -510,18 +505,3 @@ driver = ps.codegen.get_driver(cfg, retain_intermediates=True)
 kernel = driver(assignments)
 ps.inspect(driver.intermediates)
 ```
-
-## API: Kernel Parameters and Function Objects
-
-```{eval-rst}
-.. module:: pystencils.codegen
-
-.. autosummary::
-  :toctree: generated
-  :nosignatures:
-  :template: autosummary/entire_class.rst
-
-  Parameter
-  Kernel
-  GpuKernel
-```
diff --git a/src/pystencils/jit/__init__.py b/src/pystencils/jit/__init__.py
index a47dc4aa6..1ef8378d3 100644
--- a/src/pystencils/jit/__init__.py
+++ b/src/pystencils/jit/__init__.py
@@ -24,7 +24,7 @@ It is due to be replaced in the near future.
 
 from .jit import JitBase, NoJit, KernelWrapper
 from .legacy_cpu import LegacyCpuJit
-from .gpu_cupy import CupyJit
+from .gpu_cupy import CupyJit, CupyKernelWrapper, LaunchGrid
 
 no_jit = NoJit()
 """Disables just-in-time compilation for a kernel."""
@@ -36,4 +36,6 @@ __all__ = [
     "NoJit",
     "no_jit",
     "CupyJit",
+    "CupyKernelWrapper",
+    "LaunchGrid"
 ]
-- 
GitLab


From c6e2ec20b74fb5a1a4970e6b417c3be7bc655919 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 18 Dec 2024 14:10:59 +0100
Subject: [PATCH 25/31] update codegen module docs

---
 docs/Makefile                    |  7 +------
 docs/source/api/codegen.rst      | 21 ++++++++++++++++++---
 src/pystencils/codegen/driver.py | 15 +++++++++++++++
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/docs/Makefile b/docs/Makefile
index 800b4020a..a293f14ee 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -12,12 +12,7 @@ BUILDDIR      = build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-clean:
-	rm -rf source/reference/generated
-	rm -rf source/backend/generated
-	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help clean Makefile
+.PHONY: help html clean
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
diff --git a/docs/source/api/codegen.rst b/docs/source/api/codegen.rst
index cf8db1700..d65e9a358 100644
--- a/docs/source/api/codegen.rst
+++ b/docs/source/api/codegen.rst
@@ -3,15 +3,14 @@ pystencils.codegen
 
 .. module:: pystencils.codegen
 
-Kernel Creation Driver
-----------------------
+Invocation
+----------
 
 .. autosummary::
   :toctree: generated
   :nosignatures:
 
   create_kernel
-  get_driver
   
 Configuration
 -------------
@@ -43,6 +42,22 @@ Target Specification
 
   Target
 
+Code Generation Drivers
+-----------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  driver.DefaultKernelCreationDriver
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+
+  get_driver
+
 Output Code Objects
 -------------------
 
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 62a3f44db..2ab906366 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -81,10 +81,25 @@ def create_kernel(
 
 
 def get_driver(cfg: CreateKernelConfig, *, retain_intermediates: bool = False):
+    """Create a code generation driver object from the given configuration.
+
+    Args:
+        cfg: Configuration for the code generator
+        retain_intermediates: If `True`, instructs the driver to keep copies of
+            the intermediate results of its stages for later inspection.
+    """
     return DefaultKernelCreationDriver(cfg, retain_intermediates)
 
 
 class DefaultKernelCreationDriver:
+    """Drives the default kernel creation sequence.
+
+    Args:
+        cfg: Configuration for the code generator
+        retain_intermediates: If `True`, instructs the driver to keep copies of
+            the intermediate results of its stages for later inspection.
+    """
+
     def __init__(self, cfg: CreateKernelConfig, retain_intermediates: bool = False):
         self._cfg = cfg
 
-- 
GitLab


From 91b9c7b41e07ed386c0bc687e7fec4f57a368332 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 13 Jan 2025 09:04:11 +0100
Subject: [PATCH 26/31] add missing type annotations

---
 src/pystencils/codegen/driver.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 2ab906366..7bdec96cc 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -80,7 +80,9 @@ def create_kernel(
     return driver(assignments)
 
 
-def get_driver(cfg: CreateKernelConfig, *, retain_intermediates: bool = False):
+def get_driver(
+    cfg: CreateKernelConfig, *, retain_intermediates: bool = False
+) -> DefaultKernelCreationDriver:
     """Create a code generation driver object from the given configuration.
 
     Args:
@@ -126,7 +128,7 @@ class DefaultKernelCreationDriver:
     def __call__(
         self,
         assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
-    ):
+    ) -> Kernel:
         kernel_body = self.parse_kernel_body(assignments)
 
         match self._platform:
@@ -241,7 +243,7 @@ class DefaultKernelCreationDriver:
 
         return kernel_body
 
-    def _transform_for_cpu(self, kernel_ast: PsBlock):
+    def _transform_for_cpu(self, kernel_ast: PsBlock) -> PsBlock:
         canonicalize = CanonicalizeSymbols(self._ctx, True)
         kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
 
@@ -394,7 +396,7 @@ def create_cpu_kernel_function(
     function_name: str,
     target_spec: Target,
     jit: JitBase,
-):
+) -> Kernel:
     undef_symbols = collect_undefined_symbols(body)
 
     params = _get_function_params(ctx, undef_symbols)
@@ -413,7 +415,7 @@ def create_gpu_kernel_function(
     function_name: str,
     target_spec: Target,
     jit: JitBase,
-):
+) -> GpuKernel:
     undef_symbols = collect_undefined_symbols(body)
 
     if threads_range is not None:
@@ -436,7 +438,9 @@ def create_gpu_kernel_function(
     return kfunc
 
 
-def _get_function_params(ctx: KernelCreationContext, symbols: Iterable[PsSymbol]):
+def _get_function_params(
+    ctx: KernelCreationContext, symbols: Iterable[PsSymbol]
+) -> list[Parameter]:
     params: list[Parameter] = []
 
     from pystencils.backend.memory import BufferBasePtr
@@ -456,7 +460,9 @@ def _get_function_params(ctx: KernelCreationContext, symbols: Iterable[PsSymbol]
     return params
 
 
-def _get_headers(ctx: KernelCreationContext, platform: Platform, body: PsBlock):
+def _get_headers(
+    ctx: KernelCreationContext, platform: Platform, body: PsBlock
+) -> set[str]:
     req_headers = collect_required_headers(body)
     req_headers |= platform.required_headers
     req_headers |= ctx.required_headers
-- 
GitLab


From 70ca60e78c14688493a824fda253235549525446 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 13 Jan 2025 09:16:56 +0100
Subject: [PATCH 27/31] fix extraneous autosummary entry

---
 docs/source/api/jit.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/api/jit.rst b/docs/source/api/jit.rst
index 5a6bb2f4f..7bcd9989c 100644
--- a/docs/source/api/jit.rst
+++ b/docs/source/api/jit.rst
@@ -26,7 +26,6 @@ Legacy CPU JIT
   :template: autosummary/entire_class.rst
 
   LegacyCpuJit
-  KernelWrapper
 
 CuPy-based GPU JIT
 ------------------
-- 
GitLab


From e9c461ac7cad411d61e4965c645f92e78e8b0b0e Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 13 Jan 2025 09:25:07 +0100
Subject: [PATCH 28/31] try to fix CUDA in CI

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9f80cd261..9ac0cd1aa 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -300,7 +300,7 @@ tests-and-coverage:
     - python -m coverage xml
   tags:
     - docker
-    - cuda11
+    - cuda12
     - AVX
   coverage: /Total coverage:\s\d+.\d+\%/
   artifacts:
-- 
GitLab


From 346b118aa56a41e6320a8416ca0cc7a297a61017 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 13 Jan 2025 09:29:00 +0100
Subject: [PATCH 29/31] revert cuda12 tag

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9ac0cd1aa..9f80cd261 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -300,7 +300,7 @@ tests-and-coverage:
     - python -m coverage xml
   tags:
     - docker
-    - cuda12
+    - cuda11
     - AVX
   coverage: /Total coverage:\s\d+.\d+\%/
   artifacts:
-- 
GitLab


From e075038e22476f2595724f0ec6c1180d4f146d79 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 13 Jan 2025 10:17:03 +0100
Subject: [PATCH 30/31] use cuda11 container

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9f80cd261..13f4592f3 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -286,7 +286,7 @@ mypy-typecheck:
 tests-and-coverage:
   stage: "Unit Tests"
   needs: []
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cuda11
   before_script:
     - pip install -e .[tests]
   script:
@@ -318,7 +318,7 @@ tests-and-coverage:
 
 
 build-documentation:
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cuda11
   stage: docs
   needs: []
   before_script:
-- 
GitLab


From 4ac580c7930f2167a66b7a379fc5e77d2319cb4c Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 13 Jan 2025 12:38:48 +0100
Subject: [PATCH 31/31] use cupy12.3 in CI

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 13f4592f3..6c58a26bd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -286,7 +286,7 @@ mypy-typecheck:
 tests-and-coverage:
   stage: "Unit Tests"
   needs: []
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cuda11
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cupy12.3
   before_script:
     - pip install -e .[tests]
   script:
@@ -318,7 +318,7 @@ tests-and-coverage:
 
 
 build-documentation:
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cuda11
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cupy12.3
   stage: docs
   needs: []
   before_script:
-- 
GitLab