From 92c29b1887911721a8ff3781978f46fc0e8270db Mon Sep 17 00:00:00 2001
From: markus holzer <markus.holzer@fau.de>
Date: Sun, 20 Feb 2022 07:58:05 +0100
Subject: [PATCH 01/18] Added CUDA benchmarks

---
 pystencils_benchmark/__init__.py              |   1 +
 pystencils_benchmark/benchmark.py             |   8 +-
 pystencils_benchmark/benchmark_gpu.py         | 170 ++++++++++++++++++
 pystencils_benchmark/enums.py                 |   1 +
 pystencils_benchmark/templates/NVCC.mk        |  12 ++
 .../templates/{ => cpu}/kernel.c              |   0
 .../templates/{ => cpu}/kernel.h              |   0
 .../templates/{ => cpu}/main.c                |   0
 pystencils_benchmark/templates/gpu/kernel.cu  |   8 +
 pystencils_benchmark/templates/gpu/kernel.h   |  11 ++
 pystencils_benchmark/templates/gpu/main.c     |  66 +++++++
 tests/test_benchmark.py                       |  17 +-
 12 files changed, 289 insertions(+), 5 deletions(-)
 create mode 100644 pystencils_benchmark/benchmark_gpu.py
 create mode 100644 pystencils_benchmark/templates/NVCC.mk
 rename pystencils_benchmark/templates/{ => cpu}/kernel.c (100%)
 rename pystencils_benchmark/templates/{ => cpu}/kernel.h (100%)
 rename pystencils_benchmark/templates/{ => cpu}/main.c (100%)
 create mode 100644 pystencils_benchmark/templates/gpu/kernel.cu
 create mode 100644 pystencils_benchmark/templates/gpu/kernel.h
 create mode 100644 pystencils_benchmark/templates/gpu/main.c

diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py
index 6f5f32c..8142abe 100644
--- a/pystencils_benchmark/__init__.py
+++ b/pystencils_benchmark/__init__.py
@@ -1,2 +1,3 @@
 from .enums import Compiler
 from .benchmark import generate_benchmark, kernel_header, kernel_source
+from .benchmark_gpu import generate_benchmark_gpu
diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py
index 67cc3d5..a4ce99c 100644
--- a/pystencils_benchmark/benchmark.py
+++ b/pystencils_benchmark/benchmark.py
@@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined
 import numpy as np
 
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.astnodes import KernelFunction, PragmaBlock
+from pystencils.astnodes import KernelFunction
 from pystencils.enums import Backend
 from pystencils.data_types import get_base_type
 from pystencils.sympyextensions import prod
@@ -147,7 +147,7 @@ def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True):
         'timing': timing,
     }
 
-    main = _env.get_template('main.c').render(**jinja_context)
+    main = _env.get_template('cpu/main.c').render(**jinja_context)
     return main
 
 
@@ -160,7 +160,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
         'function_signature': function_signature,
     }
 
-    header = _env.get_template('kernel.h').render(**jinja_context)
+    header = _env.get_template('cpu/kernel.h').render(**jinja_context)
     return header
 
 
@@ -176,5 +176,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
         'timing': True,
     }
 
-    source = _env.get_template('kernel.c').render(**jinja_context)
+    source = _env.get_template('cpu/kernel.c').render(**jinja_context)
     return source
diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py
new file mode 100644
index 0000000..67ce0dd
--- /dev/null
+++ b/pystencils_benchmark/benchmark_gpu.py
@@ -0,0 +1,170 @@
+from typing import Union, List
+from collections import namedtuple
+from pathlib import Path
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from pystencils.data_types import get_base_type
+from pystencils.sympyextensions import prod
+from pystencils.transformations import get_common_shape
+from pystencils.gpucuda import BlockIndexing
+
+from pystencils_benchmark.enums import Compiler
+
+_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
+                   trim_blocks=True, lstrip_blocks=True)
+
+
+def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
+                           path: Path = None,
+                           *,
+                           compiler: Compiler = Compiler.GCC) -> None:
+    if path is None:
+        path = Path('.')
+    else:
+        path.mkdir(parents=True, exist_ok=True)
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    if isinstance(kernel_asts, KernelFunction):
+        kernel_asts = [kernel_asts]
+
+    for kernel_ast in kernel_asts:
+        kernel_name = kernel_ast.function_name
+
+        header = kernel_header(kernel_ast)
+        with open(include_path / f'{kernel_name}.h', 'w+') as f:
+            f.write(header)
+
+        source = kernel_source(kernel_ast)
+        with open(src_path / f'{kernel_name}.c', 'w+') as f:
+            f.write(source)
+
+    with open(src_path / 'main.c', 'w+') as f:
+        f.write(kernel_main(kernel_asts))
+
+    copy_static_files(path)
+    compiler_toolchain(path, compiler)
+
+
+def compiler_toolchain(path: Path, compiler: Compiler) -> None:
+    name = compiler.name
+    jinja_context = {
+        'compiler': name,
+    }
+
+    files = ['Makefile', f'{name}.mk']
+    for file_name in files:
+        with open(path / file_name, 'w+') as f:
+            template = _env.get_template(file_name).render(**jinja_context)
+            f.write(template)
+
+
+def copy_static_files(path: Path) -> None:
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    files = ['timing.h', 'timing.c']
+    for file_name in files:
+        template = _env.get_template(file_name).render()
+        if file_name[-1] == 'h':
+            target_path = include_path / file_name
+        elif file_name[-1] == 'c':
+            target_path = src_path / file_name
+        else:
+            target_path = path / file_name
+        with open(target_path, 'w+') as f:
+            f.write(template)
+
+
+def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)):
+    """
+    Return C code of a benchmark program for the given kernel.
+
+    Args:
+        kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
+        timing: add timing output to the code, prints time per iteration to stdout
+        cuda_block_size: defines the cuda block grid
+    Returns:
+        C code as string
+    """
+    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
+                                   'call_argument_list', 'blocks', 'grid'])
+    kernels = []
+    includes = set()
+    for kernel in kernels_ast:
+        name = kernel.function_name
+        accessed_fields = {f.name: f for f in kernel.fields_accessed}
+        constants = []
+        fields = []
+        call_parameters = []
+        block_and_thread_numbers = dict()
+        for p in kernel.get_parameters():
+            if not p.is_field_parameter:
+                constants.append((p.symbol.name, str(p.symbol.dtype)))
+                call_parameters.append(p.symbol.name)
+            else:
+                assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
+                field = accessed_fields[p.field_name]
+                dtype = str(get_base_type(p.symbol.dtype))
+                elements = prod(field.shape)
+
+                fields.append((p.field_name, dtype, elements))
+                call_parameters.append(p.field_name)
+
+            common_shape = get_common_shape(kernel.fields_accessed)
+            indexing = kernel.indexing
+            block_and_thread_numbers = indexing.call_parameters(common_shape)
+            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+
+        kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
+                              call_argument_list=",".join(call_parameters),
+                              blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
+        includes.add(name)
+
+    jinja_context = {
+        'kernels': kernels,
+        'includes': includes,
+        'timing': timing,
+    }
+
+    main = _env.get_template('gpu/main.c').render(**jinja_context)
+    return main
+
+
+def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
+    header_guard = f'_{kernel_ast.function_name.upper()}_H'
+
+    jinja_context = {
+        'header_guard': header_guard,
+        'function_signature': function_signature,
+        'target': 'gpu'
+    }
+
+    header = _env.get_template('gpu/kernel.h').render(**jinja_context)
+    return header
+
+
+def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    kernel_name = kernel_ast.function_name
+    function_source = generate_c(kernel_ast, dialect=dialect)
+    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
+    headers.update(get_headers(kernel_ast))
+
+    jinja_context = {
+        'function_source': function_source,
+        'headers': sorted(headers),
+        'timing': True,
+        'target': 'gpu'
+    }
+
+    source = _env.get_template('gpu/kernel.cu').render(**jinja_context)
+    return source
diff --git a/pystencils_benchmark/enums.py b/pystencils_benchmark/enums.py
index ec56c8a..84cf49e 100644
--- a/pystencils_benchmark/enums.py
+++ b/pystencils_benchmark/enums.py
@@ -6,3 +6,4 @@ class Compiler(Enum):
     GCCdebug = auto()
     Clang = auto()
     ICC = auto()
+    NVCC = auto()
diff --git a/pystencils_benchmark/templates/NVCC.mk b/pystencils_benchmark/templates/NVCC.mk
new file mode 100644
index 0000000..71010ab
--- /dev/null
+++ b/pystencils_benchmark/templates/NVCC.mk
@@ -0,0 +1,12 @@
+CC  = nvcc
+LINKER = $(CC)
+
+# More warning pls
+#CFLAGS   += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align  -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
+# Maybe too much warnings
+#CFLAGS   += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
+# Specific C flags
+CFLAGS   := -use_fast_math
+DEFINES  = -D_GNU_SOURCE -DNDEBUG
+INCLUDES =
+LIBS     =
diff --git a/pystencils_benchmark/templates/kernel.c b/pystencils_benchmark/templates/cpu/kernel.c
similarity index 100%
rename from pystencils_benchmark/templates/kernel.c
rename to pystencils_benchmark/templates/cpu/kernel.c
diff --git a/pystencils_benchmark/templates/kernel.h b/pystencils_benchmark/templates/cpu/kernel.h
similarity index 100%
rename from pystencils_benchmark/templates/kernel.h
rename to pystencils_benchmark/templates/cpu/kernel.h
diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/cpu/main.c
similarity index 100%
rename from pystencils_benchmark/templates/main.c
rename to pystencils_benchmark/templates/cpu/main.c
diff --git a/pystencils_benchmark/templates/gpu/kernel.cu b/pystencils_benchmark/templates/gpu/kernel.cu
new file mode 100644
index 0000000..973369f
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/kernel.cu
@@ -0,0 +1,8 @@
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_source}}
diff --git a/pystencils_benchmark/templates/gpu/kernel.h b/pystencils_benchmark/templates/gpu/kernel.h
new file mode 100644
index 0000000..102d9c9
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/kernel.h
@@ -0,0 +1,11 @@
+#ifndef {{header_guard}}
+#define {{header_guard}}
+
+
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_signature}};
+
+#endif
diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c
new file mode 100644
index 0000000..b2f3571
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/main.c
@@ -0,0 +1,66 @@
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "timing.h"
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+#include <cuda_runtime.h>
+
+//kernels
+{% for include in includes %}
+#include "{{ include }}.h"
+{% endfor %}
+
+int main(int argc, char **argv)
+{
+    if(argc < 2) {
+        printf("Usage: %s <n_repeat>\n", argv[0]);
+        return -1;
+    }
+    int n_repeat = atoi(argv[1]);
+    {% for kernel in kernels %}
+
+    { // Kernel: {{kernel.name}}
+        {% for field_name, dataType, elements in kernel.fields %}
+          {{dataType}} *{{field_name}};
+          cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}}));
+          cudaMemset({{field_name}},     0.23, {{elements}});
+        {% endfor %}
+
+        dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
+        dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
+
+        for(int warmup = 1; warmup >= 0; --warmup) {
+            int repeat = 2;
+            if(warmup == 0) {
+                repeat = n_repeat;
+            }
+
+            {% if timing %}
+            double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
+            timing(&wcStartTime, &cpuStartTime);
+            {% endif %}
+
+            for (; repeat > 0; --repeat)
+            {
+                {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}});
+            }
+
+            {% if timing %}
+            timing(&wcEndTime, &cpuEndTime);
+
+            if( warmup == 0)
+                printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
+            {% endif %}
+        }
+        {% for field_name, dataType, elements in kernel.fields %}
+        cudaFree({{field_name}});
+        {% endfor %}
+    }
+    {% endfor %}
+}
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 70af02e..4ecd4d8 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -6,7 +6,7 @@ import tempfile
 import pytest
 import pystencils as ps
 from pathlib import Path
-from pystencils_benchmark import generate_benchmark, Compiler
+from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu
 
 
 compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
@@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg):
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
 
+def test_generate_gpu():
+    compiler = Compiler.NVCC
+    a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def vadd():
+        a[0] @= b[0] + c[0]
+    kernel_vadd = ps.create_kernel(**vadd)
+
+    temp_dir = Path('/home/markus/pystencils_benchmark_testfolder')
+    generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler)
+    subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+    subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
+
+
-- 
GitLab


From a0a17570d98777f0260b07204d866de7ce796c8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B6nig?= <jan.hoenig@fau.de>
Date: Mon, 21 Feb 2022 22:29:44 +0100
Subject: [PATCH 02/18] Working CUDA benchmark Version. CUDA needs '.cu' files,
 otherwise it doesn't work?

---
 pystencils_benchmark/benchmark_gpu.py   | 7 +++++--
 pystencils_benchmark/templates/Makefile | 8 ++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py
index 67ce0dd..d68a31d 100644
--- a/pystencils_benchmark/benchmark_gpu.py
+++ b/pystencils_benchmark/benchmark_gpu.py
@@ -41,10 +41,11 @@ def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunctio
             f.write(header)
 
         source = kernel_source(kernel_ast)
-        with open(src_path / f'{kernel_name}.c', 'w+') as f:
+        # TODO CUDA specific suffix
+        with open(src_path / f'{kernel_name}.cu', 'w+') as f:
             f.write(source)
 
-    with open(src_path / 'main.c', 'w+') as f:
+    with open(src_path / 'main.cu', 'w+') as f:
         f.write(kernel_main(kernel_asts))
 
     copy_static_files(path)
@@ -77,6 +78,8 @@ def copy_static_files(path: Path) -> None:
             target_path = include_path / file_name
         elif file_name[-1] == 'c':
             target_path = src_path / file_name
+            # TODO CUDA specific suffix:
+            target_path = target_path.with_suffix('.cu')
         else:
             target_path = path / file_name
         with open(target_path, 'w+') as f:
diff --git a/pystencils_benchmark/templates/Makefile b/pystencils_benchmark/templates/Makefile
index 98fcaaa..ea38570 100644
--- a/pystencils_benchmark/templates/Makefile
+++ b/pystencils_benchmark/templates/Makefile
@@ -14,6 +14,8 @@ INCLUDES  += -I./include
 VPATH     = $(SRC_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
+# TODO CUDA specific SUFFIX
+OBJ       += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
 CFLAGS := $(CFLAGS) $(DEFINES) $(INCLUDES)
 
 
@@ -28,6 +30,12 @@ $(BUILD_DIR)/%.o:  %.c
 	$(Q)$(CC) -c $(CFLAGS) $< -o $@
 	$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 
+# TODO CUDA specific SUFFIX
+$(BUILD_DIR)/%.o:  %.cu
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CC) -c $(CFLAGS) $< -o $@
+	$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
 $(BUILD_DIR)/%.s:  %.c
 	@echo "===>  GENERATE ASM  $@"
 	$(Q)$(CC) -S $(CFLAGS) $< -o $@
-- 
GitLab


From c51eee431ec28df93b22265fe1cd6ac795197956 Mon Sep 17 00:00:00 2001
From: markus holzer <markus.holzer@fau.de>
Date: Sun, 20 Feb 2022 07:58:05 +0100
Subject: [PATCH 03/18] Added CUDA benchmarks

---
 pystencils_benchmark/__init__.py              |   1 +
 pystencils_benchmark/benchmark.py             |   8 +-
 pystencils_benchmark/benchmark_gpu.py         | 170 ++++++++++++++++++
 pystencils_benchmark/enums.py                 |   1 +
 pystencils_benchmark/templates/NVCC.mk        |  12 ++
 .../templates/{ => cpu}/kernel.c              |   0
 .../templates/{ => cpu}/kernel.h              |   0
 .../templates/{ => cpu}/main.c                |   0
 pystencils_benchmark/templates/gpu/kernel.cu  |   8 +
 pystencils_benchmark/templates/gpu/kernel.h   |  11 ++
 pystencils_benchmark/templates/gpu/main.c     |  66 +++++++
 tests/test_benchmark.py                       |  17 +-
 12 files changed, 289 insertions(+), 5 deletions(-)
 create mode 100644 pystencils_benchmark/benchmark_gpu.py
 create mode 100644 pystencils_benchmark/templates/NVCC.mk
 rename pystencils_benchmark/templates/{ => cpu}/kernel.c (100%)
 rename pystencils_benchmark/templates/{ => cpu}/kernel.h (100%)
 rename pystencils_benchmark/templates/{ => cpu}/main.c (100%)
 create mode 100644 pystencils_benchmark/templates/gpu/kernel.cu
 create mode 100644 pystencils_benchmark/templates/gpu/kernel.h
 create mode 100644 pystencils_benchmark/templates/gpu/main.c

diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py
index 6f5f32c..8142abe 100644
--- a/pystencils_benchmark/__init__.py
+++ b/pystencils_benchmark/__init__.py
@@ -1,2 +1,3 @@
 from .enums import Compiler
 from .benchmark import generate_benchmark, kernel_header, kernel_source
+from .benchmark_gpu import generate_benchmark_gpu
diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py
index df6bd9b..3247800 100644
--- a/pystencils_benchmark/benchmark.py
+++ b/pystencils_benchmark/benchmark.py
@@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined
 import numpy as np
 
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.astnodes import KernelFunction, PragmaBlock
+from pystencils.astnodes import KernelFunction
 from pystencils.enums import Backend
 from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
@@ -159,7 +159,7 @@ def kernel_main(kernels_ast: List[KernelFunction], *,
         'likwid': likwid,
     }
 
-    main = _env.get_template('main.c').render(**jinja_context)
+    main = _env.get_template('cpu/main.c').render(**jinja_context)
     return main
 
 
@@ -172,7 +172,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
         'function_signature': function_signature,
     }
 
-    header = _env.get_template('kernel.h').render(**jinja_context)
+    header = _env.get_template('cpu/kernel.h').render(**jinja_context)
     return header
 
 
@@ -188,5 +188,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
         'timing': True,
     }
 
-    source = _env.get_template('kernel.c').render(**jinja_context)
+    source = _env.get_template('cpu/kernel.c').render(**jinja_context)
     return source
diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py
new file mode 100644
index 0000000..67ce0dd
--- /dev/null
+++ b/pystencils_benchmark/benchmark_gpu.py
@@ -0,0 +1,170 @@
+from typing import Union, List
+from collections import namedtuple
+from pathlib import Path
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from pystencils.data_types import get_base_type
+from pystencils.sympyextensions import prod
+from pystencils.transformations import get_common_shape
+from pystencils.gpucuda import BlockIndexing
+
+from pystencils_benchmark.enums import Compiler
+
+_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
+                   trim_blocks=True, lstrip_blocks=True)
+
+
+def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
+                           path: Path = None,
+                           *,
+                           compiler: Compiler = Compiler.GCC) -> None:
+    if path is None:
+        path = Path('.')
+    else:
+        path.mkdir(parents=True, exist_ok=True)
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    if isinstance(kernel_asts, KernelFunction):
+        kernel_asts = [kernel_asts]
+
+    for kernel_ast in kernel_asts:
+        kernel_name = kernel_ast.function_name
+
+        header = kernel_header(kernel_ast)
+        with open(include_path / f'{kernel_name}.h', 'w+') as f:
+            f.write(header)
+
+        source = kernel_source(kernel_ast)
+        with open(src_path / f'{kernel_name}.c', 'w+') as f:
+            f.write(source)
+
+    with open(src_path / 'main.c', 'w+') as f:
+        f.write(kernel_main(kernel_asts))
+
+    copy_static_files(path)
+    compiler_toolchain(path, compiler)
+
+
+def compiler_toolchain(path: Path, compiler: Compiler) -> None:
+    name = compiler.name
+    jinja_context = {
+        'compiler': name,
+    }
+
+    files = ['Makefile', f'{name}.mk']
+    for file_name in files:
+        with open(path / file_name, 'w+') as f:
+            template = _env.get_template(file_name).render(**jinja_context)
+            f.write(template)
+
+
+def copy_static_files(path: Path) -> None:
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    files = ['timing.h', 'timing.c']
+    for file_name in files:
+        template = _env.get_template(file_name).render()
+        if file_name[-1] == 'h':
+            target_path = include_path / file_name
+        elif file_name[-1] == 'c':
+            target_path = src_path / file_name
+        else:
+            target_path = path / file_name
+        with open(target_path, 'w+') as f:
+            f.write(template)
+
+
+def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)):
+    """
+    Return C code of a benchmark program for the given kernel.
+
+    Args:
+        kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
+        timing: add timing output to the code, prints time per iteration to stdout
+        cuda_block_size: defines the cuda block grid
+    Returns:
+        C code as string
+    """
+    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
+                                   'call_argument_list', 'blocks', 'grid'])
+    kernels = []
+    includes = set()
+    for kernel in kernels_ast:
+        name = kernel.function_name
+        accessed_fields = {f.name: f for f in kernel.fields_accessed}
+        constants = []
+        fields = []
+        call_parameters = []
+        block_and_thread_numbers = dict()
+        for p in kernel.get_parameters():
+            if not p.is_field_parameter:
+                constants.append((p.symbol.name, str(p.symbol.dtype)))
+                call_parameters.append(p.symbol.name)
+            else:
+                assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
+                field = accessed_fields[p.field_name]
+                dtype = str(get_base_type(p.symbol.dtype))
+                elements = prod(field.shape)
+
+                fields.append((p.field_name, dtype, elements))
+                call_parameters.append(p.field_name)
+
+            common_shape = get_common_shape(kernel.fields_accessed)
+            indexing = kernel.indexing
+            block_and_thread_numbers = indexing.call_parameters(common_shape)
+            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+
+        kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
+                              call_argument_list=",".join(call_parameters),
+                              blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
+        includes.add(name)
+
+    jinja_context = {
+        'kernels': kernels,
+        'includes': includes,
+        'timing': timing,
+    }
+
+    main = _env.get_template('gpu/main.c').render(**jinja_context)
+    return main
+
+
+def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
+    header_guard = f'_{kernel_ast.function_name.upper()}_H'
+
+    jinja_context = {
+        'header_guard': header_guard,
+        'function_signature': function_signature,
+        'target': 'gpu'
+    }
+
+    header = _env.get_template('gpu/kernel.h').render(**jinja_context)
+    return header
+
+
+def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    kernel_name = kernel_ast.function_name
+    function_source = generate_c(kernel_ast, dialect=dialect)
+    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
+    headers.update(get_headers(kernel_ast))
+
+    jinja_context = {
+        'function_source': function_source,
+        'headers': sorted(headers),
+        'timing': True,
+        'target': 'gpu'
+    }
+
+    source = _env.get_template('gpu/kernel.cu').render(**jinja_context)
+    return source
diff --git a/pystencils_benchmark/enums.py b/pystencils_benchmark/enums.py
index ec56c8a..84cf49e 100644
--- a/pystencils_benchmark/enums.py
+++ b/pystencils_benchmark/enums.py
@@ -6,3 +6,4 @@ class Compiler(Enum):
     GCCdebug = auto()
     Clang = auto()
     ICC = auto()
+    NVCC = auto()
diff --git a/pystencils_benchmark/templates/NVCC.mk b/pystencils_benchmark/templates/NVCC.mk
new file mode 100644
index 0000000..71010ab
--- /dev/null
+++ b/pystencils_benchmark/templates/NVCC.mk
@@ -0,0 +1,12 @@
+CC  = nvcc
+LINKER = $(CC)
+
+# More warning pls
+#CFLAGS   += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align  -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
+# Maybe too much warnings
+#CFLAGS   += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
+# Specific C flags
+CFLAGS   := -use_fast_math
+DEFINES  = -D_GNU_SOURCE -DNDEBUG
+INCLUDES =
+LIBS     =
diff --git a/pystencils_benchmark/templates/kernel.c b/pystencils_benchmark/templates/cpu/kernel.c
similarity index 100%
rename from pystencils_benchmark/templates/kernel.c
rename to pystencils_benchmark/templates/cpu/kernel.c
diff --git a/pystencils_benchmark/templates/kernel.h b/pystencils_benchmark/templates/cpu/kernel.h
similarity index 100%
rename from pystencils_benchmark/templates/kernel.h
rename to pystencils_benchmark/templates/cpu/kernel.h
diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/cpu/main.c
similarity index 100%
rename from pystencils_benchmark/templates/main.c
rename to pystencils_benchmark/templates/cpu/main.c
diff --git a/pystencils_benchmark/templates/gpu/kernel.cu b/pystencils_benchmark/templates/gpu/kernel.cu
new file mode 100644
index 0000000..973369f
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/kernel.cu
@@ -0,0 +1,8 @@
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_source}}
diff --git a/pystencils_benchmark/templates/gpu/kernel.h b/pystencils_benchmark/templates/gpu/kernel.h
new file mode 100644
index 0000000..102d9c9
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/kernel.h
@@ -0,0 +1,11 @@
+#ifndef {{header_guard}}
+#define {{header_guard}}
+
+
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_signature}};
+
+#endif
diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c
new file mode 100644
index 0000000..b2f3571
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/main.c
@@ -0,0 +1,66 @@
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "timing.h"
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+#include <cuda_runtime.h>
+
+//kernels
+{% for include in includes %}
+#include "{{ include }}.h"
+{% endfor %}
+
+int main(int argc, char **argv)
+{
+    if(argc < 2) {
+        printf("Usage: %s <n_repeat>\n", argv[0]);
+        return -1;
+    }
+    int n_repeat = atoi(argv[1]);
+    {% for kernel in kernels %}
+
+    { // Kernel: {{kernel.name}}
+        {% for field_name, dataType, elements in kernel.fields %}
+          {{dataType}} *{{field_name}};
+          cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}}));
+          cudaMemset({{field_name}},     0.23, {{elements}});
+        {% endfor %}
+
+        dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
+        dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
+
+        for(int warmup = 1; warmup >= 0; --warmup) {
+            int repeat = 2;
+            if(warmup == 0) {
+                repeat = n_repeat;
+            }
+
+            {% if timing %}
+            double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
+            timing(&wcStartTime, &cpuStartTime);
+            {% endif %}
+
+            for (; repeat > 0; --repeat)
+            {
+                {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}});
+            }
+
+            {% if timing %}
+            timing(&wcEndTime, &cpuEndTime);
+
+            if( warmup == 0)
+                printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
+            {% endif %}
+        }
+        {% for field_name, dataType, elements in kernel.fields %}
+        cudaFree({{field_name}});
+        {% endfor %}
+    }
+    {% endfor %}
+}
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 70af02e..4ecd4d8 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -6,7 +6,7 @@ import tempfile
 import pytest
 import pystencils as ps
 from pathlib import Path
-from pystencils_benchmark import generate_benchmark, Compiler
+from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu
 
 
 compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
@@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg):
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
 
+def test_generate_gpu():
+    compiler = Compiler.NVCC
+    a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def vadd():
+        a[0] @= b[0] + c[0]
+    kernel_vadd = ps.create_kernel(**vadd)
+
+    temp_dir = Path('/home/markus/pystencils_benchmark_testfolder')
+    generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler)
+    subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+    subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
+
+
-- 
GitLab


From 96c63098a1c8e5f40d2dfd4157a47e29601eb662 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B6nig?= <jan.hoenig@fau.de>
Date: Mon, 21 Feb 2022 22:29:44 +0100
Subject: [PATCH 04/18] Working CUDA benchmark Version. CUDA needs '.cu' files,
 otherwise it doesn't work?

---
 pystencils_benchmark/benchmark_gpu.py   | 7 +++++--
 pystencils_benchmark/templates/Makefile | 8 ++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py
index 67ce0dd..d68a31d 100644
--- a/pystencils_benchmark/benchmark_gpu.py
+++ b/pystencils_benchmark/benchmark_gpu.py
@@ -41,10 +41,11 @@ def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunctio
             f.write(header)
 
         source = kernel_source(kernel_ast)
-        with open(src_path / f'{kernel_name}.c', 'w+') as f:
+        # TODO CUDA specific suffix
+        with open(src_path / f'{kernel_name}.cu', 'w+') as f:
             f.write(source)
 
-    with open(src_path / 'main.c', 'w+') as f:
+    with open(src_path / 'main.cu', 'w+') as f:
         f.write(kernel_main(kernel_asts))
 
     copy_static_files(path)
@@ -77,6 +78,8 @@ def copy_static_files(path: Path) -> None:
             target_path = include_path / file_name
         elif file_name[-1] == 'c':
             target_path = src_path / file_name
+            # TODO CUDA specific suffix:
+            target_path = target_path.with_suffix('.cu')
         else:
             target_path = path / file_name
         with open(target_path, 'w+') as f:
diff --git a/pystencils_benchmark/templates/Makefile b/pystencils_benchmark/templates/Makefile
index 66b68b8..d62c513 100644
--- a/pystencils_benchmark/templates/Makefile
+++ b/pystencils_benchmark/templates/Makefile
@@ -29,6 +29,8 @@ LIBS += -llikwid
 VPATH     = $(SRC_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
+# TODO CUDA specific SUFFIX
+OBJ       += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
 CFLAGS := $(CFLAGS) $(DEFINES) $(INCLUDES)
 
 
@@ -43,6 +45,12 @@ $(BUILD_DIR)/%.o:  %.c
 	$(Q)$(CC) -c $(CFLAGS) $< -o $@
 	$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 
+# TODO CUDA specific SUFFIX
+$(BUILD_DIR)/%.o:  %.cu
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CC) -c $(CFLAGS) $< -o $@
+	$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
 $(BUILD_DIR)/%.s:  %.c
 	@echo "===>  GENERATE ASM  $@"
 	$(Q)$(CC) -S $(CFLAGS) $< -o $@
-- 
GitLab


From ac4b31c141659373cece8cd63ef6dd465dbd2a4f Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 8 Aug 2023 10:42:39 +0200
Subject: [PATCH 05/18] Updated import to new pystencils api

---
 pystencils_benchmark/benchmark_gpu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py
index d68a31d..1c4e24c 100644
--- a/pystencils_benchmark/benchmark_gpu.py
+++ b/pystencils_benchmark/benchmark_gpu.py
@@ -6,10 +6,10 @@ from jinja2 import Environment, PackageLoader, StrictUndefined
 from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.astnodes import KernelFunction
 from pystencils.enums import Backend
-from pystencils.data_types import get_base_type
+from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
-from pystencils.transformations import get_common_shape
-from pystencils.gpucuda import BlockIndexing
+from pystencils.transformations import get_common_field
+# from pystencils.gpucuda import BlockIndexing
 
 from pystencils_benchmark.enums import Compiler
 
@@ -121,7 +121,7 @@ def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_blo
                 fields.append((p.field_name, dtype, elements))
                 call_parameters.append(p.field_name)
 
-            common_shape = get_common_shape(kernel.fields_accessed)
+            common_shape = get_common_field(kernel.fields_accessed).shape
             indexing = kernel.indexing
             block_and_thread_numbers = indexing.call_parameters(common_shape)
             block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
-- 
GitLab


From 39b4029cc112ab29d818c2b95ee2225d23a11431 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 8 Aug 2023 10:43:55 +0200
Subject: [PATCH 06/18] Exposing the cuda block size option to the
 generate_benchmark function

---
 pystencils_benchmark/benchmark.py     |  2 +-
 pystencils_benchmark/benchmark_gpu.py | 12 +++++++++---
 tests/test_benchmark.py               |  6 +++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py
index 3247800..0cc7b11 100644
--- a/pystencils_benchmark/benchmark.py
+++ b/pystencils_benchmark/benchmark.py
@@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined
 import numpy as np
 
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.astnodes import KernelFunction
+from pystencils.astnodes import KernelFunction, PragmaBlock
 from pystencils.enums import Backend
 from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py
index 1c4e24c..befd83d 100644
--- a/pystencils_benchmark/benchmark_gpu.py
+++ b/pystencils_benchmark/benchmark_gpu.py
@@ -20,7 +20,10 @@ _env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=Stric
 def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                            path: Path = None,
                            *,
-                           compiler: Compiler = Compiler.GCC) -> None:
+                           compiler: Compiler = Compiler.GCC,
+                           timing: bool = True,
+                           cuda_block_size: tuple = (32, 1, 1)
+                           ) -> None:
     if path is None:
         path = Path('.')
     else:
@@ -46,7 +49,9 @@ def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunctio
             f.write(source)
 
     with open(src_path / 'main.cu', 'w+') as f:
-        f.write(kernel_main(kernel_asts))
+        f.write(kernel_main(kernel_asts,
+                            timing=timing,
+                            cuda_block_size=cuda_block_size))
 
     copy_static_files(path)
     compiler_toolchain(path, compiler)
@@ -56,6 +61,7 @@ def compiler_toolchain(path: Path, compiler: Compiler) -> None:
     name = compiler.name
     jinja_context = {
         'compiler': name,
+        'likwid': False,
     }
 
     files = ['Makefile', f'{name}.mk']
@@ -86,7 +92,7 @@ def copy_static_files(path: Path) -> None:
             f.write(template)
 
 
-def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)):
+def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_block_size: tuple):
     """
     Return C code of a benchmark program for the given kernel.
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 4ecd4d8..411eb83 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -48,9 +48,9 @@ def test_generate_gpu():
         a[0] @= b[0] + c[0]
     kernel_vadd = ps.create_kernel(**vadd)
 
-    temp_dir = Path('/home/markus/pystencils_benchmark_testfolder')
+    temp_dir = Path('/tmp/pystencils_benchmark_testfolder')
     generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler)
-    subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
-    subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
+    # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+    # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
 
-- 
GitLab


From 3300460d08ee976674a331b01545207d1511d020 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 8 Aug 2023 13:41:28 +0200
Subject: [PATCH 07/18] made the gpu test more streamlined with the cpu tests

---
 tests/test_benchmark.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 6e2dbf8..48207da 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -48,7 +48,8 @@ def test_generate_gpu():
         a[0] @= b[0] + c[0]
     kernel_vadd = ps.create_kernel(**vadd)
 
-    temp_dir = Path('/tmp/pystencils_benchmark_testfolder')
-    generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler)
-    # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
-    # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
+    with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
+        temp_dir = Path(temp_dir)
+        generate_benchmark_gpu(kernel_vadd, temp_dir, compiler=compiler)
+        # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+        # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
-- 
GitLab


From 3e930f35565af4cd669e63fff149dfce8297fc04 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 8 Aug 2023 13:41:53 +0200
Subject: [PATCH 08/18] removed some code duplication between benchmark and
 benchmark_gpu move shared function to a extra file

---
 pystencils_benchmark/benchmark.py     | 80 +++-------------------
 pystencils_benchmark/benchmark_gpu.py | 96 ++++++--------------------
 pystencils_benchmark/common.py        | 97 +++++++++++++++++++++++++++
 3 files changed, 126 insertions(+), 147 deletions(-)
 create mode 100644 pystencils_benchmark/common.py

diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py
index 0cc7b11..4258ee5 100644
--- a/pystencils_benchmark/benchmark.py
+++ b/pystencils_benchmark/benchmark.py
@@ -1,24 +1,24 @@
 from typing import Union, List
 from collections import namedtuple
 from pathlib import Path
-from jinja2 import Environment, PackageLoader, StrictUndefined
 
 import numpy as np
 
-from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.astnodes import KernelFunction, PragmaBlock
 from pystencils.enums import Backend
 from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
 from pystencils.integer_functions import modulo_ceil
 
+from pystencils_benchmark.common import (_env,
+                                         _kernel_source,
+                                         _kernel_header,
+                                         compiler_toolchain,
+                                         copy_static_files,
+                                         setup_directories)
 from pystencils_benchmark.enums import Compiler
 
 
-_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
-                   trim_blocks=True, lstrip_blocks=True)
-
-
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        path: Path = None,
                        *,
@@ -26,14 +26,8 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        timing: bool = True,
                        likwid: bool = False
                        ) -> None:
-    if path is None:
-        path = Path('.')
-    else:
-        path.mkdir(parents=True, exist_ok=True)
-    src_path = path / 'src'
-    src_path.mkdir(parents=True, exist_ok=True)
-    include_path = path / 'include'
-    include_path.mkdir(parents=True, exist_ok=True)
+
+    src_path, include_path = setup_directories(path)
 
     if isinstance(kernel_asts, KernelFunction):
         kernel_asts = [kernel_asts]
@@ -56,39 +50,6 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
     compiler_toolchain(path, compiler, likwid)
 
 
-def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
-    name = compiler.name
-    jinja_context = {
-        'compiler': name,
-        'likwid': likwid,
-    }
-
-    files = ['Makefile', f'{name}.mk']
-    for file_name in files:
-        with open(path / file_name, 'w+') as f:
-            template = _env.get_template(file_name).render(**jinja_context)
-            f.write(template)
-
-
-def copy_static_files(path: Path) -> None:
-    src_path = path / 'src'
-    src_path.mkdir(parents=True, exist_ok=True)
-    include_path = path / 'include'
-    include_path.mkdir(parents=True, exist_ok=True)
-
-    files = ['timing.h', 'timing.c']
-    for file_name in files:
-        template = _env.get_template(file_name).render()
-        if file_name[-1] == 'h':
-            target_path = include_path / file_name
-        elif file_name[-1] == 'c':
-            target_path = src_path / file_name
-        else:
-            target_path = path / file_name
-        with open(target_path, 'w+') as f:
-            f.write(template)
-
-
 def kernel_main(kernels_ast: List[KernelFunction], *,
                 timing: bool = True, likwid: bool = False) -> str:
     """
@@ -164,29 +125,8 @@ def kernel_main(kernels_ast: List[KernelFunction], *,
 
 
 def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
-    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
-    header_guard = f'_{kernel_ast.function_name.upper()}_H'
-
-    jinja_context = {
-        'header_guard': header_guard,
-        'function_signature': function_signature,
-    }
-
-    header = _env.get_template('cpu/kernel.h').render(**jinja_context)
-    return header
+    return _kernel_header(kernel_ast, dialect=dialect, template_file='cpu/kernel.h')
 
 
 def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
-    kernel_name = kernel_ast.function_name
-    function_source = generate_c(kernel_ast, dialect=dialect)
-    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
-    headers.update(get_headers(kernel_ast))
-
-    jinja_context = {
-        'function_source': function_source,
-        'headers': sorted(headers),
-        'timing': True,
-    }
-
-    source = _env.get_template('cpu/kernel.c').render(**jinja_context)
-    return source
+    return _kernel_source(kernel_ast, dialect=dialect, template_file='cpu/kernel.c')
diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py
index befd83d..d0ccbe1 100644
--- a/pystencils_benchmark/benchmark_gpu.py
+++ b/pystencils_benchmark/benchmark_gpu.py
@@ -1,37 +1,31 @@
 from typing import Union, List
 from collections import namedtuple
 from pathlib import Path
-from jinja2 import Environment, PackageLoader, StrictUndefined
 
-from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.astnodes import KernelFunction
 from pystencils.enums import Backend
 from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
 from pystencils.transformations import get_common_field
-# from pystencils.gpucuda import BlockIndexing
 
+from pystencils_benchmark.common import (_env,
+                                         _kernel_source,
+                                         _kernel_header,
+                                         compiler_toolchain,
+                                         copy_static_files,
+                                         setup_directories)
 from pystencils_benchmark.enums import Compiler
 
-_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
-                   trim_blocks=True, lstrip_blocks=True)
-
 
 def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                            path: Path = None,
                            *,
-                           compiler: Compiler = Compiler.GCC,
+                           compiler: Compiler = Compiler.NVCC,
                            timing: bool = True,
                            cuda_block_size: tuple = (32, 1, 1)
                            ) -> None:
-    if path is None:
-        path = Path('.')
-    else:
-        path.mkdir(parents=True, exist_ok=True)
-    src_path = path / 'src'
-    src_path.mkdir(parents=True, exist_ok=True)
-    include_path = path / 'include'
-    include_path.mkdir(parents=True, exist_ok=True)
+
+    src_path, include_path = setup_directories(path)
 
     if isinstance(kernel_asts, KernelFunction):
         kernel_asts = [kernel_asts]
@@ -53,43 +47,8 @@ def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunctio
                             timing=timing,
                             cuda_block_size=cuda_block_size))
 
-    copy_static_files(path)
-    compiler_toolchain(path, compiler)
-
-
-def compiler_toolchain(path: Path, compiler: Compiler) -> None:
-    name = compiler.name
-    jinja_context = {
-        'compiler': name,
-        'likwid': False,
-    }
-
-    files = ['Makefile', f'{name}.mk']
-    for file_name in files:
-        with open(path / file_name, 'w+') as f:
-            template = _env.get_template(file_name).render(**jinja_context)
-            f.write(template)
-
-
-def copy_static_files(path: Path) -> None:
-    src_path = path / 'src'
-    src_path.mkdir(parents=True, exist_ok=True)
-    include_path = path / 'include'
-    include_path.mkdir(parents=True, exist_ok=True)
-
-    files = ['timing.h', 'timing.c']
-    for file_name in files:
-        template = _env.get_template(file_name).render()
-        if file_name[-1] == 'h':
-            target_path = include_path / file_name
-        elif file_name[-1] == 'c':
-            target_path = src_path / file_name
-            # TODO CUDA specific suffix:
-            target_path = target_path.with_suffix('.cu')
-        else:
-            target_path = path / file_name
-        with open(target_path, 'w+') as f:
-            f.write(template)
+    copy_static_files(path, source_file_suffix='.cu')
+    compiler_toolchain(path, compiler, likwid=False)
 
 
 def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_block_size: tuple):
@@ -149,31 +108,14 @@ def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_
 
 
 def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
-    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
-    header_guard = f'_{kernel_ast.function_name.upper()}_H'
-
-    jinja_context = {
-        'header_guard': header_guard,
-        'function_signature': function_signature,
-        'target': 'gpu'
-    }
-
-    header = _env.get_template('gpu/kernel.h').render(**jinja_context)
-    return header
+    return _kernel_header(kernel_ast,
+                          dialect=dialect,
+                          template_file='gpu/kernel.h',
+                          additional_jinja_context={'target': 'gpu'})
 
 
 def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
-    kernel_name = kernel_ast.function_name
-    function_source = generate_c(kernel_ast, dialect=dialect)
-    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
-    headers.update(get_headers(kernel_ast))
-
-    jinja_context = {
-        'function_source': function_source,
-        'headers': sorted(headers),
-        'timing': True,
-        'target': 'gpu'
-    }
-
-    source = _env.get_template('gpu/kernel.cu').render(**jinja_context)
-    return source
+    return _kernel_source(kernel_ast,
+                          dialect=dialect,
+                          template_file='gpu/kernel.cu',
+                          additional_jinja_context={'target': 'gpu'})
diff --git a/pystencils_benchmark/common.py b/pystencils_benchmark/common.py
new file mode 100644
index 0000000..beeeed6
--- /dev/null
+++ b/pystencils_benchmark/common.py
@@ -0,0 +1,97 @@
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils_benchmark.enums import Compiler
+from pathlib import Path
+
+_env = Environment(loader=PackageLoader('pystencils_benchmark'),
+                   undefined=StrictUndefined,
+                   keep_trailing_newline=True,
+                   trim_blocks=True, lstrip_blocks=True)
+
+
+def _kernel_header(kernel_ast: KernelFunction,
+                   dialect: Backend = Backend.C,
+                   *,
+                   template_file: str,
+                   additional_jinja_context: dict = {}) -> str:
+    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
+    header_guard = f'_{kernel_ast.function_name.upper()}_H'
+
+    jinja_context = {
+        'header_guard': header_guard,
+        'function_signature': function_signature,
+        **additional_jinja_context
+    }
+
+    header = _env.get_template(template_file).render(**jinja_context)
+    return header
+
+
+def _kernel_source(kernel_ast: KernelFunction,
+                   dialect: Backend = Backend.C,
+                   *,
+                   template_file: str,
+                   additional_jinja_context: dict = {}) -> str:
+    kernel_name = kernel_ast.function_name
+    function_source = generate_c(kernel_ast, dialect=dialect)
+    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
+    headers.update(get_headers(kernel_ast))
+
+    jinja_context = {
+        'function_source': function_source,
+        'headers': sorted(headers),
+        'timing': True,
+        **additional_jinja_context,
+    }
+
+    source = _env.get_template(template_file).render(**jinja_context)
+    return source
+
+
+def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
+    name = compiler.name
+    jinja_context = {
+        'compiler': name,
+        'likwid': likwid,
+    }
+
+    files = ['Makefile', f'{name}.mk']
+    for file_name in files:
+        with open(path / file_name, 'w+') as f:
+            template = _env.get_template(file_name).render(**jinja_context)
+            f.write(template)
+
+
+def copy_static_files(path: Path, *, source_file_suffix='.c') -> None:
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    files = ['timing.h', 'timing.c']
+    for file_name in files:
+        template = _env.get_template(file_name).render()
+        if file_name[-1] == 'h':
+            target_path = include_path / file_name
+        elif file_name[-1] == 'c':
+            target_path = src_path / file_name
+            target_path = target_path.with_suffix(source_file_suffix)
+        else:
+            target_path = path / file_name
+        with open(target_path, 'w+') as f:
+            f.write(template)
+
+
+def setup_directories(path: Path):
+    if path is None:
+        path = Path('.')
+    else:
+        path.mkdir(parents=True, exist_ok=True)
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+    return src_path, include_path
-- 
GitLab


From 24f81cf6e2c186862df5d9ab641e98a2be419167 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 8 Aug 2023 14:11:30 +0200
Subject: [PATCH 09/18] added submodules from cpu and gpu benchmark generation

---
 pystencils_benchmark/__init__.py                   |  4 ++--
 pystencils_benchmark/cpu/__init__.py               |  1 +
 pystencils_benchmark/{ => cpu}/benchmark.py        |  0
 pystencils_benchmark/gpu/__init__.py               |  1 +
 .../{benchmark_gpu.py => gpu/benchmark.py}         | 14 +++++++-------
 tests/test_benchmark.py                            |  8 +++++---
 ve_example/test.py                                 | 14 +++++++-------
 7 files changed, 23 insertions(+), 19 deletions(-)
 create mode 100644 pystencils_benchmark/cpu/__init__.py
 rename pystencils_benchmark/{ => cpu}/benchmark.py (100%)
 create mode 100644 pystencils_benchmark/gpu/__init__.py
 rename pystencils_benchmark/{benchmark_gpu.py => gpu/benchmark.py} (92%)

diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py
index 8142abe..86d8f56 100644
--- a/pystencils_benchmark/__init__.py
+++ b/pystencils_benchmark/__init__.py
@@ -1,3 +1,3 @@
 from .enums import Compiler
-from .benchmark import generate_benchmark, kernel_header, kernel_source
-from .benchmark_gpu import generate_benchmark_gpu
+from . import gpu
+from . import cpu
diff --git a/pystencils_benchmark/cpu/__init__.py b/pystencils_benchmark/cpu/__init__.py
new file mode 100644
index 0000000..cfd889e
--- /dev/null
+++ b/pystencils_benchmark/cpu/__init__.py
@@ -0,0 +1 @@
+from .benchmark import generate_benchmark
diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/cpu/benchmark.py
similarity index 100%
rename from pystencils_benchmark/benchmark.py
rename to pystencils_benchmark/cpu/benchmark.py
diff --git a/pystencils_benchmark/gpu/__init__.py b/pystencils_benchmark/gpu/__init__.py
new file mode 100644
index 0000000..cfd889e
--- /dev/null
+++ b/pystencils_benchmark/gpu/__init__.py
@@ -0,0 +1 @@
+from .benchmark import generate_benchmark
diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/gpu/benchmark.py
similarity index 92%
rename from pystencils_benchmark/benchmark_gpu.py
rename to pystencils_benchmark/gpu/benchmark.py
index d0ccbe1..d653d18 100644
--- a/pystencils_benchmark/benchmark_gpu.py
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -17,13 +17,13 @@ from pystencils_benchmark.common import (_env,
 from pystencils_benchmark.enums import Compiler
 
 
-def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
-                           path: Path = None,
-                           *,
-                           compiler: Compiler = Compiler.NVCC,
-                           timing: bool = True,
-                           cuda_block_size: tuple = (32, 1, 1)
-                           ) -> None:
+def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
+                       path: Path = None,
+                       *,
+                       compiler: Compiler = Compiler.NVCC,
+                       timing: bool = True,
+                       cuda_block_size: tuple = (32, 1, 1)
+                       ) -> None:
 
     src_path, include_path = setup_directories(path)
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 48207da..0c42c79 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -6,7 +6,9 @@ import tempfile
 import pytest
 import pystencils as ps
 from pathlib import Path
-from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu
+
+from pystencils_benchmark import Compiler
+import pystencils_benchmark as pb
 
 
 compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
@@ -34,7 +36,7 @@ def test_generate(compiler, config_kwarg):
 
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
+        pb.cpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
         subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
@@ -50,6 +52,6 @@ def test_generate_gpu():
 
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        generate_benchmark_gpu(kernel_vadd, temp_dir, compiler=compiler)
+        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler)
         # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
diff --git a/ve_example/test.py b/ve_example/test.py
index ee5e32b..9bb91d6 100755
--- a/ve_example/test.py
+++ b/ve_example/test.py
@@ -4,15 +4,16 @@ import subprocess
 import numpy as np
 import sympy as sp
 import pystencils as ps
-from pystencils_benchmark import generate_benchmark, Compiler
+import pystencils_benchmark as pb
 from pathlib import Path
 
 
-def generate(path: Path, compiler: Compiler):
+def generate(path: Path, compiler: pb.Compiler):
     a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
     alpha = sp.symbols('alpha')
 
     kernels = []
+
     @ps.kernel_config(ps.CreateKernelConfig())
     def vadd():
         a[0] @= b[0] + c[0]
@@ -33,20 +34,20 @@ def generate(path: Path, compiler: Compiler):
         b[0] @= alpha * a[0] + b[0]
     kernels.append(ps.create_kernel(**daxpy_vector))
 
-    generate_benchmark(kernels, path, compiler=compiler)
+    pb.cpu.generate_benchmark(kernels, path, compiler=compiler)
 
 
 def make(path: Path):
     subprocess.run(['make'], check=True)
 
 
-def execute(path: Path, compiler: Compiler):
+def execute(path: Path, compiler: pb.Compiler):
     subprocess.run([f'./benchmark-{compiler.name}', '100'], check=True)
 
 
 def main():
-    compiler = Compiler.GCCdebug
-    path = Path.cwd()
+    compiler = pb.Compiler.GCCdebug
+    path = Path.cwd() / 'generated'
     generate(path, compiler)
     make(path)
     execute(path, compiler)
@@ -54,4 +55,3 @@ def main():
 
 if __name__ == '__main__':
     main()
-
-- 
GitLab


From 857f1848adce4d4292362c3366c7ec32204fd4ab Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 8 Aug 2023 14:16:57 +0200
Subject: [PATCH 10/18] removed the mutable default argument from the
 _kernel_header and _kernel_source function

---
 pystencils_benchmark/common.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pystencils_benchmark/common.py b/pystencils_benchmark/common.py
index beeeed6..70cabd6 100644
--- a/pystencils_benchmark/common.py
+++ b/pystencils_benchmark/common.py
@@ -16,15 +16,17 @@ def _kernel_header(kernel_ast: KernelFunction,
                    dialect: Backend = Backend.C,
                    *,
                    template_file: str,
-                   additional_jinja_context: dict = {}) -> str:
+                   additional_jinja_context: dict = None) -> str:
+
     function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
     header_guard = f'_{kernel_ast.function_name.upper()}_H'
 
     jinja_context = {
         'header_guard': header_guard,
         'function_signature': function_signature,
-        **additional_jinja_context
     }
+    if additional_jinja_context is not None:
+        jinja_context.update(additional_jinja_context)
 
     header = _env.get_template(template_file).render(**jinja_context)
     return header
@@ -34,7 +36,8 @@ def _kernel_source(kernel_ast: KernelFunction,
                    dialect: Backend = Backend.C,
                    *,
                    template_file: str,
-                   additional_jinja_context: dict = {}) -> str:
+                   additional_jinja_context: dict = None) -> str:
+
     kernel_name = kernel_ast.function_name
     function_source = generate_c(kernel_ast, dialect=dialect)
     headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
@@ -44,9 +47,11 @@ def _kernel_source(kernel_ast: KernelFunction,
         'function_source': function_source,
         'headers': sorted(headers),
         'timing': True,
-        **additional_jinja_context,
     }
 
+    if additional_jinja_context is not None:
+        jinja_context.update(additional_jinja_context)
+
     source = _env.get_template(template_file).render(**jinja_context)
     return source
 
-- 
GitLab


From 9140da63f4446be8b1c9c5f58617654f5fffb485 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 8 Aug 2023 16:22:11 +0200
Subject: [PATCH 11/18] Added a parameter to insert a launch bounds to the
 kernel

---
 pystencils_benchmark/gpu/benchmark.py | 13 ++++++++++++-
 tests/test_benchmark.py               |  8 ++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py
index d653d18..5a4852c 100644
--- a/pystencils_benchmark/gpu/benchmark.py
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -17,12 +17,19 @@ from pystencils_benchmark.common import (_env,
 from pystencils_benchmark.enums import Compiler
 
 
+def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
+    lb_str = f"__launch_bounds__({','.join(str(lb) for lb in launch_bounds)})"
+    splitted = code.split("void")
+    return splitted[0] + lb_str + "".join(splitted[1:])
+
+
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        path: Path = None,
                        *,
                        compiler: Compiler = Compiler.NVCC,
                        timing: bool = True,
-                       cuda_block_size: tuple = (32, 1, 1)
+                       cuda_block_size: tuple = (32, 1, 1),
+                       launch_bounds: tuple = None,
                        ) -> None:
 
     src_path, include_path = setup_directories(path)
@@ -34,10 +41,14 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
         kernel_name = kernel_ast.function_name
 
         header = kernel_header(kernel_ast)
+        if launch_bounds:
+            header = _add_launch_bound(header, launch_bounds)
         with open(include_path / f'{kernel_name}.h', 'w+') as f:
             f.write(header)
 
         source = kernel_source(kernel_ast)
+        if launch_bounds:
+            source = _add_launch_bound(source, launch_bounds)
         # TODO CUDA specific suffix
         with open(src_path / f'{kernel_name}.cu', 'w+') as f:
             f.write(source)
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 0c42c79..929d4f8 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -41,7 +41,11 @@ def test_generate(compiler, config_kwarg):
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
 
-def test_generate_gpu():
+gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)})
+
+
+@pytest.mark.parametrize('kwargs', gpu_kwargs)
+def test_generate_gpu(kwargs):
     compiler = Compiler.NVCC
     a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
 
@@ -52,6 +56,6 @@ def test_generate_gpu():
 
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler)
+        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs)
         # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
-- 
GitLab


From 4ee400e910d835c5c6535970fee423dad1a35798 Mon Sep 17 00:00:00 2001
From: Christoph Alt <typ@ohnebild.com>
Date: Wed, 9 Aug 2023 14:51:12 +0200
Subject: [PATCH 12/18] added the new packages to the setup.cfg and the new
 templates to the manifest so that packaging works correctly

---
 MANIFEST.in | 2 ++
 setup.cfg   | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index ef395db..38e6285 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,5 @@
 include README.md
 include LICENSE.md
 include pystencils_benchmark/templates/*
+include pystencils_benchmark/templates/cpu/*
+include pystencils_benchmark/templates/gpu/*
diff --git a/setup.cfg b/setup.cfg
index 4063797..8bc42e9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,10 @@ license = AGPLv3
 version = 0.0.1
 
 [options]
-packages = pystencils_benchmark
+packages = 
+    pystencils_benchmark
+    pystencils_benchmark.gpu
+    pystencils_benchmark.cpu
 install_requires =
     jinja2 >= 3.0
     pystencils >= 0.3.4
-- 
GitLab


From 4b1f3f5374dcef0c0e56f44c9679782080278bd7 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 15 Aug 2023 09:44:27 +0200
Subject: [PATCH 13/18] fixed the _add_launch_bounds and also added some small
 tests

---
 pystencils_benchmark/gpu/benchmark.py | 10 +++++++---
 tests/test_launch_bounds.py           | 19 +++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_launch_bounds.py

diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py
index 5a4852c..96fb58c 100644
--- a/pystencils_benchmark/gpu/benchmark.py
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -18,9 +18,13 @@ from pystencils_benchmark.enums import Compiler
 
 
 def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
-    lb_str = f"__launch_bounds__({','.join(str(lb) for lb in launch_bounds)})"
-    splitted = code.split("void")
-    return splitted[0] + lb_str + "".join(splitted[1:])
+    lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) "
+    splitted = code.split("void ")
+    prefix = splitted[0]
+    if code.startswith("void "):
+        # just in case that there is nothing before the first void
+        prefix = ""
+    return prefix + "void " + lb_str + "void ".join(splitted[1:])
 
 
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
diff --git a/tests/test_launch_bounds.py b/tests/test_launch_bounds.py
new file mode 100644
index 0000000..48af06d
--- /dev/null
+++ b/tests/test_launch_bounds.py
@@ -0,0 +1,19 @@
+import numpy as np
+import pystencils as ps
+from pystencils_benchmark.gpu.benchmark import kernel_header, _add_launch_bound, kernel_source
+
+
+def test_launch_bounds():
+    a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def vadd():
+        a[0] @= b[0] + c[0]
+    kernel_vadd = ps.create_kernel(**vadd)
+    launch_bounds = (256, 2)
+    header = kernel_header(kernel_vadd)
+    header = _add_launch_bound(header, launch_bounds)
+    assert "void __launch_bounds__(256, 2)" in header
+    source = kernel_source(kernel_vadd)
+    source = _add_launch_bound(source, launch_bounds)
+    assert "void __launch_bounds__(256, 2)" in source 
-- 
GitLab


From 6e88d389816964f1dea3dfdc40da11e6a28e66c8 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 15 Aug 2023 10:30:04 +0200
Subject: [PATCH 14/18] Using cuda as a base for the docker container to also
 test the gpu kernels

---
 .gitlab-ci.yml          | 1 +
 Dockerfile              | 4 ++--
 tests/test_benchmark.py | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 65e62e3..7a159bd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,6 +26,7 @@ test:
   image: i10git.cs.fau.de:5005/pycodegen/pystencils-benchmark/pystencils-benchmark
   tags:
     - docker
+    - cuda
   script:
     - pip install tox
     - echo $TOX_ENV
diff --git a/Dockerfile b/Dockerfile
index 2b8f8a8..dcb5493 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
-FROM ubuntu:latest
+FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
 
 LABEL maintainer="jan.hoenig@fau.de"
-LABEL version="0.1"
+LABEL version="0.2"
 LABEL description="Custom docker image for pystencils-benchmark"
 
 ARG DEBIAN_FRONTEND=noninteractive
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 929d4f8..62881ec 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -57,5 +57,5 @@ def test_generate_gpu(kwargs):
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
         pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs)
-        # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
-        # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
+        subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+        subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
-- 
GitLab


From 82ce1d7dcc28c621889699ef02821549d516ae43 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 15 Aug 2023 12:37:10 +0200
Subject: [PATCH 15/18] Fix the missing constants for the gpu main file and
 added a kernel with a constant to the gpu tests

---
 pystencils_benchmark/templates/gpu/main.c | 6 ++++++
 tests/test_benchmark.py                   | 8 +++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c
index b2f3571..1373048 100644
--- a/pystencils_benchmark/templates/gpu/main.c
+++ b/pystencils_benchmark/templates/gpu/main.c
@@ -32,6 +32,12 @@ int main(int argc, char **argv)
           cudaMemset({{field_name}},     0.23, {{elements}});
         {% endfor %}
 
+        {% for constantName, dataType in kernel.constants %}
+        // Constant {{constantName}}
+        {{dataType}} {{constantName}};
+        {{constantName}} = 0.23;
+        {% endfor %}
+
         dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
         dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 62881ec..fe3946b 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -48,14 +48,20 @@ gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)})
 def test_generate_gpu(kwargs):
     compiler = Compiler.NVCC
     a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+    alpha = sp.symbols('alpha')
 
     @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
     def vadd():
         a[0] @= b[0] + c[0]
     kernel_vadd = ps.create_kernel(**vadd)
 
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def daxpy():
+        b[0] @= alpha * a[0] + b[0]
+    kernel_daxpy = ps.create_kernel(**daxpy)
+
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs)
+        pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs)
         subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
-- 
GitLab


From 1e542f17ab15c0551438a1a73535f3530b2dc16e Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 15 Aug 2023 13:49:45 +0200
Subject: [PATCH 16/18] Skipping compiling and running cuda kernels if cuda or
 gpu is not available

---
 tests/test_benchmark.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index fe3946b..a45fcdd 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -18,6 +18,14 @@ config_kwargs = ({},
                                          'assume_aligned': True}})
 
 
+def nvidia_gpu_available():
+    return subprocess.call(['nvidia-smi']) == 0
+
+
+def nvcc_available():
+    return subprocess.call(['nvcc', '--version']) == 0
+
+
 @pytest.mark.parametrize('compiler', compilers)
 @pytest.mark.parametrize('config_kwarg', config_kwargs)
 def test_generate(compiler, config_kwarg):
@@ -63,5 +71,9 @@ def test_generate_gpu(kwargs):
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
         pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs)
+        if not nvcc_available():
+            return
         subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+        if not nvidia_gpu_available():
+            return
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
-- 
GitLab


From d38a932419699bce59df19117cfac64060d1cb97 Mon Sep 17 00:00:00 2001
From: Christoph Alt <typ@ohnebild.com>
Date: Wed, 16 Aug 2023 15:03:18 +0200
Subject: [PATCH 17/18] using pytest skip if there is no nvcc or gpu available

---
 tests/test_benchmark.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index a45fcdd..b47f9c5 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -19,11 +19,17 @@ config_kwargs = ({},
 
 
 def nvidia_gpu_available():
-    return subprocess.call(['nvidia-smi']) == 0
+    try:
+        return subprocess.call(['nvidia-smi']) == 0
+    except (FileNotFoundError,):
+        return False
 
 
 def nvcc_available():
-    return subprocess.call(['nvcc', '--version']) == 0
+    try:
+        return subprocess.call(['nvcc', '--version']) == 0
+    except (FileNotFoundError,):
+        return False
 
 
 @pytest.mark.parametrize('compiler', compilers)
@@ -72,8 +78,8 @@ def test_generate_gpu(kwargs):
         temp_dir = Path(temp_dir)
         pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs)
         if not nvcc_available():
-            return
+            pytest.skip("nvcc is not available!")
         subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         if not nvidia_gpu_available():
-            return
+            pytest.skip("There is no GPU available!")
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
-- 
GitLab


From 879ee872a994c4b220e7403a361ecc8c5361d102 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Mon, 11 Sep 2023 14:55:38 +0200
Subject: [PATCH 18/18] removed the unused `cuda_block_size` for the
 `gpu.generate_benchmark` function. For now the only way to set the
 cuda_block_size size is to use pass it in the `ps.KernelConfig` to the
 generated kernel

---
 pystencils_benchmark/gpu/benchmark.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py
index 96fb58c..1e9ce37 100644
--- a/pystencils_benchmark/gpu/benchmark.py
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -32,7 +32,6 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        *,
                        compiler: Compiler = Compiler.NVCC,
                        timing: bool = True,
-                       cuda_block_size: tuple = (32, 1, 1),
                        launch_bounds: tuple = None,
                        ) -> None:
 
@@ -58,22 +57,19 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
             f.write(source)
 
     with open(src_path / 'main.cu', 'w+') as f:
-        f.write(kernel_main(kernel_asts,
-                            timing=timing,
-                            cuda_block_size=cuda_block_size))
+        f.write(kernel_main(kernel_asts, timing=timing))
 
     copy_static_files(path, source_file_suffix='.cu')
     compiler_toolchain(path, compiler, likwid=False)
 
 
-def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_block_size: tuple):
+def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True):
     """
     Return C code of a benchmark program for the given kernel.
 
     Args:
         kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
         timing: add timing output to the code, prints time per iteration to stdout
-        cuda_block_size: defines the cuda block grid
     Returns:
         C code as string
     """
-- 
GitLab