From 92c29b1887911721a8ff3781978f46fc0e8270db Mon Sep 17 00:00:00 2001 From: markus holzer <markus.holzer@fau.de> Date: Sun, 20 Feb 2022 07:58:05 +0100 Subject: [PATCH 01/18] Added CUDA benchmarks --- pystencils_benchmark/__init__.py | 1 + pystencils_benchmark/benchmark.py | 8 +- pystencils_benchmark/benchmark_gpu.py | 170 ++++++++++++++++++ pystencils_benchmark/enums.py | 1 + pystencils_benchmark/templates/NVCC.mk | 12 ++ .../templates/{ => cpu}/kernel.c | 0 .../templates/{ => cpu}/kernel.h | 0 .../templates/{ => cpu}/main.c | 0 pystencils_benchmark/templates/gpu/kernel.cu | 8 + pystencils_benchmark/templates/gpu/kernel.h | 11 ++ pystencils_benchmark/templates/gpu/main.c | 66 +++++++ tests/test_benchmark.py | 17 +- 12 files changed, 289 insertions(+), 5 deletions(-) create mode 100644 pystencils_benchmark/benchmark_gpu.py create mode 100644 pystencils_benchmark/templates/NVCC.mk rename pystencils_benchmark/templates/{ => cpu}/kernel.c (100%) rename pystencils_benchmark/templates/{ => cpu}/kernel.h (100%) rename pystencils_benchmark/templates/{ => cpu}/main.c (100%) create mode 100644 pystencils_benchmark/templates/gpu/kernel.cu create mode 100644 pystencils_benchmark/templates/gpu/kernel.h create mode 100644 pystencils_benchmark/templates/gpu/main.c diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py index 6f5f32c..8142abe 100644 --- a/pystencils_benchmark/__init__.py +++ b/pystencils_benchmark/__init__.py @@ -1,2 +1,3 @@ from .enums import Compiler from .benchmark import generate_benchmark, kernel_header, kernel_source +from .benchmark_gpu import generate_benchmark_gpu diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py index 67cc3d5..a4ce99c 100644 --- a/pystencils_benchmark/benchmark.py +++ b/pystencils_benchmark/benchmark.py @@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined import numpy as np from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.astnodes import KernelFunction, PragmaBlock +from pystencils.astnodes import KernelFunction from pystencils.enums import Backend from pystencils.data_types import get_base_type from pystencils.sympyextensions import prod @@ -147,7 +147,7 @@ def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True): 'timing': timing, } - main = _env.get_template('main.c').render(**jinja_context) + main = _env.get_template('cpu/main.c').render(**jinja_context) return main @@ -160,7 +160,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s 'function_signature': function_signature, } - header = _env.get_template('kernel.h').render(**jinja_context) + header = _env.get_template('cpu/kernel.h').render(**jinja_context) return header @@ -176,5 +176,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s 'timing': True, } - source = _env.get_template('kernel.c').render(**jinja_context) + source = _env.get_template('cpu/kernel.c').render(**jinja_context) return source diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py new file mode 100644 index 0000000..67ce0dd --- /dev/null +++ b/pystencils_benchmark/benchmark_gpu.py @@ -0,0 +1,170 @@ +from typing import Union, List +from collections import namedtuple +from pathlib import Path +from jinja2 import Environment, PackageLoader, StrictUndefined + +from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.astnodes import KernelFunction +from pystencils.enums import Backend +from pystencils.data_types import get_base_type +from pystencils.sympyextensions import prod +from pystencils.transformations import get_common_shape +from pystencils.gpucuda import BlockIndexing + +from pystencils_benchmark.enums import Compiler + +_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True, + trim_blocks=True, lstrip_blocks=True) + + +def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]], + path: Path = None, + *, + compiler: Compiler = Compiler.GCC) -> None: + if path is None: + path = Path('.') + else: + path.mkdir(parents=True, exist_ok=True) + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + + if isinstance(kernel_asts, KernelFunction): + kernel_asts = [kernel_asts] + + for kernel_ast in kernel_asts: + kernel_name = kernel_ast.function_name + + header = kernel_header(kernel_ast) + with open(include_path / f'{kernel_name}.h', 'w+') as f: + f.write(header) + + source = kernel_source(kernel_ast) + with open(src_path / f'{kernel_name}.c', 'w+') as f: + f.write(source) + + with open(src_path / 'main.c', 'w+') as f: + f.write(kernel_main(kernel_asts)) + + copy_static_files(path) + compiler_toolchain(path, compiler) + + +def compiler_toolchain(path: Path, compiler: Compiler) -> None: + name = compiler.name + jinja_context = { + 'compiler': name, + } + + files = ['Makefile', f'{name}.mk'] + for file_name in files: + with open(path / file_name, 'w+') as f: + template = _env.get_template(file_name).render(**jinja_context) + f.write(template) + + +def copy_static_files(path: Path) -> None: + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + + files = ['timing.h', 'timing.c'] + for file_name in files: + template = _env.get_template(file_name).render() + if file_name[-1] == 'h': + target_path = include_path / file_name + elif file_name[-1] == 'c': + target_path = src_path / file_name + else: + target_path = path / file_name + with open(target_path, 'w+') as f: + f.write(template) + + +def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)): + """ + Return C code of a benchmark program for the given kernel. + + Args: + kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking + timing: add timing output to the code, prints time per iteration to stdout + cuda_block_size: defines the cuda block grid + Returns: + C code as string + """ + Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters', + 'call_argument_list', 'blocks', 'grid']) + kernels = [] + includes = set() + for kernel in kernels_ast: + name = kernel.function_name + accessed_fields = {f.name: f for f in kernel.fields_accessed} + constants = [] + fields = [] + call_parameters = [] + block_and_thread_numbers = dict() + for p in kernel.get_parameters(): + if not p.is_field_parameter: + constants.append((p.symbol.name, str(p.symbol.dtype))) + call_parameters.append(p.symbol.name) + else: + assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" + field = accessed_fields[p.field_name] + dtype = str(get_base_type(p.symbol.dtype)) + elements = prod(field.shape) + + fields.append((p.field_name, dtype, elements)) + call_parameters.append(p.field_name) + + common_shape = get_common_shape(kernel.fields_accessed) + indexing = kernel.indexing + block_and_thread_numbers = indexing.call_parameters(common_shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) + + kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters, + call_argument_list=",".join(call_parameters), + blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid'])) + includes.add(name) + + jinja_context = { + 'kernels': kernels, + 'includes': includes, + 'timing': timing, + } + + main = _env.get_template('gpu/main.c').render(**jinja_context) + return main + + +def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: + function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) + header_guard = f'_{kernel_ast.function_name.upper()}_H' + + jinja_context = { + 'header_guard': header_guard, + 'function_signature': function_signature, + 'target': 'gpu' + } + + header = _env.get_template('gpu/kernel.h').render(**jinja_context) + return header + + +def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: + kernel_name = kernel_ast.function_name + function_source = generate_c(kernel_ast, dialect=dialect) + headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} + headers.update(get_headers(kernel_ast)) + + jinja_context = { + 'function_source': function_source, + 'headers': sorted(headers), + 'timing': True, + 'target': 'gpu' + } + + source = _env.get_template('gpu/kernel.cu').render(**jinja_context) + return source diff --git a/pystencils_benchmark/enums.py b/pystencils_benchmark/enums.py index ec56c8a..84cf49e 100644 --- a/pystencils_benchmark/enums.py +++ b/pystencils_benchmark/enums.py @@ -6,3 +6,4 @@ class Compiler(Enum): GCCdebug = auto() Clang = auto() ICC = auto() + NVCC = auto() diff --git a/pystencils_benchmark/templates/NVCC.mk b/pystencils_benchmark/templates/NVCC.mk new file mode 100644 index 0000000..71010ab --- /dev/null +++ b/pystencils_benchmark/templates/NVCC.mk @@ -0,0 +1,12 @@ +CC = nvcc +LINKER = $(CC) + +# More warning pls +#CFLAGS += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return +# Maybe too much warnings +#CFLAGS += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code +# Specific C flags +CFLAGS := -use_fast_math +DEFINES = -D_GNU_SOURCE -DNDEBUG +INCLUDES = +LIBS = diff --git a/pystencils_benchmark/templates/kernel.c b/pystencils_benchmark/templates/cpu/kernel.c similarity index 100% rename from pystencils_benchmark/templates/kernel.c rename to pystencils_benchmark/templates/cpu/kernel.c diff --git a/pystencils_benchmark/templates/kernel.h b/pystencils_benchmark/templates/cpu/kernel.h similarity index 100% rename from pystencils_benchmark/templates/kernel.h rename to pystencils_benchmark/templates/cpu/kernel.h diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/cpu/main.c similarity index 100% rename from pystencils_benchmark/templates/main.c rename to pystencils_benchmark/templates/cpu/main.c diff --git a/pystencils_benchmark/templates/gpu/kernel.cu b/pystencils_benchmark/templates/gpu/kernel.cu new file mode 100644 index 0000000..973369f --- /dev/null +++ b/pystencils_benchmark/templates/gpu/kernel.cu @@ -0,0 +1,8 @@ +{% for header in headers %} +#include {{header}} +{% endfor %} + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ + +{{function_source}} diff --git a/pystencils_benchmark/templates/gpu/kernel.h b/pystencils_benchmark/templates/gpu/kernel.h new file mode 100644 index 0000000..102d9c9 --- /dev/null +++ b/pystencils_benchmark/templates/gpu/kernel.h @@ -0,0 +1,11 @@ +#ifndef {{header_guard}} +#define {{header_guard}} + + + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ + +{{function_signature}}; + +#endif diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c new file mode 100644 index 0000000..b2f3571 --- /dev/null +++ b/pystencils_benchmark/templates/gpu/main.c @@ -0,0 +1,66 @@ +#include <assert.h> +#include <math.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "timing.h" + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ +#include <cuda_runtime.h> + +//kernels +{% for include in includes %} +#include "{{ include }}.h" +{% endfor %} + +int main(int argc, char **argv) +{ + if(argc < 2) { + printf("Usage: %s <n_repeat>\n", argv[0]); + return -1; + } + int n_repeat = atoi(argv[1]); + {% for kernel in kernels %} + + { // Kernel: {{kernel.name}} + {% for field_name, dataType, elements in kernel.fields %} + {{dataType}} *{{field_name}}; + cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}})); + cudaMemset({{field_name}}, 0.23, {{elements}}); + {% endfor %} + + dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}}); + dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}}); + + for(int warmup = 1; warmup >= 0; --warmup) { + int repeat = 2; + if(warmup == 0) { + repeat = n_repeat; + } + + {% if timing %} + double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; + timing(&wcStartTime, &cpuStartTime); + {% endif %} + + for (; repeat > 0; --repeat) + { + {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}}); + } + + {% if timing %} + timing(&wcEndTime, &cpuEndTime); + + if( warmup == 0) + printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat ); + {% endif %} + } + {% for field_name, dataType, elements in kernel.fields %} + cudaFree({{field_name}}); + {% endfor %} + } + {% endfor %} +} diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 70af02e..4ecd4d8 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -6,7 +6,7 @@ import tempfile import pytest import pystencils as ps from pathlib import Path -from pystencils_benchmark import generate_benchmark, Compiler +from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang) @@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg): subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) +def test_generate_gpu(): + compiler = Compiler.NVCC + a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) + + @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) + def vadd(): + a[0] @= b[0] + c[0] + kernel_vadd = ps.create_kernel(**vadd) + + temp_dir = Path('/home/markus/pystencils_benchmark_testfolder') + generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler) + subprocess.run(['make', '-C', f'{temp_dir}'], check=True) + subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) + + -- GitLab From a0a17570d98777f0260b07204d866de7ce796c8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6nig?= <jan.hoenig@fau.de> Date: Mon, 21 Feb 2022 22:29:44 +0100 Subject: [PATCH 02/18] Working CUDA benchmark Version. CUDA needs '.cu' files, otherwise it doesn't work? --- pystencils_benchmark/benchmark_gpu.py | 7 +++++-- pystencils_benchmark/templates/Makefile | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py index 67ce0dd..d68a31d 100644 --- a/pystencils_benchmark/benchmark_gpu.py +++ b/pystencils_benchmark/benchmark_gpu.py @@ -41,10 +41,11 @@ def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunctio f.write(header) source = kernel_source(kernel_ast) - with open(src_path / f'{kernel_name}.c', 'w+') as f: + # TODO CUDA specific suffix + with open(src_path / f'{kernel_name}.cu', 'w+') as f: f.write(source) - with open(src_path / 'main.c', 'w+') as f: + with open(src_path / 'main.cu', 'w+') as f: f.write(kernel_main(kernel_asts)) copy_static_files(path) @@ -77,6 +78,8 @@ def copy_static_files(path: Path) -> None: target_path = include_path / file_name elif file_name[-1] == 'c': target_path = src_path / file_name + # TODO CUDA specific suffix: + target_path = target_path.with_suffix('.cu') else: target_path = path / file_name with open(target_path, 'w+') as f: diff --git a/pystencils_benchmark/templates/Makefile b/pystencils_benchmark/templates/Makefile index 98fcaaa..ea38570 100644 --- a/pystencils_benchmark/templates/Makefile +++ b/pystencils_benchmark/templates/Makefile @@ -14,6 +14,8 @@ INCLUDES += -I./include VPATH = $(SRC_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OBJ = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)) +# TODO CUDA specific SUFFIX +OBJ += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu)) CFLAGS := $(CFLAGS) $(DEFINES) $(INCLUDES) @@ -28,6 +30,12 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(CFLAGS) $< -o $@ $(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d +# TODO CUDA specific SUFFIX +$(BUILD_DIR)/%.o: %.cu + @echo "===> COMPILE $@" + $(Q)$(CC) -c $(CFLAGS) $< -o $@ + $(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d + $(BUILD_DIR)/%.s: %.c @echo "===> GENERATE ASM $@" $(Q)$(CC) -S $(CFLAGS) $< -o $@ -- GitLab From c51eee431ec28df93b22265fe1cd6ac795197956 Mon Sep 17 00:00:00 2001 From: markus holzer <markus.holzer@fau.de> Date: Sun, 20 Feb 2022 07:58:05 +0100 Subject: [PATCH 03/18] Added CUDA benchmarks --- pystencils_benchmark/__init__.py | 1 + pystencils_benchmark/benchmark.py | 8 +- pystencils_benchmark/benchmark_gpu.py | 170 ++++++++++++++++++ pystencils_benchmark/enums.py | 1 + pystencils_benchmark/templates/NVCC.mk | 12 ++ .../templates/{ => cpu}/kernel.c | 0 .../templates/{ => cpu}/kernel.h | 0 .../templates/{ => cpu}/main.c | 0 pystencils_benchmark/templates/gpu/kernel.cu | 8 + pystencils_benchmark/templates/gpu/kernel.h | 11 ++ pystencils_benchmark/templates/gpu/main.c | 66 +++++++ tests/test_benchmark.py | 17 +- 12 files changed, 289 insertions(+), 5 deletions(-) create mode 100644 pystencils_benchmark/benchmark_gpu.py create mode 100644 pystencils_benchmark/templates/NVCC.mk rename pystencils_benchmark/templates/{ => cpu}/kernel.c (100%) rename pystencils_benchmark/templates/{ => cpu}/kernel.h (100%) rename pystencils_benchmark/templates/{ => cpu}/main.c (100%) create mode 100644 pystencils_benchmark/templates/gpu/kernel.cu create mode 100644 pystencils_benchmark/templates/gpu/kernel.h create mode 100644 pystencils_benchmark/templates/gpu/main.c diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py index 6f5f32c..8142abe 100644 --- a/pystencils_benchmark/__init__.py +++ b/pystencils_benchmark/__init__.py @@ -1,2 +1,3 @@ from .enums import Compiler from .benchmark import generate_benchmark, kernel_header, kernel_source +from .benchmark_gpu import generate_benchmark_gpu diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py index df6bd9b..3247800 100644 --- a/pystencils_benchmark/benchmark.py +++ b/pystencils_benchmark/benchmark.py @@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined import numpy as np from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.astnodes import KernelFunction, PragmaBlock +from pystencils.astnodes import KernelFunction from pystencils.enums import Backend from pystencils.typing import get_base_type from pystencils.sympyextensions import prod @@ -159,7 +159,7 @@ def kernel_main(kernels_ast: List[KernelFunction], *, 'likwid': likwid, } - main = _env.get_template('main.c').render(**jinja_context) + main = _env.get_template('cpu/main.c').render(**jinja_context) return main @@ -172,7 +172,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s 'function_signature': function_signature, } - header = _env.get_template('kernel.h').render(**jinja_context) + header = _env.get_template('cpu/kernel.h').render(**jinja_context) return header @@ -188,5 +188,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s 'timing': True, } - source = _env.get_template('kernel.c').render(**jinja_context) + source = _env.get_template('cpu/kernel.c').render(**jinja_context) return source diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py new file mode 100644 index 0000000..67ce0dd --- /dev/null +++ b/pystencils_benchmark/benchmark_gpu.py @@ -0,0 +1,170 @@ +from typing import Union, List +from collections import namedtuple +from pathlib import Path +from jinja2 import Environment, PackageLoader, StrictUndefined + +from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.astnodes import KernelFunction +from pystencils.enums import Backend +from pystencils.data_types import get_base_type +from pystencils.sympyextensions import prod +from pystencils.transformations import get_common_shape +from pystencils.gpucuda import BlockIndexing + +from pystencils_benchmark.enums import Compiler + +_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True, + trim_blocks=True, lstrip_blocks=True) + + +def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]], + path: Path = None, + *, + compiler: Compiler = Compiler.GCC) -> None: + if path is None: + path = Path('.') + else: + path.mkdir(parents=True, exist_ok=True) + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + + if isinstance(kernel_asts, KernelFunction): + kernel_asts = [kernel_asts] + + for kernel_ast in kernel_asts: + kernel_name = kernel_ast.function_name + + header = kernel_header(kernel_ast) + with open(include_path / f'{kernel_name}.h', 'w+') as f: + f.write(header) + + source = kernel_source(kernel_ast) + with open(src_path / f'{kernel_name}.c', 'w+') as f: + f.write(source) + + with open(src_path / 'main.c', 'w+') as f: + f.write(kernel_main(kernel_asts)) + + copy_static_files(path) + compiler_toolchain(path, compiler) + + +def compiler_toolchain(path: Path, compiler: Compiler) -> None: + name = compiler.name + jinja_context = { + 'compiler': name, + } + + files = ['Makefile', f'{name}.mk'] + for file_name in files: + with open(path / file_name, 'w+') as f: + template = _env.get_template(file_name).render(**jinja_context) + f.write(template) + + +def copy_static_files(path: Path) -> None: + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + + files = ['timing.h', 'timing.c'] + for file_name in files: + template = _env.get_template(file_name).render() + if file_name[-1] == 'h': + target_path = include_path / file_name + elif file_name[-1] == 'c': + target_path = src_path / file_name + else: + target_path = path / file_name + with open(target_path, 'w+') as f: + f.write(template) + + +def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)): + """ + Return C code of a benchmark program for the given kernel. + + Args: + kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking + timing: add timing output to the code, prints time per iteration to stdout + cuda_block_size: defines the cuda block grid + Returns: + C code as string + """ + Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters', + 'call_argument_list', 'blocks', 'grid']) + kernels = [] + includes = set() + for kernel in kernels_ast: + name = kernel.function_name + accessed_fields = {f.name: f for f in kernel.fields_accessed} + constants = [] + fields = [] + call_parameters = [] + block_and_thread_numbers = dict() + for p in kernel.get_parameters(): + if not p.is_field_parameter: + constants.append((p.symbol.name, str(p.symbol.dtype))) + call_parameters.append(p.symbol.name) + else: + assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" + field = accessed_fields[p.field_name] + dtype = str(get_base_type(p.symbol.dtype)) + elements = prod(field.shape) + + fields.append((p.field_name, dtype, elements)) + call_parameters.append(p.field_name) + + common_shape = get_common_shape(kernel.fields_accessed) + indexing = kernel.indexing + block_and_thread_numbers = indexing.call_parameters(common_shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) + + kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters, + call_argument_list=",".join(call_parameters), + blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid'])) + includes.add(name) + + jinja_context = { + 'kernels': kernels, + 'includes': includes, + 'timing': timing, + } + + main = _env.get_template('gpu/main.c').render(**jinja_context) + return main + + +def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: + function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) + header_guard = f'_{kernel_ast.function_name.upper()}_H' + + jinja_context = { + 'header_guard': header_guard, + 'function_signature': function_signature, + 'target': 'gpu' + } + + header = _env.get_template('gpu/kernel.h').render(**jinja_context) + return header + + +def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: + kernel_name = kernel_ast.function_name + function_source = generate_c(kernel_ast, dialect=dialect) + headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} + headers.update(get_headers(kernel_ast)) + + jinja_context = { + 'function_source': function_source, + 'headers': sorted(headers), + 'timing': True, + 'target': 'gpu' + } + + source = _env.get_template('gpu/kernel.cu').render(**jinja_context) + return source diff --git a/pystencils_benchmark/enums.py b/pystencils_benchmark/enums.py index ec56c8a..84cf49e 100644 --- a/pystencils_benchmark/enums.py +++ b/pystencils_benchmark/enums.py @@ -6,3 +6,4 @@ class Compiler(Enum): GCCdebug = auto() Clang = auto() ICC = auto() + NVCC = auto() diff --git a/pystencils_benchmark/templates/NVCC.mk b/pystencils_benchmark/templates/NVCC.mk new file mode 100644 index 0000000..71010ab --- /dev/null +++ b/pystencils_benchmark/templates/NVCC.mk @@ -0,0 +1,12 @@ +CC = nvcc +LINKER = $(CC) + +# More warning pls +#CFLAGS += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return +# Maybe too much warnings +#CFLAGS += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code +# Specific C flags +CFLAGS := -use_fast_math +DEFINES = -D_GNU_SOURCE -DNDEBUG +INCLUDES = +LIBS = diff --git a/pystencils_benchmark/templates/kernel.c b/pystencils_benchmark/templates/cpu/kernel.c similarity index 100% rename from pystencils_benchmark/templates/kernel.c rename to pystencils_benchmark/templates/cpu/kernel.c diff --git a/pystencils_benchmark/templates/kernel.h b/pystencils_benchmark/templates/cpu/kernel.h similarity index 100% rename from pystencils_benchmark/templates/kernel.h rename to pystencils_benchmark/templates/cpu/kernel.h diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/cpu/main.c similarity index 100% rename from pystencils_benchmark/templates/main.c rename to pystencils_benchmark/templates/cpu/main.c diff --git a/pystencils_benchmark/templates/gpu/kernel.cu b/pystencils_benchmark/templates/gpu/kernel.cu new file mode 100644 index 0000000..973369f --- /dev/null +++ b/pystencils_benchmark/templates/gpu/kernel.cu @@ -0,0 +1,8 @@ +{% for header in headers %} +#include {{header}} +{% endfor %} + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ + +{{function_source}} diff --git a/pystencils_benchmark/templates/gpu/kernel.h b/pystencils_benchmark/templates/gpu/kernel.h new file mode 100644 index 0000000..102d9c9 --- /dev/null +++ b/pystencils_benchmark/templates/gpu/kernel.h @@ -0,0 +1,11 @@ +#ifndef {{header_guard}} +#define {{header_guard}} + + + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ + +{{function_signature}}; + +#endif diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c new file mode 100644 index 0000000..b2f3571 --- /dev/null +++ b/pystencils_benchmark/templates/gpu/main.c @@ -0,0 +1,66 @@ +#include <assert.h> +#include <math.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "timing.h" + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ +#include <cuda_runtime.h> + +//kernels +{% for include in includes %} +#include "{{ include }}.h" +{% endfor %} + +int main(int argc, char **argv) +{ + if(argc < 2) { + printf("Usage: %s <n_repeat>\n", argv[0]); + return -1; + } + int n_repeat = atoi(argv[1]); + {% for kernel in kernels %} + + { // Kernel: {{kernel.name}} + {% for field_name, dataType, elements in kernel.fields %} + {{dataType}} *{{field_name}}; + cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}})); + cudaMemset({{field_name}}, 0.23, {{elements}}); + {% endfor %} + + dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}}); + dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}}); + + for(int warmup = 1; warmup >= 0; --warmup) { + int repeat = 2; + if(warmup == 0) { + repeat = n_repeat; + } + + {% if timing %} + double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; + timing(&wcStartTime, &cpuStartTime); + {% endif %} + + for (; repeat > 0; --repeat) + { + {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}}); + } + + {% if timing %} + timing(&wcEndTime, &cpuEndTime); + + if( warmup == 0) + printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat ); + {% endif %} + } + {% for field_name, dataType, elements in kernel.fields %} + cudaFree({{field_name}}); + {% endfor %} + } + {% endfor %} +} diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 70af02e..4ecd4d8 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -6,7 +6,7 @@ import tempfile import pytest import pystencils as ps from pathlib import Path -from pystencils_benchmark import generate_benchmark, Compiler +from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang) @@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg): subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) +def test_generate_gpu(): + compiler = Compiler.NVCC + a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) + + @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) + def vadd(): + a[0] @= b[0] + c[0] + kernel_vadd = ps.create_kernel(**vadd) + + temp_dir = Path('/home/markus/pystencils_benchmark_testfolder') + generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler) + subprocess.run(['make', '-C', f'{temp_dir}'], check=True) + subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) + + -- GitLab From 96c63098a1c8e5f40d2dfd4157a47e29601eb662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6nig?= <jan.hoenig@fau.de> Date: Mon, 21 Feb 2022 22:29:44 +0100 Subject: [PATCH 04/18] Working CUDA benchmark Version. CUDA needs '.cu' files, otherwise it doesn't work? --- pystencils_benchmark/benchmark_gpu.py | 7 +++++-- pystencils_benchmark/templates/Makefile | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py index 67ce0dd..d68a31d 100644 --- a/pystencils_benchmark/benchmark_gpu.py +++ b/pystencils_benchmark/benchmark_gpu.py @@ -41,10 +41,11 @@ def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunctio f.write(header) source = kernel_source(kernel_ast) - with open(src_path / f'{kernel_name}.c', 'w+') as f: + # TODO CUDA specific suffix + with open(src_path / f'{kernel_name}.cu', 'w+') as f: f.write(source) - with open(src_path / 'main.c', 'w+') as f: + with open(src_path / 'main.cu', 'w+') as f: f.write(kernel_main(kernel_asts)) copy_static_files(path) @@ -77,6 +78,8 @@ def copy_static_files(path: Path) -> None: target_path = include_path / file_name elif file_name[-1] == 'c': target_path = src_path / file_name + # TODO CUDA specific suffix: + target_path = target_path.with_suffix('.cu') else: target_path = path / file_name with open(target_path, 'w+') as f: diff --git a/pystencils_benchmark/templates/Makefile b/pystencils_benchmark/templates/Makefile index 66b68b8..d62c513 100644 --- a/pystencils_benchmark/templates/Makefile +++ b/pystencils_benchmark/templates/Makefile @@ -29,6 +29,8 @@ LIBS += -llikwid VPATH = $(SRC_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OBJ = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)) +# TODO CUDA specific SUFFIX +OBJ += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu)) CFLAGS := $(CFLAGS) $(DEFINES) $(INCLUDES) @@ -43,6 +45,12 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(CFLAGS) $< -o $@ $(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d +# TODO CUDA specific SUFFIX +$(BUILD_DIR)/%.o: %.cu + @echo "===> COMPILE $@" + $(Q)$(CC) -c $(CFLAGS) $< -o $@ + $(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d + $(BUILD_DIR)/%.s: %.c @echo "===> GENERATE ASM $@" $(Q)$(CC) -S $(CFLAGS) $< -o $@ -- GitLab From ac4b31c141659373cece8cd63ef6dd465dbd2a4f Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 8 Aug 2023 10:42:39 +0200 Subject: [PATCH 05/18] Updated import to new pystencils api --- pystencils_benchmark/benchmark_gpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py index d68a31d..1c4e24c 100644 --- a/pystencils_benchmark/benchmark_gpu.py +++ b/pystencils_benchmark/benchmark_gpu.py @@ -6,10 +6,10 @@ from jinja2 import Environment, PackageLoader, StrictUndefined from pystencils.backends.cbackend import generate_c, get_headers from pystencils.astnodes import KernelFunction from pystencils.enums import Backend -from pystencils.data_types import get_base_type +from pystencils.typing import get_base_type from pystencils.sympyextensions import prod -from pystencils.transformations import get_common_shape -from pystencils.gpucuda import BlockIndexing +from pystencils.transformations import get_common_field +# from pystencils.gpucuda import BlockIndexing from pystencils_benchmark.enums import Compiler @@ -121,7 +121,7 @@ def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_blo fields.append((p.field_name, dtype, elements)) call_parameters.append(p.field_name) - common_shape = get_common_shape(kernel.fields_accessed) + common_shape = get_common_field(kernel.fields_accessed).shape indexing = kernel.indexing block_and_thread_numbers = indexing.call_parameters(common_shape) block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) -- GitLab From 39b4029cc112ab29d818c2b95ee2225d23a11431 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 8 Aug 2023 10:43:55 +0200 Subject: [PATCH 06/18] Exposing the cuda block size option to the generate_benchmark function --- pystencils_benchmark/benchmark.py | 2 +- pystencils_benchmark/benchmark_gpu.py | 12 +++++++++--- tests/test_benchmark.py | 6 +++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py index 3247800..0cc7b11 100644 --- a/pystencils_benchmark/benchmark.py +++ b/pystencils_benchmark/benchmark.py @@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined import numpy as np from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.astnodes import KernelFunction +from pystencils.astnodes import KernelFunction, PragmaBlock from pystencils.enums import Backend from pystencils.typing import get_base_type from pystencils.sympyextensions import prod diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py index 1c4e24c..befd83d 100644 --- a/pystencils_benchmark/benchmark_gpu.py +++ b/pystencils_benchmark/benchmark_gpu.py @@ -20,7 +20,10 @@ _env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=Stric def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]], path: Path = None, *, - compiler: Compiler = Compiler.GCC) -> None: + compiler: Compiler = Compiler.GCC, + timing: bool = True, + cuda_block_size: tuple = (32, 1, 1) + ) -> None: if path is None: path = Path('.') else: @@ -46,7 +49,9 @@ def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunctio f.write(source) with open(src_path / 'main.cu', 'w+') as f: - f.write(kernel_main(kernel_asts)) + f.write(kernel_main(kernel_asts, + timing=timing, + cuda_block_size=cuda_block_size)) copy_static_files(path) compiler_toolchain(path, compiler) @@ -56,6 +61,7 @@ def compiler_toolchain(path: Path, compiler: Compiler) -> None: name = compiler.name jinja_context = { 'compiler': name, + 'likwid': False, } files = ['Makefile', f'{name}.mk'] @@ -86,7 +92,7 @@ def copy_static_files(path: Path) -> None: f.write(template) -def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)): +def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_block_size: tuple): """ Return C code of a benchmark program for the given kernel. diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 4ecd4d8..411eb83 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -48,9 +48,9 @@ def test_generate_gpu(): a[0] @= b[0] + c[0] kernel_vadd = ps.create_kernel(**vadd) - temp_dir = Path('/home/markus/pystencils_benchmark_testfolder') + temp_dir = Path('/tmp/pystencils_benchmark_testfolder') generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler) - subprocess.run(['make', '-C', f'{temp_dir}'], check=True) - subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) + # subprocess.run(['make', '-C', f'{temp_dir}'], check=True) + # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -- GitLab From 3300460d08ee976674a331b01545207d1511d020 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 8 Aug 2023 13:41:28 +0200 Subject: [PATCH 07/18] made the gpu test more streamlined with the cpu tests --- tests/test_benchmark.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 6e2dbf8..48207da 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -48,7 +48,8 @@ def test_generate_gpu(): a[0] @= b[0] + c[0] kernel_vadd = ps.create_kernel(**vadd) - temp_dir = Path('/tmp/pystencils_benchmark_testfolder') - generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler) - # subprocess.run(['make', '-C', f'{temp_dir}'], check=True) - # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) + with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: + temp_dir = Path(temp_dir) + generate_benchmark_gpu(kernel_vadd, temp_dir, compiler=compiler) + # subprocess.run(['make', '-C', f'{temp_dir}'], check=True) + # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -- GitLab From 3e930f35565af4cd669e63fff149dfce8297fc04 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 8 Aug 2023 13:41:53 +0200 Subject: [PATCH 08/18] removed some code duplication between benchmark and benchmark_gpu move shared function to a extra file --- pystencils_benchmark/benchmark.py | 80 +++------------------- pystencils_benchmark/benchmark_gpu.py | 96 ++++++-------------------- pystencils_benchmark/common.py | 97 +++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 147 deletions(-) create mode 100644 pystencils_benchmark/common.py diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py index 0cc7b11..4258ee5 100644 --- a/pystencils_benchmark/benchmark.py +++ b/pystencils_benchmark/benchmark.py @@ -1,24 +1,24 @@ from typing import Union, List from collections import namedtuple from pathlib import Path -from jinja2 import Environment, PackageLoader, StrictUndefined import numpy as np -from pystencils.backends.cbackend import generate_c, get_headers from pystencils.astnodes import KernelFunction, PragmaBlock from pystencils.enums import Backend from pystencils.typing import get_base_type from pystencils.sympyextensions import prod from pystencils.integer_functions import modulo_ceil +from pystencils_benchmark.common import (_env, + _kernel_source, + _kernel_header, + compiler_toolchain, + copy_static_files, + setup_directories) from pystencils_benchmark.enums import Compiler -_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True, - trim_blocks=True, lstrip_blocks=True) - - def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], path: Path = None, *, @@ -26,14 +26,8 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], timing: bool = True, likwid: bool = False ) -> None: - if path is None: - path = Path('.') - else: - path.mkdir(parents=True, exist_ok=True) - src_path = path / 'src' - src_path.mkdir(parents=True, exist_ok=True) - include_path = path / 'include' - include_path.mkdir(parents=True, exist_ok=True) + + src_path, include_path = setup_directories(path) if isinstance(kernel_asts, KernelFunction): kernel_asts = [kernel_asts] @@ -56,39 +50,6 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], compiler_toolchain(path, compiler, likwid) -def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None: - name = compiler.name - jinja_context = { - 'compiler': name, - 'likwid': likwid, - } - - files = ['Makefile', f'{name}.mk'] - for file_name in files: - with open(path / file_name, 'w+') as f: - template = _env.get_template(file_name).render(**jinja_context) - f.write(template) - - -def copy_static_files(path: Path) -> None: - src_path = path / 'src' - src_path.mkdir(parents=True, exist_ok=True) - include_path = path / 'include' - include_path.mkdir(parents=True, exist_ok=True) - - files = ['timing.h', 'timing.c'] - for file_name in files: - template = _env.get_template(file_name).render() - if file_name[-1] == 'h': - target_path = include_path / file_name - elif file_name[-1] == 'c': - target_path = src_path / file_name - else: - target_path = path / file_name - with open(target_path, 'w+') as f: - f.write(template) - - def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, likwid: bool = False) -> str: """ @@ -164,29 +125,8 @@ def kernel_main(kernels_ast: List[KernelFunction], *, def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: - function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) - header_guard = f'_{kernel_ast.function_name.upper()}_H' - - jinja_context = { - 'header_guard': header_guard, - 'function_signature': function_signature, - } - - header = _env.get_template('cpu/kernel.h').render(**jinja_context) - return header + return _kernel_header(kernel_ast, dialect=dialect, template_file='cpu/kernel.h') def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: - kernel_name = kernel_ast.function_name - function_source = generate_c(kernel_ast, dialect=dialect) - headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} - headers.update(get_headers(kernel_ast)) - - jinja_context = { - 'function_source': function_source, - 'headers': sorted(headers), - 'timing': True, - } - - source = _env.get_template('cpu/kernel.c').render(**jinja_context) - return source + return _kernel_source(kernel_ast, dialect=dialect, template_file='cpu/kernel.c') diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py index befd83d..d0ccbe1 100644 --- a/pystencils_benchmark/benchmark_gpu.py +++ b/pystencils_benchmark/benchmark_gpu.py @@ -1,37 +1,31 @@ from typing import Union, List from collections import namedtuple from pathlib import Path -from jinja2 import Environment, PackageLoader, StrictUndefined -from pystencils.backends.cbackend import generate_c, get_headers from pystencils.astnodes import KernelFunction from pystencils.enums import Backend from pystencils.typing import get_base_type from pystencils.sympyextensions import prod from pystencils.transformations import get_common_field -# from pystencils.gpucuda import BlockIndexing +from pystencils_benchmark.common import (_env, + _kernel_source, + _kernel_header, + compiler_toolchain, + copy_static_files, + setup_directories) from pystencils_benchmark.enums import Compiler -_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True, - trim_blocks=True, lstrip_blocks=True) - def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]], path: Path = None, *, - compiler: Compiler = Compiler.GCC, + compiler: Compiler = Compiler.NVCC, timing: bool = True, cuda_block_size: tuple = (32, 1, 1) ) -> None: - if path is None: - path = Path('.') - else: - path.mkdir(parents=True, exist_ok=True) - src_path = path / 'src' - src_path.mkdir(parents=True, exist_ok=True) - include_path = path / 'include' - include_path.mkdir(parents=True, exist_ok=True) + + src_path, include_path = setup_directories(path) if isinstance(kernel_asts, KernelFunction): kernel_asts = [kernel_asts] @@ -53,43 +47,8 @@ def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunctio timing=timing, cuda_block_size=cuda_block_size)) - copy_static_files(path) - compiler_toolchain(path, compiler) - - -def compiler_toolchain(path: Path, compiler: Compiler) -> None: - name = compiler.name - jinja_context = { - 'compiler': name, - 'likwid': False, - } - - files = ['Makefile', f'{name}.mk'] - for file_name in files: - with open(path / file_name, 'w+') as f: - template = _env.get_template(file_name).render(**jinja_context) - f.write(template) - - -def copy_static_files(path: Path) -> None: - src_path = path / 'src' - src_path.mkdir(parents=True, exist_ok=True) - include_path = path / 'include' - include_path.mkdir(parents=True, exist_ok=True) - - files = ['timing.h', 'timing.c'] - for file_name in files: - template = _env.get_template(file_name).render() - if file_name[-1] == 'h': - target_path = include_path / file_name - elif file_name[-1] == 'c': - target_path = src_path / file_name - # TODO CUDA specific suffix: - target_path = target_path.with_suffix('.cu') - else: - target_path = path / file_name - with open(target_path, 'w+') as f: - f.write(template) + copy_static_files(path, source_file_suffix='.cu') + compiler_toolchain(path, compiler, likwid=False) def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_block_size: tuple): @@ -149,31 +108,14 @@ def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: - function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) - header_guard = f'_{kernel_ast.function_name.upper()}_H' - - jinja_context = { - 'header_guard': header_guard, - 'function_signature': function_signature, - 'target': 'gpu' - } - - header = _env.get_template('gpu/kernel.h').render(**jinja_context) - return header + return _kernel_header(kernel_ast, + dialect=dialect, + template_file='gpu/kernel.h', + additional_jinja_context={'target': 'gpu'}) def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: - kernel_name = kernel_ast.function_name - function_source = generate_c(kernel_ast, dialect=dialect) - headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} - headers.update(get_headers(kernel_ast)) - - jinja_context = { - 'function_source': function_source, - 'headers': sorted(headers), - 'timing': True, - 'target': 'gpu' - } - - source = _env.get_template('gpu/kernel.cu').render(**jinja_context) - return source + return _kernel_source(kernel_ast, + dialect=dialect, + template_file='gpu/kernel.cu', + additional_jinja_context={'target': 'gpu'}) diff --git a/pystencils_benchmark/common.py b/pystencils_benchmark/common.py new file mode 100644 index 0000000..beeeed6 --- /dev/null +++ b/pystencils_benchmark/common.py @@ -0,0 +1,97 @@ +from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.astnodes import KernelFunction +from pystencils.enums import Backend +from jinja2 import Environment, PackageLoader, StrictUndefined + +from pystencils_benchmark.enums import Compiler +from pathlib import Path + +_env = Environment(loader=PackageLoader('pystencils_benchmark'), + undefined=StrictUndefined, + keep_trailing_newline=True, + trim_blocks=True, lstrip_blocks=True) + + +def _kernel_header(kernel_ast: KernelFunction, + dialect: Backend = Backend.C, + *, + template_file: str, + additional_jinja_context: dict = {}) -> str: + function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) + header_guard = f'_{kernel_ast.function_name.upper()}_H' + + jinja_context = { + 'header_guard': header_guard, + 'function_signature': function_signature, + **additional_jinja_context + } + + header = _env.get_template(template_file).render(**jinja_context) + return header + + +def _kernel_source(kernel_ast: KernelFunction, + dialect: Backend = Backend.C, + *, + template_file: str, + additional_jinja_context: dict = {}) -> str: + kernel_name = kernel_ast.function_name + function_source = generate_c(kernel_ast, dialect=dialect) + headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} + headers.update(get_headers(kernel_ast)) + + jinja_context = { + 'function_source': function_source, + 'headers': sorted(headers), + 'timing': True, + **additional_jinja_context, + } + + source = _env.get_template(template_file).render(**jinja_context) + return source + + +def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None: + name = compiler.name + jinja_context = { + 'compiler': name, + 'likwid': likwid, + } + + files = ['Makefile', f'{name}.mk'] + for file_name in files: + with open(path / file_name, 'w+') as f: + template = _env.get_template(file_name).render(**jinja_context) + f.write(template) + + +def copy_static_files(path: Path, *, source_file_suffix='.c') -> None: + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + + files = ['timing.h', 'timing.c'] + for file_name in files: + template = _env.get_template(file_name).render() + if file_name[-1] == 'h': + target_path = include_path / file_name + elif file_name[-1] == 'c': + target_path = src_path / file_name + target_path = target_path.with_suffix(source_file_suffix) + else: + target_path = path / file_name + with open(target_path, 'w+') as f: + f.write(template) + + +def setup_directories(path: Path): + if path is None: + path = Path('.') + else: + path.mkdir(parents=True, exist_ok=True) + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + return src_path, include_path -- GitLab From 24f81cf6e2c186862df5d9ab641e98a2be419167 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 8 Aug 2023 14:11:30 +0200 Subject: [PATCH 09/18] added submodules from cpu and gpu benchmark generation --- pystencils_benchmark/__init__.py | 4 ++-- pystencils_benchmark/cpu/__init__.py | 1 + pystencils_benchmark/{ => cpu}/benchmark.py | 0 pystencils_benchmark/gpu/__init__.py | 1 + .../{benchmark_gpu.py => gpu/benchmark.py} | 14 +++++++------- tests/test_benchmark.py | 8 +++++--- ve_example/test.py | 14 +++++++------- 7 files changed, 23 insertions(+), 19 deletions(-) create mode 100644 pystencils_benchmark/cpu/__init__.py rename pystencils_benchmark/{ => cpu}/benchmark.py (100%) create mode 100644 pystencils_benchmark/gpu/__init__.py rename pystencils_benchmark/{benchmark_gpu.py => gpu/benchmark.py} (92%) diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py index 8142abe..86d8f56 100644 --- a/pystencils_benchmark/__init__.py +++ b/pystencils_benchmark/__init__.py @@ -1,3 +1,3 @@ from .enums import Compiler -from .benchmark import generate_benchmark, kernel_header, kernel_source -from .benchmark_gpu import generate_benchmark_gpu +from . import gpu +from . import cpu diff --git a/pystencils_benchmark/cpu/__init__.py b/pystencils_benchmark/cpu/__init__.py new file mode 100644 index 0000000..cfd889e --- /dev/null +++ b/pystencils_benchmark/cpu/__init__.py @@ -0,0 +1 @@ +from .benchmark import generate_benchmark diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/cpu/benchmark.py similarity index 100% rename from pystencils_benchmark/benchmark.py rename to pystencils_benchmark/cpu/benchmark.py diff --git a/pystencils_benchmark/gpu/__init__.py b/pystencils_benchmark/gpu/__init__.py new file mode 100644 index 0000000..cfd889e --- /dev/null +++ b/pystencils_benchmark/gpu/__init__.py @@ -0,0 +1 @@ +from .benchmark import generate_benchmark diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/gpu/benchmark.py similarity index 92% rename from pystencils_benchmark/benchmark_gpu.py rename to pystencils_benchmark/gpu/benchmark.py index d0ccbe1..d653d18 100644 --- a/pystencils_benchmark/benchmark_gpu.py +++ b/pystencils_benchmark/gpu/benchmark.py @@ -17,13 +17,13 @@ from pystencils_benchmark.common import (_env, from pystencils_benchmark.enums import Compiler -def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]], - path: Path = None, - *, - compiler: Compiler = Compiler.NVCC, - timing: bool = True, - cuda_block_size: tuple = (32, 1, 1) - ) -> None: +def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], + path: Path = None, + *, + compiler: Compiler = Compiler.NVCC, + timing: bool = True, + cuda_block_size: tuple = (32, 1, 1) + ) -> None: src_path, include_path = setup_directories(path) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 48207da..0c42c79 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -6,7 +6,9 @@ import tempfile import pytest import pystencils as ps from pathlib import Path -from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu + +from pystencils_benchmark import Compiler +import pystencils_benchmark as pb compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang) @@ -34,7 +36,7 @@ def test_generate(compiler, config_kwarg): with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: temp_dir = Path(temp_dir) - generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler) + pb.cpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler) subprocess.run(['make', '-C', f'{temp_dir}'], check=True) subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) @@ -50,6 +52,6 @@ def test_generate_gpu(): with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: temp_dir = Path(temp_dir) - generate_benchmark_gpu(kernel_vadd, temp_dir, compiler=compiler) + pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler) # subprocess.run(['make', '-C', f'{temp_dir}'], check=True) # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) diff --git a/ve_example/test.py b/ve_example/test.py index ee5e32b..9bb91d6 100755 --- a/ve_example/test.py +++ b/ve_example/test.py @@ -4,15 +4,16 @@ import subprocess import numpy as np import sympy as sp import pystencils as ps -from pystencils_benchmark import generate_benchmark, Compiler +import pystencils_benchmark as pb from pathlib import Path -def generate(path: Path, compiler: Compiler): +def generate(path: Path, compiler: pb.Compiler): a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) alpha = sp.symbols('alpha') kernels = [] + @ps.kernel_config(ps.CreateKernelConfig()) def vadd(): a[0] @= b[0] + c[0] @@ -33,20 +34,20 @@ def generate(path: Path, compiler: Compiler): b[0] @= alpha * a[0] + b[0] kernels.append(ps.create_kernel(**daxpy_vector)) - generate_benchmark(kernels, path, compiler=compiler) + pb.cpu.generate_benchmark(kernels, path, compiler=compiler) def make(path: Path): subprocess.run(['make'], check=True) -def execute(path: Path, compiler: Compiler): +def execute(path: Path, compiler: pb.Compiler): subprocess.run([f'./benchmark-{compiler.name}', '100'], check=True) def main(): - compiler = Compiler.GCCdebug - path = Path.cwd() + compiler = pb.Compiler.GCCdebug + path = Path.cwd() / 'generated' generate(path, compiler) make(path) execute(path, compiler) @@ -54,4 +55,3 @@ def main(): if __name__ == '__main__': main() - -- GitLab From 857f1848adce4d4292362c3366c7ec32204fd4ab Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 8 Aug 2023 14:16:57 +0200 Subject: [PATCH 10/18] removed the mutable default argument from the _kernel_header and _kernel_source function --- pystencils_benchmark/common.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pystencils_benchmark/common.py b/pystencils_benchmark/common.py index beeeed6..70cabd6 100644 --- a/pystencils_benchmark/common.py +++ b/pystencils_benchmark/common.py @@ -16,15 +16,17 @@ def _kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C, *, template_file: str, - additional_jinja_context: dict = {}) -> str: + additional_jinja_context: dict = None) -> str: + function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) header_guard = f'_{kernel_ast.function_name.upper()}_H' jinja_context = { 'header_guard': header_guard, 'function_signature': function_signature, - **additional_jinja_context } + if additional_jinja_context is not None: + jinja_context.update(additional_jinja_context) header = _env.get_template(template_file).render(**jinja_context) return header @@ -34,7 +36,8 @@ def _kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C, *, template_file: str, - additional_jinja_context: dict = {}) -> str: + additional_jinja_context: dict = None) -> str: + kernel_name = kernel_ast.function_name function_source = generate_c(kernel_ast, dialect=dialect) headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} @@ -44,9 +47,11 @@ def _kernel_source(kernel_ast: KernelFunction, 'function_source': function_source, 'headers': sorted(headers), 'timing': True, - **additional_jinja_context, } + if additional_jinja_context is not None: + jinja_context.update(additional_jinja_context) + source = _env.get_template(template_file).render(**jinja_context) return source -- GitLab From 9140da63f4446be8b1c9c5f58617654f5fffb485 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 8 Aug 2023 16:22:11 +0200 Subject: [PATCH 11/18] Added a parameter to insert a launch bounds to the kernel --- pystencils_benchmark/gpu/benchmark.py | 13 ++++++++++++- tests/test_benchmark.py | 8 ++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py index d653d18..5a4852c 100644 --- a/pystencils_benchmark/gpu/benchmark.py +++ b/pystencils_benchmark/gpu/benchmark.py @@ -17,12 +17,19 @@ from pystencils_benchmark.common import (_env, from pystencils_benchmark.enums import Compiler +def _add_launch_bound(code: str, launch_bounds: tuple) -> str: + lb_str = f"__launch_bounds__({','.join(str(lb) for lb in launch_bounds)})" + splitted = code.split("void") + return splitted[0] + lb_str + "".join(splitted[1:]) + + def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], path: Path = None, *, compiler: Compiler = Compiler.NVCC, timing: bool = True, - cuda_block_size: tuple = (32, 1, 1) + cuda_block_size: tuple = (32, 1, 1), + launch_bounds: tuple = None, ) -> None: src_path, include_path = setup_directories(path) @@ -34,10 +41,14 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], kernel_name = kernel_ast.function_name header = kernel_header(kernel_ast) + if launch_bounds: + header = _add_launch_bound(header, launch_bounds) with open(include_path / f'{kernel_name}.h', 'w+') as f: f.write(header) source = kernel_source(kernel_ast) + if launch_bounds: + source = _add_launch_bound(source, launch_bounds) # TODO CUDA specific suffix with open(src_path / f'{kernel_name}.cu', 'w+') as f: f.write(source) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 0c42c79..929d4f8 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -41,7 +41,11 @@ def test_generate(compiler, config_kwarg): subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -def test_generate_gpu(): +gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)}) + + +@pytest.mark.parametrize('kwargs', gpu_kwargs) +def test_generate_gpu(kwargs): compiler = Compiler.NVCC a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) @@ -52,6 +56,6 @@ def test_generate_gpu(): with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: temp_dir = Path(temp_dir) - pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler) + pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs) # subprocess.run(['make', '-C', f'{temp_dir}'], check=True) # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -- GitLab From 4ee400e910d835c5c6535970fee423dad1a35798 Mon Sep 17 00:00:00 2001 From: Christoph Alt <typ@ohnebild.com> Date: Wed, 9 Aug 2023 14:51:12 +0200 Subject: [PATCH 12/18] added the new packages to the setup.cfg and the new templates to the manifest so that packaging works correctly --- MANIFEST.in | 2 ++ setup.cfg | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index ef395db..38e6285 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,5 @@ include README.md include LICENSE.md include pystencils_benchmark/templates/* +include pystencils_benchmark/templates/cpu/* +include pystencils_benchmark/templates/gpu/* diff --git a/setup.cfg b/setup.cfg index 4063797..8bc42e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,7 +10,10 @@ license = AGPLv3 version = 0.0.1 [options] -packages = pystencils_benchmark +packages = + pystencils_benchmark + pystencils_benchmark.gpu + pystencils_benchmark.cpu install_requires = jinja2 >= 3.0 pystencils >= 0.3.4 -- GitLab From 4b1f3f5374dcef0c0e56f44c9679782080278bd7 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 15 Aug 2023 09:44:27 +0200 Subject: [PATCH 13/18] fixed the _add_launch_bounds and also added some small tests --- pystencils_benchmark/gpu/benchmark.py | 10 +++++++--- tests/test_launch_bounds.py | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 tests/test_launch_bounds.py diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py index 5a4852c..96fb58c 100644 --- a/pystencils_benchmark/gpu/benchmark.py +++ b/pystencils_benchmark/gpu/benchmark.py @@ -18,9 +18,13 @@ from pystencils_benchmark.enums import Compiler def _add_launch_bound(code: str, launch_bounds: tuple) -> str: - lb_str = f"__launch_bounds__({','.join(str(lb) for lb in launch_bounds)})" - splitted = code.split("void") - return splitted[0] + lb_str + "".join(splitted[1:]) + lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) " + splitted = code.split("void ") + prefix = splitted[0] + if code.startswith("void "): + # just in case that there is nothing before the first void + prefix = "" + return prefix + "void " + lb_str + "void ".join(splitted[1:]) def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], diff --git a/tests/test_launch_bounds.py b/tests/test_launch_bounds.py new file mode 100644 index 0000000..48af06d --- /dev/null +++ b/tests/test_launch_bounds.py @@ -0,0 +1,19 @@ +import numpy as np +import pystencils as ps +from pystencils_benchmark.gpu.benchmark import kernel_header, _add_launch_bound, kernel_source + + +def test_launch_bounds(): + a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) + + @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) + def vadd(): + a[0] @= b[0] + c[0] + kernel_vadd = ps.create_kernel(**vadd) + launch_bounds = (256, 2) + header = kernel_header(kernel_vadd) + header = _add_launch_bound(header, launch_bounds) + assert "void __launch_bounds__(256, 2)" in header + source = kernel_source(kernel_vadd) + source = _add_launch_bound(source, launch_bounds) + assert "void __launch_bounds__(256, 2)" in source -- GitLab From 6e88d389816964f1dea3dfdc40da11e6a28e66c8 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 15 Aug 2023 10:30:04 +0200 Subject: [PATCH 14/18] Using cuda as a base for the docker container to also test the gpu kernels --- .gitlab-ci.yml | 1 + Dockerfile | 4 ++-- tests/test_benchmark.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 65e62e3..7a159bd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,6 +26,7 @@ test: image: i10git.cs.fau.de:5005/pycodegen/pystencils-benchmark/pystencils-benchmark tags: - docker + - cuda script: - pip install tox - echo $TOX_ENV diff --git a/Dockerfile b/Dockerfile index 2b8f8a8..dcb5493 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ -FROM ubuntu:latest +FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 LABEL maintainer="jan.hoenig@fau.de" -LABEL version="0.1" +LABEL version="0.2" LABEL description="Custom docker image for pystencils-benchmark" ARG DEBIAN_FRONTEND=noninteractive diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 929d4f8..62881ec 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -57,5 +57,5 @@ def test_generate_gpu(kwargs): with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: temp_dir = Path(temp_dir) pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs) - # subprocess.run(['make', '-C', f'{temp_dir}'], check=True) - # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) + subprocess.run(['make', '-C', f'{temp_dir}'], check=True) + subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -- GitLab From 82ce1d7dcc28c621889699ef02821549d516ae43 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 15 Aug 2023 12:37:10 +0200 Subject: [PATCH 15/18] Fix the missing constants for the gpu main file and added a kernel with a constant to the gpu tests --- pystencils_benchmark/templates/gpu/main.c | 6 ++++++ tests/test_benchmark.py | 8 +++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c index b2f3571..1373048 100644 --- a/pystencils_benchmark/templates/gpu/main.c +++ b/pystencils_benchmark/templates/gpu/main.c @@ -32,6 +32,12 @@ int main(int argc, char **argv) cudaMemset({{field_name}}, 0.23, {{elements}}); {% endfor %} + {% for constantName, dataType in kernel.constants %} + // Constant {{constantName}} + {{dataType}} {{constantName}}; + {{constantName}} = 0.23; + {% endfor %} + dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}}); dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}}); diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 62881ec..fe3946b 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -48,14 +48,20 @@ gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)}) def test_generate_gpu(kwargs): compiler = Compiler.NVCC a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) + alpha = sp.symbols('alpha') @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) def vadd(): a[0] @= b[0] + c[0] kernel_vadd = ps.create_kernel(**vadd) + @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) + def daxpy(): + b[0] @= alpha * a[0] + b[0] + kernel_daxpy = ps.create_kernel(**daxpy) + with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: temp_dir = Path(temp_dir) - pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs) + pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs) subprocess.run(['make', '-C', f'{temp_dir}'], check=True) subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -- GitLab From 1e542f17ab15c0551438a1a73535f3530b2dc16e Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Tue, 15 Aug 2023 13:49:45 +0200 Subject: [PATCH 16/18] Skipping compiling and running cuda kernels if cuda or gpu is not available --- tests/test_benchmark.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index fe3946b..a45fcdd 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -18,6 +18,14 @@ config_kwargs = ({}, 'assume_aligned': True}}) +def nvidia_gpu_available(): + return subprocess.call(['nvidia-smi']) == 0 + + +def nvcc_available(): + return subprocess.call(['nvcc', '--version']) == 0 + + @pytest.mark.parametrize('compiler', compilers) @pytest.mark.parametrize('config_kwarg', config_kwargs) def test_generate(compiler, config_kwarg): @@ -63,5 +71,9 @@ def test_generate_gpu(kwargs): with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: temp_dir = Path(temp_dir) pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs) + if not nvcc_available(): + return subprocess.run(['make', '-C', f'{temp_dir}'], check=True) + if not nvidia_gpu_available(): + return subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -- GitLab From d38a932419699bce59df19117cfac64060d1cb97 Mon Sep 17 00:00:00 2001 From: Christoph Alt <typ@ohnebild.com> Date: Wed, 16 Aug 2023 15:03:18 +0200 Subject: [PATCH 17/18] using pytest skip if there is no nvcc or gpu available --- tests/test_benchmark.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index a45fcdd..b47f9c5 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -19,11 +19,17 @@ config_kwargs = ({}, def nvidia_gpu_available(): - return subprocess.call(['nvidia-smi']) == 0 + try: + return subprocess.call(['nvidia-smi']) == 0 + except (FileNotFoundError,): + return False def nvcc_available(): - return subprocess.call(['nvcc', '--version']) == 0 + try: + return subprocess.call(['nvcc', '--version']) == 0 + except (FileNotFoundError,): + return False @pytest.mark.parametrize('compiler', compilers) @@ -72,8 +78,8 @@ def test_generate_gpu(kwargs): temp_dir = Path(temp_dir) pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs) if not nvcc_available(): - return + pytest.skip("nvcc is not available!") subprocess.run(['make', '-C', f'{temp_dir}'], check=True) if not nvidia_gpu_available(): - return + pytest.skip("There is no GPU available!") subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -- GitLab From 879ee872a994c4b220e7403a361ecc8c5361d102 Mon Sep 17 00:00:00 2001 From: Christoph Alt <christoph.alt@fau.de> Date: Mon, 11 Sep 2023 14:55:38 +0200 Subject: [PATCH 18/18] removed the unused `cuda_block_size` for the `gpu.generate_benchmark` function. For now the only way to set the cuda_block_size size is to use pass it in the `ps.KernelConfig` to the generated kernel --- pystencils_benchmark/gpu/benchmark.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py index 96fb58c..1e9ce37 100644 --- a/pystencils_benchmark/gpu/benchmark.py +++ b/pystencils_benchmark/gpu/benchmark.py @@ -32,7 +32,6 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], *, compiler: Compiler = Compiler.NVCC, timing: bool = True, - cuda_block_size: tuple = (32, 1, 1), launch_bounds: tuple = None, ) -> None: @@ -58,22 +57,19 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], f.write(source) with open(src_path / 'main.cu', 'w+') as f: - f.write(kernel_main(kernel_asts, - timing=timing, - cuda_block_size=cuda_block_size)) + f.write(kernel_main(kernel_asts, timing=timing)) copy_static_files(path, source_file_suffix='.cu') compiler_toolchain(path, compiler, likwid=False) -def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_block_size: tuple): +def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True): """ Return C code of a benchmark program for the given kernel. Args: kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking timing: add timing output to the code, prints time per iteration to stdout - cuda_block_size: defines the cuda block grid Returns: C code as string """ -- GitLab