From d01fc61c4201f11f70828d45b8a5917d6f2e7887 Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Mon, 27 Mar 2023 19:56:15 +0200 Subject: [PATCH 1/2] Properly detect and enable vectorization on ARM --- pystencils/backends/simd_instruction_sets.py | 5 ++--- pystencils/cpu/cpujit.py | 4 +--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index cdb2ee5cf..d8cccf98a 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -43,8 +43,7 @@ def get_supported_instruction_sets(): return _cache.copy() if 'PYSTENCILS_SIMD' in os.environ: return os.environ['PYSTENCILS_SIMD'].split(',') - if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64': - # not supported by cpuinfo + if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo return ['neon'] elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo libc = CDLL('libc.so.6') @@ -72,7 +71,7 @@ def get_supported_instruction_sets(): required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'} required_avx_flags = {'avx', 'avx2'} required_avx512_flags = {'avx512f'} - required_neon_flags = {'neon'} + required_neon_flags = {'asimd'} required_sve_flags = {'sve'} flags = set(get_cpu_info()['flags']) if flags.issuperset(required_sse_flags): diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index aebefec91..c71700d2f 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -146,9 +146,7 @@ def read_config(): ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'), ('restrict_qualifier', '__restrict__') ]) - if platform.machine() == 'arm64': - default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', '') - elif platform.machine().startswith('ppc64'): + if platform.machine().startswith('ppc64') or platform.machine() == 'arm64': default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', '-mcpu=native') elif platform.system().lower() == 'windows': -- GitLab From f0e9cd0020f73c629e3997be97bc6e46f4b4752a Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Tue, 30 May 2023 19:33:05 +0200 Subject: [PATCH 2/2] Remove cpuinfo dependency for SIMD detection on non-x86 --- .gitlab-ci.yml | 7 +- pystencils/backends/simd_instruction_sets.py | 95 +++++++++----------- pystencils/cpu/cpujit.py | 4 +- 3 files changed, 45 insertions(+), 61 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f333e761d..e4c04dde2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -156,7 +156,7 @@ arm64v8: extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 variables: - PYSTENCILS_SIMD: "neon" + QEMU_CPU: "cortex-a76" before_script: - *multiarch_before_script - sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json @@ -164,8 +164,6 @@ arm64v8: ppc64le: extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le - variables: - PYSTENCILS_SIMD: "vsx" before_script: - *multiarch_before_script - sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json @@ -174,8 +172,6 @@ arm64v9: # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors). extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 - variables: - PYSTENCILS_SIMD: "sve128,sve256,sve512,sve" before_script: - *multiarch_before_script - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json @@ -187,6 +183,7 @@ riscv64: extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64 variables: + # explicitly set SIMD as detection does not appear to work on QEMU PYSTENCILS_SIMD: "rvv" QEMU_CPU: "rv64,v=true" before_script: diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 81da66884..748417c6a 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_ from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv +from pystencils.cache import memorycache from pystencils.typing import numpy_name_to_c @@ -31,79 +32,66 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'): return get_vector_instruction_set_x86(type_name, instruction_set) -_cache = None -_cachelinesize = None - - +@memorycache def get_supported_instruction_sets(): """List of supported instruction sets on current hardware, or None if query failed.""" - global _cache - if _cache is not None: - return _cache.copy() if 'PYSTENCILS_SIMD' in os.environ: return os.environ['PYSTENCILS_SIMD'].split(',') - if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo + if platform.system() == 'Darwin' and platform.machine() == 'arm64': return ['neon'] - elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo + elif platform.system() == 'Linux' and platform.machine() == 'aarch64': + result = ['neon'] # Neon is mandatory on 64-bit ARM libc = CDLL('libc.so.6') hwcap = libc.getauxval(16) # AT_HWCAP - hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V - return ['rvv'] if hwcap & hwcap_isa_v else [] - elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo - import subprocess - import tempfile - from pystencils.cpu.cpujit import get_compiler_config - f = tempfile.NamedTemporaryFile(suffix='.cpp') - command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name] - macros = subprocess.check_output(command, input='', text=True) - if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros: - _cache = ['vsx'] - else: - _cache = [] - return _cache.copy() - try: - from cpuinfo import get_cpu_info - except ImportError: - return None - - result = [] - required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'} - required_avx_flags = {'avx', 'avx2'} - required_avx512_flags = {'avx512f'} - required_neon_flags = {'asimd'} - required_sve_flags = {'sve'} - flags = set(get_cpu_info()['flags']) - if flags.issuperset(required_sse_flags): - result.append("sse") - if flags.issuperset(required_avx_flags): - result.append("avx") - if flags.issuperset(required_avx512_flags): - result.append("avx512") - if flags.issuperset(required_neon_flags): - result.append("neon") - if flags.issuperset(required_sve_flags): - if platform.system() == 'Linux': - libc = CDLL('libc.so.6') + if hwcap & (1 << 22): # HWCAP_SVE length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL if length < 0: raise OSError("SVE length query failed") - while length > 128: + while length >= 128: result.append(f"sve{length}") length //= 2 - result.append("sve") - return result + result.append("sve") + return result + elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): + libc = CDLL('libc.so.6') + hwcap = libc.getauxval(16) # AT_HWCAP + hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V + return ['rvv'] if hwcap & hwcap_isa_v else [] + elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'): + libc = CDLL('libc.so.6') + hwcap = libc.getauxval(16) # AT_HWCAP + return ['vsx'] if hwcap & 0x00000080 else [] # PPC_FEATURE_HAS_VSX + elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']: + try: + from cpuinfo import get_cpu_info + except ImportError: + return None + + result = [] + required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'} + required_avx_flags = {'avx', 'avx2'} + required_avx512_flags = {'avx512f'} + flags = set(get_cpu_info()['flags']) + if flags.issuperset(required_sse_flags): + result.append("sse") + if flags.issuperset(required_avx_flags): + result.append("avx") + if flags.issuperset(required_avx512_flags): + result.append("avx512") + return result + else: + raise NotImplementedError('Instruction set detection for %s on %s is not implemented' % + (platform.system(), platform.machine())) +@memorycache def get_cacheline_size(instruction_set): """Get the size (in bytes) of a cache block that can be zeroed without memory access. Usually, this is identical to the cache line size.""" - global _cachelinesize instruction_sets = get_vector_instruction_set('double', instruction_set) if 'cachelineSize' not in instruction_sets: return None - if _cachelinesize is not None: - return _cachelinesize import pystencils as ps from pystencils.astnodes import SympyAssignment @@ -116,5 +104,4 @@ def get_cacheline_size(instruction_set): ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(**{f.name: arr, CachelineSize.symbol.name: 0}) - _cachelinesize = int(arr[0, 0]) - return _cachelinesize + return int(arr[0, 0]) diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index c71700d2f..98d63aa3e 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -172,8 +172,8 @@ def read_config(): default_compiler_config['flags'] += ' ' + libomp break else: - raise ValueError("The detection of the platform with platform.system() did not work. " - "Pystencils is only supported for linux, windows, and darwin platforms.") + raise NotImplementedError('Generation of default compiler flags for %s is not implemented' % + (platform.system(),)) default_cache_config = OrderedDict([ ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')), -- GitLab