From d01fc61c4201f11f70828d45b8a5917d6f2e7887 Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Mon, 27 Mar 2023 19:56:15 +0200
Subject: [PATCH 1/2] Properly detect and enable vectorization on ARM

---
 pystencils/backends/simd_instruction_sets.py | 5 ++---
 pystencils/cpu/cpujit.py                     | 4 +---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index cdb2ee5cf..d8cccf98a 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -43,8 +43,7 @@ def get_supported_instruction_sets():
         return _cache.copy()
     if 'PYSTENCILS_SIMD' in os.environ:
         return os.environ['PYSTENCILS_SIMD'].split(',')
-    if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64':
-        # not supported by cpuinfo
+    if platform.system() == 'Darwin' and platform.machine() == 'arm64':  # not supported by cpuinfo
         return ['neon']
     elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):  # not supported by cpuinfo
         libc = CDLL('libc.so.6')
@@ -72,7 +71,7 @@ def get_supported_instruction_sets():
     required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
     required_avx_flags = {'avx', 'avx2'}
     required_avx512_flags = {'avx512f'}
-    required_neon_flags = {'neon'}
+    required_neon_flags = {'asimd'}
     required_sve_flags = {'sve'}
     flags = set(get_cpu_info()['flags'])
     if flags.issuperset(required_sse_flags):
diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index aebefec91..c71700d2f 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -146,9 +146,7 @@ def read_config():
             ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
             ('restrict_qualifier', '__restrict__')
         ])
-        if platform.machine() == 'arm64':
-            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', '')
-        elif platform.machine().startswith('ppc64'):
+        if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
             default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
                                                                                         '-mcpu=native')
     elif platform.system().lower() == 'windows':
-- 
GitLab


From f0e9cd0020f73c629e3997be97bc6e46f4b4752a Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Tue, 30 May 2023 19:33:05 +0200
Subject: [PATCH 2/2] Remove cpuinfo dependency for SIMD detection on non-x86

---
 .gitlab-ci.yml                               |  7 +-
 pystencils/backends/simd_instruction_sets.py | 95 +++++++++-----------
 pystencils/cpu/cpujit.py                     |  4 +-
 3 files changed, 45 insertions(+), 61 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f333e761d..e4c04dde2 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -156,7 +156,7 @@ arm64v8:
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
   variables:
-    PYSTENCILS_SIMD: "neon"
+    QEMU_CPU: "cortex-a76"
   before_script:
     - *multiarch_before_script
     - sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json
@@ -164,8 +164,6 @@ arm64v8:
 ppc64le:
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le
-  variables:
-    PYSTENCILS_SIMD: "vsx"
   before_script:
     - *multiarch_before_script
     - sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json
@@ -174,8 +172,6 @@ arm64v9:
   # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
-  variables:
-    PYSTENCILS_SIMD: "sve128,sve256,sve512,sve"
   before_script:
     - *multiarch_before_script
     - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
@@ -187,6 +183,7 @@ riscv64:
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64
   variables:
+    # explicitly set SIMD as detection does not appear to work on QEMU
     PYSTENCILS_SIMD: "rvv"
     QEMU_CPU: "rv64,v=true"
   before_script:
diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index 81da66884..748417c6a 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_
 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
 from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
 from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
+from pystencils.cache import memorycache
 from pystencils.typing import numpy_name_to_c
 
 
@@ -31,79 +32,66 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
         return get_vector_instruction_set_x86(type_name, instruction_set)
 
 
-_cache = None
-_cachelinesize = None
-
-
+@memorycache
 def get_supported_instruction_sets():
     """List of supported instruction sets on current hardware, or None if query failed."""
-    global _cache
-    if _cache is not None:
-        return _cache.copy()
     if 'PYSTENCILS_SIMD' in os.environ:
         return os.environ['PYSTENCILS_SIMD'].split(',')
-    if platform.system() == 'Darwin' and platform.machine() == 'arm64':  # not supported by cpuinfo
+    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
         return ['neon']
-    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):  # not supported by cpuinfo
+    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
+        result = ['neon']  # Neon is mandatory on 64-bit ARM
         libc = CDLL('libc.so.6')
         hwcap = libc.getauxval(16)  # AT_HWCAP
-        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
-        return ['rvv'] if hwcap & hwcap_isa_v else []
-    elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
-        import subprocess
-        import tempfile
-        from pystencils.cpu.cpujit import get_compiler_config
-        f = tempfile.NamedTemporaryFile(suffix='.cpp')
-        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
-        macros = subprocess.check_output(command, input='', text=True)
-        if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
-            _cache = ['vsx']
-        else:
-            _cache = []
-        return _cache.copy()
-    try:
-        from cpuinfo import get_cpu_info
-    except ImportError:
-        return None
-
-    result = []
-    required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
-    required_avx_flags = {'avx', 'avx2'}
-    required_avx512_flags = {'avx512f'}
-    required_neon_flags = {'asimd'}
-    required_sve_flags = {'sve'}
-    flags = set(get_cpu_info()['flags'])
-    if flags.issuperset(required_sse_flags):
-        result.append("sse")
-    if flags.issuperset(required_avx_flags):
-        result.append("avx")
-    if flags.issuperset(required_avx512_flags):
-        result.append("avx512")
-    if flags.issuperset(required_neon_flags):
-        result.append("neon")
-    if flags.issuperset(required_sve_flags):
-        if platform.system() == 'Linux':
-            libc = CDLL('libc.so.6')
+        if hwcap & (1 << 22):  # HWCAP_SVE
             length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
             if length < 0:
                 raise OSError("SVE length query failed")
-            while length > 128:
+            while length >= 128:
                 result.append(f"sve{length}")
                 length //= 2
-        result.append("sve")
-    return result
+            result.append("sve")
+        return result
+    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
+        return ['rvv'] if hwcap & hwcap_isa_v else []
+    elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        return ['vsx'] if hwcap & 0x00000080 else []  # PPC_FEATURE_HAS_VSX
+    elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError:
+            return None
+
+        result = []
+        required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
+        required_avx_flags = {'avx', 'avx2'}
+        required_avx512_flags = {'avx512f'}
+        flags = set(get_cpu_info()['flags'])
+        if flags.issuperset(required_sse_flags):
+            result.append("sse")
+        if flags.issuperset(required_avx_flags):
+            result.append("avx")
+        if flags.issuperset(required_avx512_flags):
+            result.append("avx512")
+        return result
+    else:
+        raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
+                                  (platform.system(), platform.machine()))
 
 
+@memorycache
 def get_cacheline_size(instruction_set):
     """Get the size (in bytes) of a cache block that can be zeroed without memory access.
        Usually, this is identical to the cache line size."""
-    global _cachelinesize
     
     instruction_sets = get_vector_instruction_set('double', instruction_set)
     if 'cachelineSize' not in instruction_sets:
         return None
-    if _cachelinesize is not None:
-        return _cachelinesize
     
     import pystencils as ps
     from pystencils.astnodes import SympyAssignment
@@ -116,5 +104,4 @@ def get_cacheline_size(instruction_set):
     ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
     kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
-    _cachelinesize = int(arr[0, 0])
-    return _cachelinesize
+    return int(arr[0, 0])
diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index c71700d2f..98d63aa3e 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -172,8 +172,8 @@ def read_config():
                 default_compiler_config['flags'] += ' ' + libomp
                 break
     else:
-        raise ValueError("The detection of the platform with platform.system() did not work. "
-                         "Pystencils is only supported for linux, windows, and darwin platforms.")
+        raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
+                                  (platform.system(),))
 
     default_cache_config = OrderedDict([
         ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
-- 
GitLab