diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 8024d58c3960235611020cd05f3ea3755375cf5b..e9bce873751fb6639cbb77ba2427d6c68c0b3f8f 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -73,6 +73,7 @@ def get_supported_instruction_sets(): required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'} required_avx_flags = {'avx', 'avx2'} required_avx512_flags = {'avx512f'} + possible_avx512vl_flags = {'avx512vl', 'avx10_1'} flags = set(get_cpu_info()['flags']) if flags.issuperset(required_sse_flags): result.append("sse") @@ -80,6 +81,8 @@ def get_supported_instruction_sets(): result.append("avx") if flags.issuperset(required_avx512_flags): result.append("avx512") + if not flags.isdisjoint(possible_avx512vl_flags): + result.append("avx512vl") return result else: raise NotImplementedError('Instruction set detection for %s on %s is not implemented' % diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py index 7653c7c69cbfef34a06714bb19b8d7976f53400f..7f05a403d45a8ca890a78b7db92ff147d30dd7dc 100644 --- a/pystencils/backends/x86_instruction_sets.py +++ b/pystencils/backends/x86_instruction_sets.py @@ -57,8 +57,8 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): 'storeU': 'storeu[0,1]', 'storeA': 'store[0,1]', 'stream': 'stream[0,1]', - 'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]', - 'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]', + 'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]', + 'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]', } for comparison_op, constant in comparisons.items(): @@ -66,6 +66,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): headers = { 'avx512': ['<immintrin.h>'], + 'avx512vl': ['<immintrin.h>'], 'avx': ['<immintrin.h>'], 'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>', '<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>'] @@ -79,6 +80,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): prefix = { 'sse': '_mm', 'avx': '_mm256', + 'avx512vl': '_mm256', 'avx512': '_mm512', } @@ -89,6 +91,9 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ("double", "avx"): 4, ("float", "avx"): 8, ("int", "avx"): 8, + ("double", "avx512vl"): 4, + ("float", "avx512vl"): 8, + ("int", "avx512vl"): 8, ("double", "avx512"): 8, ("float", "avx512"): 16, ("int", "avx512"): 16, @@ -110,7 +115,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): suf = suffix[data_type] arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut) - mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else '' + mask_suffix = '_mask' if instruction_set.startswith('avx512') and intrinsic_id in comparisons.keys() else '' result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string bit_width = result['width'] * (64 if data_type == 'double' else 32) @@ -123,29 +128,45 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0" result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}" - if instruction_set == 'avx512': + setsuf = "x" if bit_width < 512 and bit_width // result['width'] == 64 else "" + + if instruction_set.startswith('avx512'): size = result['width'] - result['&'] = f'_kand_mask{size}({{0}}, {{1}})' - result['|'] = f'_kor_mask{size}({{0}}, {{1}})' - result['any'] = f'!_ktestz_mask{size}_u8({{0}}, {{0}})' - result['all'] = f'_kortestc_mask{size}_u8({{0}}, {{0}})' + masksize = max(size, 8) + result['&'] = f'_kand_mask{masksize}({{0}}, {{1}})' + result['|'] = f'_kor_mask{masksize}({{0}}, {{1}})' + result['any'] = f'!_ktestz_mask{masksize}_u8({{0}}, {{0}})' + result['all'] = f'_kortestc_mask{masksize}_u8({{0}}, {{0}})' result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})' result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})" - result['abs'] = f"{pre}_abs_{suf}({{0}})" - result['bool'] = f"__mmask{size}" + result['bool'] = f"__mmask{masksize}" params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)]) result['makeVecBool'] = f"__mmask8(({params}) )" params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)]) result['makeVecConstBool'] = f"__mmask8(({params}) )" - vindex = f'{pre}_set_epi{bit_width//size}(' + ', '.join([str(i) for i in range(result['width'])][::-1]) + ')' - vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}({{0}}))' + vindex = f'{pre}_set_epi{bit_width//size}{setsuf}(' + \ + ', '.join([str(i) for i in range(result['width'])][::-1]) + ')' + vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}{setsuf}({{0}}))' + scale = bit_width // size // 8 result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \ - f', {{1}}, {64//size})' + f', {{1}}, {scale})' result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \ - f', {{1}}, {64//size})' - result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {64//size})' + f', {{1}}, {scale})' + if bit_width == 512: + result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {scale})' + else: + result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}({{0}}, ' + vindex.format("{1}") + f', {scale})' + + # abs intrinsic exists in 512 bits, but expands to a sequence. We generate that same sequence for 128 and 256 bits + if instruction_set == 'avx512': + result['abs'] = f"{pre}_abs_{suf}({{0}})" + else: + result['abs'] = f"{pre}_castsi{bit_width}_{suf}({pre}_and_si{bit_width}(" + \ + f"{pre}_set1_epi{bit_width // result['width']}{setsuf}(0x7" + \ + 'f' * (bit_width // result['width'] // 4 - 1) + "), " + \ + f"{pre}_cast{suf}_si{bit_width}({{0}})))" if instruction_set == 'avx' and data_type == 'float': result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})" diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h index 6871aa461f9fc433d494bc2bab334c9b4f38fc48..86fb89e9a78e8ae073620e1f1a564e561d760075 100644 --- a/pystencils/include/aesni_rand.h +++ b/pystencils/include/aesni_rand.h @@ -551,7 +551,7 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr #endif -#ifdef __AVX512F__ +#if defined(__AVX512F__) || defined(__AVX10_512BIT__) QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) { alignas(64) std::array<uint32,16> a; _mm512_store_si512((__m512i*) a.data(), k512); diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h index eb1fe4dc41f2851660723a3c2ddd57fafb06a22a..62c549296cb09fa28bbf1c611a396a313c7a31c9 100644 --- a/pystencils/include/myintrin.h +++ b/pystencils/include/myintrin.h @@ -3,7 +3,7 @@ #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) { -#ifdef __AVX512VL__ +#if defined(__AVX512VL__) || defined(__AVX10_1__) return _mm_cvtepu32_ps(v); #else __m128i v2 = _mm_srli_epi32(v, 1); @@ -29,12 +29,12 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _ #endif #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) -#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) +#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) __attribute__((optimize("no-associative-math"))) #endif QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) { -#ifdef __AVX512VL__ +#if defined(__AVX512VL__) || defined(__AVX10_1__) return _mm_cvtepu64_pd(x); #elif defined(__clang__) return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d); @@ -69,7 +69,7 @@ QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo) QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) { -#ifdef __AVX512VL__ +#if defined(__AVX512VL__) || defined(__AVX10_1__) return _mm256_cvtepu32_ps(v); #else __m256i v2 = _mm256_srli_epi32(v, 1); @@ -80,12 +80,12 @@ QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) #endif } -#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) +#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) __attribute__((optimize("no-associative-math"))) #endif QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x) { -#ifdef __AVX512VL__ +#if defined(__AVX512VL__) || defined(__AVX10_1__) return _mm256_cvtepu64_pd(x); #elif defined(__clang__) return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d); @@ -99,7 +99,7 @@ QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x) } #endif -#ifdef __AVX512F__ +#if defined(__AVX512F__) || defined(__AVX10_512BIT__) QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a) { return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3); diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index cb91b53b96c487b575d9d318a5f58d1460ed59ed..4320a8b9396e9f61060c7d50e512c0919eed6c56 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -1184,7 +1184,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ct } #endif -#ifdef __AVX512F__ +#if defined(__AVX512F__) || defined(__AVX10_512BIT__) QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key) { __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0)); diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py index 77d06b5133d21cfe3e3742703a6c4eb0106221be..e82bff309e636c3c07f4b1f82b552d67e6ccb3d7 100644 --- a/pystencils_tests/test_random.py +++ b/pystencils_tests/test_random.py @@ -22,6 +22,8 @@ if get_compiler_config()['os'] == 'windows': instruction_sets.remove('avx') if 'avx512' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower(): instruction_sets.remove('avx512') + if 'avx512vl' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower(): + instruction_sets.remove('avx512vl') @pytest.mark.parametrize('target, rng', ((Target.CPU, 'philox'), (Target.CPU, 'aesni'), (Target.GPU, 'philox'))) diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index c6a3bf2210727921f58fd06608c0366132860a2d..46e13c2d7f59bfaa9fa50f5e3d8632da3c1a25ac 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -64,7 +64,7 @@ def test_strided(instruction_set, dtype): f, g = ps.fields(f"f, g : {dtype}[2D]") update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \ - and instruction_set not in ['avx512', 'rvv'] and not instruction_set.startswith('sve'): + and instruction_set not in ['avx512', 'avx512vl', 'rvv'] and not instruction_set.startswith('sve'): with pytest.warns(UserWarning) as warn: config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, default_number_float=dtype) @@ -129,7 +129,7 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set @pytest.mark.parametrize('instruction_set', supported_instruction_sets) def test_cacheline_size(instruction_set): cacheline_size = get_cacheline_size(instruction_set) - if cacheline_size is None and instruction_set in ['sse', 'avx', 'avx512', 'rvv']: + if cacheline_size is None and instruction_set in ['sse', 'avx', 'avx512', 'avx512vl', 'rvv']: pytest.skip() instruction_set = get_vector_instruction_set('double', instruction_set) vector_size = instruction_set['bytes']