From b13b0e94832eaf04c028f95d39bc812170e3d56c Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Sat, 20 Apr 2024 19:11:36 +0000 Subject: [PATCH] new tests and minor fixes --- Math/Matrix/VectorFloat32.h | 996 +++------------------- Stdlib/Intrinsics.h | 70 ++ Stdlib/Mathtypes.h | 2 + Stdlib/SIMD/SIMD_F32.h | 1101 ++++++++++++++++++++++++ Stdlib/SIMD/SIMD_Helper.h | 70 ++ Stdlib/SIMD/SIMD_I32.h | 1117 +++++++++++++++++++++++++ Types.h => Stdlib/Types.h | 9 +- Threads/Job.h | 1 + Threads/Thread.h | 48 +- Utils/TestUtils.h | 10 +- tests/Stdlib/SIMD/SIMD_F32Test.cpp | 386 +++++++++ tests/Stdlib/SIMD/SIMD_HelperTest.cpp | 42 + tests/Threads/ThreadPoolTest.cpp | 31 +- tests/test.sh | 4 + 14 files changed, 2957 insertions(+), 930 deletions(-) create mode 100644 Stdlib/Intrinsics.h create mode 100644 Stdlib/SIMD/SIMD_F32.h create mode 100644 Stdlib/SIMD/SIMD_Helper.h create mode 100644 Stdlib/SIMD/SIMD_I32.h rename Types.h => Stdlib/Types.h (69%) create mode 100644 tests/Stdlib/SIMD/SIMD_F32Test.cpp create mode 100644 tests/Stdlib/SIMD/SIMD_HelperTest.cpp diff --git a/Math/Matrix/VectorFloat32.h b/Math/Matrix/VectorFloat32.h index c971a72..25c96c9 100644 --- a/Math/Matrix/VectorFloat32.h +++ b/Math/Matrix/VectorFloat32.h @@ -10,907 +10,145 @@ #ifndef MATH_MATRIX_VECTORFLOAT32_H #define MATH_MATRIX_VECTORFLOAT32_H -#include "Types.h" -#include -#include - -struct simd_f32_4 { - union { - __m128 P; - f32 v[4]; - }; -}; - -struct simd_f32_8 { - union { - __m256 P; - f32 v[8]; - }; -}; - -struct simd_f32_16 { - union { - __m512 P; - f32 v[16]; - }; -}; - -inline -simd_f32_4 init_zero_simd_f32_4() -{ - simd_f32_4 simd; - simd.P = _mm_setzero_ps(); - - return simd; -} - -inline -simd_f32_8 init_zero_simd_f32_8() -{ - simd_f32_8 simd; - simd.P = _mm256_setzero_ps(); - - return simd; -} - -inline -simd_f32_16 init_zero_simd_f32_16() -{ - simd_f32_16 simd; - simd.P = _mm512_setzero_ps(); - - return simd; -} - -inline -simd_f32_4 operator+(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_add_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator+(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_add_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator+(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_add_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator-(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_sub_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator-(simd_f32_4 a) -{ - return init_zero_simd_f32_4() - a; -} - -inline -simd_f32_8 operator-(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_sub_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator-(simd_f32_8 a) -{ - return init_zero_simd_f32_8() - a; -} - -inline -simd_f32_16 operator-(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_sub_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator-(simd_f32_16 a) -{ - return init_zero_simd_f32_16() - a; -} - -inline -simd_f32_4 operator*(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_mul_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator*(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_mul_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator*(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mul_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator/(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_div_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator/(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_div_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator/(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_div_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator^(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_xor_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator^(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_xor_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator^(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_xor_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4& operator-=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a - b; - - return a; -} - -inline -simd_f32_8& operator-=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a - b; - - return a; -} - -inline -simd_f32_16& operator-=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a - b; - - return a; -} - -inline -simd_f32_4& operator+=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a + b; - - return a; -} - -inline -simd_f32_8& operator+=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a + b; - - return a; -} - -inline -simd_f32_16& operator+=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a + b; - - return a; -} - -inline -simd_f32_4& operator*=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a * b; - - return a; -} - -inline -simd_f32_8& operator*=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a * b; - - return a; -} - -inline -simd_f32_16& operator*=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a * b; - - return a; -} - -inline -simd_f32_4& operator/=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a / b; - - return a; -} - -inline -simd_f32_8& operator/=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a / b; - - return a; -} - -inline -simd_f32_16& operator/=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a / b; - - return a; -} - -inline -simd_f32_4& operator^=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a ^ b; - - return a; -} - -inline -simd_f32_8& operator^=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a ^ b; - - return a; -} - -inline -simd_f32_16& operator^=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a ^ b; - - return a; -} - -inline -simd_f32_4 operator<(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmplt_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator<(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_LT_OQ); - - return simd; -} - -inline -simd_f32_16 operator<(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_LT_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator<=(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmple_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator<=(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_LE_OQ); - - return simd; -} - -inline -simd_f32_16 operator<=(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_LE_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator>(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmpgt_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator>(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_GT_OQ); - - return simd; -} - -inline -simd_f32_16 operator>(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_GT_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator>=(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmpge_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator>=(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_GE_OQ); - - return simd; -} - -inline -simd_f32_16 operator>=(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_GE_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator==(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmpeq_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator==(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_EQ_OQ); - - return simd; -} - -inline -simd_f32_16 operator==(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_EQ_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator!=(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmpneq_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator!=(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_NEQ_OQ); - - return simd; -} - -inline -simd_f32_16 operator!=(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_NEQ_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator&(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_and_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator&(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_and_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator&(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_and_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator|(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_or_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator|(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_or_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator|(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_or_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4& operator&=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a & b; - - return a; -} - -inline -simd_f32_8& operator&=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a & b; - - return a; -} - -inline -simd_f32_16& operator&=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a & b; - - return a; -} - -inline -simd_f32_4& operator|=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a | b; - - return a; -} - -inline -simd_f32_8& operator|=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a | b; - - return a; -} - -inline -simd_f32_16& operator|=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a | b; - - return a; -} - -inline -simd_f32_4 abs(simd_f32_4 a) -{ - unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m128 mask = _mm_set1_ps(*(float *) &unsigned_mask); - - simd_f32_4 simd; - simd.P = _mm_and_ps(a.P, mask); - - return simd; -} - -inline -simd_f32_8 abs(simd_f32_8 a) -{ - unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m256 mask = _mm256_set1_ps(*(float *) &unsigned_mask); - - simd_f32_8 simd; - simd.P = _mm256_and_ps(a.P, mask); - - return simd; -} - -inline -simd_f32_16 abs(simd_f32_16 a) -{ - unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m512 mask = _mm512_set1_ps(*(float *) &unsigned_mask); - - simd_f32_16 simd; - simd.P = _mm512_and_ps(a.P, mask); - - return simd; -} - -inline -simd_f32_4 min(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_min_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 min(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_min_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 min(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_min_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 max(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_max_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 max(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_max_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 max(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_max_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 sign(simd_f32_4 a) -{ - unsigned int umask = (unsigned int) (1 << 31); - __m128 mask = _mm_set1_ps(*(float *) &umask); - - simd_f32_4 signBit; - signBit.P = _mm_and_ps(a.P, mask); - - simd_f32_4 b; - b.P = _mm_set1_ps(1.0f); - - simd_f32_4 simd = b | signBit; - - return simd; -} - -inline -simd_f32_8 sign(simd_f32_8 a) -{ - unsigned int umask = (unsigned int) (1 << 31); - __m256 mask = _mm256_set1_ps(*(float *) &umask); - - simd_f32_8 signBit; - signBit.P = _mm256_and_ps(a.P, mask); - - simd_f32_8 b; - b.P = _mm256_set1_ps(1.0f); - - simd_f32_8 simd = b | signBit; - - return simd; -} - -inline -simd_f32_16 sign(simd_f32_16 a) -{ - unsigned int umask = (unsigned int) (1 << 31); - __m512 mask = _mm512_set1_ps(*(float *) &umask); - - simd_f32_16 signBit; - signBit.P = _mm512_and_ps(a.P, mask); - - simd_f32_16 b; - b.P = _mm512_set1_ps(1.0f); - - simd_f32_16 simd = b | signBit; - - return simd; -} - -// sqrt -// approxinvsquareroot -// approx1over -// clamp -// floor -// ceil -// anytrue -// alltrue -// anyfalse -// allfalse - -struct v3_simd_f32_4 { - union { - struct { - union { - simd_f32_4 x; - simd_f32_4 r; - }; - union { - simd_f32_4 y; - simd_f32_4 g; - }; - union { - simd_f32_4 z; - simd_f32_4 b; +namespace Math::Matrix::VectorFloat32 +{ + struct v3_f32_4_simd { + union { + struct { + union { + f32_4_simd x; + f32_4_simd r; + }; + union { + f32_4_simd y; + f32_4_simd g; + }; + union { + f32_4_simd z; + f32_4_simd b; + }; }; + + f32_4_simd v[3]; }; - - simd_f32_4 v[3]; }; -}; -struct v3_simd_f32_8 { - union { - struct { - union { - simd_f32_8 x; - simd_f32_8 r; - }; - union { - simd_f32_8 y; - simd_f32_8 g; - }; - union { - simd_f32_8 z; - simd_f32_8 b; + struct v3_f32_8_simd { + union { + struct { + union { + f32_8_simd x; + f32_8_simd r; + }; + union { + f32_8_simd y; + f32_8_simd g; + }; + union { + f32_8_simd z; + f32_8_simd b; + }; }; + + f32_8_simd v[3]; }; - - simd_f32_8 v[3]; }; -}; -struct v3_simd_f32_16 { - union { - struct { - union { - simd_f32_16 x; - simd_f32_16 r; - }; - union { - simd_f32_16 y; - simd_f32_16 g; - }; - union { - simd_f32_16 z; - simd_f32_16 b; + struct v3_f32_16_simd { + union { + struct { + union { + f32_16_simd x; + f32_16_simd r; + }; + union { + f32_16_simd y; + f32_16_simd g; + }; + union { + f32_16_simd z; + f32_16_simd b; + }; }; + + f32_16_simd v[3]; }; - - simd_f32_16 v[3]; }; -}; -struct v4_simd_f32_4 { - union { - struct { - union { - simd_f32_4 x; - simd_f32_4 r; - }; - union { - simd_f32_4 y; - simd_f32_4 g; - }; - union { - simd_f32_4 z; - simd_f32_4 b; - }; - union { - simd_f32_4 w; - simd_f32_4 a; + struct v4_f32_4_simd { + union { + struct { + union { + f32_4_simd x; + f32_4_simd r; + }; + union { + f32_4_simd y; + f32_4_simd g; + }; + union { + f32_4_simd z; + f32_4_simd b; + }; + union { + f32_4_simd w; + f32_4_simd a; + }; }; + + f32_4_simd v[4]; }; - - simd_f32_4 v[4]; }; -}; -struct v4_simd_f32_8 { - union { - struct { - union { - simd_f32_8 x; - simd_f32_8 r; - }; - union { - simd_f32_8 y; - simd_f32_8 g; - }; - union { - simd_f32_8 z; - simd_f32_8 b; - }; - union { - simd_f32_8 w; - simd_f32_8 a; + struct v4_f32_8_simd { + union { + struct { + union { + f32_8_simd x; + f32_8_simd r; + }; + union { + f32_8_simd y; + f32_8_simd g; + }; + union { + f32_8_simd z; + f32_8_simd b; + }; + union { + f32_8_simd w; + f32_8_simd a; + }; }; + + f32_8_simd v[4]; }; - - simd_f32_8 v[4]; }; -}; -struct v4_simd_f32_16 { - union { - struct { - union { - simd_f32_16 x; - simd_f32_16 r; - }; - union { - simd_f32_16 y; - simd_f32_16 g; - }; - union { - simd_f32_16 z; - simd_f32_16 b; - }; - union { - simd_f32_16 w; - simd_f32_16 a; + struct v4_f32_16_simd { + union { + struct { + union { + f32_16_simd x; + f32_16_simd r; + }; + union { + f32_16_simd y; + f32_16_simd g; + }; + union { + f32_16_simd z; + f32_16_simd b; + }; + union { + f32_16_simd w; + f32_16_simd a; + }; }; + + f32_16_simd v[4]; }; - - simd_f32_16 v[4]; }; -}; +} #endif diff --git a/Stdlib/Intrinsics.h b/Stdlib/Intrinsics.h new file mode 100644 index 0000000..c537534 --- /dev/null +++ b/Stdlib/Intrinsics.h @@ -0,0 +1,70 @@ +/** + * Jingga + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef STDLIB_INTRINSICS_H +#define STDLIB_INTRINSICS_H + +#include +#include +#include +#include + +#include "Types.h" + +namespace Stdlib::Intrinsics +{ + inline + f32 sqrt(f32 a) { + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_set_ss(a))); + } + + inline + f32 round(f32 a) { + return _mm_cvtss_f32( + _mm_round_ss( + _mm_setzero_ps(), + _mm_set_ss(a), + (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) + ) + ); + } + + inline + uint32 round_to_int(f32 a) { + return (uint32) _mm_cvtss_si32(_mm_set_ss(a)); + } + + inline + f32 floor(f32 a) { + return _mm_cvtss_f32(_mm_floor_ss(_mm_setzero_ps(), _mm_set_ss(a))); + } + + inline + f32 ceil(f32 a) { + return _mm_cvtss_f32(_mm_ceil_ss(_mm_setzero_ps(), _mm_set_ss(a))); + } + + inline + uint32 hash(uint64 a, uint64 b = 0) { + uint8 seed[16] = { + 0xaa, 0x9b, 0xbd, 0xb8, + 0xa1, 0x98, 0xac, 0x3f, + 0x1f, 0x94, 0x07, 0xb3, + 0x8c, 0x27, 0x93, 0x69, + }; + + __m128i hash = _mm_set_epi64x(a, b); + hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); + hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); + + return _mm_extract_epi32(hash, 0); + } +} + +#endif \ No newline at end of file diff --git a/Stdlib/Mathtypes.h b/Stdlib/Mathtypes.h index 750792f..0ba1450 100644 --- a/Stdlib/Mathtypes.h +++ b/Stdlib/Mathtypes.h @@ -13,6 +13,8 @@ #include #include +// @todo check Vectors, we can simplify this!!! + // int32_t vectors typedef union { struct { diff --git a/Stdlib/SIMD/SIMD_F32.h b/Stdlib/SIMD/SIMD_F32.h new file mode 100644 index 0000000..367fafa --- /dev/null +++ b/Stdlib/SIMD/SIMD_F32.h @@ -0,0 +1,1101 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef STDLIB_SIMD_F32_H +#define STDLIB_SIMD_F32_H + +#include +#include + +#include "../Types.h" + +namespace Stdlib::SIMD +{ + struct f32_4_simd { + union { + __m128 s; + f32 v[4]; + }; + }; + + struct f32_8_simd { + union { + __m256 s; + f32 v[8]; + }; + }; + + struct f32_16_simd { + union { + __m512 s; + f32 v[16]; + }; + }; + + inline + f32_4_simd load_f32_4_simd(f32 *mem) + { + f32_4_simd simd; + simd.s = _mm_loadu_ps(mem); + + return simd; + } + + inline + f32_4_simd init_f32_4_simd(f32 *mem) + { + f32_4_simd simd; + simd.s = _mm_set_ps(mem[0], mem[1], mem[2], mem[3]); + + return simd; + } + + inline + void unload_f32_4_simd(f32_4_simd a, f32* array) + { + _mm_store_ps(array, a.s); + } + + inline + f32_8_simd load_f32_8_simd(f32 *mem) + { + f32_8_simd simd; + simd.s = _mm256_loadu_ps(mem); + + return simd; + } + + inline + f32_8_simd init_f32_8_simd(f32 *mem) + { + f32_8_simd simd; + simd.s = _mm256_set_ps( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7] + ); + + return simd; + } + + inline + void unload_f32_8_simd(f32_8_simd a, f32* array) + { + _mm256_store_ps(array, a.s); + } + + inline + f32_16_simd load_f32_16_simd(f32 *mem) + { + f32_16_simd simd; + simd.s = _mm512_loadu_ps(mem); + + return simd; + } + + inline + f32_16_simd init_f32_16_simd(f32 *mem) + { + f32_16_simd simd; + simd.s = _mm512_set_ps( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], + mem[12], mem[13], mem[14], mem[15] + ); + + return simd; + } + + inline + void unload_f32_16_simd(f32_16_simd a, f32* array) + { + _mm512_store_ps(array, a.s); + } + + inline + f32_4_simd init_zero_f32_4_simd() + { + f32_4_simd simd; + simd.s = _mm_setzero_ps(); + + return simd; + } + + inline + f32_8_simd init_zero_f32_8_simd() + { + f32_8_simd simd; + simd.s = _mm256_setzero_ps(); + + return simd; + } + + inline + f32_16_simd init_zero_f32_16_simd() + { + f32_16_simd simd; + simd.s = _mm512_setzero_ps(); + + return simd; + } + + inline + f32_4_simd operator+(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_add_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator+(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_add_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator+(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_add_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator-(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_sub_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator-(f32_4_simd a) + { + return init_zero_f32_4_simd() - a; + } + + inline + f32_8_simd operator-(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_sub_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator-(f32_8_simd a) + { + return init_zero_f32_8_simd() - a; + } + + inline + f32_16_simd operator-(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_sub_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator-(f32_16_simd a) + { + return init_zero_f32_16_simd() - a; + } + + inline + f32_4_simd operator*(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_mul_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator*(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_mul_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator*(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mul_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator/(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_div_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator/(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_div_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator/(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_div_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator^(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_xor_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator^(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_xor_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator^(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_xor_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd& operator-=(f32_4_simd &a, f32_4_simd b) + { + a = a - b; + + return a; + } + + inline + f32_8_simd& operator-=(f32_8_simd &a, f32_8_simd b) + { + a = a - b; + + return a; + } + + inline + f32_16_simd& operator-=(f32_16_simd &a, f32_16_simd b) + { + a = a - b; + + return a; + } + + inline + f32_4_simd& operator+=(f32_4_simd &a, f32_4_simd b) + { + a = a + b; + + return a; + } + + inline + f32_8_simd& operator+=(f32_8_simd &a, f32_8_simd b) + { + a = a + b; + + return a; + } + + inline + f32_16_simd& operator+=(f32_16_simd &a, f32_16_simd b) + { + a = a + b; + + return a; + } + + inline + f32_4_simd& operator*=(f32_4_simd &a, f32_4_simd b) + { + a = a * b; + + return a; + } + + inline + f32_8_simd& operator*=(f32_8_simd &a, f32_8_simd b) + { + a = a * b; + + return a; + } + + inline + f32_16_simd& operator*=(f32_16_simd &a, f32_16_simd b) + { + a = a * b; + + return a; + } + + inline + f32_4_simd& operator/=(f32_4_simd &a, f32_4_simd b) + { + a = a / b; + + return a; + } + + inline + f32_8_simd& operator/=(f32_8_simd &a, f32_8_simd b) + { + a = a / b; + + return a; + } + + inline + f32_16_simd& operator/=(f32_16_simd &a, f32_16_simd b) + { + a = a / b; + + return a; + } + + inline + f32_4_simd& operator^=(f32_4_simd &a, f32_4_simd b) + { + a = a ^ b; + + return a; + } + + inline + f32_8_simd& operator^=(f32_8_simd &a, f32_8_simd b) + { + a = a ^ b; + + return a; + } + + inline + f32_16_simd& operator^=(f32_16_simd &a, f32_16_simd b) + { + a = a ^ b; + + return a; + } + + inline + f32_4_simd operator<(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmplt_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator<(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LT_OQ); + + return simd; + } + + inline + f32_16_simd operator<(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmplt_ps_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator<=(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmple_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator<=(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LE_OQ); + + return simd; + } + + inline + f32_16_simd operator<=(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator>(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmpgt_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator>(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GT_OQ); + + return simd; + } + + inline + f32_16_simd operator>(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator>=(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmpge_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator>=(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GE_OQ); + + return simd; + } + + inline + f32_16_simd operator>=(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator==(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmpeq_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator==(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_EQ_OQ); + + return simd; + } + + inline + f32_16_simd operator==(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator!=(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmpneq_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator!=(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_NEQ_OQ); + + return simd; + } + + inline + f32_16_simd operator!=(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator&(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_and_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator&(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_and_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator&(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_and_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator|(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_or_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator|(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_or_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator|(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_or_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd& operator&=(f32_4_simd &a, f32_4_simd b) + { + a = a & b; + + return a; + } + + inline + f32_8_simd& operator&=(f32_8_simd &a, f32_8_simd b) + { + a = a & b; + + return a; + } + + inline + f32_16_simd& operator&=(f32_16_simd &a, f32_16_simd b) + { + a = a & b; + + return a; + } + + inline + f32_4_simd& operator|=(f32_4_simd &a, f32_4_simd b) + { + a = a | b; + + return a; + } + + inline + f32_8_simd& operator|=(f32_8_simd &a, f32_8_simd b) + { + a = a | b; + + return a; + } + + inline + f32_16_simd& operator|=(f32_16_simd &a, f32_16_simd b) + { + a = a | b; + + return a; + } + + inline + f32_4_simd abs(f32_4_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_ps(*(float *) &unsigned_mask); + + f32_4_simd simd; + simd.s = _mm_and_ps(a.s, mask); + + return simd; + } + + inline + f32_8_simd abs(f32_8_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_ps(*(float *) &unsigned_mask); + + f32_8_simd simd; + simd.s = _mm256_and_ps(a.s, mask); + + return simd; + } + + inline + f32_16_simd abs(f32_16_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_ps(*(float *) &unsigned_mask); + + f32_16_simd simd; + simd.s = _mm512_and_ps(a.s, mask); + + return simd; + } + + inline + f32_4_simd min(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_min_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd min(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_min_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd min(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_min_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd max(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_max_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd max(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_max_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd max(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_max_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd sign(f32_4_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_ps(*(float *) &umask); + + f32_4_simd signBit; + signBit.s = _mm_and_ps(a.s, mask); + + f32_4_simd b; + b.s = _mm_set1_ps(1.0f); + + f32_4_simd simd = b | signBit; + + return simd; + } + + inline + f32_8_simd sign(f32_8_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_ps(*(float *) &umask); + + f32_8_simd signBit; + signBit.s = _mm256_and_ps(a.s, mask); + + f32_8_simd b; + b.s = _mm256_set1_ps(1.0f); + + f32_8_simd simd = b | signBit; + + return simd; + } + + inline + f32_16_simd sign(f32_16_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_ps(*(float *) &umask); + + f32_16_simd signBit; + signBit.s = _mm512_and_ps(a.s, mask); + + f32_16_simd b; + b.s = _mm512_set1_ps(1.0f); + + f32_16_simd simd = b | signBit; + + return simd; + } + + inline + f32_4_simd floor(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_floor_ps(a.s); + + return simd; + } + + inline + f32_8_simd floor(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_floor_ps(a.s); + + return simd; + } + + inline + f32_16_simd floor(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_floor_ps(a.s); + + return simd; + } + + inline + f32_4_simd ceil(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_ceil_ps(a.s); + + return simd; + } + + inline + f32_8_simd ceil(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_ceil_ps(a.s); + + return simd; + } + + inline + f32_16_simd ceil(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_ceil_ps(a.s); + + return simd; + } + + inline + f32_4_simd sqrt(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_sqrt_ps(a.s); + + return simd; + } + + inline + f32_8_simd sqrt(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_sqrt_ps(a.s); + + return simd; + } + + inline + f32_16_simd sqrt(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_sqrt_ps(a.s); + + return simd; + } + + inline + f32_4_simd sqrt_inv_approx(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_rsqrt_ps(a.s); + + return simd; + } + + inline + f32_8_simd sqrt_inv_approx(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_rsqrt_ps(a.s); + + return simd; + } + + inline + f32_16_simd sqrt_inv_approx(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_rsqrt14_ps(a.s); + + return simd; + } + + inline + f32_4_simd one_over_approx(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_rcp_ps(a.s); + + return simd; + } + + inline + f32_8_simd one_over_approx(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_rcp_ps(a.s); + + return simd; + } + + inline + f32_16_simd one_over_approx(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_rcp14_ps(a.s); + + return simd; + } + + inline + f32_4_simd clamp(f32_4_simd min_value, f32_4_simd a, f32_4_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + f32_8_simd clamp(f32_8_simd min_value, f32_8_simd a, f32_8_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + f32_16_simd clamp(f32_16_simd min_value, f32_16_simd a, f32_16_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + int32 which_true(f32_4_simd a) + { + int32 which_true = _mm_movemask_ps(a.s); + + return which_true; + } + + inline + int32 which_true(f32_8_simd a) + { + int32 which_true = _mm256_movemask_ps(a.s); + + return which_true; + } + + inline + int32 which_true(f32_16_simd a) + { + int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)); + + return which_true; + } + + inline + bool any_true(f32_4_simd a) + { + bool is_any_true = _mm_movemask_ps(a.s) > 0; + + return is_any_true; + } + + inline + bool any_true(f32_8_simd a) + { + bool is_any_true = _mm256_movemask_ps(a.s) > 0; + + return is_any_true; + } + + inline + bool any_true(f32_16_simd a) + { + bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0; + + return is_any_true; + } + + inline + bool all_true(f32_4_simd a) + { + bool is_true = (_mm_movemask_ps(a.s) == 15); + + return is_true; + } + + inline + bool all_true(f32_8_simd a) + { + bool is_true = (_mm256_movemask_ps(a.s) == 255); + + return is_true; + } + + inline + bool all_true(f32_16_simd a) + { + bool is_true = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535); + + return is_true; + } + + inline + bool all_false(f32_4_simd a) + { + bool is_false = (_mm_movemask_ps(a.s) == 0); + + return is_false; + } + + inline + bool all_false(f32_8_simd a) + { + bool is_false = (_mm256_movemask_ps(a.s) == 0); + + return is_false; + } + + inline + bool all_false(f32_16_simd a) + { + // @todo This can be optimized (requires also changes in the comparison functions return) + bool is_false = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0); + + return is_false; + } +} + +#endif diff --git a/Stdlib/SIMD/SIMD_Helper.h b/Stdlib/SIMD/SIMD_Helper.h new file mode 100644 index 0000000..8dea4fe --- /dev/null +++ b/Stdlib/SIMD/SIMD_Helper.h @@ -0,0 +1,70 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef STDLIB_SIMD_HELPER_H +#define STDLIB_SIMD_HELPER_H + +#include +#include +#include + +namespace Stdlib::SIMD +{ + bool is_avx_supported() + { + uint32_t eax, ebx, ecx, edx; + + eax = 1; // CPUID function 1 + + __asm__ __volatile__( + "cpuid;" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (eax) + ); + + // Check the AVX feature bit in ECX + return (ecx >> 28) & 1; + } + + bool is_avx256_supported() + { + uint32_t eax, ebx, ecx, edx; + + eax = 7; // CPUID function 7 + ecx = 0; // Sub-function 0 + + __asm__ __volatile__( + "cpuid;" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (eax), "c" (ecx) + ); + + // Check the AVX-256 (AVX2) feature bit in EBX + return (ebx >> 5) & 1; + } + + bool is_avx512_supported() + { + uint32_t eax, ebx, ecx, edx; + + eax = 7; // CPUID function 7 + ecx = 0; // Sub-function 0 + + __asm__ __volatile__( + "cpuid;" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (eax), "c" (ecx) + ); + + // Check the AVX-512 feature bit in EBX + return (ebx >> 16) & 1; + } +} + +#endif \ No newline at end of file diff --git a/Stdlib/SIMD/SIMD_I32.h b/Stdlib/SIMD/SIMD_I32.h new file mode 100644 index 0000000..b1fadb0 --- /dev/null +++ b/Stdlib/SIMD/SIMD_I32.h @@ -0,0 +1,1117 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef STDLIB_SIMD_I32_H +#define STDLIB_SIMD_I32_H + +#include +#include + +#include "../Types.h" +#include "SIMD_F32.h" + +namespace Stdlib::SIMD +{ + struct int32_4_simd { + union { + __m128i s; + int32 v[4]; + }; + }; + + struct int32_8_simd { + union { + __m256i s; + int32 v[8]; + }; + }; + + struct int32_16_simd { + union { + __m512i s; + int32 v[16]; + }; + }; + + inline + int32_4_simd load_int32_4_simd(int32 *mem) + { + int32_4_simd simd; + simd.s = _mm_loadu_epi32(mem); + + return simd; + } + + inline + int32_4_simd init_int32_4_simd(int32 *mem) + { + int32_4_simd simd; + simd.s = _mm_set_epi32(mem[0], mem[1], mem[2], mem[3]); + + return simd; + } + + inline + void unload_int32_4_simd(int32_4_simd a, int32* array) + { + _mm_store_epi32(array, a.s); + } + + inline + int32_8_simd load_int32_8_simd(int32 *mem) + { + int32_8_simd simd; + simd.s = _mm256_loadu_epi32(mem); + + return simd; + } + + inline + int32_8_simd init_int32_8_simd(int32 *mem) + { + int32_8_simd simd; + simd.s = _mm256_set_epi32( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7] + ); + + return simd; + } + + inline + void unload_int32_8_simd(int32_8_simd a, int32* array) + { + _mm256_store_epi32(array, a.s); + } + + inline + int32_16_simd load_int32_16_simd(int32 *mem) + { + int32_16_simd simd; + simd.s = _mm512_loadu_epi32(mem); + + return simd; + } + + inline + int32_16_simd init_int32_16_simd(int32 *mem) + { + int32_16_simd simd; + simd.s = _mm512_set_epi32( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], + mem[12], mem[13], mem[14], mem[15] + ); + + return simd; + } + + inline + void unload_int32_16_simd(int32_16_simd a, int32* array) + { + _mm512_store_epi32(array, a.s); + } + + inline + int32_4_simd init_zero_int32_4_simd() + { + int32_4_simd simd; + simd.s = _mm_setzero_si128(); + + return simd; + } + + inline + int32_8_simd init_zero_int32_8_simd() + { + int32_8_simd simd; + simd.s = _mm256_setzero_si256(); + + return simd; + } + + inline + int32_16_simd init_zero_int32_16_simd() + { + int32_16_simd simd; + simd.s = _mm512_setzero_epi32(); + + return simd; + } + + inline + int32_4_simd operator+(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_add_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator+(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_add_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator+(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_add_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd operator-(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_sub_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd operator-(int32_4_simd a) + { + return init_zero_int32_4_simd() - a; + } + + inline + int32_8_simd operator-(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_sub_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator-(int32_8_simd a) + { + return init_zero_int32_8_simd() - a; + } + + inline + int32_16_simd operator-(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_sub_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator-(int32_16_simd a) + { + return init_zero_int32_16_simd() - a; + } + + inline + int32_4_simd operator*(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_mul_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator*(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_mul_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator*(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mul_epi32(a.s, b.s); + + return simd; + } + + inline + Stdlib::SIMD::f32_4_simd operator/(int32_4_simd a, int32_4_simd b) + { + Stdlib::SIMD::f32_4_simd simd; + simd.s = _mm_div_ps(a.s, b.s); + + return simd; + } + + inline + Stdlib::SIMD::f32_8_simd operator/(int32_8_simd a, int32_8_simd b) + { + Stdlib::SIMD::f32_8_simd simd; + simd.s = _mm256_div_ps(a.s, b.s); + + return simd; + } + + inline + Stdlib::SIMD::f32_16_simd operator/(int32_16_simd a, int32_16_simd b) + { + Stdlib::SIMD::f32_16_simd simd; + simd.s = _mm512_div_ps(a.s, b.s); + + return simd; + } + + inline + int32_4_simd operator^(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_xor_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator^(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_xor_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator^(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_xor_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd& operator-=(int32_4_simd &a, int32_4_simd b) + { + a = a - b; + + return a; + } + + inline + int32_8_simd& operator-=(int32_8_simd &a, int32_8_simd b) + { + a = a - b; + + return a; + } + + inline + int32_16_simd& operator-=(int32_16_simd &a, int32_16_simd b) + { + a = a - b; + + return a; + } + + inline + int32_4_simd& operator+=(int32_4_simd &a, int32_4_simd b) + { + a = a + b; + + return a; + } + + inline + int32_8_simd& operator+=(int32_8_simd &a, int32_8_simd b) + { + a = a + b; + + return a; + } + + inline + int32_16_simd& operator+=(int32_16_simd &a, int32_16_simd b) + { + a = a + b; + + return a; + } + + inline + int32_4_simd& operator*=(int32_4_simd &a, int32_4_simd b) + { + a = a * b; + + return a; + } + + inline + int32_8_simd& operator*=(int32_8_simd &a, int32_8_simd b) + { + a = a * b; + + return a; + } + + inline + int32_16_simd& operator*=(int32_16_simd &a, int32_16_simd b) + { + a = a * b; + + return a; + } + + inline + int32_4_simd& operator/=(int32_4_simd &a, int32_4_simd b) + { + a.s = _mm_cvtps_epi32((a / b).s); + + return a; + } + + inline + int32_8_simd& operator/=(int32_8_simd &a, int32_8_simd b) + { + a.s = _mm256_cvtps_epi32((a / b).s); + + return a; + } + + inline + int32_16_simd& operator/=(int32_16_simd &a, int32_16_simd b) + { + a.s = _mm512_cvtps_epi32((a / b).s); + + return a; + } + + inline + int32_4_simd& operator^=(int32_4_simd &a, int32_4_simd b) + { + a = a ^ b; + + return a; + } + + inline + int32_8_simd& operator^=(int32_8_simd &a, int32_8_simd b) + { + a = a ^ b; + + return a; + } + + inline + int32_16_simd& operator^=(int32_16_simd &a, int32_16_simd b) + { + a = a ^ b; + + return a; + } + + inline + int32_4_simd operator<(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_cmplt_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator<(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_xor_si256( + _mm256_cmpgt_epi32(a.s, b.s), + _mm256_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_16_simd operator<(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_epi32( + _mm512_cmplt_epi32_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + int32_4_simd operator<=(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_andnot_si128( + _mm_cmplt_epi32(b.s, a.s), + _mm_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_8_simd operator<=(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_andnot_si256( + _mm256_cmpgt_epi32(a.s, b.s), + _mm256_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_16_simd operator<=(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_epi32( + _mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), + b.s, + a.s + ); + + return simd; + } + + inline + int32_4_simd operator>(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_cmpgt_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator>(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_cmpgt_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator>(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmpgt_epi32_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + int32_4_simd operator>=(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_andnot_si128( + _mm_cmplt_epi32(a.s, b.s), + _mm_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_8_simd operator>=(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_andnot_si256( + _mm256_cmpgt_epi32(b.s, a.s), + _mm256_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_16_simd operator>=(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmpge_epi32_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + int32_4_simd operator==(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_cmpeq_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator==(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_cmpeq_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator==(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmpeq_epi32_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + int32_4_simd operator!=(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_cmpneq_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator!=(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_cmp_epi32(a.s, b.s, _CMP_NEQ_OQ); + + return simd; + } + + inline + int32_16_simd operator!=(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_mov_epi32( + _mm512_setzero_epi32(), + _mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), + _mm512_set1_epi32(1.0f) + ); + + return simd; + } + + inline + int32_4_simd operator&(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_and_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator&(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_and_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator&(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_and_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd operator|(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_or_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator|(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_or_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator|(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_or_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd& operator&=(int32_4_simd &a, int32_4_simd b) + { + a = a & b; + + return a; + } + + inline + int32_8_simd& operator&=(int32_8_simd &a, int32_8_simd b) + { + a = a & b; + + return a; + } + + inline + int32_16_simd& operator&=(int32_16_simd &a, int32_16_simd b) + { + a = a & b; + + return a; + } + + inline + int32_4_simd& operator|=(int32_4_simd &a, int32_4_simd b) + { + a = a | b; + + return a; + } + + inline + int32_8_simd& operator|=(int32_8_simd &a, int32_8_simd b) + { + a = a | b; + + return a; + } + + inline + int32_16_simd& operator|=(int32_16_simd &a, int32_16_simd b) + { + a = a | b; + + return a; + } + + inline + int32_4_simd abs(int32_4_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_epi32(*(float *) &unsigned_mask); + + int32_4_simd simd; + simd.s = _mm_and_epi32(a.s, mask); + + return simd; + } + + inline + int32_8_simd abs(int32_8_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_epi32(*(float *) &unsigned_mask); + + int32_8_simd simd; + simd.s = _mm256_and_epi32(a.s, mask); + + return simd; + } + + inline + int32_16_simd abs(int32_16_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_epi32(*(float *) &unsigned_mask); + + int32_16_simd simd; + simd.s = _mm512_and_epi32(a.s, mask); + + return simd; + } + + inline + int32_4_simd min(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_min_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd min(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_min_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd min(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_min_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd max(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_max_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd max(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_max_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd max(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_max_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd sign(int32_4_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_epi32(*(float *) &umask); + + int32_4_simd signBit; + signBit.s = _mm_and_epi32(a.s, mask); + + int32_4_simd b; + b.s = _mm_set1_epi32(1.0f); + + int32_4_simd simd = b | signBit; + + return simd; + } + + inline + int32_8_simd sign(int32_8_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_epi32(*(float *) &umask); + + int32_8_simd signBit; + signBit.s = _mm256_and_epi32(a.s, mask); + + int32_8_simd b; + b.s = _mm256_set1_epi32(1.0f); + + int32_8_simd simd = b | signBit; + + return simd; + } + + inline + int32_16_simd sign(int32_16_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_epi32(*(float *) &umask); + + int32_16_simd signBit; + signBit.s = _mm512_and_epi32(a.s, mask); + + int32_16_simd b; + b.s = _mm512_set1_epi32(1.0f); + + int32_16_simd simd = b | signBit; + + return simd; + } + + inline + int32_4_simd floor(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_floor_epi32(a.s); + + return simd; + } + + inline + int32_8_simd floor(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_floor_epi32(a.s); + + return simd; + } + + inline + int32_16_simd floor(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_floor_epi32(a.s); + + return simd; + } + + inline + int32_4_simd ceil(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_ceil_epi32(a.s); + + return simd; + } + + inline + int32_8_simd ceil(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_ceil_epi32(a.s); + + return simd; + } + + inline + int32_16_simd ceil(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_ceil_epi32(a.s); + + return simd; + } + + inline + int32_4_simd sqrt(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_sqrt_epi32(a.s); + + return simd; + } + + inline + int32_8_simd sqrt(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_sqrt_epi32(a.s); + + return simd; + } + + inline + int32_16_simd sqrt(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_sqrt_epi32(a.s); + + return simd; + } + + inline + int32_4_simd sqrt_inv_approx(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_rsqrt_epi32(a.s); + + return simd; + } + + inline + int32_8_simd sqrt_inv_approx(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_rsqrt_epi32(a.s); + + return simd; + } + + inline + int32_16_simd sqrt_inv_approx(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_rsqrt14_epi32(a.s); + + return simd; + } + + inline + int32_4_simd one_over_approx(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_rcp_epi32(a.s); + + return simd; + } + + inline + int32_8_simd one_over_approx(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_rcp_epi32(a.s); + + return simd; + } + + inline + int32_16_simd one_over_approx(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_rcp14_epi32(a.s); + + return simd; + } + + inline + int32_4_simd clamp(int32_4_simd min_value, int32_4_simd a, int32_4_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + int32_8_simd clamp(int32_8_simd min_value, int32_8_simd a, int32_8_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + int32_16_simd clamp(int32_16_simd min_value, int32_16_simd a, int32_16_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + int32 which_true(int32_4_simd a) + { + int32 which_true = _mm_movemask_epi32(a.s); + + return which_true; + } + + inline + int32 which_true(int32_8_simd a) + { + int32 which_true = _mm256_movemask_epi32(a.s); + + return which_true; + } + + inline + int32 which_true(int32_16_simd a) + { + int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)); + + return which_true; + } + + inline + bool any_true(int32_4_simd a) + { + bool is_any_true = _mm_movemask_epi32(a.s) > 0; + + return is_any_true; + } + + inline + bool any_true(int32_8_simd a) + { + bool is_any_true = _mm256_movemask_epi32(a.s) > 0; + + return is_any_true; + } + + inline + bool any_true(int32_16_simd a) + { + bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0; + + return is_any_true; + } + + inline + bool all_true(int32_4_simd a) + { + bool is_true = (_mm_movemask_epi32(a.s) == 15); + + return is_true; + } + + inline + bool all_true(int32_8_simd a) + { + bool is_true = (_mm256_movemask_epi32(a.s) == 255); + + return is_true; + } + + inline + bool all_true(int32_16_simd a) + { + bool is_true = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535); + + return is_true; + } + + inline + bool all_false(int32_4_simd a) + { + bool is_false = (_mm_movemask_epi32(a.s) == 0); + + return is_false; + } + + inline + bool all_false(int32_8_simd a) + { + bool is_false = (_mm256_movemask_epi32(a.s) == 0); + + return is_false; + } + + inline + bool all_false(int32_16_simd a) + { + // @todo This can be optimized (requires also changes in the comparison functions return) + bool is_false = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0); + + return is_false; + } +} + +#endif diff --git a/Types.h b/Stdlib/Types.h similarity index 69% rename from Types.h rename to Stdlib/Types.h index 402a29d..2cd4679 100644 --- a/Types.h +++ b/Stdlib/Types.h @@ -7,8 +7,8 @@ * @version 1.0.0 * @link https://jingga.app */ -#ifndef TYPES_H -#define TYPES_H +#ifndef STDLIB_TYPES_H +#define STDLIB_TYPES_H #include #include @@ -18,6 +18,11 @@ typedef int16_t int16; typedef int32_t int32; typedef int64_t int64; +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint64_t uint64; + typedef float f32; typedef double f64; diff --git a/Threads/Job.h b/Threads/Job.h index c53bd2b..9c3612c 100755 --- a/Threads/Job.h +++ b/Threads/Job.h @@ -19,6 +19,7 @@ namespace Threads struct job_t { JobFunc func; void *arg; + int state; job_t *next; }; diff --git a/Threads/Thread.h b/Threads/Thread.h index 581590e..1ed45ba 100755 --- a/Threads/Thread.h +++ b/Threads/Thread.h @@ -19,38 +19,26 @@ namespace Threads { Job *pool_work_create(JobFunc func, void *arg) { - Job *work; - if (func == NULL) { return NULL; } - work = (Job *) malloc(sizeof(*work)); + Job *work = (Job *) malloc(sizeof(*work)); work->func = func; work->arg = arg; + work->state = 0; work->next = NULL; return work; } - void pool_work_destroy(Job *work) + Job *pool_work_poll(Threads::ThreadPool *pool) { - if (work == NULL) { - return; - } - - free(work); - } - - Job *pool_work_get(Threads::ThreadPool *pool) - { - Job *work; - if (pool == NULL) { return NULL; } - work = pool->work_first; + Job *work = pool->work_first; if (work == NULL) { return NULL; } @@ -70,7 +58,7 @@ namespace Threads Threads::ThreadPool *pool = (Threads::ThreadPool *) arg; Threads::Job *work; - while (1) { + while (true) { pthread_mutex_lock(&(pool->work_mutex)); while (pool->work_first == NULL && !pool->stop) { @@ -81,13 +69,12 @@ namespace Threads break; } - work = Threads::pool_work_get(pool); + work = Threads::pool_work_poll(pool); ++(pool->working_cnt); pthread_mutex_unlock(&(pool->work_mutex)); if (work != NULL) { - work->func(work->arg); - pool_work_destroy(work); + work->func(work); } pthread_mutex_lock(&(pool->work_mutex)); @@ -155,20 +142,15 @@ namespace Threads void pool_destroy(Threads::ThreadPool *pool) { - Threads::Job *work; - Threads::Job *work2; - if (pool == NULL) { return; } pthread_mutex_lock(&(pool->work_mutex)); - work = pool->work_first; + Threads::Job *work = pool->work_first; while (work != NULL) { - work2 = work->next; - pool_work_destroy(work); - work = work2; + work = work->next; } pool->stop = true; @@ -184,17 +166,15 @@ namespace Threads free(pool); } - bool pool_add_work(Threads::ThreadPool *pool, JobFunc func, void *arg) + Threads::Job* pool_add_work(Threads::ThreadPool *pool, JobFunc func, void *arg) { - Threads::Job *work; - if (pool == NULL) { - return false; + return NULL; } - work = Threads::pool_work_create(func, arg); + Threads::Job *work = Threads::pool_work_create(func, arg); if (work == NULL) { - return false; + return NULL; } pthread_mutex_lock(&(pool->work_mutex)); @@ -209,7 +189,7 @@ namespace Threads pthread_cond_broadcast(&(pool->work_cond)); pthread_mutex_unlock(&(pool->work_mutex)); - return true; + return work; } } diff --git a/Utils/TestUtils.h b/Utils/TestUtils.h index 26ad21f..b039bdc 100755 --- a/Utils/TestUtils.h +++ b/Utils/TestUtils.h @@ -17,7 +17,7 @@ if ((a) == (b)) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf((t1), (a)); printf(" != "); printf((t2), (b)); printf("\n"); \ return 0; } \ @@ -27,7 +27,7 @@ if (oms_abs((a) - (b)) <= (delta)) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf((t1), (a)); printf(" != "); printf((t2), (b)); printf("\n"); \ return 0; } \ @@ -37,7 +37,7 @@ if (strstr((a), (b)) != NULL) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf("%s", (a)); printf(" !contains "); printf("%s", (b)); printf("\n"); \ return 0; } \ @@ -47,7 +47,7 @@ if ((a) == true) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf("%d", (a)); printf(" != "); printf("1"); printf("\n"); \ return 0; } \ @@ -57,7 +57,7 @@ if ((a) == false) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf("%d", (a)); printf(" != "); printf("1"); printf("\n"); \ return 0; } \ diff --git a/tests/Stdlib/SIMD/SIMD_F32Test.cpp b/tests/Stdlib/SIMD/SIMD_F32Test.cpp new file mode 100644 index 0000000..15b4e6f --- /dev/null +++ b/tests/Stdlib/SIMD/SIMD_F32Test.cpp @@ -0,0 +1,386 @@ +/** + * Jingga + * + * @package Test + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#include +#include + +#include "../../../Utils/TestUtils.h" +#include "../../../Stdlib/SIMD/SIMD_F32.h" +#include "../../../Stdlib/SIMD/SIMD_Helper.h" + +float* a_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float* b_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float* expected_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float* result_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); + +float* a_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float* b_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float* expected_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float* result_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); + +float* a_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float* b_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float* expected_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float* result_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); + +int test_operator_plus(); +int test_operator_minus(); +int test_operator_mul(); + +int main(int argc, char** argv) +{ + printf("SIMD_F32:\n"); + + test_operator_plus(); + test_operator_minus(); + test_operator_mul(); + + printf("\n\n"); + + return 0; +} + +int test_operator_plus() +{ + printf("\noperator+:\n"); + printf("[4]: "); + if (!Stdlib::SIMD::is_avx_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; + b_array_4[0] = 0.0f; b_array_4[1] = 1.0f; b_array_4[2] = 2.0f; b_array_4[3] = 3.0f; + + expected_array_4[0] = 0.0f; expected_array_4[1] = 2.0f; expected_array_4[2] = 4.0f; expected_array_4[3] = 6.0f; + Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); + + Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); + Stdlib::SIMD::f32_4_simd b_simd_4 = Stdlib::SIMD::load_f32_4_simd(b_array_4); + + Stdlib::SIMD::f32_4_simd result_simd_4 = a_simd_4 + b_simd_4; + Stdlib::SIMD::unload_f32_4_simd(result_simd_4, result_array_4); + + ASSERT_EQUALS_WITH_DELTA(result_array_4[0], expected_array_4[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[1], expected_array_4[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[2], expected_array_4[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[3], expected_array_4[3], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_4 == expected_simd_4)); + + printf("\n[8]: "); + + if (!Stdlib::SIMD::is_avx256_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + + b_array_8[0] = 0.0f; b_array_8[1] = 1.0f; b_array_8[2] = 2.0f; b_array_8[3] = 3.0f; + b_array_8[4] = 0.0f; b_array_8[5] = 1.0f; b_array_8[6] = 2.0f; b_array_8[7] = 3.0f; + + expected_array_8[0] = 0.0f; expected_array_8[1] = 2.0f; expected_array_8[2] = 4.0f; expected_array_8[3] = 6.0f; + expected_array_8[4] = 0.0f; expected_array_8[5] = 2.0f; expected_array_8[6] = 4.0f; expected_array_8[7] = 6.0f; + Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); + + Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); + Stdlib::SIMD::f32_8_simd b_simd_8 = Stdlib::SIMD::load_f32_8_simd(b_array_8); + + Stdlib::SIMD::f32_8_simd result_simd_8 = a_simd_8 + b_simd_8; + Stdlib::SIMD::unload_f32_8_simd(result_simd_8, result_array_8); + + ASSERT_EQUALS_WITH_DELTA(result_array_8[0], expected_array_8[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[1], expected_array_8[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[2], expected_array_8[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[3], expected_array_8[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[4], expected_array_8[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[5], expected_array_8[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[6], expected_array_8[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[7], expected_array_8[7], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_8 == expected_simd_8)); + + printf("\n[16]: "); + + if (!Stdlib::SIMD::is_avx512_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + + b_array_16[0] = 0.0f; b_array_16[1] = 1.0f; b_array_16[2] = 2.0f; b_array_16[3] = 3.0f; + b_array_16[4] = 0.0f; b_array_16[5] = 1.0f; b_array_16[6] = 2.0f; b_array_16[7] = 3.0f; + b_array_16[8] = 0.0f; b_array_16[9] = 1.0f; b_array_16[10] = 2.0f; b_array_16[11] = 3.0f; + b_array_16[12] = 0.0f; b_array_16[13] = 1.0f; b_array_16[14] = 2.0f; b_array_16[15] = 3.0f; + + expected_array_16[0] = 0.0f; expected_array_16[1] = 2.0f; expected_array_16[2] = 4.0f; expected_array_16[3] = 6.0f; + expected_array_16[4] = 0.0f; expected_array_16[5] = 2.0f; expected_array_16[6] = 4.0f; expected_array_16[7] = 6.0f; + expected_array_16[8] = 0.0f; expected_array_16[9] = 2.0f; expected_array_16[10] = 4.0f; expected_array_16[11] = 6.0f; + expected_array_16[12] = 0.0f; expected_array_16[13] = 2.0f; expected_array_16[14] = 4.0f; expected_array_16[15] = 6.0f; + Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); + + Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); + Stdlib::SIMD::f32_16_simd b_simd_16 = Stdlib::SIMD::load_f32_16_simd(b_array_16); + + Stdlib::SIMD::f32_16_simd result_simd_16 = a_simd_16 + b_simd_16; + Stdlib::SIMD::unload_f32_16_simd(result_simd_16, result_array_16); + + ASSERT_EQUALS_WITH_DELTA(result_array_16[0], expected_array_16[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[1], expected_array_16[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[2], expected_array_16[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[3], expected_array_16[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[4], expected_array_16[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[5], expected_array_16[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[6], expected_array_16[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[7], expected_array_16[7], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[8], expected_array_16[8], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[9], expected_array_16[9], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[10], expected_array_16[10], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[11], expected_array_16[11], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[12], expected_array_16[12], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[13], expected_array_16[13], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[14], expected_array_16[14], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[15], expected_array_16[15], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_16 == expected_simd_16)); + + return 0; +} + +int test_operator_minus() +{ + printf("\noperator-:\n"); + printf("[4]: "); + if (!Stdlib::SIMD::is_avx_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; + b_array_4[0] = 1.0f; b_array_4[1] = 1.0f; b_array_4[2] = 1.0f; b_array_4[3] = 1.0f; + + expected_array_4[0] = -1.0f; expected_array_4[1] = 0.0f; expected_array_4[2] = 1.0f; expected_array_4[3] = 2.0f; + Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); + + Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); + Stdlib::SIMD::f32_4_simd b_simd_4 = Stdlib::SIMD::load_f32_4_simd(b_array_4); + + Stdlib::SIMD::f32_4_simd result_simd_4 = a_simd_4 - b_simd_4; + Stdlib::SIMD::unload_f32_4_simd(result_simd_4, result_array_4); + + ASSERT_EQUALS_WITH_DELTA(result_array_4[0], expected_array_4[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[1], expected_array_4[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[2], expected_array_4[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[3], expected_array_4[3], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_4 == expected_simd_4)); + + printf("\n[8]: "); + + if (!Stdlib::SIMD::is_avx256_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + + b_array_8[0] = 1.0f; b_array_8[1] = 1.0f; b_array_8[2] = 1.0f; b_array_8[3] = 1.0f; + b_array_8[4] = 1.0f; b_array_8[5] = 1.0f; b_array_8[6] = 1.0f; b_array_8[7] = 1.0f; + + expected_array_8[0] = -1.0f; expected_array_8[1] = 0.0f; expected_array_8[2] = 1.0f; expected_array_8[3] = 2.0f; + expected_array_8[4] = -1.0f; expected_array_8[5] = 0.0f; expected_array_8[6] = 1.0f; expected_array_8[7] = 2.0f; + Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); + + Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); + Stdlib::SIMD::f32_8_simd b_simd_8 = Stdlib::SIMD::load_f32_8_simd(b_array_8); + + Stdlib::SIMD::f32_8_simd result_simd_8 = a_simd_8 - b_simd_8; + Stdlib::SIMD::unload_f32_8_simd(result_simd_8, result_array_8); + + ASSERT_EQUALS_WITH_DELTA(result_array_8[0], expected_array_8[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[1], expected_array_8[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[2], expected_array_8[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[3], expected_array_8[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[4], expected_array_8[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[5], expected_array_8[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[6], expected_array_8[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[7], expected_array_8[7], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_8 == expected_simd_8)); + + printf("\n[16]: "); + + if (!Stdlib::SIMD::is_avx512_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + + b_array_16[0] = 1.0f; b_array_16[1] = 1.0f; b_array_16[2] = 1.0f; b_array_16[3] = 1.0f; + b_array_16[4] = 1.0f; b_array_16[5] = 1.0f; b_array_16[6] = 1.0f; b_array_16[7] = 1.0f; + b_array_16[8] = 1.0f; b_array_16[9] = 1.0f; b_array_16[10] = 1.0f; b_array_16[11] = 1.0f; + b_array_16[12] = 1.0f; b_array_16[13] = 1.0f; b_array_16[14] = 1.0f; b_array_16[15] = 1.0f; + + expected_array_16[0] = -1.0f; expected_array_16[1] = 0.0f; expected_array_16[2] = 1.0f; expected_array_16[3] = 2.0f; + expected_array_16[4] = -1.0f; expected_array_16[5] = 0.0f; expected_array_16[6] = 1.0f; expected_array_16[7] = 2.0f; + expected_array_16[8] = -1.0f; expected_array_16[9] = 0.0f; expected_array_16[10] = 1.0f; expected_array_16[11] = 2.0f; + expected_array_16[12] = -1.0f; expected_array_16[13] = 0.0f; expected_array_16[14] = 1.0f; expected_array_16[15] = 2.0f; + Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); + + Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); + Stdlib::SIMD::f32_16_simd b_simd_16 = Stdlib::SIMD::load_f32_16_simd(b_array_16); + + Stdlib::SIMD::f32_16_simd result_simd_16 = a_simd_16 - b_simd_16; + Stdlib::SIMD::unload_f32_16_simd(result_simd_16, result_array_16); + + ASSERT_EQUALS_WITH_DELTA(result_array_16[0], expected_array_16[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[1], expected_array_16[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[2], expected_array_16[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[3], expected_array_16[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[4], expected_array_16[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[5], expected_array_16[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[6], expected_array_16[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[7], expected_array_16[7], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[8], expected_array_16[8], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[9], expected_array_16[9], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[10], expected_array_16[10], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[11], expected_array_16[11], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[12], expected_array_16[12], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[13], expected_array_16[13], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[14], expected_array_16[14], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[15], expected_array_16[15], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_16 == expected_simd_16)); + + return 0; +} + +int test_operator_mul() +{ + printf("\noperator*:\n"); + printf("[4]: "); + if (!Stdlib::SIMD::is_avx_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; + b_array_4[0] = 0.0f; b_array_4[1] = 1.0f; b_array_4[2] = 2.0f; b_array_4[3] = 3.0f; + + expected_array_4[0] = 0.0f; expected_array_4[1] = 1.0f; expected_array_4[2] = 4.0f; expected_array_4[3] = 9.0f; + Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); + + Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); + Stdlib::SIMD::f32_4_simd b_simd_4 = Stdlib::SIMD::load_f32_4_simd(b_array_4); + + Stdlib::SIMD::f32_4_simd result_simd_4 = a_simd_4 * b_simd_4; + Stdlib::SIMD::unload_f32_4_simd(result_simd_4, result_array_4); + + ASSERT_EQUALS_WITH_DELTA(result_array_4[0], expected_array_4[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[1], expected_array_4[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[2], expected_array_4[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[3], expected_array_4[3], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_4 == expected_simd_4)); + + printf("\n[8]: "); + + if (!Stdlib::SIMD::is_avx256_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + + b_array_8[0] = 0.0f; b_array_8[1] = 1.0f; b_array_8[2] = 2.0f; b_array_8[3] = 3.0f; + b_array_8[4] = 0.0f; b_array_8[5] = 1.0f; b_array_8[6] = 2.0f; b_array_8[7] = 3.0f; + + expected_array_8[0] = 0.0f; expected_array_8[1] = 1.0f; expected_array_8[2] = 4.0f; expected_array_8[3] = 9.0f; + expected_array_8[4] = 0.0f; expected_array_8[5] = 1.0f; expected_array_8[6] = 4.0f; expected_array_8[7] = 9.0f; + Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); + + Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); + Stdlib::SIMD::f32_8_simd b_simd_8 = Stdlib::SIMD::load_f32_8_simd(b_array_8); + + Stdlib::SIMD::f32_8_simd result_simd_8 = a_simd_8 * b_simd_8; + Stdlib::SIMD::unload_f32_8_simd(result_simd_8, result_array_8); + + ASSERT_EQUALS_WITH_DELTA(result_array_8[0], expected_array_8[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[1], expected_array_8[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[2], expected_array_8[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[3], expected_array_8[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[4], expected_array_8[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[5], expected_array_8[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[6], expected_array_8[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[7], expected_array_8[7], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_8 == expected_simd_8)); + + printf("\n[16]: "); + + if (!Stdlib::SIMD::is_avx512_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + + b_array_16[0] = 0.0f; b_array_16[1] = 1.0f; b_array_16[2] = 2.0f; b_array_16[3] = 3.0f; + b_array_16[4] = 0.0f; b_array_16[5] = 1.0f; b_array_16[6] = 2.0f; b_array_16[7] = 3.0f; + b_array_16[8] = 0.0f; b_array_16[9] = 1.0f; b_array_16[10] = 2.0f; b_array_16[11] = 3.0f; + b_array_16[12] = 0.0f; b_array_16[13] = 1.0f; b_array_16[14] = 2.0f; b_array_16[15] = 3.0f; + + expected_array_16[0] = 0.0f; expected_array_16[1] = 1.0f; expected_array_16[2] = 4.0f; expected_array_16[3] = 9.0f; + expected_array_16[4] = 0.0f; expected_array_16[5] = 1.0f; expected_array_16[6] = 4.0f; expected_array_16[7] = 9.0f; + expected_array_16[8] = 0.0f; expected_array_16[9] = 1.0f; expected_array_16[10] = 4.0f; expected_array_16[11] = 9.0f; + expected_array_16[12] = 0.0f; expected_array_16[13] = 1.0f; expected_array_16[14] = 4.0f; expected_array_16[15] = 9.0f; + Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); + + Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); + Stdlib::SIMD::f32_16_simd b_simd_16 = Stdlib::SIMD::load_f32_16_simd(b_array_16); + + Stdlib::SIMD::f32_16_simd result_simd_16 = a_simd_16 * b_simd_16; + Stdlib::SIMD::unload_f32_16_simd(result_simd_16, result_array_16); + + ASSERT_EQUALS_WITH_DELTA(result_array_16[0], expected_array_16[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[1], expected_array_16[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[2], expected_array_16[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[3], expected_array_16[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[4], expected_array_16[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[5], expected_array_16[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[6], expected_array_16[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[7], expected_array_16[7], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[8], expected_array_16[8], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[9], expected_array_16[9], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[10], expected_array_16[10], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[11], expected_array_16[11], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[12], expected_array_16[12], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[13], expected_array_16[13], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[14], expected_array_16[14], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[15], expected_array_16[15], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_16 == expected_simd_16)); + + return 0; +} \ No newline at end of file diff --git a/tests/Stdlib/SIMD/SIMD_HelperTest.cpp b/tests/Stdlib/SIMD/SIMD_HelperTest.cpp new file mode 100644 index 0000000..dbb6347 --- /dev/null +++ b/tests/Stdlib/SIMD/SIMD_HelperTest.cpp @@ -0,0 +1,42 @@ +/** + * Jingga + * + * @package Test + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#include +#include + +#include "../../../Utils/TestUtils.h" +#include "../../../Stdlib/SIMD/SIMD_Helper.h" + + +int main(int argc, char** argv) +{ + printf("SIMD_Helper:\n"); + + if (Stdlib::SIMD::is_avx_supported()) { + printf("\nAVX is supported"); + } else { + printf("\033[33m\nAVX is NOT supported\033[0m"); + } + + if (Stdlib::SIMD::is_avx256_supported()) { + printf("\nAVX 256 is supported"); + } else { + printf("\033[33m\nAVX 256 is NOT supported\033[0m"); + } + + if (Stdlib::SIMD::is_avx512_supported()) { + printf("\nAVX 512 is supported"); + } else { + printf("\033[33m\nAVX 512 is NOT supported\033[0m"); + } + + printf("\n\n"); + + return 0; +} diff --git a/tests/Threads/ThreadPoolTest.cpp b/tests/Threads/ThreadPoolTest.cpp index 483408a..592cb16 100755 --- a/tests/Threads/ThreadPoolTest.cpp +++ b/tests/Threads/ThreadPoolTest.cpp @@ -8,6 +8,7 @@ * @link https://jingga.app */ #include +#include #include "../../Threads/Thread.h" #include "../../Utils/TestUtils.h" @@ -15,17 +16,19 @@ static const size_t num_threads = 4; static const size_t num_items = 10; +// increase value by 100 void worker(void *arg) { - int *val = (int *) arg; - int old = *val; + Threads::Job *job = (Threads::Job *) arg; + int *val = (int *) job->arg; *val += 100; - // printf("tid=%p, old=%d, val=%d\n", (void *) pthread_self(), old, *val); if (*val % 2) { sleep(1); } + + job->state = 1; } int main(int argc, char** argv) @@ -36,25 +39,33 @@ int main(int argc, char** argv) int i; Threads::ThreadPool *pool = Threads::pool_create(num_threads); int *vals = (int *) calloc(num_items, sizeof(int)); + Threads::Job **works = (Threads::Job **) calloc(num_items, sizeof(Threads::Job)); for (i = 0; i < num_items; ++i) { vals[i] = i; - Threads::pool_add_work(pool, worker, vals + i); + works[i] = Threads::pool_add_work(pool, worker, vals + i); } - Threads::pool_wait(pool); - sleep(1); + // @bug wait is not working as expected + // I thought wait works similarly to what the do/while construct below does + //Threads::pool_wait(pool); + + bool finished = false; + do { + finished = true; + for (i = 0; i < num_items; ++i) { + finished = finished && (works[i]->state == 1); + } + } while (!finished); bool test = true; for (i = 0; i < num_items; ++i) { - // printf("%d\n", vals[i]); - test = test && 100 + i == vals[i]; + ASSERT_EQUALS(vals[i], 100 + i, "%d", "%d"); } - ASSERT_EQUALS(test, true, "%d", "%d"); - free(vals); + free(works); Threads::pool_destroy(pool); printf("\n\n"); diff --git a/tests/test.sh b/tests/test.sh index 8635502..b2d3980 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -9,3 +9,7 @@ g++ $BASEDIR/Image/ImageUtilsTest.cpp -o $BASEDIR/Image/ImageUtilsTest && $BASED g++ $BASEDIR/Threads/ThreadPoolTest.cpp -o $BASEDIR/Threads/ThreadPoolTest && $BASEDIR/Threads/ThreadPoolTest && rm $BASEDIR/Threads/ThreadPoolTest # g++ $BASEDIR/Utils/WebUtilsTest.cpp -o $BASEDIR/Utils/WebUtilsTest -l curl -l xml2 -l libxml2 -I /usr/include/libxml2 -f permissive && $BASEDIR/Utils/WebUtilsTest && rm $BASEDIR/Utils/WebUtilsTest + +g++ $BASEDIR/Stdlib/SIMD/SIMD_HelperTest.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_HelperTest && $BASEDIR/Stdlib/SIMD/SIMD_HelperTest && rm $BASEDIR/Stdlib/SIMD/SIMD_HelperTest + +g++ -mavx -msse -maes -msse3 -msse4.1 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/SIMD/SIMD_F32Test.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_F32Test && $BASEDIR/Stdlib/SIMD/SIMD_F32Test && rm $BASEDIR/Stdlib/SIMD/SIMD_F32Test