From b68c8702e060c9d4920a77381854cf32480fb894 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Fri, 19 Apr 2024 02:08:38 +0000 Subject: [PATCH 1/7] general fixes --- Math/Matrix/VectorFloat32.h | 916 ++++++++++++++++++++++++++++++++++++ Math/Matrix/VectorInt32.h | 176 +++++++ Math/Matrix/VectorInt64.h | 328 +++++++------ Types.h | 24 + 4 files changed, 1275 insertions(+), 169 deletions(-) create mode 100644 Types.h diff --git a/Math/Matrix/VectorFloat32.h b/Math/Matrix/VectorFloat32.h index e69de29..c971a72 100644 --- a/Math/Matrix/VectorFloat32.h +++ b/Math/Matrix/VectorFloat32.h @@ -0,0 +1,916 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef MATH_MATRIX_VECTORFLOAT32_H +#define MATH_MATRIX_VECTORFLOAT32_H + +#include "Types.h" +#include +#include + +struct simd_f32_4 { + union { + __m128 P; + f32 v[4]; + }; +}; + +struct simd_f32_8 { + union { + __m256 P; + f32 v[8]; + }; +}; + +struct simd_f32_16 { + union { + __m512 P; + f32 v[16]; + }; +}; + +inline +simd_f32_4 init_zero_simd_f32_4() +{ + simd_f32_4 simd; + simd.P = _mm_setzero_ps(); + + return simd; +} + +inline +simd_f32_8 init_zero_simd_f32_8() +{ + simd_f32_8 simd; + simd.P = _mm256_setzero_ps(); + + return simd; +} + +inline +simd_f32_16 init_zero_simd_f32_16() +{ + simd_f32_16 simd; + simd.P = _mm512_setzero_ps(); + + return simd; +} + +inline +simd_f32_4 operator+(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_add_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator+(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_add_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 operator+(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_add_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4 operator-(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_sub_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4 operator-(simd_f32_4 a) +{ + return init_zero_simd_f32_4() - a; +} + +inline +simd_f32_8 operator-(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_sub_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator-(simd_f32_8 a) +{ + return init_zero_simd_f32_8() - a; +} + +inline +simd_f32_16 operator-(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_sub_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 operator-(simd_f32_16 a) +{ + return init_zero_simd_f32_16() - a; +} + +inline +simd_f32_4 operator*(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_mul_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator*(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_mul_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 operator*(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_mul_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4 operator/(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_div_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator/(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_div_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 operator/(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_div_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4 operator^(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_xor_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator^(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_xor_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 operator^(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_xor_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4& operator-=(simd_f32_4 &a, simd_f32_4 b) +{ + a = a - b; + + return a; +} + +inline +simd_f32_8& operator-=(simd_f32_8 &a, simd_f32_8 b) +{ + a = a - b; + + return a; +} + +inline +simd_f32_16& operator-=(simd_f32_16 &a, simd_f32_16 b) +{ + a = a - b; + + return a; +} + +inline +simd_f32_4& operator+=(simd_f32_4 &a, simd_f32_4 b) +{ + a = a + b; + + return a; +} + +inline +simd_f32_8& operator+=(simd_f32_8 &a, simd_f32_8 b) +{ + a = a + b; + + return a; +} + +inline +simd_f32_16& operator+=(simd_f32_16 &a, simd_f32_16 b) +{ + a = a + b; + + return a; +} + +inline +simd_f32_4& operator*=(simd_f32_4 &a, simd_f32_4 b) +{ + a = a * b; + + return a; +} + +inline +simd_f32_8& operator*=(simd_f32_8 &a, simd_f32_8 b) +{ + a = a * b; + + return a; +} + +inline +simd_f32_16& operator*=(simd_f32_16 &a, simd_f32_16 b) +{ + a = a * b; + + return a; +} + +inline +simd_f32_4& operator/=(simd_f32_4 &a, simd_f32_4 b) +{ + a = a / b; + + return a; +} + +inline +simd_f32_8& operator/=(simd_f32_8 &a, simd_f32_8 b) +{ + a = a / b; + + return a; +} + +inline +simd_f32_16& operator/=(simd_f32_16 &a, simd_f32_16 b) +{ + a = a / b; + + return a; +} + +inline +simd_f32_4& operator^=(simd_f32_4 &a, simd_f32_4 b) +{ + a = a ^ b; + + return a; +} + +inline +simd_f32_8& operator^=(simd_f32_8 &a, simd_f32_8 b) +{ + a = a ^ b; + + return a; +} + +inline +simd_f32_16& operator^=(simd_f32_16 &a, simd_f32_16 b) +{ + a = a ^ b; + + return a; +} + +inline +simd_f32_4 operator<(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_cmplt_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator<(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_LT_OQ); + + return simd; +} + +inline +simd_f32_16 operator<(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_mask_mov_ps( + _mm512_setzero_ps(), + _mm512_cmp_ps_mask(a.P, b.P, _CMP_LT_OQ), + _mm512_set1_ps(1.0f) + ); + + return simd; +} + +inline +simd_f32_4 operator<=(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_cmple_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator<=(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_LE_OQ); + + return simd; +} + +inline +simd_f32_16 operator<=(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_mask_mov_ps( + _mm512_setzero_ps(), + _mm512_cmp_ps_mask(a.P, b.P, _CMP_LE_OQ), + _mm512_set1_ps(1.0f) + ); + + return simd; +} + +inline +simd_f32_4 operator>(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_cmpgt_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator>(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_GT_OQ); + + return simd; +} + +inline +simd_f32_16 operator>(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_mask_mov_ps( + _mm512_setzero_ps(), + _mm512_cmp_ps_mask(a.P, b.P, _CMP_GT_OQ), + _mm512_set1_ps(1.0f) + ); + + return simd; +} + +inline +simd_f32_4 operator>=(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_cmpge_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator>=(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_GE_OQ); + + return simd; +} + +inline +simd_f32_16 operator>=(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_mask_mov_ps( + _mm512_setzero_ps(), + _mm512_cmp_ps_mask(a.P, b.P, _CMP_GE_OQ), + _mm512_set1_ps(1.0f) + ); + + return simd; +} + +inline +simd_f32_4 operator==(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_cmpeq_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator==(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_EQ_OQ); + + return simd; +} + +inline +simd_f32_16 operator==(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_mask_mov_ps( + _mm512_setzero_ps(), + _mm512_cmp_ps_mask(a.P, b.P, _CMP_EQ_OQ), + _mm512_set1_ps(1.0f) + ); + + return simd; +} + +inline +simd_f32_4 operator!=(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_cmpneq_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator!=(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_NEQ_OQ); + + return simd; +} + +inline +simd_f32_16 operator!=(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_mask_mov_ps( + _mm512_setzero_ps(), + _mm512_cmp_ps_mask(a.P, b.P, _CMP_NEQ_OQ), + _mm512_set1_ps(1.0f) + ); + + return simd; +} + +inline +simd_f32_4 operator&(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_and_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator&(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_and_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 operator&(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_and_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4 operator|(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_or_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 operator|(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_or_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 operator|(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_or_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4& operator&=(simd_f32_4 &a, simd_f32_4 b) +{ + a = a & b; + + return a; +} + +inline +simd_f32_8& operator&=(simd_f32_8 &a, simd_f32_8 b) +{ + a = a & b; + + return a; +} + +inline +simd_f32_16& operator&=(simd_f32_16 &a, simd_f32_16 b) +{ + a = a & b; + + return a; +} + +inline +simd_f32_4& operator|=(simd_f32_4 &a, simd_f32_4 b) +{ + a = a | b; + + return a; +} + +inline +simd_f32_8& operator|=(simd_f32_8 &a, simd_f32_8 b) +{ + a = a | b; + + return a; +} + +inline +simd_f32_16& operator|=(simd_f32_16 &a, simd_f32_16 b) +{ + a = a | b; + + return a; +} + +inline +simd_f32_4 abs(simd_f32_4 a) +{ + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_ps(*(float *) &unsigned_mask); + + simd_f32_4 simd; + simd.P = _mm_and_ps(a.P, mask); + + return simd; +} + +inline +simd_f32_8 abs(simd_f32_8 a) +{ + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_ps(*(float *) &unsigned_mask); + + simd_f32_8 simd; + simd.P = _mm256_and_ps(a.P, mask); + + return simd; +} + +inline +simd_f32_16 abs(simd_f32_16 a) +{ + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_ps(*(float *) &unsigned_mask); + + simd_f32_16 simd; + simd.P = _mm512_and_ps(a.P, mask); + + return simd; +} + +inline +simd_f32_4 min(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_min_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 min(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_min_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 min(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_min_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4 max(simd_f32_4 a, simd_f32_4 b) +{ + simd_f32_4 simd; + simd.P = _mm_max_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_8 max(simd_f32_8 a, simd_f32_8 b) +{ + simd_f32_8 simd; + simd.P = _mm256_max_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_16 max(simd_f32_16 a, simd_f32_16 b) +{ + simd_f32_16 simd; + simd.P = _mm512_max_ps(a.P, b.P); + + return simd; +} + +inline +simd_f32_4 sign(simd_f32_4 a) +{ + unsigned int umask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_ps(*(float *) &umask); + + simd_f32_4 signBit; + signBit.P = _mm_and_ps(a.P, mask); + + simd_f32_4 b; + b.P = _mm_set1_ps(1.0f); + + simd_f32_4 simd = b | signBit; + + return simd; +} + +inline +simd_f32_8 sign(simd_f32_8 a) +{ + unsigned int umask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_ps(*(float *) &umask); + + simd_f32_8 signBit; + signBit.P = _mm256_and_ps(a.P, mask); + + simd_f32_8 b; + b.P = _mm256_set1_ps(1.0f); + + simd_f32_8 simd = b | signBit; + + return simd; +} + +inline +simd_f32_16 sign(simd_f32_16 a) +{ + unsigned int umask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_ps(*(float *) &umask); + + simd_f32_16 signBit; + signBit.P = _mm512_and_ps(a.P, mask); + + simd_f32_16 b; + b.P = _mm512_set1_ps(1.0f); + + simd_f32_16 simd = b | signBit; + + return simd; +} + +// sqrt +// approxinvsquareroot +// approx1over +// clamp +// floor +// ceil +// anytrue +// alltrue +// anyfalse +// allfalse + +struct v3_simd_f32_4 { + union { + struct { + union { + simd_f32_4 x; + simd_f32_4 r; + }; + union { + simd_f32_4 y; + simd_f32_4 g; + }; + union { + simd_f32_4 z; + simd_f32_4 b; + }; + }; + + simd_f32_4 v[3]; + }; +}; + +struct v3_simd_f32_8 { + union { + struct { + union { + simd_f32_8 x; + simd_f32_8 r; + }; + union { + simd_f32_8 y; + simd_f32_8 g; + }; + union { + simd_f32_8 z; + simd_f32_8 b; + }; + }; + + simd_f32_8 v[3]; + }; +}; + +struct v3_simd_f32_16 { + union { + struct { + union { + simd_f32_16 x; + simd_f32_16 r; + }; + union { + simd_f32_16 y; + simd_f32_16 g; + }; + union { + simd_f32_16 z; + simd_f32_16 b; + }; + }; + + simd_f32_16 v[3]; + }; +}; + +struct v4_simd_f32_4 { + union { + struct { + union { + simd_f32_4 x; + simd_f32_4 r; + }; + union { + simd_f32_4 y; + simd_f32_4 g; + }; + union { + simd_f32_4 z; + simd_f32_4 b; + }; + union { + simd_f32_4 w; + simd_f32_4 a; + }; + }; + + simd_f32_4 v[4]; + }; +}; + +struct v4_simd_f32_8 { + union { + struct { + union { + simd_f32_8 x; + simd_f32_8 r; + }; + union { + simd_f32_8 y; + simd_f32_8 g; + }; + union { + simd_f32_8 z; + simd_f32_8 b; + }; + union { + simd_f32_8 w; + simd_f32_8 a; + }; + }; + + simd_f32_8 v[4]; + }; +}; + +struct v4_simd_f32_16 { + union { + struct { + union { + simd_f32_16 x; + simd_f32_16 r; + }; + union { + simd_f32_16 y; + simd_f32_16 g; + }; + union { + simd_f32_16 z; + simd_f32_16 b; + }; + union { + simd_f32_16 w; + simd_f32_16 a; + }; + }; + + simd_f32_16 v[4]; + }; +}; + +#endif diff --git a/Math/Matrix/VectorInt32.h b/Math/Matrix/VectorInt32.h index e69de29..12cb085 100644 --- a/Math/Matrix/VectorInt32.h +++ b/Math/Matrix/VectorInt32.h @@ -0,0 +1,176 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef MATH_MATRIX_VECTORFLOAT32_H +#define MATH_MATRIX_VECTORFLOAT32_H + +#include "Types.h" +#include +#include + +struct simd_int32_4 { + union { + __m128i P; + int32 v[4]; + }; +}; + +struct simd_int32_8 { + union { + __m256i P; + int32 v[8]; + }; +}; + +struct simd_int32_16 { + union { + __m512i P; + int32 v[16]; + }; +}; + +struct v3_simd_int32_4 { + union { + struct { + union { + simd_int32_4 x; + simd_int32_4 r; + }; + union { + simd_int32_4 y; + simd_int32_4 g; + }; + union { + simd_int32_4 z; + simd_int32_4 b; + }; + }; + + simd_int32_4 v[3]; + }; +}; + +struct v3_simd_int32_8 { + union { + struct { + union { + simd_int32_8 x; + simd_int32_8 r; + }; + union { + simd_int32_8 y; + simd_int32_8 g; + }; + union { + simd_int32_8 z; + simd_int32_8 b; + }; + }; + + simd_int32_8 v[3]; + }; +}; + +struct v3_simd_int32_16 { + union { + struct { + union { + simd_int32_16 x; + simd_int32_16 r; + }; + union { + simd_int32_16 y; + simd_int32_16 g; + }; + union { + simd_int32_16 z; + simd_int32_16 b; + }; + }; + + simd_int32_16 v[3]; + }; +}; + +struct v4_simd_int32_4 { + union { + struct { + union { + simd_int32_4 x; + simd_int32_4 r; + }; + union { + simd_int32_4 y; + simd_int32_4 g; + }; + union { + simd_int32_4 z; + simd_int32_4 b; + }; + union { + simd_int32_4 w; + simd_int32_4 a; + }; + }; + + simd_int32_4 v[4]; + }; +}; + +struct v4_simd_int32_8 { + union { + struct { + union { + simd_int32_8 x; + simd_int32_8 r; + }; + union { + simd_int32_8 y; + simd_int32_8 g; + }; + union { + simd_int32_8 z; + simd_int32_8 b; + }; + union { + simd_int32_8 w; + simd_int32_8 a; + }; + }; + + simd_int32_8 v[4]; + }; +}; + +struct v4_simd_int32_16 { + union { + struct { + union { + simd_int32_16 x; + simd_int32_16 r; + }; + union { + simd_int32_16 y; + simd_int32_16 g; + }; + union { + simd_int32_16 z; + simd_int32_16 b; + }; + union { + simd_int32_16 w; + simd_int32_16 a; + }; + }; + + simd_int32_16 v[4]; + }; +}; + +#endif diff --git a/Math/Matrix/VectorInt64.h b/Math/Matrix/VectorInt64.h index c1c8b2f..a861de0 100644 --- a/Math/Matrix/VectorInt64.h +++ b/Math/Matrix/VectorInt64.h @@ -1,186 +1,176 @@ -// Remarks: sizes for the second matrix/vector are often implied by the first parameter and the rules for matrix/vector multiplication. +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef MATH_MATRIX_VECTORFLOAT32_H +#define MATH_MATRIX_VECTORFLOAT32_H -///////////////////////////////// -// Multiplication -///////////////////////////////// +#include "Types.h" +#include +#include -// Array vector multiplication -///////////////////////////////// -// mult_vec_int32(int64_t *a, size_t a, int32_t *b) -// mult_vec_int64(int64_t *a, size_t a, int64_t *b) -// mult_vec_float(int64_t *a, size_t a, float *b) +struct simd_int64_2 { + union { + __m128i P; + int64 v[2]; + }; +}; -// mult_scal_int32(int64_t *a, size_t a, int32_t b) -// mult_scal_int64(int64_t *a, size_t a, int64_t b) -// mult_scal_float(int64_t *a, size_t a, float b) +struct simd_int64_4 { + union { + __m256i P; + int64 v[4]; + }; +}; -// Vector 2 vector multiplication -///////////////////////////////// -// v2_mult_vec_int32(v2 *a, int32_t *b) -// v2_mult_vec_int64(v2 *a, int64_t *b) -// v2_mult_vec_float(v2 *a, float *b) -// v2_mult_vec_v2(v2 *a, v2 *b) +struct simd_int64_8 { + union { + __m512i P; + int64 v[8]; + }; +}; -// v2_mult_scal_int32(v2 *a, int32_t b) -// v2_mult_scal_int64(v2 *a, int64_t b) -// v2_mult_scal_float(v2 *a, float b) +struct v3_simd_int64_2 { + union { + struct { + union { + simd_int64_2 x; + simd_int64_2 r; + }; + union { + simd_int64_2 y; + simd_int64_2 g; + }; + union { + simd_int64_2 z; + simd_int64_2 b; + }; + }; -// Vector 3 vector multiplication -///////////////////////////////// -// v3_mult_vec_int32(v3 *a, int32_t *b) -// v3_mult_vec_int64(v3 *a, int64_t *b) -// v3_mult_vec_float(v3 *a, float *b) -// v3_mult_vec_v3(v3 *a, v3 *b) + simd_int64_2 v[3]; + }; +}; -// v3_mult_scal_int32(v3 *a, int32_t b) -// v3_mult_scal_int64(v3 *a, int64_t b) -// v3_mult_scal_float(v3 *a, float b) +struct v3_simd_int64_4 { + union { + struct { + union { + simd_int64_4 x; + simd_int64_4 r; + }; + union { + simd_int64_4 y; + simd_int64_4 g; + }; + union { + simd_int64_4 z; + simd_int64_4 b; + }; + }; -// Vector 4 vector multiplication -///////////////////////////////// -// v4_mult_vec_int32(v4 *a, int32_t *b) -// v4_mult_vec_int64(v4 *a, int64_t *b) -// v4_mult_vec_float(v4 *a, float *b) -// v4_mult_vec_v4(v4 *a, v4 *b) + simd_int64_4 v[3]; + }; +}; -// v4_mult_scal_int32(v4 *a, int32_t b) -// v4_mult_scal_int64(v4 *a, int64_t b) -// v4_mult_scal_float(v4 *a, float b) +struct v3_simd_int64_8 { + union { + struct { + union { + simd_int64_8 x; + simd_int64_8 r; + }; + union { + simd_int64_8 y; + simd_int64_8 g; + }; + union { + simd_int64_8 z; + simd_int64_8 b; + }; + }; -///////////////////////////////// -// Addition -///////////////////////////////// + simd_int64_8 v[3]; + }; +}; -// Array vector addition -///////////////////////////////// -// add_vec_int32(int64_t *a, size_t a, int32_t *b) -// add_vec_int64(int64_t *a, size_t a, int64_t *b) -// add_vec_float(int64_t *a, size_t a, float *b) +struct v4_simd_int64_2 { + union { + struct { + union { + simd_int64_2 x; + simd_int64_2 r; + }; + union { + simd_int64_2 y; + simd_int64_2 g; + }; + union { + simd_int64_2 z; + simd_int64_2 b; + }; + union { + simd_int64_2 w; + simd_int64_2 a; + }; + }; -// add_scal_int32(int64_t *a, size_t a, int32_t b) -// add_scal_int64(int64_t *a, size_t a, int64_t b) -// add_scal_float(int64_t *a, size_t a, float b) + simd_int64_2 v[4]; + }; +}; -// Vector 2 vector addition -///////////////////////////////// -// v2_add_vec_int32(v2 *a, int32_t *b) -// v2_add_vec_int64(v2 *a, int64_t *b) -// v2_add_vec_float(v2 *a, float *b) -// v2_add_vec_v2(v2 *a, v2 *b) +struct v4_simd_int64_4 { + union { + struct { + union { + simd_int64_4 x; + simd_int64_4 r; + }; + union { + simd_int64_4 y; + simd_int64_4 g; + }; + union { + simd_int64_4 z; + simd_int64_4 b; + }; + union { + simd_int64_4 w; + simd_int64_4 a; + }; + }; -// v2_add_scal_int32(v2 *a, int32_t b) -// v2_add_scal_int64(v2 *a, int64_t b) -// v2_add_scal_float(v2 *a, float b) + simd_int64_4 v[4]; + }; +}; -// Vector 3 vector addition -///////////////////////////////// -// v3_add_vec_int32(v3 *a, int32_t *b) -// v3_add_vec_int64(v3 *a, int64_t *b) -// v3_add_vec_float(v3 *a, float *b) -// v3_add_vec_v3(v3 *a, v3 *b) +struct v4_simd_int64_8 { + union { + struct { + union { + simd_int64_8 x; + simd_int64_8 r; + }; + union { + simd_int64_8 y; + simd_int64_8 g; + }; + union { + simd_int64_8 z; + simd_int64_8 b; + }; + union { + simd_int64_8 w; + simd_int64_8 a; + }; + }; -// v3_add_scal_int32(v3 *a, int32_t b) -// v3_add_scal_int64(v3 *a, int64_t b) -// v3_add_scal_float(v3 *a, float b) + simd_int64_8 v[4]; + }; +}; -// Vector 4 vector addition -///////////////////////////////// -// v4_add_vec_int32(v4 *a, int32_t *b) -// v4_add_vec_int64(v4 *a, int64_t *b) -// v4_add_vec_float(v4 *a, float *b) -// v4_add_vec_v4(v4 *a, v4 *b) - -// v4_add_scal_int32(v4 *a, int32_t b) -// v4_add_scal_int64(v4 *a, int64_t b) -// v4_add_scal_float(v4 *a, float b) - -///////////////////////////////// -// Subtraction -///////////////////////////////// - -// Array vector subtraction -///////////////////////////////// -// sub_vec_int32(int64_t *a, size_t a, int32_t *b) -// sub_vec_int64(int64_t *a, size_t a, int64_t *b) -// sub_vec_float(int64_t *a, size_t a, float *b) - -// sub_scal_int32(int64_t *a, size_t a, int32_t b) -// sub_scal_int64(int64_t *a, size_t a, int64_t b) -// sub_scal_float(int64_t *a, size_t a, float b) - -// Vector 2 vector subtraction -///////////////////////////////// -// v2_sub_vec_int32(v2 *a, int32_t *b) -// v2_sub_vec_int64(v2 *a, int64_t *b) -// v2_sub_vec_float(v2 *a, float *b) -// v2_sub_vec_v2(v2 *a, v2 *b) - -// v2_sub_scal_int32(v2 *a, int32_t b) -// v2_sub_scal_int64(v2 *a, int64_t b) -// v2_sub_scal_float(v2 *a, float b) - -// Vector 3 vector subtraction -///////////////////////////////// -// v3_sub_vec_int32(v3 *a, int32_t *b) -// v3_sub_vec_int64(v3 *a, int64_t *b) -// v3_sub_vec_float(v3 *a, float *b) -// v3_sub_vec_v3(v3 *a, v3 *b) - -// v3_sub_scal_int32(v3 *a, int32_t b) -// v3_sub_scal_int64(v3 *a, int64_t b) -// v3_sub_scal_float(v3 *a, float b) - -// Vector 4 vector subtraction -///////////////////////////////// -// v4_sub_vec_int32(v4 *a, int32_t *b) -// v4_sub_vec_int64(v4 *a, int64_t *b) -// v4_sub_vec_float(v4 *a, float *b) -// v4_sub_vec_v4(v4 *a, v4 *b) - -// v4_sub_scal_int32(v4 *a, int32_t b) -// v4_sub_scal_int64(v4 *a, int64_t b) -// v4_sub_scal_float(v4 *a, float b) - -///////////////////////////////// -// Other -///////////////////////////////// - -// Cross product -///////////////////////////////// -// cross_int32(int64_t *a, size_t a, int32_t *b) -// cross_int64(int64_t *a, size_t a, int64_t *b) -// cross_float(int64_t *a, size_t a, float *b) - -// v2_cross_v2(v2 *a, v2 *b) -// v3_cross_v3(v3 *a, v3 *b) -// v4_cross_v4(v4 *a, v4 *b) - -// Dot product -///////////////////////////////// -// dot_int32(int64_t *a, size_t a, int32_t *b) -// dot_int64(int64_t *a, size_t a, int64_t *b) -// dot_float(int64_t *a, size_t a, float *b) - -// v2_dot_v2(v2 *a, v2 *b) -// v3_dot_v3(v3 *a, v3 *b) -// v4_dot_v4(v4 *a, v4 *b) - -// Angle -///////////////////////////////// -// angle_int32(int64_t *a, size_t a, int32_t *b) -// angle_int64(int64_t *a, size_t a, int64_t *b) -// angle_float(int64_t *a, size_t a, float *b) - -// v2_angle_v2(v2 *a, v2 *b) -// v3_angle_v3(v3 *a, v3 *b) -// v4_angle_v4(v4 *a, v4 *b) - -// Cosine -///////////////////////////////// -// cosine_int32(int64_t *a, size_t a, int32_t *b) -// cosine_int64(int64_t *a, size_t a, int64_t *b) -// cosine_float(int64_t *a, size_t a, float *b) - -// v2_cosine_v2(v2 *a, v2 *b) -// v3_cosine_v3(v3 *a, v3 *b) -// v4_cosine_v4(v4 *a, v4 *b) \ No newline at end of file +#endif diff --git a/Types.h b/Types.h new file mode 100644 index 0000000..402a29d --- /dev/null +++ b/Types.h @@ -0,0 +1,24 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef TYPES_H +#define TYPES_H + +#include +#include + +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; +typedef int64_t int64; + +typedef float f32; +typedef double f64; + +#endif From b13b0e94832eaf04c028f95d39bc812170e3d56c Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Sat, 20 Apr 2024 19:11:36 +0000 Subject: [PATCH 2/7] new tests and minor fixes --- Math/Matrix/VectorFloat32.h | 996 +++------------------- Stdlib/Intrinsics.h | 70 ++ Stdlib/Mathtypes.h | 2 + Stdlib/SIMD/SIMD_F32.h | 1101 ++++++++++++++++++++++++ Stdlib/SIMD/SIMD_Helper.h | 70 ++ Stdlib/SIMD/SIMD_I32.h | 1117 +++++++++++++++++++++++++ Types.h => Stdlib/Types.h | 9 +- Threads/Job.h | 1 + Threads/Thread.h | 48 +- Utils/TestUtils.h | 10 +- tests/Stdlib/SIMD/SIMD_F32Test.cpp | 386 +++++++++ tests/Stdlib/SIMD/SIMD_HelperTest.cpp | 42 + tests/Threads/ThreadPoolTest.cpp | 31 +- tests/test.sh | 4 + 14 files changed, 2957 insertions(+), 930 deletions(-) create mode 100644 Stdlib/Intrinsics.h create mode 100644 Stdlib/SIMD/SIMD_F32.h create mode 100644 Stdlib/SIMD/SIMD_Helper.h create mode 100644 Stdlib/SIMD/SIMD_I32.h rename Types.h => Stdlib/Types.h (69%) create mode 100644 tests/Stdlib/SIMD/SIMD_F32Test.cpp create mode 100644 tests/Stdlib/SIMD/SIMD_HelperTest.cpp diff --git a/Math/Matrix/VectorFloat32.h b/Math/Matrix/VectorFloat32.h index c971a72..25c96c9 100644 --- a/Math/Matrix/VectorFloat32.h +++ b/Math/Matrix/VectorFloat32.h @@ -10,907 +10,145 @@ #ifndef MATH_MATRIX_VECTORFLOAT32_H #define MATH_MATRIX_VECTORFLOAT32_H -#include "Types.h" -#include -#include - -struct simd_f32_4 { - union { - __m128 P; - f32 v[4]; - }; -}; - -struct simd_f32_8 { - union { - __m256 P; - f32 v[8]; - }; -}; - -struct simd_f32_16 { - union { - __m512 P; - f32 v[16]; - }; -}; - -inline -simd_f32_4 init_zero_simd_f32_4() -{ - simd_f32_4 simd; - simd.P = _mm_setzero_ps(); - - return simd; -} - -inline -simd_f32_8 init_zero_simd_f32_8() -{ - simd_f32_8 simd; - simd.P = _mm256_setzero_ps(); - - return simd; -} - -inline -simd_f32_16 init_zero_simd_f32_16() -{ - simd_f32_16 simd; - simd.P = _mm512_setzero_ps(); - - return simd; -} - -inline -simd_f32_4 operator+(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_add_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator+(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_add_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator+(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_add_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator-(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_sub_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator-(simd_f32_4 a) -{ - return init_zero_simd_f32_4() - a; -} - -inline -simd_f32_8 operator-(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_sub_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator-(simd_f32_8 a) -{ - return init_zero_simd_f32_8() - a; -} - -inline -simd_f32_16 operator-(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_sub_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator-(simd_f32_16 a) -{ - return init_zero_simd_f32_16() - a; -} - -inline -simd_f32_4 operator*(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_mul_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator*(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_mul_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator*(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mul_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator/(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_div_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator/(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_div_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator/(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_div_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator^(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_xor_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator^(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_xor_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator^(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_xor_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4& operator-=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a - b; - - return a; -} - -inline -simd_f32_8& operator-=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a - b; - - return a; -} - -inline -simd_f32_16& operator-=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a - b; - - return a; -} - -inline -simd_f32_4& operator+=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a + b; - - return a; -} - -inline -simd_f32_8& operator+=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a + b; - - return a; -} - -inline -simd_f32_16& operator+=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a + b; - - return a; -} - -inline -simd_f32_4& operator*=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a * b; - - return a; -} - -inline -simd_f32_8& operator*=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a * b; - - return a; -} - -inline -simd_f32_16& operator*=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a * b; - - return a; -} - -inline -simd_f32_4& operator/=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a / b; - - return a; -} - -inline -simd_f32_8& operator/=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a / b; - - return a; -} - -inline -simd_f32_16& operator/=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a / b; - - return a; -} - -inline -simd_f32_4& operator^=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a ^ b; - - return a; -} - -inline -simd_f32_8& operator^=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a ^ b; - - return a; -} - -inline -simd_f32_16& operator^=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a ^ b; - - return a; -} - -inline -simd_f32_4 operator<(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmplt_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator<(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_LT_OQ); - - return simd; -} - -inline -simd_f32_16 operator<(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_LT_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator<=(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmple_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator<=(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_LE_OQ); - - return simd; -} - -inline -simd_f32_16 operator<=(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_LE_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator>(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmpgt_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator>(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_GT_OQ); - - return simd; -} - -inline -simd_f32_16 operator>(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_GT_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator>=(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmpge_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator>=(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_GE_OQ); - - return simd; -} - -inline -simd_f32_16 operator>=(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_GE_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator==(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmpeq_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator==(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_EQ_OQ); - - return simd; -} - -inline -simd_f32_16 operator==(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_EQ_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator!=(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_cmpneq_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator!=(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_cmp_ps(a.P, b.P, _CMP_NEQ_OQ); - - return simd; -} - -inline -simd_f32_16 operator!=(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_mask_mov_ps( - _mm512_setzero_ps(), - _mm512_cmp_ps_mask(a.P, b.P, _CMP_NEQ_OQ), - _mm512_set1_ps(1.0f) - ); - - return simd; -} - -inline -simd_f32_4 operator&(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_and_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator&(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_and_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator&(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_and_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 operator|(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_or_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 operator|(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_or_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 operator|(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_or_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4& operator&=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a & b; - - return a; -} - -inline -simd_f32_8& operator&=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a & b; - - return a; -} - -inline -simd_f32_16& operator&=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a & b; - - return a; -} - -inline -simd_f32_4& operator|=(simd_f32_4 &a, simd_f32_4 b) -{ - a = a | b; - - return a; -} - -inline -simd_f32_8& operator|=(simd_f32_8 &a, simd_f32_8 b) -{ - a = a | b; - - return a; -} - -inline -simd_f32_16& operator|=(simd_f32_16 &a, simd_f32_16 b) -{ - a = a | b; - - return a; -} - -inline -simd_f32_4 abs(simd_f32_4 a) -{ - unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m128 mask = _mm_set1_ps(*(float *) &unsigned_mask); - - simd_f32_4 simd; - simd.P = _mm_and_ps(a.P, mask); - - return simd; -} - -inline -simd_f32_8 abs(simd_f32_8 a) -{ - unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m256 mask = _mm256_set1_ps(*(float *) &unsigned_mask); - - simd_f32_8 simd; - simd.P = _mm256_and_ps(a.P, mask); - - return simd; -} - -inline -simd_f32_16 abs(simd_f32_16 a) -{ - unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m512 mask = _mm512_set1_ps(*(float *) &unsigned_mask); - - simd_f32_16 simd; - simd.P = _mm512_and_ps(a.P, mask); - - return simd; -} - -inline -simd_f32_4 min(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_min_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 min(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_min_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 min(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_min_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 max(simd_f32_4 a, simd_f32_4 b) -{ - simd_f32_4 simd; - simd.P = _mm_max_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_8 max(simd_f32_8 a, simd_f32_8 b) -{ - simd_f32_8 simd; - simd.P = _mm256_max_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_16 max(simd_f32_16 a, simd_f32_16 b) -{ - simd_f32_16 simd; - simd.P = _mm512_max_ps(a.P, b.P); - - return simd; -} - -inline -simd_f32_4 sign(simd_f32_4 a) -{ - unsigned int umask = (unsigned int) (1 << 31); - __m128 mask = _mm_set1_ps(*(float *) &umask); - - simd_f32_4 signBit; - signBit.P = _mm_and_ps(a.P, mask); - - simd_f32_4 b; - b.P = _mm_set1_ps(1.0f); - - simd_f32_4 simd = b | signBit; - - return simd; -} - -inline -simd_f32_8 sign(simd_f32_8 a) -{ - unsigned int umask = (unsigned int) (1 << 31); - __m256 mask = _mm256_set1_ps(*(float *) &umask); - - simd_f32_8 signBit; - signBit.P = _mm256_and_ps(a.P, mask); - - simd_f32_8 b; - b.P = _mm256_set1_ps(1.0f); - - simd_f32_8 simd = b | signBit; - - return simd; -} - -inline -simd_f32_16 sign(simd_f32_16 a) -{ - unsigned int umask = (unsigned int) (1 << 31); - __m512 mask = _mm512_set1_ps(*(float *) &umask); - - simd_f32_16 signBit; - signBit.P = _mm512_and_ps(a.P, mask); - - simd_f32_16 b; - b.P = _mm512_set1_ps(1.0f); - - simd_f32_16 simd = b | signBit; - - return simd; -} - -// sqrt -// approxinvsquareroot -// approx1over -// clamp -// floor -// ceil -// anytrue -// alltrue -// anyfalse -// allfalse - -struct v3_simd_f32_4 { - union { - struct { - union { - simd_f32_4 x; - simd_f32_4 r; - }; - union { - simd_f32_4 y; - simd_f32_4 g; - }; - union { - simd_f32_4 z; - simd_f32_4 b; +namespace Math::Matrix::VectorFloat32 +{ + struct v3_f32_4_simd { + union { + struct { + union { + f32_4_simd x; + f32_4_simd r; + }; + union { + f32_4_simd y; + f32_4_simd g; + }; + union { + f32_4_simd z; + f32_4_simd b; + }; }; + + f32_4_simd v[3]; }; - - simd_f32_4 v[3]; }; -}; -struct v3_simd_f32_8 { - union { - struct { - union { - simd_f32_8 x; - simd_f32_8 r; - }; - union { - simd_f32_8 y; - simd_f32_8 g; - }; - union { - simd_f32_8 z; - simd_f32_8 b; + struct v3_f32_8_simd { + union { + struct { + union { + f32_8_simd x; + f32_8_simd r; + }; + union { + f32_8_simd y; + f32_8_simd g; + }; + union { + f32_8_simd z; + f32_8_simd b; + }; }; + + f32_8_simd v[3]; }; - - simd_f32_8 v[3]; }; -}; -struct v3_simd_f32_16 { - union { - struct { - union { - simd_f32_16 x; - simd_f32_16 r; - }; - union { - simd_f32_16 y; - simd_f32_16 g; - }; - union { - simd_f32_16 z; - simd_f32_16 b; + struct v3_f32_16_simd { + union { + struct { + union { + f32_16_simd x; + f32_16_simd r; + }; + union { + f32_16_simd y; + f32_16_simd g; + }; + union { + f32_16_simd z; + f32_16_simd b; + }; }; + + f32_16_simd v[3]; }; - - simd_f32_16 v[3]; }; -}; -struct v4_simd_f32_4 { - union { - struct { - union { - simd_f32_4 x; - simd_f32_4 r; - }; - union { - simd_f32_4 y; - simd_f32_4 g; - }; - union { - simd_f32_4 z; - simd_f32_4 b; - }; - union { - simd_f32_4 w; - simd_f32_4 a; + struct v4_f32_4_simd { + union { + struct { + union { + f32_4_simd x; + f32_4_simd r; + }; + union { + f32_4_simd y; + f32_4_simd g; + }; + union { + f32_4_simd z; + f32_4_simd b; + }; + union { + f32_4_simd w; + f32_4_simd a; + }; }; + + f32_4_simd v[4]; }; - - simd_f32_4 v[4]; }; -}; -struct v4_simd_f32_8 { - union { - struct { - union { - simd_f32_8 x; - simd_f32_8 r; - }; - union { - simd_f32_8 y; - simd_f32_8 g; - }; - union { - simd_f32_8 z; - simd_f32_8 b; - }; - union { - simd_f32_8 w; - simd_f32_8 a; + struct v4_f32_8_simd { + union { + struct { + union { + f32_8_simd x; + f32_8_simd r; + }; + union { + f32_8_simd y; + f32_8_simd g; + }; + union { + f32_8_simd z; + f32_8_simd b; + }; + union { + f32_8_simd w; + f32_8_simd a; + }; }; + + f32_8_simd v[4]; }; - - simd_f32_8 v[4]; }; -}; -struct v4_simd_f32_16 { - union { - struct { - union { - simd_f32_16 x; - simd_f32_16 r; - }; - union { - simd_f32_16 y; - simd_f32_16 g; - }; - union { - simd_f32_16 z; - simd_f32_16 b; - }; - union { - simd_f32_16 w; - simd_f32_16 a; + struct v4_f32_16_simd { + union { + struct { + union { + f32_16_simd x; + f32_16_simd r; + }; + union { + f32_16_simd y; + f32_16_simd g; + }; + union { + f32_16_simd z; + f32_16_simd b; + }; + union { + f32_16_simd w; + f32_16_simd a; + }; }; + + f32_16_simd v[4]; }; - - simd_f32_16 v[4]; }; -}; +} #endif diff --git a/Stdlib/Intrinsics.h b/Stdlib/Intrinsics.h new file mode 100644 index 0000000..c537534 --- /dev/null +++ b/Stdlib/Intrinsics.h @@ -0,0 +1,70 @@ +/** + * Jingga + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef STDLIB_INTRINSICS_H +#define STDLIB_INTRINSICS_H + +#include +#include +#include +#include + +#include "Types.h" + +namespace Stdlib::Intrinsics +{ + inline + f32 sqrt(f32 a) { + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_set_ss(a))); + } + + inline + f32 round(f32 a) { + return _mm_cvtss_f32( + _mm_round_ss( + _mm_setzero_ps(), + _mm_set_ss(a), + (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) + ) + ); + } + + inline + uint32 round_to_int(f32 a) { + return (uint32) _mm_cvtss_si32(_mm_set_ss(a)); + } + + inline + f32 floor(f32 a) { + return _mm_cvtss_f32(_mm_floor_ss(_mm_setzero_ps(), _mm_set_ss(a))); + } + + inline + f32 ceil(f32 a) { + return _mm_cvtss_f32(_mm_ceil_ss(_mm_setzero_ps(), _mm_set_ss(a))); + } + + inline + uint32 hash(uint64 a, uint64 b = 0) { + uint8 seed[16] = { + 0xaa, 0x9b, 0xbd, 0xb8, + 0xa1, 0x98, 0xac, 0x3f, + 0x1f, 0x94, 0x07, 0xb3, + 0x8c, 0x27, 0x93, 0x69, + }; + + __m128i hash = _mm_set_epi64x(a, b); + hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); + hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); + + return _mm_extract_epi32(hash, 0); + } +} + +#endif \ No newline at end of file diff --git a/Stdlib/Mathtypes.h b/Stdlib/Mathtypes.h index 750792f..0ba1450 100644 --- a/Stdlib/Mathtypes.h +++ b/Stdlib/Mathtypes.h @@ -13,6 +13,8 @@ #include #include +// @todo check Vectors, we can simplify this!!! + // int32_t vectors typedef union { struct { diff --git a/Stdlib/SIMD/SIMD_F32.h b/Stdlib/SIMD/SIMD_F32.h new file mode 100644 index 0000000..367fafa --- /dev/null +++ b/Stdlib/SIMD/SIMD_F32.h @@ -0,0 +1,1101 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef STDLIB_SIMD_F32_H +#define STDLIB_SIMD_F32_H + +#include +#include + +#include "../Types.h" + +namespace Stdlib::SIMD +{ + struct f32_4_simd { + union { + __m128 s; + f32 v[4]; + }; + }; + + struct f32_8_simd { + union { + __m256 s; + f32 v[8]; + }; + }; + + struct f32_16_simd { + union { + __m512 s; + f32 v[16]; + }; + }; + + inline + f32_4_simd load_f32_4_simd(f32 *mem) + { + f32_4_simd simd; + simd.s = _mm_loadu_ps(mem); + + return simd; + } + + inline + f32_4_simd init_f32_4_simd(f32 *mem) + { + f32_4_simd simd; + simd.s = _mm_set_ps(mem[0], mem[1], mem[2], mem[3]); + + return simd; + } + + inline + void unload_f32_4_simd(f32_4_simd a, f32* array) + { + _mm_store_ps(array, a.s); + } + + inline + f32_8_simd load_f32_8_simd(f32 *mem) + { + f32_8_simd simd; + simd.s = _mm256_loadu_ps(mem); + + return simd; + } + + inline + f32_8_simd init_f32_8_simd(f32 *mem) + { + f32_8_simd simd; + simd.s = _mm256_set_ps( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7] + ); + + return simd; + } + + inline + void unload_f32_8_simd(f32_8_simd a, f32* array) + { + _mm256_store_ps(array, a.s); + } + + inline + f32_16_simd load_f32_16_simd(f32 *mem) + { + f32_16_simd simd; + simd.s = _mm512_loadu_ps(mem); + + return simd; + } + + inline + f32_16_simd init_f32_16_simd(f32 *mem) + { + f32_16_simd simd; + simd.s = _mm512_set_ps( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], + mem[12], mem[13], mem[14], mem[15] + ); + + return simd; + } + + inline + void unload_f32_16_simd(f32_16_simd a, f32* array) + { + _mm512_store_ps(array, a.s); + } + + inline + f32_4_simd init_zero_f32_4_simd() + { + f32_4_simd simd; + simd.s = _mm_setzero_ps(); + + return simd; + } + + inline + f32_8_simd init_zero_f32_8_simd() + { + f32_8_simd simd; + simd.s = _mm256_setzero_ps(); + + return simd; + } + + inline + f32_16_simd init_zero_f32_16_simd() + { + f32_16_simd simd; + simd.s = _mm512_setzero_ps(); + + return simd; + } + + inline + f32_4_simd operator+(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_add_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator+(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_add_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator+(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_add_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator-(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_sub_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator-(f32_4_simd a) + { + return init_zero_f32_4_simd() - a; + } + + inline + f32_8_simd operator-(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_sub_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator-(f32_8_simd a) + { + return init_zero_f32_8_simd() - a; + } + + inline + f32_16_simd operator-(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_sub_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator-(f32_16_simd a) + { + return init_zero_f32_16_simd() - a; + } + + inline + f32_4_simd operator*(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_mul_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator*(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_mul_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator*(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mul_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator/(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_div_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator/(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_div_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator/(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_div_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator^(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_xor_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator^(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_xor_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator^(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_xor_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd& operator-=(f32_4_simd &a, f32_4_simd b) + { + a = a - b; + + return a; + } + + inline + f32_8_simd& operator-=(f32_8_simd &a, f32_8_simd b) + { + a = a - b; + + return a; + } + + inline + f32_16_simd& operator-=(f32_16_simd &a, f32_16_simd b) + { + a = a - b; + + return a; + } + + inline + f32_4_simd& operator+=(f32_4_simd &a, f32_4_simd b) + { + a = a + b; + + return a; + } + + inline + f32_8_simd& operator+=(f32_8_simd &a, f32_8_simd b) + { + a = a + b; + + return a; + } + + inline + f32_16_simd& operator+=(f32_16_simd &a, f32_16_simd b) + { + a = a + b; + + return a; + } + + inline + f32_4_simd& operator*=(f32_4_simd &a, f32_4_simd b) + { + a = a * b; + + return a; + } + + inline + f32_8_simd& operator*=(f32_8_simd &a, f32_8_simd b) + { + a = a * b; + + return a; + } + + inline + f32_16_simd& operator*=(f32_16_simd &a, f32_16_simd b) + { + a = a * b; + + return a; + } + + inline + f32_4_simd& operator/=(f32_4_simd &a, f32_4_simd b) + { + a = a / b; + + return a; + } + + inline + f32_8_simd& operator/=(f32_8_simd &a, f32_8_simd b) + { + a = a / b; + + return a; + } + + inline + f32_16_simd& operator/=(f32_16_simd &a, f32_16_simd b) + { + a = a / b; + + return a; + } + + inline + f32_4_simd& operator^=(f32_4_simd &a, f32_4_simd b) + { + a = a ^ b; + + return a; + } + + inline + f32_8_simd& operator^=(f32_8_simd &a, f32_8_simd b) + { + a = a ^ b; + + return a; + } + + inline + f32_16_simd& operator^=(f32_16_simd &a, f32_16_simd b) + { + a = a ^ b; + + return a; + } + + inline + f32_4_simd operator<(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmplt_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator<(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LT_OQ); + + return simd; + } + + inline + f32_16_simd operator<(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmplt_ps_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator<=(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmple_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator<=(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LE_OQ); + + return simd; + } + + inline + f32_16_simd operator<=(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator>(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmpgt_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator>(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GT_OQ); + + return simd; + } + + inline + f32_16_simd operator>(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator>=(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmpge_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator>=(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GE_OQ); + + return simd; + } + + inline + f32_16_simd operator>=(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator==(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmpeq_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator==(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_EQ_OQ); + + return simd; + } + + inline + f32_16_simd operator==(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator!=(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_cmpneq_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator!=(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_NEQ_OQ); + + return simd; + } + + inline + f32_16_simd operator!=(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), + a.s, + b.s + ); + + return simd; + } + + inline + f32_4_simd operator&(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_and_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator&(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_and_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator&(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_and_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd operator|(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_or_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd operator|(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_or_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd operator|(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_or_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd& operator&=(f32_4_simd &a, f32_4_simd b) + { + a = a & b; + + return a; + } + + inline + f32_8_simd& operator&=(f32_8_simd &a, f32_8_simd b) + { + a = a & b; + + return a; + } + + inline + f32_16_simd& operator&=(f32_16_simd &a, f32_16_simd b) + { + a = a & b; + + return a; + } + + inline + f32_4_simd& operator|=(f32_4_simd &a, f32_4_simd b) + { + a = a | b; + + return a; + } + + inline + f32_8_simd& operator|=(f32_8_simd &a, f32_8_simd b) + { + a = a | b; + + return a; + } + + inline + f32_16_simd& operator|=(f32_16_simd &a, f32_16_simd b) + { + a = a | b; + + return a; + } + + inline + f32_4_simd abs(f32_4_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_ps(*(float *) &unsigned_mask); + + f32_4_simd simd; + simd.s = _mm_and_ps(a.s, mask); + + return simd; + } + + inline + f32_8_simd abs(f32_8_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_ps(*(float *) &unsigned_mask); + + f32_8_simd simd; + simd.s = _mm256_and_ps(a.s, mask); + + return simd; + } + + inline + f32_16_simd abs(f32_16_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_ps(*(float *) &unsigned_mask); + + f32_16_simd simd; + simd.s = _mm512_and_ps(a.s, mask); + + return simd; + } + + inline + f32_4_simd min(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_min_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd min(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_min_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd min(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_min_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd max(f32_4_simd a, f32_4_simd b) + { + f32_4_simd simd; + simd.s = _mm_max_ps(a.s, b.s); + + return simd; + } + + inline + f32_8_simd max(f32_8_simd a, f32_8_simd b) + { + f32_8_simd simd; + simd.s = _mm256_max_ps(a.s, b.s); + + return simd; + } + + inline + f32_16_simd max(f32_16_simd a, f32_16_simd b) + { + f32_16_simd simd; + simd.s = _mm512_max_ps(a.s, b.s); + + return simd; + } + + inline + f32_4_simd sign(f32_4_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_ps(*(float *) &umask); + + f32_4_simd signBit; + signBit.s = _mm_and_ps(a.s, mask); + + f32_4_simd b; + b.s = _mm_set1_ps(1.0f); + + f32_4_simd simd = b | signBit; + + return simd; + } + + inline + f32_8_simd sign(f32_8_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_ps(*(float *) &umask); + + f32_8_simd signBit; + signBit.s = _mm256_and_ps(a.s, mask); + + f32_8_simd b; + b.s = _mm256_set1_ps(1.0f); + + f32_8_simd simd = b | signBit; + + return simd; + } + + inline + f32_16_simd sign(f32_16_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_ps(*(float *) &umask); + + f32_16_simd signBit; + signBit.s = _mm512_and_ps(a.s, mask); + + f32_16_simd b; + b.s = _mm512_set1_ps(1.0f); + + f32_16_simd simd = b | signBit; + + return simd; + } + + inline + f32_4_simd floor(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_floor_ps(a.s); + + return simd; + } + + inline + f32_8_simd floor(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_floor_ps(a.s); + + return simd; + } + + inline + f32_16_simd floor(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_floor_ps(a.s); + + return simd; + } + + inline + f32_4_simd ceil(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_ceil_ps(a.s); + + return simd; + } + + inline + f32_8_simd ceil(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_ceil_ps(a.s); + + return simd; + } + + inline + f32_16_simd ceil(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_ceil_ps(a.s); + + return simd; + } + + inline + f32_4_simd sqrt(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_sqrt_ps(a.s); + + return simd; + } + + inline + f32_8_simd sqrt(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_sqrt_ps(a.s); + + return simd; + } + + inline + f32_16_simd sqrt(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_sqrt_ps(a.s); + + return simd; + } + + inline + f32_4_simd sqrt_inv_approx(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_rsqrt_ps(a.s); + + return simd; + } + + inline + f32_8_simd sqrt_inv_approx(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_rsqrt_ps(a.s); + + return simd; + } + + inline + f32_16_simd sqrt_inv_approx(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_rsqrt14_ps(a.s); + + return simd; + } + + inline + f32_4_simd one_over_approx(f32_4_simd a) + { + f32_4_simd simd; + simd.s = _mm_rcp_ps(a.s); + + return simd; + } + + inline + f32_8_simd one_over_approx(f32_8_simd a) + { + f32_8_simd simd; + simd.s = _mm256_rcp_ps(a.s); + + return simd; + } + + inline + f32_16_simd one_over_approx(f32_16_simd a) + { + f32_16_simd simd; + simd.s = _mm512_rcp14_ps(a.s); + + return simd; + } + + inline + f32_4_simd clamp(f32_4_simd min_value, f32_4_simd a, f32_4_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + f32_8_simd clamp(f32_8_simd min_value, f32_8_simd a, f32_8_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + f32_16_simd clamp(f32_16_simd min_value, f32_16_simd a, f32_16_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + int32 which_true(f32_4_simd a) + { + int32 which_true = _mm_movemask_ps(a.s); + + return which_true; + } + + inline + int32 which_true(f32_8_simd a) + { + int32 which_true = _mm256_movemask_ps(a.s); + + return which_true; + } + + inline + int32 which_true(f32_16_simd a) + { + int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)); + + return which_true; + } + + inline + bool any_true(f32_4_simd a) + { + bool is_any_true = _mm_movemask_ps(a.s) > 0; + + return is_any_true; + } + + inline + bool any_true(f32_8_simd a) + { + bool is_any_true = _mm256_movemask_ps(a.s) > 0; + + return is_any_true; + } + + inline + bool any_true(f32_16_simd a) + { + bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0; + + return is_any_true; + } + + inline + bool all_true(f32_4_simd a) + { + bool is_true = (_mm_movemask_ps(a.s) == 15); + + return is_true; + } + + inline + bool all_true(f32_8_simd a) + { + bool is_true = (_mm256_movemask_ps(a.s) == 255); + + return is_true; + } + + inline + bool all_true(f32_16_simd a) + { + bool is_true = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535); + + return is_true; + } + + inline + bool all_false(f32_4_simd a) + { + bool is_false = (_mm_movemask_ps(a.s) == 0); + + return is_false; + } + + inline + bool all_false(f32_8_simd a) + { + bool is_false = (_mm256_movemask_ps(a.s) == 0); + + return is_false; + } + + inline + bool all_false(f32_16_simd a) + { + // @todo This can be optimized (requires also changes in the comparison functions return) + bool is_false = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0); + + return is_false; + } +} + +#endif diff --git a/Stdlib/SIMD/SIMD_Helper.h b/Stdlib/SIMD/SIMD_Helper.h new file mode 100644 index 0000000..8dea4fe --- /dev/null +++ b/Stdlib/SIMD/SIMD_Helper.h @@ -0,0 +1,70 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef STDLIB_SIMD_HELPER_H +#define STDLIB_SIMD_HELPER_H + +#include +#include +#include + +namespace Stdlib::SIMD +{ + bool is_avx_supported() + { + uint32_t eax, ebx, ecx, edx; + + eax = 1; // CPUID function 1 + + __asm__ __volatile__( + "cpuid;" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (eax) + ); + + // Check the AVX feature bit in ECX + return (ecx >> 28) & 1; + } + + bool is_avx256_supported() + { + uint32_t eax, ebx, ecx, edx; + + eax = 7; // CPUID function 7 + ecx = 0; // Sub-function 0 + + __asm__ __volatile__( + "cpuid;" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (eax), "c" (ecx) + ); + + // Check the AVX-256 (AVX2) feature bit in EBX + return (ebx >> 5) & 1; + } + + bool is_avx512_supported() + { + uint32_t eax, ebx, ecx, edx; + + eax = 7; // CPUID function 7 + ecx = 0; // Sub-function 0 + + __asm__ __volatile__( + "cpuid;" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (eax), "c" (ecx) + ); + + // Check the AVX-512 feature bit in EBX + return (ebx >> 16) & 1; + } +} + +#endif \ No newline at end of file diff --git a/Stdlib/SIMD/SIMD_I32.h b/Stdlib/SIMD/SIMD_I32.h new file mode 100644 index 0000000..b1fadb0 --- /dev/null +++ b/Stdlib/SIMD/SIMD_I32.h @@ -0,0 +1,1117 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef STDLIB_SIMD_I32_H +#define STDLIB_SIMD_I32_H + +#include +#include + +#include "../Types.h" +#include "SIMD_F32.h" + +namespace Stdlib::SIMD +{ + struct int32_4_simd { + union { + __m128i s; + int32 v[4]; + }; + }; + + struct int32_8_simd { + union { + __m256i s; + int32 v[8]; + }; + }; + + struct int32_16_simd { + union { + __m512i s; + int32 v[16]; + }; + }; + + inline + int32_4_simd load_int32_4_simd(int32 *mem) + { + int32_4_simd simd; + simd.s = _mm_loadu_epi32(mem); + + return simd; + } + + inline + int32_4_simd init_int32_4_simd(int32 *mem) + { + int32_4_simd simd; + simd.s = _mm_set_epi32(mem[0], mem[1], mem[2], mem[3]); + + return simd; + } + + inline + void unload_int32_4_simd(int32_4_simd a, int32* array) + { + _mm_store_epi32(array, a.s); + } + + inline + int32_8_simd load_int32_8_simd(int32 *mem) + { + int32_8_simd simd; + simd.s = _mm256_loadu_epi32(mem); + + return simd; + } + + inline + int32_8_simd init_int32_8_simd(int32 *mem) + { + int32_8_simd simd; + simd.s = _mm256_set_epi32( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7] + ); + + return simd; + } + + inline + void unload_int32_8_simd(int32_8_simd a, int32* array) + { + _mm256_store_epi32(array, a.s); + } + + inline + int32_16_simd load_int32_16_simd(int32 *mem) + { + int32_16_simd simd; + simd.s = _mm512_loadu_epi32(mem); + + return simd; + } + + inline + int32_16_simd init_int32_16_simd(int32 *mem) + { + int32_16_simd simd; + simd.s = _mm512_set_epi32( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], + mem[12], mem[13], mem[14], mem[15] + ); + + return simd; + } + + inline + void unload_int32_16_simd(int32_16_simd a, int32* array) + { + _mm512_store_epi32(array, a.s); + } + + inline + int32_4_simd init_zero_int32_4_simd() + { + int32_4_simd simd; + simd.s = _mm_setzero_si128(); + + return simd; + } + + inline + int32_8_simd init_zero_int32_8_simd() + { + int32_8_simd simd; + simd.s = _mm256_setzero_si256(); + + return simd; + } + + inline + int32_16_simd init_zero_int32_16_simd() + { + int32_16_simd simd; + simd.s = _mm512_setzero_epi32(); + + return simd; + } + + inline + int32_4_simd operator+(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_add_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator+(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_add_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator+(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_add_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd operator-(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_sub_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd operator-(int32_4_simd a) + { + return init_zero_int32_4_simd() - a; + } + + inline + int32_8_simd operator-(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_sub_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator-(int32_8_simd a) + { + return init_zero_int32_8_simd() - a; + } + + inline + int32_16_simd operator-(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_sub_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator-(int32_16_simd a) + { + return init_zero_int32_16_simd() - a; + } + + inline + int32_4_simd operator*(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_mul_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator*(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_mul_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator*(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mul_epi32(a.s, b.s); + + return simd; + } + + inline + Stdlib::SIMD::f32_4_simd operator/(int32_4_simd a, int32_4_simd b) + { + Stdlib::SIMD::f32_4_simd simd; + simd.s = _mm_div_ps(a.s, b.s); + + return simd; + } + + inline + Stdlib::SIMD::f32_8_simd operator/(int32_8_simd a, int32_8_simd b) + { + Stdlib::SIMD::f32_8_simd simd; + simd.s = _mm256_div_ps(a.s, b.s); + + return simd; + } + + inline + Stdlib::SIMD::f32_16_simd operator/(int32_16_simd a, int32_16_simd b) + { + Stdlib::SIMD::f32_16_simd simd; + simd.s = _mm512_div_ps(a.s, b.s); + + return simd; + } + + inline + int32_4_simd operator^(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_xor_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator^(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_xor_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator^(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_xor_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd& operator-=(int32_4_simd &a, int32_4_simd b) + { + a = a - b; + + return a; + } + + inline + int32_8_simd& operator-=(int32_8_simd &a, int32_8_simd b) + { + a = a - b; + + return a; + } + + inline + int32_16_simd& operator-=(int32_16_simd &a, int32_16_simd b) + { + a = a - b; + + return a; + } + + inline + int32_4_simd& operator+=(int32_4_simd &a, int32_4_simd b) + { + a = a + b; + + return a; + } + + inline + int32_8_simd& operator+=(int32_8_simd &a, int32_8_simd b) + { + a = a + b; + + return a; + } + + inline + int32_16_simd& operator+=(int32_16_simd &a, int32_16_simd b) + { + a = a + b; + + return a; + } + + inline + int32_4_simd& operator*=(int32_4_simd &a, int32_4_simd b) + { + a = a * b; + + return a; + } + + inline + int32_8_simd& operator*=(int32_8_simd &a, int32_8_simd b) + { + a = a * b; + + return a; + } + + inline + int32_16_simd& operator*=(int32_16_simd &a, int32_16_simd b) + { + a = a * b; + + return a; + } + + inline + int32_4_simd& operator/=(int32_4_simd &a, int32_4_simd b) + { + a.s = _mm_cvtps_epi32((a / b).s); + + return a; + } + + inline + int32_8_simd& operator/=(int32_8_simd &a, int32_8_simd b) + { + a.s = _mm256_cvtps_epi32((a / b).s); + + return a; + } + + inline + int32_16_simd& operator/=(int32_16_simd &a, int32_16_simd b) + { + a.s = _mm512_cvtps_epi32((a / b).s); + + return a; + } + + inline + int32_4_simd& operator^=(int32_4_simd &a, int32_4_simd b) + { + a = a ^ b; + + return a; + } + + inline + int32_8_simd& operator^=(int32_8_simd &a, int32_8_simd b) + { + a = a ^ b; + + return a; + } + + inline + int32_16_simd& operator^=(int32_16_simd &a, int32_16_simd b) + { + a = a ^ b; + + return a; + } + + inline + int32_4_simd operator<(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_cmplt_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator<(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_xor_si256( + _mm256_cmpgt_epi32(a.s, b.s), + _mm256_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_16_simd operator<(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_epi32( + _mm512_cmplt_epi32_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + int32_4_simd operator<=(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_andnot_si128( + _mm_cmplt_epi32(b.s, a.s), + _mm_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_8_simd operator<=(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_andnot_si256( + _mm256_cmpgt_epi32(a.s, b.s), + _mm256_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_16_simd operator<=(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_epi32( + _mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), + b.s, + a.s + ); + + return simd; + } + + inline + int32_4_simd operator>(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_cmpgt_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator>(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_cmpgt_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator>(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmpgt_epi32_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + int32_4_simd operator>=(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_andnot_si128( + _mm_cmplt_epi32(a.s, b.s), + _mm_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_8_simd operator>=(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_andnot_si256( + _mm256_cmpgt_epi32(b.s, a.s), + _mm256_set1_epi32(-1) + ); + + return simd; + } + + inline + int32_16_simd operator>=(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmpge_epi32_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + int32_4_simd operator==(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_cmpeq_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator==(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_cmpeq_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator==(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_blend_ps( + _mm512_cmpeq_epi32_mask(a.s, b.s), + a.s, + b.s + ); + + return simd; + } + + inline + int32_4_simd operator!=(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_cmpneq_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator!=(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_cmp_epi32(a.s, b.s, _CMP_NEQ_OQ); + + return simd; + } + + inline + int32_16_simd operator!=(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_mask_mov_epi32( + _mm512_setzero_epi32(), + _mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), + _mm512_set1_epi32(1.0f) + ); + + return simd; + } + + inline + int32_4_simd operator&(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_and_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator&(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_and_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator&(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_and_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd operator|(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_or_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd operator|(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_or_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd operator|(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_or_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd& operator&=(int32_4_simd &a, int32_4_simd b) + { + a = a & b; + + return a; + } + + inline + int32_8_simd& operator&=(int32_8_simd &a, int32_8_simd b) + { + a = a & b; + + return a; + } + + inline + int32_16_simd& operator&=(int32_16_simd &a, int32_16_simd b) + { + a = a & b; + + return a; + } + + inline + int32_4_simd& operator|=(int32_4_simd &a, int32_4_simd b) + { + a = a | b; + + return a; + } + + inline + int32_8_simd& operator|=(int32_8_simd &a, int32_8_simd b) + { + a = a | b; + + return a; + } + + inline + int32_16_simd& operator|=(int32_16_simd &a, int32_16_simd b) + { + a = a | b; + + return a; + } + + inline + int32_4_simd abs(int32_4_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_epi32(*(float *) &unsigned_mask); + + int32_4_simd simd; + simd.s = _mm_and_epi32(a.s, mask); + + return simd; + } + + inline + int32_8_simd abs(int32_8_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_epi32(*(float *) &unsigned_mask); + + int32_8_simd simd; + simd.s = _mm256_and_epi32(a.s, mask); + + return simd; + } + + inline + int32_16_simd abs(int32_16_simd a) + { + unsigned int unsigned_mask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_epi32(*(float *) &unsigned_mask); + + int32_16_simd simd; + simd.s = _mm512_and_epi32(a.s, mask); + + return simd; + } + + inline + int32_4_simd min(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_min_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd min(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_min_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd min(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_min_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd max(int32_4_simd a, int32_4_simd b) + { + int32_4_simd simd; + simd.s = _mm_max_epi32(a.s, b.s); + + return simd; + } + + inline + int32_8_simd max(int32_8_simd a, int32_8_simd b) + { + int32_8_simd simd; + simd.s = _mm256_max_epi32(a.s, b.s); + + return simd; + } + + inline + int32_16_simd max(int32_16_simd a, int32_16_simd b) + { + int32_16_simd simd; + simd.s = _mm512_max_epi32(a.s, b.s); + + return simd; + } + + inline + int32_4_simd sign(int32_4_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m128 mask = _mm_set1_epi32(*(float *) &umask); + + int32_4_simd signBit; + signBit.s = _mm_and_epi32(a.s, mask); + + int32_4_simd b; + b.s = _mm_set1_epi32(1.0f); + + int32_4_simd simd = b | signBit; + + return simd; + } + + inline + int32_8_simd sign(int32_8_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m256 mask = _mm256_set1_epi32(*(float *) &umask); + + int32_8_simd signBit; + signBit.s = _mm256_and_epi32(a.s, mask); + + int32_8_simd b; + b.s = _mm256_set1_epi32(1.0f); + + int32_8_simd simd = b | signBit; + + return simd; + } + + inline + int32_16_simd sign(int32_16_simd a) + { + unsigned int umask = (unsigned int) (1 << 31); + __m512 mask = _mm512_set1_epi32(*(float *) &umask); + + int32_16_simd signBit; + signBit.s = _mm512_and_epi32(a.s, mask); + + int32_16_simd b; + b.s = _mm512_set1_epi32(1.0f); + + int32_16_simd simd = b | signBit; + + return simd; + } + + inline + int32_4_simd floor(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_floor_epi32(a.s); + + return simd; + } + + inline + int32_8_simd floor(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_floor_epi32(a.s); + + return simd; + } + + inline + int32_16_simd floor(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_floor_epi32(a.s); + + return simd; + } + + inline + int32_4_simd ceil(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_ceil_epi32(a.s); + + return simd; + } + + inline + int32_8_simd ceil(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_ceil_epi32(a.s); + + return simd; + } + + inline + int32_16_simd ceil(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_ceil_epi32(a.s); + + return simd; + } + + inline + int32_4_simd sqrt(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_sqrt_epi32(a.s); + + return simd; + } + + inline + int32_8_simd sqrt(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_sqrt_epi32(a.s); + + return simd; + } + + inline + int32_16_simd sqrt(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_sqrt_epi32(a.s); + + return simd; + } + + inline + int32_4_simd sqrt_inv_approx(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_rsqrt_epi32(a.s); + + return simd; + } + + inline + int32_8_simd sqrt_inv_approx(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_rsqrt_epi32(a.s); + + return simd; + } + + inline + int32_16_simd sqrt_inv_approx(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_rsqrt14_epi32(a.s); + + return simd; + } + + inline + int32_4_simd one_over_approx(int32_4_simd a) + { + int32_4_simd simd; + simd.s = _mm_rcp_epi32(a.s); + + return simd; + } + + inline + int32_8_simd one_over_approx(int32_8_simd a) + { + int32_8_simd simd; + simd.s = _mm256_rcp_epi32(a.s); + + return simd; + } + + inline + int32_16_simd one_over_approx(int32_16_simd a) + { + int32_16_simd simd; + simd.s = _mm512_rcp14_epi32(a.s); + + return simd; + } + + inline + int32_4_simd clamp(int32_4_simd min_value, int32_4_simd a, int32_4_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + int32_8_simd clamp(int32_8_simd min_value, int32_8_simd a, int32_8_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + int32_16_simd clamp(int32_16_simd min_value, int32_16_simd a, int32_16_simd max_value) + { + return min(max(a, min_value), max_value); + } + + inline + int32 which_true(int32_4_simd a) + { + int32 which_true = _mm_movemask_epi32(a.s); + + return which_true; + } + + inline + int32 which_true(int32_8_simd a) + { + int32 which_true = _mm256_movemask_epi32(a.s); + + return which_true; + } + + inline + int32 which_true(int32_16_simd a) + { + int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)); + + return which_true; + } + + inline + bool any_true(int32_4_simd a) + { + bool is_any_true = _mm_movemask_epi32(a.s) > 0; + + return is_any_true; + } + + inline + bool any_true(int32_8_simd a) + { + bool is_any_true = _mm256_movemask_epi32(a.s) > 0; + + return is_any_true; + } + + inline + bool any_true(int32_16_simd a) + { + bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0; + + return is_any_true; + } + + inline + bool all_true(int32_4_simd a) + { + bool is_true = (_mm_movemask_epi32(a.s) == 15); + + return is_true; + } + + inline + bool all_true(int32_8_simd a) + { + bool is_true = (_mm256_movemask_epi32(a.s) == 255); + + return is_true; + } + + inline + bool all_true(int32_16_simd a) + { + bool is_true = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535); + + return is_true; + } + + inline + bool all_false(int32_4_simd a) + { + bool is_false = (_mm_movemask_epi32(a.s) == 0); + + return is_false; + } + + inline + bool all_false(int32_8_simd a) + { + bool is_false = (_mm256_movemask_epi32(a.s) == 0); + + return is_false; + } + + inline + bool all_false(int32_16_simd a) + { + // @todo This can be optimized (requires also changes in the comparison functions return) + bool is_false = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0); + + return is_false; + } +} + +#endif diff --git a/Types.h b/Stdlib/Types.h similarity index 69% rename from Types.h rename to Stdlib/Types.h index 402a29d..2cd4679 100644 --- a/Types.h +++ b/Stdlib/Types.h @@ -7,8 +7,8 @@ * @version 1.0.0 * @link https://jingga.app */ -#ifndef TYPES_H -#define TYPES_H +#ifndef STDLIB_TYPES_H +#define STDLIB_TYPES_H #include #include @@ -18,6 +18,11 @@ typedef int16_t int16; typedef int32_t int32; typedef int64_t int64; +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint64_t uint64; + typedef float f32; typedef double f64; diff --git a/Threads/Job.h b/Threads/Job.h index c53bd2b..9c3612c 100755 --- a/Threads/Job.h +++ b/Threads/Job.h @@ -19,6 +19,7 @@ namespace Threads struct job_t { JobFunc func; void *arg; + int state; job_t *next; }; diff --git a/Threads/Thread.h b/Threads/Thread.h index 581590e..1ed45ba 100755 --- a/Threads/Thread.h +++ b/Threads/Thread.h @@ -19,38 +19,26 @@ namespace Threads { Job *pool_work_create(JobFunc func, void *arg) { - Job *work; - if (func == NULL) { return NULL; } - work = (Job *) malloc(sizeof(*work)); + Job *work = (Job *) malloc(sizeof(*work)); work->func = func; work->arg = arg; + work->state = 0; work->next = NULL; return work; } - void pool_work_destroy(Job *work) + Job *pool_work_poll(Threads::ThreadPool *pool) { - if (work == NULL) { - return; - } - - free(work); - } - - Job *pool_work_get(Threads::ThreadPool *pool) - { - Job *work; - if (pool == NULL) { return NULL; } - work = pool->work_first; + Job *work = pool->work_first; if (work == NULL) { return NULL; } @@ -70,7 +58,7 @@ namespace Threads Threads::ThreadPool *pool = (Threads::ThreadPool *) arg; Threads::Job *work; - while (1) { + while (true) { pthread_mutex_lock(&(pool->work_mutex)); while (pool->work_first == NULL && !pool->stop) { @@ -81,13 +69,12 @@ namespace Threads break; } - work = Threads::pool_work_get(pool); + work = Threads::pool_work_poll(pool); ++(pool->working_cnt); pthread_mutex_unlock(&(pool->work_mutex)); if (work != NULL) { - work->func(work->arg); - pool_work_destroy(work); + work->func(work); } pthread_mutex_lock(&(pool->work_mutex)); @@ -155,20 +142,15 @@ namespace Threads void pool_destroy(Threads::ThreadPool *pool) { - Threads::Job *work; - Threads::Job *work2; - if (pool == NULL) { return; } pthread_mutex_lock(&(pool->work_mutex)); - work = pool->work_first; + Threads::Job *work = pool->work_first; while (work != NULL) { - work2 = work->next; - pool_work_destroy(work); - work = work2; + work = work->next; } pool->stop = true; @@ -184,17 +166,15 @@ namespace Threads free(pool); } - bool pool_add_work(Threads::ThreadPool *pool, JobFunc func, void *arg) + Threads::Job* pool_add_work(Threads::ThreadPool *pool, JobFunc func, void *arg) { - Threads::Job *work; - if (pool == NULL) { - return false; + return NULL; } - work = Threads::pool_work_create(func, arg); + Threads::Job *work = Threads::pool_work_create(func, arg); if (work == NULL) { - return false; + return NULL; } pthread_mutex_lock(&(pool->work_mutex)); @@ -209,7 +189,7 @@ namespace Threads pthread_cond_broadcast(&(pool->work_cond)); pthread_mutex_unlock(&(pool->work_mutex)); - return true; + return work; } } diff --git a/Utils/TestUtils.h b/Utils/TestUtils.h index 26ad21f..b039bdc 100755 --- a/Utils/TestUtils.h +++ b/Utils/TestUtils.h @@ -17,7 +17,7 @@ if ((a) == (b)) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf((t1), (a)); printf(" != "); printf((t2), (b)); printf("\n"); \ return 0; } \ @@ -27,7 +27,7 @@ if (oms_abs((a) - (b)) <= (delta)) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf((t1), (a)); printf(" != "); printf((t2), (b)); printf("\n"); \ return 0; } \ @@ -37,7 +37,7 @@ if (strstr((a), (b)) != NULL) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf("%s", (a)); printf(" !contains "); printf("%s", (b)); printf("\n"); \ return 0; } \ @@ -47,7 +47,7 @@ if ((a) == true) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf("%d", (a)); printf(" != "); printf("1"); printf("\n"); \ return 0; } \ @@ -57,7 +57,7 @@ if ((a) == false) { \ printf("."); \ } else { \ - printf("[F]"); \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ printf("%d", (a)); printf(" != "); printf("1"); printf("\n"); \ return 0; } \ diff --git a/tests/Stdlib/SIMD/SIMD_F32Test.cpp b/tests/Stdlib/SIMD/SIMD_F32Test.cpp new file mode 100644 index 0000000..15b4e6f --- /dev/null +++ b/tests/Stdlib/SIMD/SIMD_F32Test.cpp @@ -0,0 +1,386 @@ +/** + * Jingga + * + * @package Test + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#include +#include + +#include "../../../Utils/TestUtils.h" +#include "../../../Stdlib/SIMD/SIMD_F32.h" +#include "../../../Stdlib/SIMD/SIMD_Helper.h" + +float* a_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float* b_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float* expected_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float* result_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); + +float* a_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float* b_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float* expected_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float* result_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); + +float* a_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float* b_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float* expected_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float* result_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); + +int test_operator_plus(); +int test_operator_minus(); +int test_operator_mul(); + +int main(int argc, char** argv) +{ + printf("SIMD_F32:\n"); + + test_operator_plus(); + test_operator_minus(); + test_operator_mul(); + + printf("\n\n"); + + return 0; +} + +int test_operator_plus() +{ + printf("\noperator+:\n"); + printf("[4]: "); + if (!Stdlib::SIMD::is_avx_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; + b_array_4[0] = 0.0f; b_array_4[1] = 1.0f; b_array_4[2] = 2.0f; b_array_4[3] = 3.0f; + + expected_array_4[0] = 0.0f; expected_array_4[1] = 2.0f; expected_array_4[2] = 4.0f; expected_array_4[3] = 6.0f; + Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); + + Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); + Stdlib::SIMD::f32_4_simd b_simd_4 = Stdlib::SIMD::load_f32_4_simd(b_array_4); + + Stdlib::SIMD::f32_4_simd result_simd_4 = a_simd_4 + b_simd_4; + Stdlib::SIMD::unload_f32_4_simd(result_simd_4, result_array_4); + + ASSERT_EQUALS_WITH_DELTA(result_array_4[0], expected_array_4[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[1], expected_array_4[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[2], expected_array_4[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[3], expected_array_4[3], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_4 == expected_simd_4)); + + printf("\n[8]: "); + + if (!Stdlib::SIMD::is_avx256_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + + b_array_8[0] = 0.0f; b_array_8[1] = 1.0f; b_array_8[2] = 2.0f; b_array_8[3] = 3.0f; + b_array_8[4] = 0.0f; b_array_8[5] = 1.0f; b_array_8[6] = 2.0f; b_array_8[7] = 3.0f; + + expected_array_8[0] = 0.0f; expected_array_8[1] = 2.0f; expected_array_8[2] = 4.0f; expected_array_8[3] = 6.0f; + expected_array_8[4] = 0.0f; expected_array_8[5] = 2.0f; expected_array_8[6] = 4.0f; expected_array_8[7] = 6.0f; + Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); + + Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); + Stdlib::SIMD::f32_8_simd b_simd_8 = Stdlib::SIMD::load_f32_8_simd(b_array_8); + + Stdlib::SIMD::f32_8_simd result_simd_8 = a_simd_8 + b_simd_8; + Stdlib::SIMD::unload_f32_8_simd(result_simd_8, result_array_8); + + ASSERT_EQUALS_WITH_DELTA(result_array_8[0], expected_array_8[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[1], expected_array_8[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[2], expected_array_8[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[3], expected_array_8[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[4], expected_array_8[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[5], expected_array_8[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[6], expected_array_8[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[7], expected_array_8[7], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_8 == expected_simd_8)); + + printf("\n[16]: "); + + if (!Stdlib::SIMD::is_avx512_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + + b_array_16[0] = 0.0f; b_array_16[1] = 1.0f; b_array_16[2] = 2.0f; b_array_16[3] = 3.0f; + b_array_16[4] = 0.0f; b_array_16[5] = 1.0f; b_array_16[6] = 2.0f; b_array_16[7] = 3.0f; + b_array_16[8] = 0.0f; b_array_16[9] = 1.0f; b_array_16[10] = 2.0f; b_array_16[11] = 3.0f; + b_array_16[12] = 0.0f; b_array_16[13] = 1.0f; b_array_16[14] = 2.0f; b_array_16[15] = 3.0f; + + expected_array_16[0] = 0.0f; expected_array_16[1] = 2.0f; expected_array_16[2] = 4.0f; expected_array_16[3] = 6.0f; + expected_array_16[4] = 0.0f; expected_array_16[5] = 2.0f; expected_array_16[6] = 4.0f; expected_array_16[7] = 6.0f; + expected_array_16[8] = 0.0f; expected_array_16[9] = 2.0f; expected_array_16[10] = 4.0f; expected_array_16[11] = 6.0f; + expected_array_16[12] = 0.0f; expected_array_16[13] = 2.0f; expected_array_16[14] = 4.0f; expected_array_16[15] = 6.0f; + Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); + + Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); + Stdlib::SIMD::f32_16_simd b_simd_16 = Stdlib::SIMD::load_f32_16_simd(b_array_16); + + Stdlib::SIMD::f32_16_simd result_simd_16 = a_simd_16 + b_simd_16; + Stdlib::SIMD::unload_f32_16_simd(result_simd_16, result_array_16); + + ASSERT_EQUALS_WITH_DELTA(result_array_16[0], expected_array_16[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[1], expected_array_16[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[2], expected_array_16[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[3], expected_array_16[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[4], expected_array_16[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[5], expected_array_16[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[6], expected_array_16[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[7], expected_array_16[7], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[8], expected_array_16[8], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[9], expected_array_16[9], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[10], expected_array_16[10], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[11], expected_array_16[11], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[12], expected_array_16[12], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[13], expected_array_16[13], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[14], expected_array_16[14], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[15], expected_array_16[15], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_16 == expected_simd_16)); + + return 0; +} + +int test_operator_minus() +{ + printf("\noperator-:\n"); + printf("[4]: "); + if (!Stdlib::SIMD::is_avx_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; + b_array_4[0] = 1.0f; b_array_4[1] = 1.0f; b_array_4[2] = 1.0f; b_array_4[3] = 1.0f; + + expected_array_4[0] = -1.0f; expected_array_4[1] = 0.0f; expected_array_4[2] = 1.0f; expected_array_4[3] = 2.0f; + Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); + + Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); + Stdlib::SIMD::f32_4_simd b_simd_4 = Stdlib::SIMD::load_f32_4_simd(b_array_4); + + Stdlib::SIMD::f32_4_simd result_simd_4 = a_simd_4 - b_simd_4; + Stdlib::SIMD::unload_f32_4_simd(result_simd_4, result_array_4); + + ASSERT_EQUALS_WITH_DELTA(result_array_4[0], expected_array_4[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[1], expected_array_4[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[2], expected_array_4[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[3], expected_array_4[3], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_4 == expected_simd_4)); + + printf("\n[8]: "); + + if (!Stdlib::SIMD::is_avx256_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + + b_array_8[0] = 1.0f; b_array_8[1] = 1.0f; b_array_8[2] = 1.0f; b_array_8[3] = 1.0f; + b_array_8[4] = 1.0f; b_array_8[5] = 1.0f; b_array_8[6] = 1.0f; b_array_8[7] = 1.0f; + + expected_array_8[0] = -1.0f; expected_array_8[1] = 0.0f; expected_array_8[2] = 1.0f; expected_array_8[3] = 2.0f; + expected_array_8[4] = -1.0f; expected_array_8[5] = 0.0f; expected_array_8[6] = 1.0f; expected_array_8[7] = 2.0f; + Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); + + Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); + Stdlib::SIMD::f32_8_simd b_simd_8 = Stdlib::SIMD::load_f32_8_simd(b_array_8); + + Stdlib::SIMD::f32_8_simd result_simd_8 = a_simd_8 - b_simd_8; + Stdlib::SIMD::unload_f32_8_simd(result_simd_8, result_array_8); + + ASSERT_EQUALS_WITH_DELTA(result_array_8[0], expected_array_8[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[1], expected_array_8[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[2], expected_array_8[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[3], expected_array_8[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[4], expected_array_8[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[5], expected_array_8[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[6], expected_array_8[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[7], expected_array_8[7], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_8 == expected_simd_8)); + + printf("\n[16]: "); + + if (!Stdlib::SIMD::is_avx512_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + + b_array_16[0] = 1.0f; b_array_16[1] = 1.0f; b_array_16[2] = 1.0f; b_array_16[3] = 1.0f; + b_array_16[4] = 1.0f; b_array_16[5] = 1.0f; b_array_16[6] = 1.0f; b_array_16[7] = 1.0f; + b_array_16[8] = 1.0f; b_array_16[9] = 1.0f; b_array_16[10] = 1.0f; b_array_16[11] = 1.0f; + b_array_16[12] = 1.0f; b_array_16[13] = 1.0f; b_array_16[14] = 1.0f; b_array_16[15] = 1.0f; + + expected_array_16[0] = -1.0f; expected_array_16[1] = 0.0f; expected_array_16[2] = 1.0f; expected_array_16[3] = 2.0f; + expected_array_16[4] = -1.0f; expected_array_16[5] = 0.0f; expected_array_16[6] = 1.0f; expected_array_16[7] = 2.0f; + expected_array_16[8] = -1.0f; expected_array_16[9] = 0.0f; expected_array_16[10] = 1.0f; expected_array_16[11] = 2.0f; + expected_array_16[12] = -1.0f; expected_array_16[13] = 0.0f; expected_array_16[14] = 1.0f; expected_array_16[15] = 2.0f; + Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); + + Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); + Stdlib::SIMD::f32_16_simd b_simd_16 = Stdlib::SIMD::load_f32_16_simd(b_array_16); + + Stdlib::SIMD::f32_16_simd result_simd_16 = a_simd_16 - b_simd_16; + Stdlib::SIMD::unload_f32_16_simd(result_simd_16, result_array_16); + + ASSERT_EQUALS_WITH_DELTA(result_array_16[0], expected_array_16[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[1], expected_array_16[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[2], expected_array_16[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[3], expected_array_16[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[4], expected_array_16[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[5], expected_array_16[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[6], expected_array_16[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[7], expected_array_16[7], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[8], expected_array_16[8], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[9], expected_array_16[9], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[10], expected_array_16[10], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[11], expected_array_16[11], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[12], expected_array_16[12], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[13], expected_array_16[13], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[14], expected_array_16[14], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[15], expected_array_16[15], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_16 == expected_simd_16)); + + return 0; +} + +int test_operator_mul() +{ + printf("\noperator*:\n"); + printf("[4]: "); + if (!Stdlib::SIMD::is_avx_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; + b_array_4[0] = 0.0f; b_array_4[1] = 1.0f; b_array_4[2] = 2.0f; b_array_4[3] = 3.0f; + + expected_array_4[0] = 0.0f; expected_array_4[1] = 1.0f; expected_array_4[2] = 4.0f; expected_array_4[3] = 9.0f; + Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); + + Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); + Stdlib::SIMD::f32_4_simd b_simd_4 = Stdlib::SIMD::load_f32_4_simd(b_array_4); + + Stdlib::SIMD::f32_4_simd result_simd_4 = a_simd_4 * b_simd_4; + Stdlib::SIMD::unload_f32_4_simd(result_simd_4, result_array_4); + + ASSERT_EQUALS_WITH_DELTA(result_array_4[0], expected_array_4[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[1], expected_array_4[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[2], expected_array_4[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_4[3], expected_array_4[3], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_4 == expected_simd_4)); + + printf("\n[8]: "); + + if (!Stdlib::SIMD::is_avx256_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + + b_array_8[0] = 0.0f; b_array_8[1] = 1.0f; b_array_8[2] = 2.0f; b_array_8[3] = 3.0f; + b_array_8[4] = 0.0f; b_array_8[5] = 1.0f; b_array_8[6] = 2.0f; b_array_8[7] = 3.0f; + + expected_array_8[0] = 0.0f; expected_array_8[1] = 1.0f; expected_array_8[2] = 4.0f; expected_array_8[3] = 9.0f; + expected_array_8[4] = 0.0f; expected_array_8[5] = 1.0f; expected_array_8[6] = 4.0f; expected_array_8[7] = 9.0f; + Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); + + Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); + Stdlib::SIMD::f32_8_simd b_simd_8 = Stdlib::SIMD::load_f32_8_simd(b_array_8); + + Stdlib::SIMD::f32_8_simd result_simd_8 = a_simd_8 * b_simd_8; + Stdlib::SIMD::unload_f32_8_simd(result_simd_8, result_array_8); + + ASSERT_EQUALS_WITH_DELTA(result_array_8[0], expected_array_8[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[1], expected_array_8[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[2], expected_array_8[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[3], expected_array_8[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[4], expected_array_8[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[5], expected_array_8[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[6], expected_array_8[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_8[7], expected_array_8[7], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_8 == expected_simd_8)); + + printf("\n[16]: "); + + if (!Stdlib::SIMD::is_avx512_supported()) { + printf("[\033[33mNot supported\033[0m]"); + + return 0; + } + + a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + + b_array_16[0] = 0.0f; b_array_16[1] = 1.0f; b_array_16[2] = 2.0f; b_array_16[3] = 3.0f; + b_array_16[4] = 0.0f; b_array_16[5] = 1.0f; b_array_16[6] = 2.0f; b_array_16[7] = 3.0f; + b_array_16[8] = 0.0f; b_array_16[9] = 1.0f; b_array_16[10] = 2.0f; b_array_16[11] = 3.0f; + b_array_16[12] = 0.0f; b_array_16[13] = 1.0f; b_array_16[14] = 2.0f; b_array_16[15] = 3.0f; + + expected_array_16[0] = 0.0f; expected_array_16[1] = 1.0f; expected_array_16[2] = 4.0f; expected_array_16[3] = 9.0f; + expected_array_16[4] = 0.0f; expected_array_16[5] = 1.0f; expected_array_16[6] = 4.0f; expected_array_16[7] = 9.0f; + expected_array_16[8] = 0.0f; expected_array_16[9] = 1.0f; expected_array_16[10] = 4.0f; expected_array_16[11] = 9.0f; + expected_array_16[12] = 0.0f; expected_array_16[13] = 1.0f; expected_array_16[14] = 4.0f; expected_array_16[15] = 9.0f; + Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); + + Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); + Stdlib::SIMD::f32_16_simd b_simd_16 = Stdlib::SIMD::load_f32_16_simd(b_array_16); + + Stdlib::SIMD::f32_16_simd result_simd_16 = a_simd_16 * b_simd_16; + Stdlib::SIMD::unload_f32_16_simd(result_simd_16, result_array_16); + + ASSERT_EQUALS_WITH_DELTA(result_array_16[0], expected_array_16[0], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[1], expected_array_16[1], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[2], expected_array_16[2], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[3], expected_array_16[3], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[4], expected_array_16[4], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[5], expected_array_16[5], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[6], expected_array_16[6], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[7], expected_array_16[7], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[8], expected_array_16[8], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[9], expected_array_16[9], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[10], expected_array_16[10], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[11], expected_array_16[11], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[12], expected_array_16[12], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[13], expected_array_16[13], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[14], expected_array_16[14], 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(result_array_16[15], expected_array_16[15], 0.01, "%f", "%f"); + ASSERT_TRUE(Stdlib::SIMD::all_true(result_simd_16 == expected_simd_16)); + + return 0; +} \ No newline at end of file diff --git a/tests/Stdlib/SIMD/SIMD_HelperTest.cpp b/tests/Stdlib/SIMD/SIMD_HelperTest.cpp new file mode 100644 index 0000000..dbb6347 --- /dev/null +++ b/tests/Stdlib/SIMD/SIMD_HelperTest.cpp @@ -0,0 +1,42 @@ +/** + * Jingga + * + * @package Test + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#include +#include + +#include "../../../Utils/TestUtils.h" +#include "../../../Stdlib/SIMD/SIMD_Helper.h" + + +int main(int argc, char** argv) +{ + printf("SIMD_Helper:\n"); + + if (Stdlib::SIMD::is_avx_supported()) { + printf("\nAVX is supported"); + } else { + printf("\033[33m\nAVX is NOT supported\033[0m"); + } + + if (Stdlib::SIMD::is_avx256_supported()) { + printf("\nAVX 256 is supported"); + } else { + printf("\033[33m\nAVX 256 is NOT supported\033[0m"); + } + + if (Stdlib::SIMD::is_avx512_supported()) { + printf("\nAVX 512 is supported"); + } else { + printf("\033[33m\nAVX 512 is NOT supported\033[0m"); + } + + printf("\n\n"); + + return 0; +} diff --git a/tests/Threads/ThreadPoolTest.cpp b/tests/Threads/ThreadPoolTest.cpp index 483408a..592cb16 100755 --- a/tests/Threads/ThreadPoolTest.cpp +++ b/tests/Threads/ThreadPoolTest.cpp @@ -8,6 +8,7 @@ * @link https://jingga.app */ #include +#include #include "../../Threads/Thread.h" #include "../../Utils/TestUtils.h" @@ -15,17 +16,19 @@ static const size_t num_threads = 4; static const size_t num_items = 10; +// increase value by 100 void worker(void *arg) { - int *val = (int *) arg; - int old = *val; + Threads::Job *job = (Threads::Job *) arg; + int *val = (int *) job->arg; *val += 100; - // printf("tid=%p, old=%d, val=%d\n", (void *) pthread_self(), old, *val); if (*val % 2) { sleep(1); } + + job->state = 1; } int main(int argc, char** argv) @@ -36,25 +39,33 @@ int main(int argc, char** argv) int i; Threads::ThreadPool *pool = Threads::pool_create(num_threads); int *vals = (int *) calloc(num_items, sizeof(int)); + Threads::Job **works = (Threads::Job **) calloc(num_items, sizeof(Threads::Job)); for (i = 0; i < num_items; ++i) { vals[i] = i; - Threads::pool_add_work(pool, worker, vals + i); + works[i] = Threads::pool_add_work(pool, worker, vals + i); } - Threads::pool_wait(pool); - sleep(1); + // @bug wait is not working as expected + // I thought wait works similarly to what the do/while construct below does + //Threads::pool_wait(pool); + + bool finished = false; + do { + finished = true; + for (i = 0; i < num_items; ++i) { + finished = finished && (works[i]->state == 1); + } + } while (!finished); bool test = true; for (i = 0; i < num_items; ++i) { - // printf("%d\n", vals[i]); - test = test && 100 + i == vals[i]; + ASSERT_EQUALS(vals[i], 100 + i, "%d", "%d"); } - ASSERT_EQUALS(test, true, "%d", "%d"); - free(vals); + free(works); Threads::pool_destroy(pool); printf("\n\n"); diff --git a/tests/test.sh b/tests/test.sh index 8635502..b2d3980 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -9,3 +9,7 @@ g++ $BASEDIR/Image/ImageUtilsTest.cpp -o $BASEDIR/Image/ImageUtilsTest && $BASED g++ $BASEDIR/Threads/ThreadPoolTest.cpp -o $BASEDIR/Threads/ThreadPoolTest && $BASEDIR/Threads/ThreadPoolTest && rm $BASEDIR/Threads/ThreadPoolTest # g++ $BASEDIR/Utils/WebUtilsTest.cpp -o $BASEDIR/Utils/WebUtilsTest -l curl -l xml2 -l libxml2 -I /usr/include/libxml2 -f permissive && $BASEDIR/Utils/WebUtilsTest && rm $BASEDIR/Utils/WebUtilsTest + +g++ $BASEDIR/Stdlib/SIMD/SIMD_HelperTest.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_HelperTest && $BASEDIR/Stdlib/SIMD/SIMD_HelperTest && rm $BASEDIR/Stdlib/SIMD/SIMD_HelperTest + +g++ -mavx -msse -maes -msse3 -msse4.1 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/SIMD/SIMD_F32Test.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_F32Test && $BASEDIR/Stdlib/SIMD/SIMD_F32Test && rm $BASEDIR/Stdlib/SIMD/SIMD_F32Test From a605c4d1c692624be7ec1e727cc17f5551be5507 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Sat, 20 Apr 2024 19:53:54 +0000 Subject: [PATCH 3/7] update --- .github/workflows/main.yml | 2 +- Stdlib/Intrinsics.h | 1 + tests/Stdlib/IntrinsicsTest | Bin 0 -> 16320 bytes tests/Stdlib/IntrinsicsTest.cpp | 27 +++++++++++++++++++++++++++ tests/test.sh | 4 +++- 5 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 tests/Stdlib/IntrinsicsTest create mode 100644 tests/Stdlib/IntrinsicsTest.cpp diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1b8cd83..1b8d2f4 100755 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -54,7 +54,7 @@ jobs: run: | cp ./Build/Config/.clang-format ./.clang-format - name: Lint Code Base - uses: github/super-linter/slim@v5 + uses: github/super-linter/slim@v6.4.0 env: VALIDATE_ALL_CODEBASE: false VALIDATE_CLANG_FORMAT : true diff --git a/Stdlib/Intrinsics.h b/Stdlib/Intrinsics.h index c537534..dd1e5d0 100644 --- a/Stdlib/Intrinsics.h +++ b/Stdlib/Intrinsics.h @@ -14,6 +14,7 @@ #include #include #include +#include #include "Types.h" diff --git a/tests/Stdlib/IntrinsicsTest b/tests/Stdlib/IntrinsicsTest new file mode 100644 index 0000000000000000000000000000000000000000..af5bf529964b024ab43a26b6e0dcf7991dd06efb GIT binary patch literal 16320 zcmeHOe{fXQ6~0M`1_Tli6oJB`(rPuk2?R=MV-~`OP34y)REywaliiRVv%7KkEhJbm zEvPV&4z`_2XLLGF9i>z2;OJO!+F>a!f}>T2(PHZ#p_MkNXk(C0h`{zc_uljN<*}jD zAOEzocV_q8^PTgZd+)jLy?t-ry$|}9)fE;L2rfn9HbGiXnL@m5Silt_ykdbEhwqtU zhPVRoD15xy?35=8m4Ri6_A7!U-W00jkkb_`SaJ_Z5^tok;7Vm-DfBpqS4<|YJLPB^ z2ntKKTkW}|SqO)$(Bori=TXQV<%+kX+95S>x0cK!m3VtJZ;$4&6zs1h=aX_mp9MOf zPFVpP6_!q3tL8c75=9d%sSPR(r_A8L)7~o0+v}2Hgi|&uSg_>w?f_3uk@mQ};Avj1 z+Z$;fI&>ac(tK0#w1u1IS9#ik~Ea=~N$s z5BaEkdSeBWzX05YkF(SP$mM@4 zaF^2=dILV!=fH(g-?-cehT@@(;e;8AH!iPji$z0?fu^>QFpQ0nSky?Efw*ZH$~C8B z;=Z`qBI2=RG}r=Gb4S2v2}c8M;Vl#dhf1>!5($K(qCIIQ=&QLk5Et$7a1>nlMCj<5 zPqk)MTe_@%ajh}OJ;yyyaD8qS4Wbb9UV!gW_!Quq>Z%l=z5`V{PF(mTr-nzC7(jJESH2 zk4pb}zaeq+MgPuz|G`gc{09e%`~`>o$1a+a5a1ylpd{1NBKygF%3oJCnMLxJHU4eY zO;q*oJY|mccUQMS>3#oVCes^4vJMxohhDHAT&sQJ_icueDr>-Nz-&(0`M_!a&SVcp zaqD=B_ImG22-02rJ!t83(NynuF(v79TMu;?-_Ou21*Kp=j?anoU)LQ<_uqLmJ?J^O zsj9npH9OW>+!}^XVa5@TxPQW@H~sr{>5p#fE}p>-6D`hF484%$oB?Mr{l>c2(?ezB zhQ=Y*;GGB4Lu*s%&$WKebENbKsqHCqUB6Wv6VrmK*vCH8ixg=e3jPw$fKL z1;R8`=_j8bsDTtiZXBbK-h$hBLGEK}w?6BYB#2Ko|a!qQ_9*oXLJ(6JQg) zAAK%6@T5J+?|^j|KgL1sf!`&`4%>q~1AWyN755@%FY383+{F{yt&~)a3k}(HMvf&h6Ui+=9Jra{I`3&SUkk3Fq1NjW(Gmy_f zJ_G;P8OXj_Qp2o|ns{NA2sbAdjuF%Do?98YyKeooyDK6(3!Y7ZxM%a`&7M{>(&o86 z5D(lN@HDSn-rzAq2{YkoFoSr(<;jL=1nh2ZZx>_6%ucwAXhngtqgVycd;O;BAU5Q?%6G(pq+%>>mB} zhnHxwbZAM>PgHnq;I&E>DcvyRdJ(ME@09lAwS(OL42|>n@E%5v4|$akUXQBr>Ve{o zs?@n(c(j!I54HWA*7fjp^#7i))_XfbR_KRGua>K{+@NK8h-BSGg<7E0a>K+&X8DyQ1o54PQPa9j~Lrrx~Xg%?qt)eN-pTr5&z@%;it> zR*2QCWqSRob{8&(jq`cL{`II7;HWS2DumCcH4?wZIuDdph;O4p?ecla_zvkm z**dQne-@Qo{U2#uugW-+#I5q*0*<0&t~f`e|Fyz-TpW{>tF3kP9O*;pKSl8Q#rO~^ zxz<&WM?!Mpw*YrB*(QH1aa?7*HW1aNz^_N%zNt@m`CkU`2H?5k2Z5L2YQ^)y(e53= ze=FgI69ix9*nj&7{*UYU&VJGhcNddw@ z$ED-^TF2o(-D-f-c%5->2kz2+vx|_7V@FiI3;4uB{3IGWkGL*f0Yd z1%QN^Y-w>fi|o35!;Bcsv@$;d&tS~h*cNLFv>8D&7Ec&~WQS;uMcUg!W+>>c$n8NZ z^20_T9uIUHA^eNkDO%!zNXQ5#Bau!}Y?=X2bC_$6|0@lnZuO$&KEt|j=V>Ib* zkH^|WakJ9~H6_E?2^0=$Xwl;OaypWwdut%kD%`=&D4I~xj4R8gP&^TiMTaQ{Z1GTA zfC#$Q-ewB7?4}!4_r@4hsw6_q!fl2+pvq}<$76Dy-Jw=}Ft-K~jWrcR9o34#bu<-- zgb_^9F$2Pl(5|=LgR;uFM&&T(b+AMv!PVxqP=w6QbBkfxwUn+e6 zrZgeuqhjYf_pj0a_4dNoZN})nz|QCU5|%D@)Y8en6Zqe8FUbAp`(>8gKHo1p`yT+G z?v?4yfsS2%4&~4vdCq@%4@29uIj7 z(W->?&3u+ufNbXruXe3alH1_`%wwJ2ITFV0^L=QU<~#kVEqj@u+t==viTSr!h^jE( zF8SE8yLxp%mp))0~!MR=brQ7se{6EtV8_HA%!J&$O E0D=X^VE_OC literal 0 HcmV?d00001 diff --git a/tests/Stdlib/IntrinsicsTest.cpp b/tests/Stdlib/IntrinsicsTest.cpp new file mode 100644 index 0000000..e948cea --- /dev/null +++ b/tests/Stdlib/IntrinsicsTest.cpp @@ -0,0 +1,27 @@ +/** + * Jingga + * + * @package Test + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#include +#include +#include + +#include "../../Utils/TestUtils.h" +#include "../../Stdlib/Intrinsics.h" + +int main(int argc, char** argv) +{ + printf("Intrinsics:\n\n"); + + ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::sqrt(1.234f), sqrt(1.234f), 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::round(1.234f), round(1.234f), 0.01, "%f", "%f"); + + printf("\n\n"); + + return 0; +} \ No newline at end of file diff --git a/tests/test.sh b/tests/test.sh index b2d3980..61befc3 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -12,4 +12,6 @@ g++ $BASEDIR/Threads/ThreadPoolTest.cpp -o $BASEDIR/Threads/ThreadPoolTest && $B g++ $BASEDIR/Stdlib/SIMD/SIMD_HelperTest.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_HelperTest && $BASEDIR/Stdlib/SIMD/SIMD_HelperTest && rm $BASEDIR/Stdlib/SIMD/SIMD_HelperTest -g++ -mavx -msse -maes -msse3 -msse4.1 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/SIMD/SIMD_F32Test.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_F32Test && $BASEDIR/Stdlib/SIMD/SIMD_F32Test && rm $BASEDIR/Stdlib/SIMD/SIMD_F32Test +g++ -mavx -maes -msse -msse2 -msse3 -msse4.1 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/IntrinsicsTest.cpp -o $BASEDIR/Stdlib/IntrinsicsTest && $BASEDIR/Stdlib/IntrinsicsTest && rm $BASEDIR/Stdlib/IntrinsicsTest + +g++ -mavx -maes -msse -msse2 -msse3 -msse4.1 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/SIMD/SIMD_F32Test.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_F32Test && $BASEDIR/Stdlib/SIMD/SIMD_F32Test && rm $BASEDIR/Stdlib/SIMD/SIMD_F32Test From 42fc9c915cd778bb807578736dbb4e099b3da640 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Sat, 20 Apr 2024 19:55:58 +0000 Subject: [PATCH 4/7] fix inspection --- .github/workflows/main.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1b8d2f4..2fb1d5a 100755 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,9 +33,6 @@ jobs: codestyle-tests: runs-on: ubuntu-latest if: "!contains(github.event.head_commit.message, 'NO_CI')" - strategy: - fail-fast: false - max-parallel: 3 steps: - name: Checkout Repository uses: actions/checkout@main @@ -54,7 +51,7 @@ jobs: run: | cp ./Build/Config/.clang-format ./.clang-format - name: Lint Code Base - uses: github/super-linter/slim@v6.4.0 + uses: super-linter/super-linter@v6.4.0 env: VALIDATE_ALL_CODEBASE: false VALIDATE_CLANG_FORMAT : true From 6d3dda5a2f10f450cf0e155185600d67ed74ad74 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Wed, 24 Apr 2024 03:20:47 +0000 Subject: [PATCH 5/7] more tests and some fixes --- Input/XInput.h | 44 ++++++++++++++++++++++++++++++++ Stdlib/Intrinsics.h | 3 +-- Utils/TestUtils.h | 10 ++++++++ tests/Stdlib/IntrinsicsTest | Bin 16320 -> 0 bytes tests/Stdlib/IntrinsicsTest.cpp | 7 ++++- tests/test.sh | 4 +-- 6 files changed, 63 insertions(+), 5 deletions(-) create mode 100644 Input/XInput.h delete mode 100644 tests/Stdlib/IntrinsicsTest diff --git a/Input/XInput.h b/Input/XInput.h new file mode 100644 index 0000000..2828fa4 --- /dev/null +++ b/Input/XInput.h @@ -0,0 +1,44 @@ +/** + * Karaka + * + * @package Stdlib + * @copyright Dennis Eichhorn + * @license OMS License 1.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef INPUT_XINPUT_H +#define INPUT_XINPUT_H + +#ifdef _WIN32 + #include +#else + #include +#endif + +#include "../Stdlib/Types.h" + +uint32 find_joysticks() +{ + +} + +void destory_joysticks() +{ + +} + +void handle_controller_input() +{ + + for (uint32 controller_index = 0; controller_index < XUSER_MAX_COUNT; ++controller_index) { + XINPUT_STATE controller_state; + if (XInputGetState(controller_index, &controller_state) == ERROR_SUCCESS) { + + } else { + + } + } +} + +#endif \ No newline at end of file diff --git a/Stdlib/Intrinsics.h b/Stdlib/Intrinsics.h index dd1e5d0..f9fe798 100644 --- a/Stdlib/Intrinsics.h +++ b/Stdlib/Intrinsics.h @@ -11,10 +11,9 @@ #define STDLIB_INTRINSICS_H #include -#include #include #include -#include +#include #include "Types.h" diff --git a/Utils/TestUtils.h b/Utils/TestUtils.h index b039bdc..870739e 100755 --- a/Utils/TestUtils.h +++ b/Utils/TestUtils.h @@ -23,6 +23,16 @@ return 0; } \ }) +#define ASSERT_NOT_EQUALS(a, b, t1, t2) ({\ + if ((a) != (b)) { \ + printf("."); \ + } else { \ + printf("\033[31m[F]\033[0m"); \ + printf("\n\n%s - %i: ", __FILE__, __LINE__); \ + printf((t1), (a)); printf(" == "); printf((t2), (b)); printf("\n"); \ + return 0; } \ + }) + #define ASSERT_EQUALS_WITH_DELTA(a, b, delta, t1, t2) ({\ if (oms_abs((a) - (b)) <= (delta)) { \ printf("."); \ diff --git a/tests/Stdlib/IntrinsicsTest b/tests/Stdlib/IntrinsicsTest deleted file mode 100644 index af5bf529964b024ab43a26b6e0dcf7991dd06efb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16320 zcmeHOe{fXQ6~0M`1_Tli6oJB`(rPuk2?R=MV-~`OP34y)REywaliiRVv%7KkEhJbm zEvPV&4z`_2XLLGF9i>z2;OJO!+F>a!f}>T2(PHZ#p_MkNXk(C0h`{zc_uljN<*}jD zAOEzocV_q8^PTgZd+)jLy?t-ry$|}9)fE;L2rfn9HbGiXnL@m5Silt_ykdbEhwqtU zhPVRoD15xy?35=8m4Ri6_A7!U-W00jkkb_`SaJ_Z5^tok;7Vm-DfBpqS4<|YJLPB^ z2ntKKTkW}|SqO)$(Bori=TXQV<%+kX+95S>x0cK!m3VtJZ;$4&6zs1h=aX_mp9MOf zPFVpP6_!q3tL8c75=9d%sSPR(r_A8L)7~o0+v}2Hgi|&uSg_>w?f_3uk@mQ};Avj1 z+Z$;fI&>ac(tK0#w1u1IS9#ik~Ea=~N$s z5BaEkdSeBWzX05YkF(SP$mM@4 zaF^2=dILV!=fH(g-?-cehT@@(;e;8AH!iPji$z0?fu^>QFpQ0nSky?Efw*ZH$~C8B z;=Z`qBI2=RG}r=Gb4S2v2}c8M;Vl#dhf1>!5($K(qCIIQ=&QLk5Et$7a1>nlMCj<5 zPqk)MTe_@%ajh}OJ;yyyaD8qS4Wbb9UV!gW_!Quq>Z%l=z5`V{PF(mTr-nzC7(jJESH2 zk4pb}zaeq+MgPuz|G`gc{09e%`~`>o$1a+a5a1ylpd{1NBKygF%3oJCnMLxJHU4eY zO;q*oJY|mccUQMS>3#oVCes^4vJMxohhDHAT&sQJ_icueDr>-Nz-&(0`M_!a&SVcp zaqD=B_ImG22-02rJ!t83(NynuF(v79TMu;?-_Ou21*Kp=j?anoU)LQ<_uqLmJ?J^O zsj9npH9OW>+!}^XVa5@TxPQW@H~sr{>5p#fE}p>-6D`hF484%$oB?Mr{l>c2(?ezB zhQ=Y*;GGB4Lu*s%&$WKebENbKsqHCqUB6Wv6VrmK*vCH8ixg=e3jPw$fKL z1;R8`=_j8bsDTtiZXBbK-h$hBLGEK}w?6BYB#2Ko|a!qQ_9*oXLJ(6JQg) zAAK%6@T5J+?|^j|KgL1sf!`&`4%>q~1AWyN755@%FY383+{F{yt&~)a3k}(HMvf&h6Ui+=9Jra{I`3&SUkk3Fq1NjW(Gmy_f zJ_G;P8OXj_Qp2o|ns{NA2sbAdjuF%Do?98YyKeooyDK6(3!Y7ZxM%a`&7M{>(&o86 z5D(lN@HDSn-rzAq2{YkoFoSr(<;jL=1nh2ZZx>_6%ucwAXhngtqgVycd;O;BAU5Q?%6G(pq+%>>mB} zhnHxwbZAM>PgHnq;I&E>DcvyRdJ(ME@09lAwS(OL42|>n@E%5v4|$akUXQBr>Ve{o zs?@n(c(j!I54HWA*7fjp^#7i))_XfbR_KRGua>K{+@NK8h-BSGg<7E0a>K+&X8DyQ1o54PQPa9j~Lrrx~Xg%?qt)eN-pTr5&z@%;it> zR*2QCWqSRob{8&(jq`cL{`II7;HWS2DumCcH4?wZIuDdph;O4p?ecla_zvkm z**dQne-@Qo{U2#uugW-+#I5q*0*<0&t~f`e|Fyz-TpW{>tF3kP9O*;pKSl8Q#rO~^ zxz<&WM?!Mpw*YrB*(QH1aa?7*HW1aNz^_N%zNt@m`CkU`2H?5k2Z5L2YQ^)y(e53= ze=FgI69ix9*nj&7{*UYU&VJGhcNddw@ z$ED-^TF2o(-D-f-c%5->2kz2+vx|_7V@FiI3;4uB{3IGWkGL*f0Yd z1%QN^Y-w>fi|o35!;Bcsv@$;d&tS~h*cNLFv>8D&7Ec&~WQS;uMcUg!W+>>c$n8NZ z^20_T9uIUHA^eNkDO%!zNXQ5#Bau!}Y?=X2bC_$6|0@lnZuO$&KEt|j=V>Ib* zkH^|WakJ9~H6_E?2^0=$Xwl;OaypWwdut%kD%`=&D4I~xj4R8gP&^TiMTaQ{Z1GTA zfC#$Q-ewB7?4}!4_r@4hsw6_q!fl2+pvq}<$76Dy-Jw=}Ft-K~jWrcR9o34#bu<-- zgb_^9F$2Pl(5|=LgR;uFM&&T(b+AMv!PVxqP=w6QbBkfxwUn+e6 zrZgeuqhjYf_pj0a_4dNoZN})nz|QCU5|%D@)Y8en6Zqe8FUbAp`(>8gKHo1p`yT+G z?v?4yfsS2%4&~4vdCq@%4@29uIj7 z(W->?&3u+ufNbXruXe3alH1_`%wwJ2ITFV0^L=QU<~#kVEqj@u+t==viTSr!h^jE( zF8SE8yLxp%mp))0~!MR=brQ7se{6EtV8_HA%!J&$O E0D=X^VE_OC diff --git a/tests/Stdlib/IntrinsicsTest.cpp b/tests/Stdlib/IntrinsicsTest.cpp index e948cea..35b5d40 100644 --- a/tests/Stdlib/IntrinsicsTest.cpp +++ b/tests/Stdlib/IntrinsicsTest.cpp @@ -19,7 +19,12 @@ int main(int argc, char** argv) printf("Intrinsics:\n\n"); ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::sqrt(1.234f), sqrt(1.234f), 0.01, "%f", "%f"); - ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::round(1.234f), round(1.234f), 0.01, "%f", "%f"); + ASSERT_EQUALS_WITH_DELTA((int) Stdlib::Intrinsics::round_to_int(1.234f), (int) roundf(1.234f), 0.01, "%d", "%d"); + + //ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::floor(1.234f), 1.0f, 0.01, "%f", "%f"); + //ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::round(1.234f), roundf(1.234f), 0.01, "%f", "%f"); + + ASSERT_NOT_EQUALS(Stdlib::Intrinsics::hash(123456), Stdlib::Intrinsics::hash(654321), "%d", "%d"); printf("\n\n"); diff --git a/tests/test.sh b/tests/test.sh index 61befc3..1be5c32 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -12,6 +12,6 @@ g++ $BASEDIR/Threads/ThreadPoolTest.cpp -o $BASEDIR/Threads/ThreadPoolTest && $B g++ $BASEDIR/Stdlib/SIMD/SIMD_HelperTest.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_HelperTest && $BASEDIR/Stdlib/SIMD/SIMD_HelperTest && rm $BASEDIR/Stdlib/SIMD/SIMD_HelperTest -g++ -mavx -maes -msse -msse2 -msse3 -msse4.1 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/IntrinsicsTest.cpp -o $BASEDIR/Stdlib/IntrinsicsTest && $BASEDIR/Stdlib/IntrinsicsTest && rm $BASEDIR/Stdlib/IntrinsicsTest +g++ -maes -msse4.2 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/IntrinsicsTest.cpp -o $BASEDIR/Stdlib/IntrinsicsTest && $BASEDIR/Stdlib/IntrinsicsTest && rm $BASEDIR/Stdlib/IntrinsicsTest -g++ -mavx -maes -msse -msse2 -msse3 -msse4.1 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/SIMD/SIMD_F32Test.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_F32Test && $BASEDIR/Stdlib/SIMD/SIMD_F32Test && rm $BASEDIR/Stdlib/SIMD/SIMD_F32Test +g++ -maes -msse4.2 -mavx512f -mpclmul -mavx512dq -march=native $BASEDIR/Stdlib/SIMD/SIMD_F32Test.cpp -o $BASEDIR/Stdlib/SIMD/SIMD_F32Test && $BASEDIR/Stdlib/SIMD/SIMD_F32Test && rm $BASEDIR/Stdlib/SIMD/SIMD_F32Test From 146dc9afdcbcf8543d5123463ad1a672c1c52ff2 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Wed, 24 Apr 2024 17:50:47 +0000 Subject: [PATCH 6/7] fix style --- .github/workflows/main.yml | 33 +- Application/ApplicationAbstract.h | 4 +- Hash/MD5.h | 72 ++-- Hash/MeowHash.h | 435 ++++++++++++------------ Hash/SHA256.h | 111 ++++--- Image/BillDetection.h | 14 +- Image/Diff.h | 8 +- Image/ImageUtils.h | 7 +- Image/Kernel.h | 56 +--- Image/Skew.h | 7 +- Image/Thresholding.h | 8 +- Input/XInput.h | 13 +- Math/Matrix/MatrixInt64.h | 3 +- Math/Matrix/VectorFloat32.h | 2 +- Math/Matrix/VectorInt32.h | 2 +- Math/Matrix/VectorInt64.h | 2 +- Router/Router.h | 8 +- Stdlib/HashTable.h | 31 +- Stdlib/Intrinsics.h | 50 +-- Stdlib/SIMD/SIMD_F32.h | 434 ++++++++---------------- Stdlib/SIMD/SIMD_Helper.h | 22 +- Stdlib/SIMD/SIMD_I32.h | 460 ++++++++------------------ Stdlib/Types.h | 2 +- Utils/ApplicationUtils.h | 27 +- Utils/ArrayUtils.h | 53 ++- Utils/ColorUtils.h | 25 +- Utils/FileUtils.h | 87 +++-- Utils/Intrinsics.h | 30 +- Utils/Rng/StringUtils.h | 13 +- Utils/StringUtils.h | 64 ++-- Utils/TestUtils.h | 116 ++++--- tests/Image/ImageUtilsTest.cpp | 4 +- tests/Stdlib/IntrinsicsTest.cpp | 10 +- tests/Stdlib/SIMD/SIMD_F32Test.cpp | 343 ++++++++++++++----- tests/Stdlib/SIMD/SIMD_HelperTest.cpp | 5 +- tests/Threads/ThreadPoolTest.cpp | 16 +- tests/Utils/WebUtilsTest.cpp | 4 +- 37 files changed, 1187 insertions(+), 1394 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2fb1d5a..9a7cfc9 100755 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,30 +30,9 @@ jobs: run: | chmod +x ./tests/test.sh ./tests/test.sh - codestyle-tests: - runs-on: ubuntu-latest - if: "!contains(github.event.head_commit.message, 'NO_CI')" - steps: - - name: Checkout Repository - uses: actions/checkout@main - with: - fetch-depth: 0 - submodules: recursive - token: ${{ secrets.GH_TOKEN }} - - name: Checkout Build Repository - uses: actions/checkout@main - with: - fetch-depth: 1 - ref: develop - repository: Karaka-Management/Build - path: Build - - name: Copy config file - run: | - cp ./Build/Config/.clang-format ./.clang-format - - name: Lint Code Base - uses: super-linter/super-linter@v6.4.0 - env: - VALIDATE_ALL_CODEBASE: false - VALIDATE_CLANG_FORMAT : true - DEFAULT_BRANCH: develop - GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} + general_module_workflow_c: + uses: Karaka-Management/Karaka/.github/workflows/c_template.yml@develop + secrets: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_PAT: ${{ secrets.GH_PAT }} + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file diff --git a/Application/ApplicationAbstract.h b/Application/ApplicationAbstract.h index 1e89a62..d7d5fd5 100755 --- a/Application/ApplicationAbstract.h +++ b/Application/ApplicationAbstract.h @@ -14,8 +14,8 @@ #include #include "../DataStorage/Database/Connection/ConnectionAbstract.h" -#include "../Utils/Parser/Json.h" #include "../Threads/Thread.h" +#include "../Utils/Parser/Json.h" namespace Application { @@ -24,6 +24,6 @@ namespace Application nlohmann::json config; Threads::ThreadPool *pool; } ApplicationAbstract; -} +} // namespace Application #endif diff --git a/Hash/MD5.h b/Hash/MD5.h index 4b05ab6..79c3744 100755 --- a/Hash/MD5.h +++ b/Hash/MD5.h @@ -1,9 +1,9 @@ #ifndef HASH_MD5_H #define HASH_MD5_H -#include -#include #include +#include +#include // https://www.rfc-editor.org/rfc/rfc1321 @@ -12,19 +12,17 @@ #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | ~(z))) -#define ROUND_OP(f, a, b, c, d, x, t, s) \ - (a) += f((b), (c), (d)) + (x) + (t); \ - (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ +#define ROUND_OP(f, a, b, c, d, x, t, s) \ + (a) += f((b), (c), (d)) + (x) + (t); \ + (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ (a) += (b); -#define SET_BLOCK(n) \ - (ctx->block[(n)] = \ - (uint32_t)ptr[(n) * 4] | \ - ((uint32_t)ptr[(n) * 4 + 1] << 8) | \ - ((uint32_t)ptr[(n) * 4 + 2] << 16) | \ - ((uint32_t)ptr[(n) * 4 + 3] << 24)) +#define SET_BLOCK(n) \ + (ctx->block[(n)] = (uint32_t) ptr[(n) * 4] | ((uint32_t) ptr[(n) * 4 + 1] << 8) | \ + ((uint32_t) ptr[(n) * 4 + 2] << 16) | ((uint32_t) ptr[(n) * 4 + 3] << 24)) -namespace Hash { +namespace Hash +{ typedef struct { uint32_t lo, hi; uint32_t a, b, c, d; @@ -173,13 +171,13 @@ namespace Hash { } memcpy(&ctx->buffer[used], data, free); - data = (unsigned char *) data + free; + data = (unsigned char *) data + free; size -= free; body(ctx, ctx->buffer, 64); } if (size >= 64) { - data = body(ctx, data, size & ~(size_t) 0x3f); + data = body(ctx, data, size & ~(size_t) 0x3f); size &= 0x3f; } @@ -190,9 +188,9 @@ namespace Hash { { uint32_t used, free; - used = ctx->lo & 0x3f; + used = ctx->lo & 0x3f; ctx->buffer[used++] = 0x80; - free = 64 - used; + free = 64 - used; if (free < 8) { memset(&ctx->buffer[used], 0, free); @@ -203,28 +201,28 @@ namespace Hash { memset(&ctx->buffer[used], 0, free - 8); - ctx->lo <<= 3; - ctx->buffer[56] = ctx->lo; - ctx->buffer[57] = ctx->lo >> 8; - ctx->buffer[58] = ctx->lo >> 16; - ctx->buffer[59] = ctx->lo >> 24; - ctx->buffer[60] = ctx->hi; - ctx->buffer[61] = ctx->hi >> 8; - ctx->buffer[62] = ctx->hi >> 16; - ctx->buffer[63] = ctx->hi >> 24; + ctx->lo <<= 3; + ctx->buffer[56] = ctx->lo; + ctx->buffer[57] = ctx->lo >> 8; + ctx->buffer[58] = ctx->lo >> 16; + ctx->buffer[59] = ctx->lo >> 24; + ctx->buffer[60] = ctx->hi; + ctx->buffer[61] = ctx->hi >> 8; + ctx->buffer[62] = ctx->hi >> 16; + ctx->buffer[63] = ctx->hi >> 24; body(ctx, ctx->buffer, 64); - result[0] = ctx->a; - result[1] = ctx->a >> 8; - result[2] = ctx->a >> 16; - result[3] = ctx->a >> 24; - result[4] = ctx->b; - result[5] = ctx->b >> 8; - result[6] = ctx->b >> 16; - result[7] = ctx->b >> 24; - result[8] = ctx->c; - result[9] = ctx->c >> 8; + result[0] = ctx->a; + result[1] = ctx->a >> 8; + result[2] = ctx->a >> 16; + result[3] = ctx->a >> 24; + result[4] = ctx->b; + result[5] = ctx->b >> 8; + result[6] = ctx->b >> 16; + result[7] = ctx->b >> 24; + result[8] = ctx->c; + result[9] = ctx->c >> 8; result[10] = ctx->c >> 16; result[11] = ctx->c >> 24; result[12] = ctx->d; @@ -254,7 +252,7 @@ namespace Hash { for (int i = 0; i < 16; ++i) { hexHash[i * 2] = hexChars[hash[i] >> 4]; - hexHash[(i * 2) + 1] = hexChars[hash[i] & 0x0F]; + hexHash[(i * 2) + 1] = hexChars[hash[i] & 0x0F]; } hexHash[16 * 2] = '\0'; @@ -262,7 +260,7 @@ namespace Hash { return hexHash; } -}; +}; // namespace Hash #undef F #undef G diff --git a/Hash/MeowHash.h b/Hash/MeowHash.h index 8fa3b3d..91d3762 100755 --- a/Hash/MeowHash.h +++ b/Hash/MeowHash.h @@ -140,9 +140,9 @@ #if __x86_64__ || _M_AMD64 #define meow_umm long long unsigned #define MeowU64From(A, I) (_mm_extract_epi64((A), (I))) - #elif __i386__ || _M_IX86 + #elif __i386__ || _M_IX86 #define meow_umm int unsigned - #define MeowU64From(A, I) (*(meow_u64 *)&(A)) + #define MeowU64From(A, I) (*(meow_u64 *) &(A)) #else #error Cannot determine architecture to use! #endif @@ -166,106 +166,102 @@ #define MEOW_PREFETCH_LIMIT 0x3ff #endif - #define prefetcht0(A) _mm_prefetch((char *)(A), _MM_HINT_T0) - #define movdqu(A, B) A = _mm_loadu_si128((__m128i *)(B)) - #define movdqu_mem(A, B) _mm_storeu_si128((__m128i *)(A), B) + #define prefetcht0(A) _mm_prefetch((char *) (A), _MM_HINT_T0) + #define movdqu(A, B) A = _mm_loadu_si128((__m128i *) (B)) + #define movdqu_mem(A, B) _mm_storeu_si128((__m128i *) (A), B) #define movq(A, B) A = _mm_set_epi64x(0, B); - #define aesdec(A, B) A = _mm_aesdec_si128(A, B) - #define pshufb(A, B) A = _mm_shuffle_epi8(A, B) - #define pxor(A, B) A = _mm_xor_si128(A, B) + #define aesdec(A, B) A = _mm_aesdec_si128(A, B) + #define pshufb(A, B) A = _mm_shuffle_epi8(A, B) + #define pxor(A, B) A = _mm_xor_si128(A, B) #define paddq(A, B) A = _mm_add_epi64(A, B) - #define pand(A, B) A = _mm_and_si128(A, B) + #define pand(A, B) A = _mm_and_si128(A, B) #define palignr(A, B, i) A = _mm_alignr_epi8(A, B, i) - #define pxor_clear(A, B) A = _mm_setzero_si128(); // NOTE(casey): pxor_clear is a nonsense thing that is only here because compilers don't detect xor(a, a) is clearing a :( + #define pxor_clear(A, B) \ + A = _mm_setzero_si128(); // NOTE(casey): pxor_clear is a nonsense thing that is only here because compilers + // don't detect xor(a, a) is clearing a :( - #define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \ - aesdec(r1, r2); \ - INSTRUCTION_REORDER_BARRIER; \ - paddq(r3, i1); \ - pxor(r2, i2); \ - aesdec(r2, r4); \ - INSTRUCTION_REORDER_BARRIER; \ - paddq(r5, i3); \ + #define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \ + aesdec(r1, r2); \ + INSTRUCTION_REORDER_BARRIER; \ + paddq(r3, i1); \ + pxor(r2, i2); \ + aesdec(r2, r4); \ + INSTRUCTION_REORDER_BARRIER; \ + paddq(r5, i3); \ pxor(r4, i4); - #define MEOW_MIX(r1, r2, r3, r4, r5, ptr) \ - MEOW_MIX_REG(r1, r2, r3, r4, r5, _mm_loadu_si128( (__m128i *) ((ptr) + 15) ), _mm_loadu_si128( (__m128i *) ((ptr) + 0) ), _mm_loadu_si128( (__m128i *) ((ptr) + 1) ), _mm_loadu_si128( (__m128i *) ((ptr) + 16) )) + #define MEOW_MIX(r1, r2, r3, r4, r5, ptr) \ + MEOW_MIX_REG(r1, r2, r3, r4, r5, _mm_loadu_si128((__m128i *) ((ptr) + 15)), \ + _mm_loadu_si128((__m128i *) ((ptr) + 0)), _mm_loadu_si128((__m128i *) ((ptr) + 1)), \ + _mm_loadu_si128((__m128i *) ((ptr) + 16))) #define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) \ - aesdec(r1, r4); \ - paddq(r2, r5); \ - pxor(r4, r6); \ - aesdec(r4, r2); \ - paddq(r5, r6); \ + aesdec(r1, r4); \ + paddq(r2, r5); \ + pxor(r4, r6); \ + aesdec(r4, r2); \ + paddq(r5, r6); \ pxor(r2, r3) #endif namespace Hash::Meow { - #if MEOW_DUMP - struct meow_dump - { - meow_u128 xmm[8]; - void *Ptr; - char const *Title; - }; - extern "C" meow_dump *MeowDumpTo; - meow_dump *MeowDumpTo; - #define MEOW_DUMP_STATE(T, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ptr) \ - if(MeowDumpTo) \ - { \ - MeowDumpTo->xmm[0] = xmm0; \ - MeowDumpTo->xmm[1] = xmm1; \ - MeowDumpTo->xmm[2] = xmm2; \ - MeowDumpTo->xmm[3] = xmm3; \ - MeowDumpTo->xmm[4] = xmm4; \ - MeowDumpTo->xmm[5] = xmm5; \ - MeowDumpTo->xmm[6] = xmm6; \ - MeowDumpTo->xmm[7] = xmm7; \ - MeowDumpTo->Ptr = ptr; \ - MeowDumpTo->Title = T; \ - ++MeowDumpTo; \ - } - #else - #define MEOW_DUMP_STATE(...) - #endif - - static meow_u8 MeowShiftAdjust[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - static meow_u8 MeowMaskLen[32] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; - - // NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it is just an encoding of Pi. - static meow_u8 MeowDefaultSeed[128] = - { - 0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D, - 0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34, - 0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D, - 0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8, - 0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37, - 0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6, - 0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D, - 0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91, - 0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1, - 0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A, - 0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB, - 0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9, - 0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9, - 0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF, - 0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1, - 0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6 +#if MEOW_DUMP + struct meow_dump { + meow_u128 xmm[8]; + void *Ptr; + char const *Title; }; + extern "C" meow_dump *MeowDumpTo; + meow_dump *MeowDumpTo; + #define MEOW_DUMP_STATE(T, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ptr) \ + if (MeowDumpTo) { \ + MeowDumpTo->xmm[0] = xmm0; \ + MeowDumpTo->xmm[1] = xmm1; \ + MeowDumpTo->xmm[2] = xmm2; \ + MeowDumpTo->xmm[3] = xmm3; \ + MeowDumpTo->xmm[4] = xmm4; \ + MeowDumpTo->xmm[5] = xmm5; \ + MeowDumpTo->xmm[6] = xmm6; \ + MeowDumpTo->xmm[7] = xmm7; \ + MeowDumpTo->Ptr = ptr; \ + MeowDumpTo->Title = T; \ + ++MeowDumpTo; \ + } +#else + #define MEOW_DUMP_STATE(...) +#endif + + static meow_u8 MeowShiftAdjust[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + static meow_u8 MeowMaskLen[32] = {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it + // is just an encoding of Pi. + static meow_u8 MeowDefaultSeed[128] = { + 0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D, 0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34, + 0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D, 0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8, + 0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37, 0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6, + 0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D, 0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91, + 0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1, 0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A, + 0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB, 0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9, + 0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9, 0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF, + 0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1, 0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6}; // // NOTE(casey): Single block version // - static meow_u128 - MeowHash(void *Seed128Init, meow_umm Len, void *SourceInit) + static meow_u128 MeowHash(void *Seed128Init, meow_umm Len, void *SourceInit) { - meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes - meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) + meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, + xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes + meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, + xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) - meow_u8 *rax = (meow_u8 *)SourceInit; - meow_u8 *rcx = (meow_u8 *)Seed128Init; + meow_u8 *rax = (meow_u8 *) SourceInit; + meow_u8 *rcx = (meow_u8 *) Seed128Init; // // NOTE(casey): Seed the eight hash registers @@ -288,41 +284,38 @@ namespace Hash::Meow // meow_umm BlockCount = (Len >> 8); - if(BlockCount > MEOW_PREFETCH_LIMIT) - { - // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop - while(BlockCount--) - { + if (BlockCount > MEOW_PREFETCH_LIMIT) { + // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this + // loop + while (BlockCount--) { prefetcht0(rax + MEOW_PREFETCH + 0x00); prefetcht0(rax + MEOW_PREFETCH + 0x40); prefetcht0(rax + MEOW_PREFETCH + 0x80); prefetcht0(rax + MEOW_PREFETCH + 0xc0); - MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); - MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); - MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); - MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); - MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); - MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); - MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); - MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0x00); + MEOW_MIX(xmm1, xmm5, xmm7, xmm2, xmm3, rax + 0x20); + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x40); + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x60); + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x80); + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0xa0); + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0xc0); + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xe0); rax += 0x100; } - } - else - { - // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop. - while(BlockCount--) - { - MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); - MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); - MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); - MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); - MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); - MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); - MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); - MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + } else { + // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port + // pressure), so we use this loop. + while (BlockCount--) { + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0x00); + MEOW_MIX(xmm1, xmm5, xmm7, xmm2, xmm3, rax + 0x20); + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x40); + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x60); + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x80); + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0xa0); + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0xc0); + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xe0); rax += 0x100; } @@ -345,15 +338,15 @@ namespace Hash::Meow // // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned - meow_u8 *Last = (meow_u8 *)SourceInit + (Len & ~0xf); + meow_u8 *Last = (meow_u8 *) SourceInit + (Len & ~0xf); int unsigned Len8 = (Len & 0xf); - if(Len8) - { + if (Len8) { // NOTE(casey): Load the mask early movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); - meow_u8 *LastOk = (meow_u8*)((((meow_umm)(((meow_u8 *)SourceInit)+Len - 1)) | (MEOW_PAGESIZE - 1)) - 16); - int Align = (Last > LastOk) ? ((int)(meow_umm)Last) & 0xf : 0; + meow_u8 *LastOk = + (meow_u8 *) ((((meow_umm) (((meow_u8 *) SourceInit) + Len - 1)) | (MEOW_PAGESIZE - 1)) - 16); + int Align = (Last > LastOk) ? ((int) (meow_umm) Last) & 0xf : 0; movdqu(xmm10, &MeowShiftAdjust[Align]); movdqu(xmm9, Last - Align); pshufb(xmm9, xmm10); @@ -363,8 +356,7 @@ namespace Hash::Meow } // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned - if(Len & 0x10) - { + if (Len & 0x10) { xmm11 = xmm9; movdqu(xmm9, Last - 0x10); } @@ -373,7 +365,7 @@ namespace Hash::Meow // NOTE(casey): Construct the residual and length injests // - xmm8 = xmm9; + xmm8 = xmm9; xmm10 = xmm9; palignr(xmm8, xmm11, 15); palignr(xmm10, xmm11, 1); @@ -390,11 +382,12 @@ namespace Hash::Meow MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 0); - // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty - MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); + // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it + // was empty + MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our 32-byte padding - MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); + MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); @@ -402,19 +395,40 @@ namespace Hash::Meow // NOTE(casey): Hash all full 32-byte blocks // int unsigned LaneCount = (Len >> 5) & 0x7; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x00); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x20); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x40); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0x60); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0x80); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xa0); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0xc0); + --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // - MixDown: + MixDown: MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); @@ -443,15 +457,14 @@ namespace Hash::Meow MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); - return(xmm0); + return (xmm0); } // // NOTE(casey): Streaming construction // - typedef struct meow_state - { + typedef struct meow_state { meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; meow_u64 TotalLengthInBytes; @@ -461,10 +474,9 @@ namespace Hash::Meow meow_u128 Pad[2]; // NOTE(casey): So we know we can over-read Buffer as necessary } meow_state; - static void - MeowBegin(meow_state *State, void *Seed128) + static void MeowBegin(meow_state *State, void *Seed128) { - meow_u8 *rcx = (meow_u8 *)Seed128; + meow_u8 *rcx = (meow_u8 *) Seed128; movdqu(State->xmm0, rcx + 0x00); movdqu(State->xmm1, rcx + 0x10); @@ -475,14 +487,14 @@ namespace Hash::Meow movdqu(State->xmm6, rcx + 0x60); movdqu(State->xmm7, rcx + 0x70); - MEOW_DUMP_STATE("Seed", State->xmm0, State->xmm1, State->xmm2, State->xmm3, State->xmm4, State->xmm5, State->xmm6, State->xmm7, 0); + MEOW_DUMP_STATE("Seed", State->xmm0, State->xmm1, State->xmm2, State->xmm3, State->xmm4, State->xmm5, + State->xmm6, State->xmm7, 0); - State->BufferLen = 0; + State->BufferLen = 0; State->TotalLengthInBytes = 0; } - static void - MeowAbsorbBlocks(meow_state *State, meow_umm BlockCount, meow_u8 *rax) + static void MeowAbsorbBlocks(meow_state *State, meow_umm BlockCount, meow_u8 *rax) { meow_u128 xmm0 = State->xmm0; meow_u128 xmm1 = State->xmm1; @@ -493,39 +505,34 @@ namespace Hash::Meow meow_u128 xmm6 = State->xmm6; meow_u128 xmm7 = State->xmm7; - if(BlockCount > MEOW_PREFETCH_LIMIT) - { - while(BlockCount--) - { + if (BlockCount > MEOW_PREFETCH_LIMIT) { + while (BlockCount--) { prefetcht0(rax + MEOW_PREFETCH + 0x00); prefetcht0(rax + MEOW_PREFETCH + 0x40); prefetcht0(rax + MEOW_PREFETCH + 0x80); prefetcht0(rax + MEOW_PREFETCH + 0xc0); - MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); - MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); - MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); - MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); - MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); - MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); - MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); - MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0x00); + MEOW_MIX(xmm1, xmm5, xmm7, xmm2, xmm3, rax + 0x20); + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x40); + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x60); + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x80); + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0xa0); + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0xc0); + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xe0); rax += 0x100; } - } - else - { - while(BlockCount--) - { - MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); - MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); - MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); - MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); - MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); - MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); - MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); - MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + } else { + while (BlockCount--) { + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0x00); + MEOW_MIX(xmm1, xmm5, xmm7, xmm2, xmm3, rax + 0x20); + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x40); + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x60); + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x80); + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0xa0); + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0xc0); + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xe0); rax += 0x100; } @@ -541,29 +548,24 @@ namespace Hash::Meow State->xmm7 = xmm7; } - static void - MeowAbsorb(meow_state *State, meow_umm Len, void *SourceInit) + static void MeowAbsorb(meow_state *State, meow_umm Len, void *SourceInit) { State->TotalLengthInBytes += Len; - meow_u8 *Source = (meow_u8 *)SourceInit; + meow_u8 *Source = (meow_u8 *) SourceInit; // NOTE(casey): Handle any buffered residual - if(State->BufferLen) - { + if (State->BufferLen) { int unsigned Fill = (sizeof(State->Buffer) - State->BufferLen); - if(Fill > Len) - { - Fill = (int unsigned)Len; + if (Fill > Len) { + Fill = (int unsigned) Len; } Len -= Fill; - while(Fill--) - { + while (Fill--) { State->Buffer[State->BufferLen++] = *Source++; } - if(State->BufferLen == sizeof(State->Buffer)) - { + if (State->BufferLen == sizeof(State->Buffer)) { MeowAbsorbBlocks(State, 1, State->Buffer); State->BufferLen = 0; } @@ -571,21 +573,19 @@ namespace Hash::Meow // NOTE(casey): Handle any full blocks meow_u64 BlockCount = (Len >> 8); - meow_u64 Advance = (BlockCount << 8); + meow_u64 Advance = (BlockCount << 8); MeowAbsorbBlocks(State, BlockCount, Source); - Len -= Advance; + Len -= Advance; Source += Advance; // NOTE(casey): Store residual - while(Len--) - { + while (Len--) { State->Buffer[State->BufferLen++] = *Source++; } } - static meow_u128 - MeowEnd(meow_state *State, meow_u8 *Store128) + static meow_u128 MeowEnd(meow_state *State, meow_u8 *Store128) { meow_umm Len = State->TotalLengthInBytes; @@ -605,22 +605,20 @@ namespace Hash::Meow pxor_clear(xmm9, xmm9); pxor_clear(xmm11, xmm11); - meow_u8 *Last = (meow_u8 *)rax + (Len & 0xf0); + meow_u8 *Last = (meow_u8 *) rax + (Len & 0xf0); int unsigned Len8 = (Len & 0xf); - if(Len8) - { + if (Len8) { movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); movdqu(xmm9, Last); pand(xmm9, xmm8); } - if(Len & 0x10) - { + if (Len & 0x10) { xmm11 = xmm9; movdqu(xmm9, Last - 0x10); } - xmm8 = xmm9; + xmm8 = xmm9; xmm10 = xmm9; palignr(xmm8, xmm11, 15); palignr(xmm10, xmm11, 1); @@ -635,11 +633,12 @@ namespace Hash::Meow MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 0); - // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty - MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); + // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it + // was empty + MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our 32-byte padding - MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); + MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); @@ -647,19 +646,40 @@ namespace Hash::Meow // NOTE(casey): Hash all full 32-byte blocks // int unsigned LaneCount = (Len >> 5) & 0x7; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x00); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x20); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x40); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0x60); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0x80); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xa0); + --LaneCount; + if (LaneCount == 0) + goto MixDown; + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0xc0); + --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // - MixDown: + MixDown: MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); @@ -678,8 +698,7 @@ namespace Hash::Meow MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); - if(Store128) - { + if (Store128) { movdqu_mem(Store128 + 0x00, xmm0); movdqu_mem(Store128 + 0x10, xmm1); movdqu_mem(Store128 + 0x20, xmm2); @@ -700,7 +719,7 @@ namespace Hash::Meow MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); - return(xmm0); + return (xmm0); } // @@ -710,17 +729,16 @@ namespace Hash::Meow // need to create a new seed. // - static void - MeowExpandSeed(meow_umm InputLen, void *Input, meow_u8 *SeedResult) + static void MeowExpandSeed(meow_umm InputLen, void *Input, meow_u8 *SeedResult) { meow_state State; - meow_u64 LengthTab = (meow_u64)InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even on 32-bit builds, to ensure identical results + meow_u64 LengthTab = (meow_u64) InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even + // on 32-bit builds, to ensure identical results meow_umm InjestCount = (256 / InputLen) + 2; MeowBegin(&State, MeowDefaultSeed); MeowAbsorb(&State, sizeof(LengthTab), &LengthTab); - while(InjestCount--) - { + while (InjestCount--) { MeowAbsorb(&State, InputLen, Input); } MeowEnd(&State, SeedResult); @@ -730,15 +748,12 @@ namespace Hash::Meow { char *str = (char *) malloc((4 * 8 + 4 + 1) * sizeof(char)); - sprintf(str, "%08X-%08X-%08X-%08X", - MeowU32From(Hash, 3), - MeowU32From(Hash, 2), - MeowU32From(Hash, 1), + sprintf(str, "%08X-%08X-%08X-%08X", MeowU32From(Hash, 3), MeowU32From(Hash, 2), MeowU32From(Hash, 1), MeowU32From(Hash, 0)); return (const char *) str; } -} +} // namespace Hash::Meow #undef INSTRUCTION_REORDER_BARRIER #undef prefetcht0 diff --git a/Hash/SHA256.h b/Hash/SHA256.h index 4aec83f..6e12f22 100755 --- a/Hash/SHA256.h +++ b/Hash/SHA256.h @@ -10,25 +10,38 @@ extern "C" { // Licensing Information // -// Except as otherwise noted (below and/or in individual files), this project is licensed under the Unlicense (https://opensource.org/licenses/unlicense) or the Zero Clause BSD license (https://opensource.org/licenses/0bsd), at your option. -// The Unlicense +// Except as otherwise noted (below and/or in individual files), this project is licensed under the Unlicense +// (https://opensource.org/licenses/unlicense) or the Zero Clause BSD license (https://opensource.org/licenses/0bsd), at +// your option. The Unlicense // // This is free and unencumbered software released into the public domain. // -// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. +// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form +// or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. // -// In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. +// In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright +// interest in the software to the public domain. We make this dedication for the benefit of the public at large and to +// the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in +// perpetuity of all present and future rights to this software under copyright law. // -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // // For more information, please refer to http://unlicense.org // Zero Clause BSD License // // © 2021 Alain Mosnier // -// Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. +// Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby +// granted. // -// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN +// AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +// PERFORMANCE OF THIS SOFTWARE. /* * @brief Size of the SHA-256 sum. This times eight is 256 bits. @@ -96,16 +109,15 @@ static inline void consume_chunk(uint32_t *h, const uint8_t *p) for (i = 0; i < 4; i++) { for (j = 0; j < 16; j++) { if (i == 0) { - w[j] = - (uint32_t)p[0] << 24 | (uint32_t)p[1] << 16 | (uint32_t)p[2] << 8 | (uint32_t)p[3]; - p += 4; + w[j] = (uint32_t) p[0] << 24 | (uint32_t) p[1] << 16 | (uint32_t) p[2] << 8 | (uint32_t) p[3]; + p += 4; } else { /* Extend the first 16 words into the remaining 48 words w[16..63] of the * message schedule array: */ - const uint32_t s0 = right_rot(w[(j + 1) & 0xf], 7) ^ right_rot(w[(j + 1) & 0xf], 18) ^ - (w[(j + 1) & 0xf] >> 3); - const uint32_t s1 = right_rot(w[(j + 14) & 0xf], 17) ^ - right_rot(w[(j + 14) & 0xf], 19) ^ (w[(j + 14) & 0xf] >> 10); + const uint32_t s0 = + right_rot(w[(j + 1) & 0xf], 7) ^ right_rot(w[(j + 1) & 0xf], 18) ^ (w[(j + 1) & 0xf] >> 3); + const uint32_t s1 = + right_rot(w[(j + 14) & 0xf], 17) ^ right_rot(w[(j + 14) & 0xf], 19) ^ (w[(j + 14) & 0xf] >> 10); w[j] = w[j] + s0 + w[(j + 9) & 0xf] + s1; } const uint32_t s1 = right_rot(ah[4], 6) ^ right_rot(ah[4], 11) ^ right_rot(ah[4], 25); @@ -116,20 +128,18 @@ static inline void consume_chunk(uint32_t *h, const uint8_t *p) * (first 32 bits of the fractional parts of the cube roots of the first 64 primes 2..311): */ static const uint32_t k[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, - 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, - 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, - 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, - 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, - 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, - 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, - 0xc67178f2}; + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2}; const uint32_t temp1 = ah[7] + s1 + ch + k[i << 4 | j] + w[j]; - const uint32_t s0 = right_rot(ah[0], 2) ^ right_rot(ah[0], 13) ^ right_rot(ah[0], 22); - const uint32_t maj = (ah[0] & ah[1]) ^ (ah[0] & ah[2]) ^ (ah[1] & ah[2]); + const uint32_t s0 = right_rot(ah[0], 2) ^ right_rot(ah[0], 13) ^ right_rot(ah[0], 22); + const uint32_t maj = (ah[0] & ah[1]) ^ (ah[0] & ah[2]) ^ (ah[1] & ah[2]); const uint32_t temp2 = s0 + maj; ah[7] = ah[6]; @@ -148,25 +158,24 @@ static inline void consume_chunk(uint32_t *h, const uint8_t *p) h[i] += ah[i]; } - /* * @brief Initialize a SHA-256 streaming calculation. * @param sha_256 A pointer to a SHA-256 structure. * @param hash Hash array, where the result will be delivered. * - * @note If all of the data you are calculating the hash value on is not available in a contiguous buffer in memory, this is - * where you should start. Instantiate a SHA-256 structure, for instance by simply declaring it locally, make your hash - * buffer available, and invoke this function. Once a SHA-256 hash has been calculated (see further below) a SHA-256 - * structure can be initialized again for the next calculation. + * @note If all of the data you are calculating the hash value on is not available in a contiguous buffer in memory, + * this is where you should start. Instantiate a SHA-256 structure, for instance by simply declaring it locally, make + * your hash buffer available, and invoke this function. Once a SHA-256 hash has been calculated (see further below) a + * SHA-256 structure can be initialized again for the next calculation. * * @note If either of the passed pointers is NULL, the results are unpredictable. */ void sha_256_init(struct Sha_256 *sha_256, uint8_t hash[SIZE_OF_SHA_256_HASH]) { - sha_256->hash = hash; - sha_256->chunk_pos = sha_256->chunk; + sha_256->hash = hash; + sha_256->chunk_pos = sha_256->chunk; sha_256->space_left = SIZE_OF_SHA_256_CHUNK; - sha_256->total_len = 0; + sha_256->total_len = 0; /* * Initialize hash values (first 32 bits of the fractional parts of the square roots of the first 8 primes * 2..19): @@ -210,18 +219,18 @@ void sha_256_write(struct Sha_256 *sha_256, const void *data, size_t len) if (sha_256->space_left == SIZE_OF_SHA_256_CHUNK && len >= SIZE_OF_SHA_256_CHUNK) { consume_chunk(sha_256->h, p); len -= SIZE_OF_SHA_256_CHUNK; - p += SIZE_OF_SHA_256_CHUNK; + p += SIZE_OF_SHA_256_CHUNK; continue; } /* General case, no particular optimization. */ const size_t consumed_len = len < sha_256->space_left ? len : sha_256->space_left; memcpy(sha_256->chunk_pos, p, consumed_len); sha_256->space_left -= consumed_len; - len -= consumed_len; - p += consumed_len; + len -= consumed_len; + p += consumed_len; if (sha_256->space_left == 0) { consume_chunk(sha_256->h, sha_256->chunk); - sha_256->chunk_pos = sha_256->chunk; + sha_256->chunk_pos = sha_256->chunk; sha_256->space_left = SIZE_OF_SHA_256_CHUNK; } else { sha_256->chunk_pos += consumed_len; @@ -245,7 +254,7 @@ void sha_256_write(struct Sha_256 *sha_256, const void *data, size_t len) */ uint8_t *sha_256_close(struct Sha_256 *sha_256) { - uint8_t *pos = sha_256->chunk_pos; + uint8_t *pos = sha_256->chunk_pos; size_t space_left = sha_256->space_left; uint32_t *const h = sha_256->h; @@ -264,29 +273,29 @@ uint8_t *sha_256_close(struct Sha_256 *sha_256) if (space_left < TOTAL_LEN_LEN) { memset(pos, 0x00, space_left); consume_chunk(h, sha_256->chunk); - pos = sha_256->chunk; + pos = sha_256->chunk; space_left = SIZE_OF_SHA_256_CHUNK; } const size_t left = space_left - TOTAL_LEN_LEN; memset(pos, 0x00, left); - pos += left; - size_t len = sha_256->total_len; - pos[7] = (uint8_t)(len << 3); - len >>= 5; + pos += left; + size_t len = sha_256->total_len; + pos[7] = (uint8_t) (len << 3); + len >>= 5; int i; for (i = 6; i >= 0; --i) { - pos[i] = (uint8_t)len; - len >>= 8; + pos[i] = (uint8_t) len; + len >>= 8; } consume_chunk(h, sha_256->chunk); /* Produce the final hash value (big-endian): */ int j; uint8_t *const hash = sha_256->hash; for (i = 0, j = 0; i < 8; i++) { - hash[j++] = (uint8_t)(h[i] >> 24); - hash[j++] = (uint8_t)(h[i] >> 16); - hash[j++] = (uint8_t)(h[i] >> 8); - hash[j++] = (uint8_t)h[i]; + hash[j++] = (uint8_t) (h[i] >> 24); + hash[j++] = (uint8_t) (h[i] >> 16); + hash[j++] = (uint8_t) (h[i] >> 8); + hash[j++] = (uint8_t) h[i]; } return sha_256->hash; } @@ -307,7 +316,7 @@ void calc_sha_256(uint8_t hash[SIZE_OF_SHA_256_HASH], const void *input, size_t struct Sha_256 sha_256; sha_256_init(&sha_256, hash); sha_256_write(&sha_256, input, len); - (void)sha_256_close(&sha_256); + (void) sha_256_close(&sha_256); } #undef TOTAL_LEN_LEN diff --git a/Image/BillDetection.h b/Image/BillDetection.h index 1779bb8..3451cdd 100755 --- a/Image/BillDetection.h +++ b/Image/BillDetection.h @@ -10,8 +10,8 @@ #ifndef IMAGE_BILL_DETECTION_H #define IMAGE_BILL_DETECTION_H -#include #include +#include #include namespace Image::BillDetection @@ -33,12 +33,12 @@ namespace Image::BillDetection std::vector lines; lines.clear(); - cv::HoughLinesP(edges, lines, 1, CV_PI/180, 25); + cv::HoughLinesP(edges, lines, 1, CV_PI / 180, 25); std::vector::iterator it = lines.begin(); - for(; it != lines.end(); ++it) { + for (; it != lines.end(); ++it) { cv::Vec4i l = *it; - cv::line(edges, cv::Point(l[0], l[1]), cv::Point(l[2], l[3]), cv::Scalar(255,0,0), 2, 8); + cv::line(edges, cv::Point(l[0], l[1]), cv::Point(l[2], l[3]), cv::Scalar(255, 0, 0), 2, 8); } std::vector> contours; @@ -60,8 +60,8 @@ namespace Image::BillDetection // Approximate polygon /* Question: we probably don't want a polygon all the time?! */ // @todo bad implementation, focus on single square - std::vector > contoursDraw (contoursArea.size()); - for (int i = 0; i < contoursArea.size(); ++i){ + std::vector> contoursDraw(contoursArea.size()); + for (int i = 0; i < contoursArea.size(); ++i) { cv::approxPolyDP(cv::Mat(contoursArea[i]), contoursDraw[i], 40, true); } @@ -73,6 +73,6 @@ namespace Image::BillDetection return out; } -} +} // namespace Image::BillDetection #endif \ No newline at end of file diff --git a/Image/Diff.h b/Image/Diff.h index 1f23178..96ddd14 100755 --- a/Image/Diff.h +++ b/Image/Diff.h @@ -10,8 +10,8 @@ #ifndef IMAGE_DIFF_H #define IMAGE_DIFF_H -#include #include +#include #include "../Utils/MathUtils.h" @@ -19,7 +19,7 @@ namespace Image { namespace ImageUtils { - cv::Mat find_diff (cv::Mat in1, cv::Mat in2) + cv::Mat find_diff(cv::Mat in1, cv::Mat in2) { cv::Mat diff; cv::absdiff(in1, in2, diff); @@ -43,7 +43,7 @@ namespace Image return out; } - } -} + } // namespace ImageUtils +} // namespace Image #endif \ No newline at end of file diff --git a/Image/ImageUtils.h b/Image/ImageUtils.h index 63d4b46..9c26e6b 100755 --- a/Image/ImageUtils.h +++ b/Image/ImageUtils.h @@ -10,13 +10,12 @@ #ifndef IMAGE_IMAGE_UTILS_H #define IMAGE_IMAGE_UTILS_H -#include #include +#include namespace Image::ImageUtils { - inline - float lightnessFromRgb(int r, int g, int b) + inline float lightnessFromRgb(int r, int g, int b) { float vR = r / 255.0; float vG = g / 255.0; @@ -31,6 +30,6 @@ namespace Image::ImageUtils return lStar / 100.0; } -} +} // namespace Image::ImageUtils #endif \ No newline at end of file diff --git a/Image/Kernel.h b/Image/Kernel.h index 521665d..b2653ae 100755 --- a/Image/Kernel.h +++ b/Image/Kernel.h @@ -10,60 +10,38 @@ #ifndef IMAGE_KERNEL_H #define IMAGE_KERNEL_H -#include #include +#include -#include "ImageUtils.h" #include "../Utils/MathUtils.h" +#include "ImageUtils.h" namespace Image::Kernel { - const float KERNEL_RIDGE_1[3][3] = { - {0.0, -1.0, 0.0}, - {-1.0, 4.0, -1.0}, - {0.0, -1.0, 0.0} - }; + const float KERNEL_RIDGE_1[3][3] = {{0.0, -1.0, 0.0}, {-1.0, 4.0, -1.0}, {0.0, -1.0, 0.0}}; - const float KERNEL_RIDGE_2[3][3] = { - {-1.0, -1.0, -1.0}, - {-1.0, 8.0, -1.0}, - {-1.0, -1.0, -1.0} - }; + const float KERNEL_RIDGE_2[3][3] = {{-1.0, -1.0, -1.0}, {-1.0, 8.0, -1.0}, {-1.0, -1.0, -1.0}}; - const float KERNEL_SHARPEN[3][3] = { - {0.0, -1.0, 0.0}, - {-1.0, 5.0, -1.0}, - {0.0, -1.0, 0.0} - }; + const float KERNEL_SHARPEN[3][3] = {{0.0, -1.0, 0.0}, {-1.0, 5.0, -1.0}, {0.0, -1.0, 0.0}}; const float KERNEL_BOX_BLUR[3][3] = { - {1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0}, - {1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0}, - {1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0} - }; + {1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0}, {1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0}, {1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0}}; - const float KERNEL_GAUSSUAN_BLUR_3[3][3] = { - {1.0 / 16.0, 2.0 / 16.0, 1.0 / 16.0}, - {2.0 / 16.0, 4.0 / 16.0, 2.0 / 16.0}, - {1.0 / 16.0, 2.0 / 16.0, 1.0 / 16.0} - }; + const float KERNEL_GAUSSUAN_BLUR_3[3][3] = {{1.0 / 16.0, 2.0 / 16.0, 1.0 / 16.0}, + {2.0 / 16.0, 4.0 / 16.0, 2.0 / 16.0}, + {1.0 / 16.0, 2.0 / 16.0, 1.0 / 16.0}}; - const float KERNEL_EMBOSS[3][3] = { - {-2.0, -1.0, 0.0}, - {-1.0, 1.0, 1.0}, - {0.0, 1.0, 2.0} - }; + const float KERNEL_EMBOSS[3][3] = {{-2.0, -1.0, 0.0}, {-1.0, 1.0, 1.0}, {0.0, 1.0, 2.0}}; const float KERNEL_UNSHARP_MASKING[5][5] = { - {-1.0 / 256.0, -4.0 / 256.0, -6.0 / 256.0, -4.0 / 256.0, -1.0 / 256.0}, - {-4.0 / 256.0, -16.0 / 256.0, -24.0 / 256.0, -16.0 / 256.0, -4.0 / 256.0}, - {-6.0 / 256.0, -24.0 / 256.0, 476.0 / 256.0, -24.0 / 256.0, -6.0 / 256.0}, - {-4.0 / 256.0, -16.0 / 256.0, -24.0 / 256.0, -16.0 / 256.0, -4.0 / 256.0}, - {-1.0 / 256.0, -4.0 / 256.0, -6.0 / 256.0, -4.0 / 256.0, -1.0 / 256.0}, + {-1.0 / 256.0, -4.0 / 256.0, -6.0 / 256.0, -4.0 / 256.0, -1.0 / 256.0}, + {-4.0 / 256.0, -16.0 / 256.0, -24.0 / 256.0, -16.0 / 256.0, -4.0 / 256.0}, + {-6.0 / 256.0, -24.0 / 256.0, 476.0 / 256.0, -24.0 / 256.0, -6.0 / 256.0}, + {-4.0 / 256.0, -16.0 / 256.0, -24.0 / 256.0, -16.0 / 256.0, -4.0 / 256.0}, + {-1.0 / 256.0, -4.0 / 256.0, -6.0 / 256.0, -4.0 / 256.0, -1.0 / 256.0}, }; - inline - cv::Mat convolve(cv::Mat in, const float kernel[][3]) + inline cv::Mat convolve(cv::Mat in, const float kernel[][3]) { cv::Size dim = in.size(); cv::Mat out(in.size(), in.type()); @@ -73,6 +51,6 @@ namespace Image::Kernel return out; } -} +} // namespace Image::Kernel #endif \ No newline at end of file diff --git a/Image/Skew.h b/Image/Skew.h index 73ab4ba..7013162 100755 --- a/Image/Skew.h +++ b/Image/Skew.h @@ -10,8 +10,8 @@ #ifndef IMAGE_SKEW_H #define IMAGE_SKEW_H -#include #include +#include #include #include "../Utils/MathUtils.h" @@ -42,7 +42,8 @@ namespace Image::Skew std::vector angles; for (int i = 0; i < tmpAngles.size(); ++i) { if (imageOrientation > 0) { - if (oms_deg2rad(90 - maxDegree) < oms_abs(tmpAngles[i]) && oms_abs(tmpAngles[i]) < oms_deg2rad(90 + maxDegree)) { + if (oms_deg2rad(90 - maxDegree) < oms_abs(tmpAngles[i]) && + oms_abs(tmpAngles[i]) < oms_deg2rad(90 + maxDegree)) { angles.push_back(tmpAngles[i]); } } else { @@ -85,6 +86,6 @@ namespace Image::Skew return out; } -} +} // namespace Image::Skew #endif \ No newline at end of file diff --git a/Image/Thresholding.h b/Image/Thresholding.h index 10bf58b..7752e14 100755 --- a/Image/Thresholding.h +++ b/Image/Thresholding.h @@ -10,11 +10,11 @@ #ifndef IMAGE_THRESHOLDING_H #define IMAGE_THRESHOLDING_H -#include #include +#include -#include "ImageUtils.h" #include "../Utils/MathUtils.h" +#include "ImageUtils.h" namespace Image::Thresholding { @@ -58,7 +58,7 @@ namespace Image::Thresholding count = (x2 - x1) * (y2 - y1); sum = intImg[x2 * y2] - intImg[x2 * (y1 - 1)] - intImg[(x1 - 1) * y2] + intImg[(x1 - 1) * (y1 - 1)]; - bgr = in.at(j, i); + bgr = in.at(j, i); brightness = Image::ImageUtils::lightnessFromRgb(bgr[2], bgr[1], bgr[0]); color = brightness * count <= (sum * (100.0 - t) / 100.0) && brightness < 0.95 ? 0 : 255; @@ -73,6 +73,6 @@ namespace Image::Thresholding return out; } -} +} // namespace Image::Thresholding #endif \ No newline at end of file diff --git a/Input/XInput.h b/Input/XInput.h index 2828fa4..6576785 100644 --- a/Input/XInput.h +++ b/Input/XInput.h @@ -18,25 +18,16 @@ #include "../Stdlib/Types.h" -uint32 find_joysticks() -{ +uint32 find_joysticks() {} -} - -void destory_joysticks() -{ - -} +void destory_joysticks() {} void handle_controller_input() { - for (uint32 controller_index = 0; controller_index < XUSER_MAX_COUNT; ++controller_index) { XINPUT_STATE controller_state; if (XInputGetState(controller_index, &controller_state) == ERROR_SUCCESS) { - } else { - } } } diff --git a/Math/Matrix/MatrixInt64.h b/Math/Matrix/MatrixInt64.h index 156d5d6..c1e594b 100644 --- a/Math/Matrix/MatrixInt64.h +++ b/Math/Matrix/MatrixInt64.h @@ -1,4 +1,5 @@ -// Remarks: sizes for the second matrix/vector are often implied by the first parameter and the rules for matrix/vector multiplication. +// Remarks: sizes for the second matrix/vector are often implied by the first parameter and the rules for matrix/vector +// multiplication. // First element is always a matrix of int64_t ///////////////////////////////// diff --git a/Math/Matrix/VectorFloat32.h b/Math/Matrix/VectorFloat32.h index 25c96c9..7ca4a61 100644 --- a/Math/Matrix/VectorFloat32.h +++ b/Math/Matrix/VectorFloat32.h @@ -149,6 +149,6 @@ namespace Math::Matrix::VectorFloat32 f32_16_simd v[4]; }; }; -} +} // namespace Math::Matrix::VectorFloat32 #endif diff --git a/Math/Matrix/VectorInt32.h b/Math/Matrix/VectorInt32.h index 12cb085..c3237d8 100644 --- a/Math/Matrix/VectorInt32.h +++ b/Math/Matrix/VectorInt32.h @@ -11,8 +11,8 @@ #define MATH_MATRIX_VECTORFLOAT32_H #include "Types.h" -#include #include +#include struct simd_int32_4 { union { diff --git a/Math/Matrix/VectorInt64.h b/Math/Matrix/VectorInt64.h index a861de0..be25a52 100644 --- a/Math/Matrix/VectorInt64.h +++ b/Math/Matrix/VectorInt64.h @@ -11,8 +11,8 @@ #define MATH_MATRIX_VECTORFLOAT32_H #include "Types.h" -#include #include +#include struct simd_int64_2 { union { diff --git a/Router/Router.h b/Router/Router.h index c18ae79..0831e15 100755 --- a/Router/Router.h +++ b/Router/Router.h @@ -17,8 +17,8 @@ #include #include -#include "../Stdlib/HashTable.h" #include "../Hash/MeowHash.h" +#include "../Stdlib/HashTable.h" namespace Router { @@ -36,14 +36,14 @@ namespace Router return router; } - void set(const Router *router, const char* route, void *endpoint) + void set(const Router *router, const char *route, void *endpoint) { Stdlib::HashTable::set_entry(router->routes, route, endpoint); } RouterFunc match_route(const Router *router, const char *uri) { - RouterFunc ptr = NULL; + RouterFunc ptr = NULL; Stdlib::HashTable::it itr = Stdlib::HashTable::table_iterator(router->routes); std::regex regex; @@ -66,6 +66,6 @@ namespace Router Stdlib::HashTable::free_table(router->routes); router->routes = NULL; } -} +} // namespace Router #endif \ No newline at end of file diff --git a/Stdlib/HashTable.h b/Stdlib/HashTable.h index 1b6e8db..f4b4d8e 100755 --- a/Stdlib/HashTable.h +++ b/Stdlib/HashTable.h @@ -38,15 +38,10 @@ namespace Stdlib::HashTable size_t index; } it; - inline - unsigned long long hash_key(const char *key) + inline unsigned long long hash_key(const char *key) { return (unsigned long long) MeowU64From( - Hash::Meow::MeowHash(Hash::Meow::MeowDefaultSeed, - strlen(key), - (void *) key), - 0 - ); + Hash::Meow::MeowHash(Hash::Meow::MeowDefaultSeed, strlen(key), (void *) key), 0); } ht *create_table(int max = 0, bool is_fixed = false) @@ -56,8 +51,8 @@ namespace Stdlib::HashTable return NULL; } - table->size = 0; - table->max = max == 0 ? 16 : max; + table->size = 0; + table->max = max == 0 ? 16 : max; table->is_fixed = is_fixed; table->entries = (entry *) calloc(table->max, sizeof(entry)); @@ -72,7 +67,7 @@ namespace Stdlib::HashTable void *get_entry(ht *table, const char *key) { unsigned long long hash = hash_key(key); - size_t index = (size_t) (hash & (unsigned long long)(table->max - 1)); + size_t index = (size_t) (hash & (unsigned long long) (table->max - 1)); while (table->entries[index].key != NULL) { if (strcmp(key, table->entries[index].key) == 0) { @@ -91,7 +86,7 @@ namespace Stdlib::HashTable const char *_set_entry(entry *entries, size_t max, const char *key, void *value, size_t *size) { unsigned long long hash = hash_key(key); - size_t index = (size_t) (hash & (unsigned long long)(max - 1)); + size_t index = (size_t) (hash & (unsigned long long) (max - 1)); while (entries[index].key != NULL) { if (strcmp(key, entries[index].key) == 0) { @@ -107,11 +102,11 @@ namespace Stdlib::HashTable } if (size != NULL) { - #ifdef _WIN32 - key = _strdup(key); - #else - key = strdup(key); - #endif +#ifdef _WIN32 + key = _strdup(key); +#else + key = strdup(key); +#endif if (key == NULL) { return NULL; @@ -120,7 +115,7 @@ namespace Stdlib::HashTable ++(*size); } - entries[index].key = (char *) key; + entries[index].key = (char *) key; entries[index].value = value; return key; @@ -213,6 +208,6 @@ namespace Stdlib::HashTable free(table->entries); } -} +} // namespace Stdlib::HashTable #endif \ No newline at end of file diff --git a/Stdlib/Intrinsics.h b/Stdlib/Intrinsics.h index f9fe798..8f0f502 100644 --- a/Stdlib/Intrinsics.h +++ b/Stdlib/Intrinsics.h @@ -10,61 +10,41 @@ #ifndef STDLIB_INTRINSICS_H #define STDLIB_INTRINSICS_H -#include -#include #include +#include #include +#include #include "Types.h" namespace Stdlib::Intrinsics { - inline - f32 sqrt(f32 a) { - return _mm_cvtss_f32(_mm_sqrt_ss(_mm_set_ss(a))); - } + inline f32 sqrt(f32 a) { return _mm_cvtss_f32(_mm_sqrt_ss(_mm_set_ss(a))); } - inline - f32 round(f32 a) { + inline f32 round(f32 a) + { return _mm_cvtss_f32( - _mm_round_ss( - _mm_setzero_ps(), - _mm_set_ss(a), - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) - ) - ); + _mm_round_ss(_mm_setzero_ps(), _mm_set_ss(a), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))); } - inline - uint32 round_to_int(f32 a) { - return (uint32) _mm_cvtss_si32(_mm_set_ss(a)); - } + inline uint32 round_to_int(f32 a) { return (uint32) _mm_cvtss_si32(_mm_set_ss(a)); } - inline - f32 floor(f32 a) { - return _mm_cvtss_f32(_mm_floor_ss(_mm_setzero_ps(), _mm_set_ss(a))); - } + inline f32 floor(f32 a) { return _mm_cvtss_f32(_mm_floor_ss(_mm_setzero_ps(), _mm_set_ss(a))); } - inline - f32 ceil(f32 a) { - return _mm_cvtss_f32(_mm_ceil_ss(_mm_setzero_ps(), _mm_set_ss(a))); - } + inline f32 ceil(f32 a) { return _mm_cvtss_f32(_mm_ceil_ss(_mm_setzero_ps(), _mm_set_ss(a))); } - inline - uint32 hash(uint64 a, uint64 b = 0) { + inline uint32 hash(uint64 a, uint64 b = 0) + { uint8 seed[16] = { - 0xaa, 0x9b, 0xbd, 0xb8, - 0xa1, 0x98, 0xac, 0x3f, - 0x1f, 0x94, 0x07, 0xb3, - 0x8c, 0x27, 0x93, 0x69, + 0xaa, 0x9b, 0xbd, 0xb8, 0xa1, 0x98, 0xac, 0x3f, 0x1f, 0x94, 0x07, 0xb3, 0x8c, 0x27, 0x93, 0x69, }; __m128i hash = _mm_set_epi64x(a, b); - hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); - hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); + hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); + hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); return _mm_extract_epi32(hash, 0); } -} +} // namespace Stdlib::Intrinsics #endif \ No newline at end of file diff --git a/Stdlib/SIMD/SIMD_F32.h b/Stdlib/SIMD/SIMD_F32.h index 367fafa..40b73f8 100644 --- a/Stdlib/SIMD/SIMD_F32.h +++ b/Stdlib/SIMD/SIMD_F32.h @@ -10,8 +10,8 @@ #ifndef STDLIB_SIMD_F32_H #define STDLIB_SIMD_F32_H -#include #include +#include #include "../Types.h" @@ -38,8 +38,7 @@ namespace Stdlib::SIMD }; }; - inline - f32_4_simd load_f32_4_simd(f32 *mem) + inline f32_4_simd load_f32_4_simd(f32 *mem) { f32_4_simd simd; simd.s = _mm_loadu_ps(mem); @@ -47,8 +46,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd init_f32_4_simd(f32 *mem) + inline f32_4_simd init_f32_4_simd(f32 *mem) { f32_4_simd simd; simd.s = _mm_set_ps(mem[0], mem[1], mem[2], mem[3]); @@ -56,14 +54,9 @@ namespace Stdlib::SIMD return simd; } - inline - void unload_f32_4_simd(f32_4_simd a, f32* array) - { - _mm_store_ps(array, a.s); - } + inline void unload_f32_4_simd(f32_4_simd a, f32 *array) { _mm_store_ps(array, a.s); } - inline - f32_8_simd load_f32_8_simd(f32 *mem) + inline f32_8_simd load_f32_8_simd(f32 *mem) { f32_8_simd simd; simd.s = _mm256_loadu_ps(mem); @@ -71,26 +64,17 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd init_f32_8_simd(f32 *mem) + inline f32_8_simd init_f32_8_simd(f32 *mem) { f32_8_simd simd; - simd.s = _mm256_set_ps( - mem[0], mem[1], mem[2], mem[3], - mem[4], mem[5], mem[6], mem[7] - ); + simd.s = _mm256_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]); return simd; } - inline - void unload_f32_8_simd(f32_8_simd a, f32* array) - { - _mm256_store_ps(array, a.s); - } + inline void unload_f32_8_simd(f32_8_simd a, f32 *array) { _mm256_store_ps(array, a.s); } - inline - f32_16_simd load_f32_16_simd(f32 *mem) + inline f32_16_simd load_f32_16_simd(f32 *mem) { f32_16_simd simd; simd.s = _mm512_loadu_ps(mem); @@ -98,28 +82,18 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd init_f32_16_simd(f32 *mem) + inline f32_16_simd init_f32_16_simd(f32 *mem) { f32_16_simd simd; - simd.s = _mm512_set_ps( - mem[0], mem[1], mem[2], mem[3], - mem[4], mem[5], mem[6], mem[7], - mem[8], mem[9], mem[10], mem[11], - mem[12], mem[13], mem[14], mem[15] - ); + simd.s = _mm512_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], mem[8], mem[9], mem[10], + mem[11], mem[12], mem[13], mem[14], mem[15]); return simd; } - inline - void unload_f32_16_simd(f32_16_simd a, f32* array) - { - _mm512_store_ps(array, a.s); - } + inline void unload_f32_16_simd(f32_16_simd a, f32 *array) { _mm512_store_ps(array, a.s); } - inline - f32_4_simd init_zero_f32_4_simd() + inline f32_4_simd init_zero_f32_4_simd() { f32_4_simd simd; simd.s = _mm_setzero_ps(); @@ -127,8 +101,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd init_zero_f32_8_simd() + inline f32_8_simd init_zero_f32_8_simd() { f32_8_simd simd; simd.s = _mm256_setzero_ps(); @@ -136,8 +109,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd init_zero_f32_16_simd() + inline f32_16_simd init_zero_f32_16_simd() { f32_16_simd simd; simd.s = _mm512_setzero_ps(); @@ -145,8 +117,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd operator+(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator+(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_add_ps(a.s, b.s); @@ -154,8 +125,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator+(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator+(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_add_ps(a.s, b.s); @@ -163,8 +133,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator+(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator+(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_add_ps(a.s, b.s); @@ -172,8 +141,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd operator-(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator-(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_sub_ps(a.s, b.s); @@ -181,14 +149,9 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd operator-(f32_4_simd a) - { - return init_zero_f32_4_simd() - a; - } + inline f32_4_simd operator-(f32_4_simd a) { return init_zero_f32_4_simd() - a; } - inline - f32_8_simd operator-(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator-(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_sub_ps(a.s, b.s); @@ -196,14 +159,9 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator-(f32_8_simd a) - { - return init_zero_f32_8_simd() - a; - } + inline f32_8_simd operator-(f32_8_simd a) { return init_zero_f32_8_simd() - a; } - inline - f32_16_simd operator-(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator-(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_sub_ps(a.s, b.s); @@ -211,14 +169,9 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator-(f32_16_simd a) - { - return init_zero_f32_16_simd() - a; - } + inline f32_16_simd operator-(f32_16_simd a) { return init_zero_f32_16_simd() - a; } - inline - f32_4_simd operator*(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator*(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_mul_ps(a.s, b.s); @@ -226,8 +179,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator*(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator*(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_mul_ps(a.s, b.s); @@ -235,8 +187,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator*(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator*(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_mul_ps(a.s, b.s); @@ -244,8 +195,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd operator/(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator/(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_div_ps(a.s, b.s); @@ -253,8 +203,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator/(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator/(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_div_ps(a.s, b.s); @@ -262,8 +211,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator/(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator/(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_div_ps(a.s, b.s); @@ -271,8 +219,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd operator^(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator^(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_xor_ps(a.s, b.s); @@ -280,8 +227,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator^(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator^(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_xor_ps(a.s, b.s); @@ -289,8 +235,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator^(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator^(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_xor_ps(a.s, b.s); @@ -298,128 +243,112 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd& operator-=(f32_4_simd &a, f32_4_simd b) + inline f32_4_simd &operator-=(f32_4_simd &a, f32_4_simd b) { a = a - b; return a; } - inline - f32_8_simd& operator-=(f32_8_simd &a, f32_8_simd b) + inline f32_8_simd &operator-=(f32_8_simd &a, f32_8_simd b) { a = a - b; return a; } - inline - f32_16_simd& operator-=(f32_16_simd &a, f32_16_simd b) + inline f32_16_simd &operator-=(f32_16_simd &a, f32_16_simd b) { a = a - b; return a; } - inline - f32_4_simd& operator+=(f32_4_simd &a, f32_4_simd b) + inline f32_4_simd &operator+=(f32_4_simd &a, f32_4_simd b) { a = a + b; return a; } - inline - f32_8_simd& operator+=(f32_8_simd &a, f32_8_simd b) + inline f32_8_simd &operator+=(f32_8_simd &a, f32_8_simd b) { a = a + b; return a; } - inline - f32_16_simd& operator+=(f32_16_simd &a, f32_16_simd b) + inline f32_16_simd &operator+=(f32_16_simd &a, f32_16_simd b) { a = a + b; return a; } - inline - f32_4_simd& operator*=(f32_4_simd &a, f32_4_simd b) + inline f32_4_simd &operator*=(f32_4_simd &a, f32_4_simd b) { a = a * b; return a; } - inline - f32_8_simd& operator*=(f32_8_simd &a, f32_8_simd b) + inline f32_8_simd &operator*=(f32_8_simd &a, f32_8_simd b) { a = a * b; return a; } - inline - f32_16_simd& operator*=(f32_16_simd &a, f32_16_simd b) + inline f32_16_simd &operator*=(f32_16_simd &a, f32_16_simd b) { a = a * b; return a; } - inline - f32_4_simd& operator/=(f32_4_simd &a, f32_4_simd b) + inline f32_4_simd &operator/=(f32_4_simd &a, f32_4_simd b) { a = a / b; return a; } - inline - f32_8_simd& operator/=(f32_8_simd &a, f32_8_simd b) + inline f32_8_simd &operator/=(f32_8_simd &a, f32_8_simd b) { a = a / b; return a; } - inline - f32_16_simd& operator/=(f32_16_simd &a, f32_16_simd b) + inline f32_16_simd &operator/=(f32_16_simd &a, f32_16_simd b) { a = a / b; return a; } - inline - f32_4_simd& operator^=(f32_4_simd &a, f32_4_simd b) + inline f32_4_simd &operator^=(f32_4_simd &a, f32_4_simd b) { a = a ^ b; return a; } - inline - f32_8_simd& operator^=(f32_8_simd &a, f32_8_simd b) + inline f32_8_simd &operator^=(f32_8_simd &a, f32_8_simd b) { a = a ^ b; return a; } - inline - f32_16_simd& operator^=(f32_16_simd &a, f32_16_simd b) + inline f32_16_simd &operator^=(f32_16_simd &a, f32_16_simd b) { a = a ^ b; return a; } - inline - f32_4_simd operator<(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator<(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_cmplt_ps(a.s, b.s); @@ -427,8 +356,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator<(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator<(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LT_OQ); @@ -436,21 +364,15 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator<(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator<(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmplt_ps_mask(a.s, b.s), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmplt_ps_mask(a.s, b.s), a.s, b.s); return simd; } - inline - f32_4_simd operator<=(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator<=(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_cmple_ps(a.s, b.s); @@ -458,8 +380,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator<=(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator<=(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LE_OQ); @@ -467,21 +388,15 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator<=(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator<=(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), a.s, b.s); return simd; } - inline - f32_4_simd operator>(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator>(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_cmpgt_ps(a.s, b.s); @@ -489,8 +404,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator>(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator>(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GT_OQ); @@ -498,21 +412,15 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator>(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator>(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), a.s, b.s); return simd; } - inline - f32_4_simd operator>=(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator>=(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_cmpge_ps(a.s, b.s); @@ -520,8 +428,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator>=(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator>=(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GE_OQ); @@ -529,21 +436,15 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator>=(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator>=(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), a.s, b.s); return simd; } - inline - f32_4_simd operator==(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator==(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_cmpeq_ps(a.s, b.s); @@ -551,8 +452,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator==(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator==(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_EQ_OQ); @@ -560,21 +460,15 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator==(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator==(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), a.s, b.s); return simd; } - inline - f32_4_simd operator!=(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator!=(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_cmpneq_ps(a.s, b.s); @@ -582,8 +476,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator!=(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator!=(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_NEQ_OQ); @@ -591,21 +484,15 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator!=(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator!=(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), a.s, b.s); return simd; } - inline - f32_4_simd operator&(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator&(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_and_ps(a.s, b.s); @@ -613,8 +500,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator&(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator&(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_and_ps(a.s, b.s); @@ -622,8 +508,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator&(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator&(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_and_ps(a.s, b.s); @@ -631,8 +516,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd operator|(f32_4_simd a, f32_4_simd b) + inline f32_4_simd operator|(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_or_ps(a.s, b.s); @@ -640,8 +524,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd operator|(f32_8_simd a, f32_8_simd b) + inline f32_8_simd operator|(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_or_ps(a.s, b.s); @@ -649,8 +532,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd operator|(f32_16_simd a, f32_16_simd b) + inline f32_16_simd operator|(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_or_ps(a.s, b.s); @@ -658,59 +540,52 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd& operator&=(f32_4_simd &a, f32_4_simd b) + inline f32_4_simd &operator&=(f32_4_simd &a, f32_4_simd b) { a = a & b; return a; } - inline - f32_8_simd& operator&=(f32_8_simd &a, f32_8_simd b) + inline f32_8_simd &operator&=(f32_8_simd &a, f32_8_simd b) { a = a & b; return a; } - inline - f32_16_simd& operator&=(f32_16_simd &a, f32_16_simd b) + inline f32_16_simd &operator&=(f32_16_simd &a, f32_16_simd b) { a = a & b; return a; } - inline - f32_4_simd& operator|=(f32_4_simd &a, f32_4_simd b) + inline f32_4_simd &operator|=(f32_4_simd &a, f32_4_simd b) { a = a | b; return a; } - inline - f32_8_simd& operator|=(f32_8_simd &a, f32_8_simd b) + inline f32_8_simd &operator|=(f32_8_simd &a, f32_8_simd b) { a = a | b; return a; } - inline - f32_16_simd& operator|=(f32_16_simd &a, f32_16_simd b) + inline f32_16_simd &operator|=(f32_16_simd &a, f32_16_simd b) { a = a | b; return a; } - inline - f32_4_simd abs(f32_4_simd a) + inline f32_4_simd abs(f32_4_simd a) { unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m128 mask = _mm_set1_ps(*(float *) &unsigned_mask); + __m128 mask = _mm_set1_ps(*(float *) &unsigned_mask); f32_4_simd simd; simd.s = _mm_and_ps(a.s, mask); @@ -718,11 +593,10 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd abs(f32_8_simd a) + inline f32_8_simd abs(f32_8_simd a) { unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m256 mask = _mm256_set1_ps(*(float *) &unsigned_mask); + __m256 mask = _mm256_set1_ps(*(float *) &unsigned_mask); f32_8_simd simd; simd.s = _mm256_and_ps(a.s, mask); @@ -730,11 +604,10 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd abs(f32_16_simd a) + inline f32_16_simd abs(f32_16_simd a) { unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m512 mask = _mm512_set1_ps(*(float *) &unsigned_mask); + __m512 mask = _mm512_set1_ps(*(float *) &unsigned_mask); f32_16_simd simd; simd.s = _mm512_and_ps(a.s, mask); @@ -742,8 +615,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd min(f32_4_simd a, f32_4_simd b) + inline f32_4_simd min(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_min_ps(a.s, b.s); @@ -751,8 +623,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd min(f32_8_simd a, f32_8_simd b) + inline f32_8_simd min(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_min_ps(a.s, b.s); @@ -760,8 +631,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd min(f32_16_simd a, f32_16_simd b) + inline f32_16_simd min(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_min_ps(a.s, b.s); @@ -769,8 +639,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd max(f32_4_simd a, f32_4_simd b) + inline f32_4_simd max(f32_4_simd a, f32_4_simd b) { f32_4_simd simd; simd.s = _mm_max_ps(a.s, b.s); @@ -778,8 +647,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd max(f32_8_simd a, f32_8_simd b) + inline f32_8_simd max(f32_8_simd a, f32_8_simd b) { f32_8_simd simd; simd.s = _mm256_max_ps(a.s, b.s); @@ -787,8 +655,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd max(f32_16_simd a, f32_16_simd b) + inline f32_16_simd max(f32_16_simd a, f32_16_simd b) { f32_16_simd simd; simd.s = _mm512_max_ps(a.s, b.s); @@ -796,11 +663,10 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd sign(f32_4_simd a) + inline f32_4_simd sign(f32_4_simd a) { unsigned int umask = (unsigned int) (1 << 31); - __m128 mask = _mm_set1_ps(*(float *) &umask); + __m128 mask = _mm_set1_ps(*(float *) &umask); f32_4_simd signBit; signBit.s = _mm_and_ps(a.s, mask); @@ -813,11 +679,10 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd sign(f32_8_simd a) + inline f32_8_simd sign(f32_8_simd a) { unsigned int umask = (unsigned int) (1 << 31); - __m256 mask = _mm256_set1_ps(*(float *) &umask); + __m256 mask = _mm256_set1_ps(*(float *) &umask); f32_8_simd signBit; signBit.s = _mm256_and_ps(a.s, mask); @@ -830,11 +695,10 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd sign(f32_16_simd a) + inline f32_16_simd sign(f32_16_simd a) { unsigned int umask = (unsigned int) (1 << 31); - __m512 mask = _mm512_set1_ps(*(float *) &umask); + __m512 mask = _mm512_set1_ps(*(float *) &umask); f32_16_simd signBit; signBit.s = _mm512_and_ps(a.s, mask); @@ -847,8 +711,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd floor(f32_4_simd a) + inline f32_4_simd floor(f32_4_simd a) { f32_4_simd simd; simd.s = _mm_floor_ps(a.s); @@ -856,8 +719,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd floor(f32_8_simd a) + inline f32_8_simd floor(f32_8_simd a) { f32_8_simd simd; simd.s = _mm256_floor_ps(a.s); @@ -865,8 +727,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd floor(f32_16_simd a) + inline f32_16_simd floor(f32_16_simd a) { f32_16_simd simd; simd.s = _mm512_floor_ps(a.s); @@ -874,8 +735,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd ceil(f32_4_simd a) + inline f32_4_simd ceil(f32_4_simd a) { f32_4_simd simd; simd.s = _mm_ceil_ps(a.s); @@ -883,8 +743,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd ceil(f32_8_simd a) + inline f32_8_simd ceil(f32_8_simd a) { f32_8_simd simd; simd.s = _mm256_ceil_ps(a.s); @@ -892,8 +751,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd ceil(f32_16_simd a) + inline f32_16_simd ceil(f32_16_simd a) { f32_16_simd simd; simd.s = _mm512_ceil_ps(a.s); @@ -901,8 +759,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd sqrt(f32_4_simd a) + inline f32_4_simd sqrt(f32_4_simd a) { f32_4_simd simd; simd.s = _mm_sqrt_ps(a.s); @@ -910,8 +767,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd sqrt(f32_8_simd a) + inline f32_8_simd sqrt(f32_8_simd a) { f32_8_simd simd; simd.s = _mm256_sqrt_ps(a.s); @@ -919,8 +775,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd sqrt(f32_16_simd a) + inline f32_16_simd sqrt(f32_16_simd a) { f32_16_simd simd; simd.s = _mm512_sqrt_ps(a.s); @@ -928,8 +783,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd sqrt_inv_approx(f32_4_simd a) + inline f32_4_simd sqrt_inv_approx(f32_4_simd a) { f32_4_simd simd; simd.s = _mm_rsqrt_ps(a.s); @@ -937,8 +791,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd sqrt_inv_approx(f32_8_simd a) + inline f32_8_simd sqrt_inv_approx(f32_8_simd a) { f32_8_simd simd; simd.s = _mm256_rsqrt_ps(a.s); @@ -946,8 +799,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd sqrt_inv_approx(f32_16_simd a) + inline f32_16_simd sqrt_inv_approx(f32_16_simd a) { f32_16_simd simd; simd.s = _mm512_rsqrt14_ps(a.s); @@ -955,8 +807,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd one_over_approx(f32_4_simd a) + inline f32_4_simd one_over_approx(f32_4_simd a) { f32_4_simd simd; simd.s = _mm_rcp_ps(a.s); @@ -964,8 +815,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_8_simd one_over_approx(f32_8_simd a) + inline f32_8_simd one_over_approx(f32_8_simd a) { f32_8_simd simd; simd.s = _mm256_rcp_ps(a.s); @@ -973,8 +823,7 @@ namespace Stdlib::SIMD return simd; } - inline - f32_16_simd one_over_approx(f32_16_simd a) + inline f32_16_simd one_over_approx(f32_16_simd a) { f32_16_simd simd; simd.s = _mm512_rcp14_ps(a.s); @@ -982,120 +831,105 @@ namespace Stdlib::SIMD return simd; } - inline - f32_4_simd clamp(f32_4_simd min_value, f32_4_simd a, f32_4_simd max_value) + inline f32_4_simd clamp(f32_4_simd min_value, f32_4_simd a, f32_4_simd max_value) { return min(max(a, min_value), max_value); } - inline - f32_8_simd clamp(f32_8_simd min_value, f32_8_simd a, f32_8_simd max_value) + inline f32_8_simd clamp(f32_8_simd min_value, f32_8_simd a, f32_8_simd max_value) { return min(max(a, min_value), max_value); } - inline - f32_16_simd clamp(f32_16_simd min_value, f32_16_simd a, f32_16_simd max_value) + inline f32_16_simd clamp(f32_16_simd min_value, f32_16_simd a, f32_16_simd max_value) { return min(max(a, min_value), max_value); } - inline - int32 which_true(f32_4_simd a) + inline int32 which_true(f32_4_simd a) { int32 which_true = _mm_movemask_ps(a.s); return which_true; } - inline - int32 which_true(f32_8_simd a) + inline int32 which_true(f32_8_simd a) { int32 which_true = _mm256_movemask_ps(a.s); return which_true; } - inline - int32 which_true(f32_16_simd a) + inline int32 which_true(f32_16_simd a) { int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)); return which_true; } - inline - bool any_true(f32_4_simd a) + inline bool any_true(f32_4_simd a) { bool is_any_true = _mm_movemask_ps(a.s) > 0; return is_any_true; } - inline - bool any_true(f32_8_simd a) + inline bool any_true(f32_8_simd a) { bool is_any_true = _mm256_movemask_ps(a.s) > 0; return is_any_true; } - inline - bool any_true(f32_16_simd a) + inline bool any_true(f32_16_simd a) { bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0; return is_any_true; } - inline - bool all_true(f32_4_simd a) + inline bool all_true(f32_4_simd a) { bool is_true = (_mm_movemask_ps(a.s) == 15); return is_true; } - inline - bool all_true(f32_8_simd a) + inline bool all_true(f32_8_simd a) { bool is_true = (_mm256_movemask_ps(a.s) == 255); return is_true; } - inline - bool all_true(f32_16_simd a) + inline bool all_true(f32_16_simd a) { bool is_true = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535); return is_true; } - inline - bool all_false(f32_4_simd a) + inline bool all_false(f32_4_simd a) { bool is_false = (_mm_movemask_ps(a.s) == 0); return is_false; } - inline - bool all_false(f32_8_simd a) + inline bool all_false(f32_8_simd a) { bool is_false = (_mm256_movemask_ps(a.s) == 0); return is_false; } - inline - bool all_false(f32_16_simd a) + inline bool all_false(f32_16_simd a) { // @todo This can be optimized (requires also changes in the comparison functions return) bool is_false = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0); return is_false; } -} +} // namespace Stdlib::SIMD #endif diff --git a/Stdlib/SIMD/SIMD_Helper.h b/Stdlib/SIMD/SIMD_Helper.h index 8dea4fe..e84f0df 100644 --- a/Stdlib/SIMD/SIMD_Helper.h +++ b/Stdlib/SIMD/SIMD_Helper.h @@ -10,9 +10,9 @@ #ifndef STDLIB_SIMD_HELPER_H #define STDLIB_SIMD_HELPER_H +#include #include #include -#include namespace Stdlib::SIMD { @@ -22,11 +22,7 @@ namespace Stdlib::SIMD eax = 1; // CPUID function 1 - __asm__ __volatile__( - "cpuid;" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) - : "a" (eax) - ); + __asm__ __volatile__("cpuid;" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(eax)); // Check the AVX feature bit in ECX return (ecx >> 28) & 1; @@ -39,11 +35,7 @@ namespace Stdlib::SIMD eax = 7; // CPUID function 7 ecx = 0; // Sub-function 0 - __asm__ __volatile__( - "cpuid;" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) - : "a" (eax), "c" (ecx) - ); + __asm__ __volatile__("cpuid;" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(eax), "c"(ecx)); // Check the AVX-256 (AVX2) feature bit in EBX return (ebx >> 5) & 1; @@ -56,15 +48,11 @@ namespace Stdlib::SIMD eax = 7; // CPUID function 7 ecx = 0; // Sub-function 0 - __asm__ __volatile__( - "cpuid;" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) - : "a" (eax), "c" (ecx) - ); + __asm__ __volatile__("cpuid;" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(eax), "c"(ecx)); // Check the AVX-512 feature bit in EBX return (ebx >> 16) & 1; } -} +} // namespace Stdlib::SIMD #endif \ No newline at end of file diff --git a/Stdlib/SIMD/SIMD_I32.h b/Stdlib/SIMD/SIMD_I32.h index b1fadb0..0166ea0 100644 --- a/Stdlib/SIMD/SIMD_I32.h +++ b/Stdlib/SIMD/SIMD_I32.h @@ -10,8 +10,8 @@ #ifndef STDLIB_SIMD_I32_H #define STDLIB_SIMD_I32_H -#include #include +#include #include "../Types.h" #include "SIMD_F32.h" @@ -39,8 +39,7 @@ namespace Stdlib::SIMD }; }; - inline - int32_4_simd load_int32_4_simd(int32 *mem) + inline int32_4_simd load_int32_4_simd(int32 *mem) { int32_4_simd simd; simd.s = _mm_loadu_epi32(mem); @@ -48,8 +47,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd init_int32_4_simd(int32 *mem) + inline int32_4_simd init_int32_4_simd(int32 *mem) { int32_4_simd simd; simd.s = _mm_set_epi32(mem[0], mem[1], mem[2], mem[3]); @@ -57,14 +55,9 @@ namespace Stdlib::SIMD return simd; } - inline - void unload_int32_4_simd(int32_4_simd a, int32* array) - { - _mm_store_epi32(array, a.s); - } + inline void unload_int32_4_simd(int32_4_simd a, int32 *array) { _mm_store_epi32(array, a.s); } - inline - int32_8_simd load_int32_8_simd(int32 *mem) + inline int32_8_simd load_int32_8_simd(int32 *mem) { int32_8_simd simd; simd.s = _mm256_loadu_epi32(mem); @@ -72,26 +65,17 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd init_int32_8_simd(int32 *mem) + inline int32_8_simd init_int32_8_simd(int32 *mem) { int32_8_simd simd; - simd.s = _mm256_set_epi32( - mem[0], mem[1], mem[2], mem[3], - mem[4], mem[5], mem[6], mem[7] - ); + simd.s = _mm256_set_epi32(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]); return simd; } - inline - void unload_int32_8_simd(int32_8_simd a, int32* array) - { - _mm256_store_epi32(array, a.s); - } + inline void unload_int32_8_simd(int32_8_simd a, int32 *array) { _mm256_store_epi32(array, a.s); } - inline - int32_16_simd load_int32_16_simd(int32 *mem) + inline int32_16_simd load_int32_16_simd(int32 *mem) { int32_16_simd simd; simd.s = _mm512_loadu_epi32(mem); @@ -99,28 +83,18 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd init_int32_16_simd(int32 *mem) + inline int32_16_simd init_int32_16_simd(int32 *mem) { int32_16_simd simd; - simd.s = _mm512_set_epi32( - mem[0], mem[1], mem[2], mem[3], - mem[4], mem[5], mem[6], mem[7], - mem[8], mem[9], mem[10], mem[11], - mem[12], mem[13], mem[14], mem[15] - ); + simd.s = _mm512_set_epi32(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], mem[8], mem[9], + mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]); return simd; } - inline - void unload_int32_16_simd(int32_16_simd a, int32* array) - { - _mm512_store_epi32(array, a.s); - } + inline void unload_int32_16_simd(int32_16_simd a, int32 *array) { _mm512_store_epi32(array, a.s); } - inline - int32_4_simd init_zero_int32_4_simd() + inline int32_4_simd init_zero_int32_4_simd() { int32_4_simd simd; simd.s = _mm_setzero_si128(); @@ -128,8 +102,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd init_zero_int32_8_simd() + inline int32_8_simd init_zero_int32_8_simd() { int32_8_simd simd; simd.s = _mm256_setzero_si256(); @@ -137,8 +110,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd init_zero_int32_16_simd() + inline int32_16_simd init_zero_int32_16_simd() { int32_16_simd simd; simd.s = _mm512_setzero_epi32(); @@ -146,8 +118,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd operator+(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator+(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_add_epi32(a.s, b.s); @@ -155,8 +126,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator+(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator+(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_add_epi32(a.s, b.s); @@ -164,8 +134,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator+(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator+(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; simd.s = _mm512_add_epi32(a.s, b.s); @@ -173,8 +142,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd operator-(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator-(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_sub_epi32(a.s, b.s); @@ -182,14 +150,9 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd operator-(int32_4_simd a) - { - return init_zero_int32_4_simd() - a; - } + inline int32_4_simd operator-(int32_4_simd a) { return init_zero_int32_4_simd() - a; } - inline - int32_8_simd operator-(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator-(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_sub_epi32(a.s, b.s); @@ -197,14 +160,9 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator-(int32_8_simd a) - { - return init_zero_int32_8_simd() - a; - } + inline int32_8_simd operator-(int32_8_simd a) { return init_zero_int32_8_simd() - a; } - inline - int32_16_simd operator-(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator-(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; simd.s = _mm512_sub_epi32(a.s, b.s); @@ -212,14 +170,9 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator-(int32_16_simd a) - { - return init_zero_int32_16_simd() - a; - } + inline int32_16_simd operator-(int32_16_simd a) { return init_zero_int32_16_simd() - a; } - inline - int32_4_simd operator*(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator*(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_mul_epi32(a.s, b.s); @@ -227,8 +180,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator*(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator*(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_mul_epi32(a.s, b.s); @@ -236,8 +188,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator*(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator*(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; simd.s = _mm512_mul_epi32(a.s, b.s); @@ -245,8 +196,7 @@ namespace Stdlib::SIMD return simd; } - inline - Stdlib::SIMD::f32_4_simd operator/(int32_4_simd a, int32_4_simd b) + inline Stdlib::SIMD::f32_4_simd operator/(int32_4_simd a, int32_4_simd b) { Stdlib::SIMD::f32_4_simd simd; simd.s = _mm_div_ps(a.s, b.s); @@ -254,8 +204,7 @@ namespace Stdlib::SIMD return simd; } - inline - Stdlib::SIMD::f32_8_simd operator/(int32_8_simd a, int32_8_simd b) + inline Stdlib::SIMD::f32_8_simd operator/(int32_8_simd a, int32_8_simd b) { Stdlib::SIMD::f32_8_simd simd; simd.s = _mm256_div_ps(a.s, b.s); @@ -263,8 +212,7 @@ namespace Stdlib::SIMD return simd; } - inline - Stdlib::SIMD::f32_16_simd operator/(int32_16_simd a, int32_16_simd b) + inline Stdlib::SIMD::f32_16_simd operator/(int32_16_simd a, int32_16_simd b) { Stdlib::SIMD::f32_16_simd simd; simd.s = _mm512_div_ps(a.s, b.s); @@ -272,8 +220,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd operator^(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator^(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_xor_epi32(a.s, b.s); @@ -281,8 +228,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator^(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator^(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_xor_epi32(a.s, b.s); @@ -290,8 +236,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator^(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator^(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; simd.s = _mm512_xor_epi32(a.s, b.s); @@ -299,128 +244,112 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd& operator-=(int32_4_simd &a, int32_4_simd b) + inline int32_4_simd &operator-=(int32_4_simd &a, int32_4_simd b) { a = a - b; return a; } - inline - int32_8_simd& operator-=(int32_8_simd &a, int32_8_simd b) + inline int32_8_simd &operator-=(int32_8_simd &a, int32_8_simd b) { a = a - b; return a; } - inline - int32_16_simd& operator-=(int32_16_simd &a, int32_16_simd b) + inline int32_16_simd &operator-=(int32_16_simd &a, int32_16_simd b) { a = a - b; return a; } - inline - int32_4_simd& operator+=(int32_4_simd &a, int32_4_simd b) + inline int32_4_simd &operator+=(int32_4_simd &a, int32_4_simd b) { a = a + b; return a; } - inline - int32_8_simd& operator+=(int32_8_simd &a, int32_8_simd b) + inline int32_8_simd &operator+=(int32_8_simd &a, int32_8_simd b) { a = a + b; return a; } - inline - int32_16_simd& operator+=(int32_16_simd &a, int32_16_simd b) + inline int32_16_simd &operator+=(int32_16_simd &a, int32_16_simd b) { a = a + b; return a; } - inline - int32_4_simd& operator*=(int32_4_simd &a, int32_4_simd b) + inline int32_4_simd &operator*=(int32_4_simd &a, int32_4_simd b) { a = a * b; return a; } - inline - int32_8_simd& operator*=(int32_8_simd &a, int32_8_simd b) + inline int32_8_simd &operator*=(int32_8_simd &a, int32_8_simd b) { a = a * b; return a; } - inline - int32_16_simd& operator*=(int32_16_simd &a, int32_16_simd b) + inline int32_16_simd &operator*=(int32_16_simd &a, int32_16_simd b) { a = a * b; return a; } - inline - int32_4_simd& operator/=(int32_4_simd &a, int32_4_simd b) + inline int32_4_simd &operator/=(int32_4_simd &a, int32_4_simd b) { a.s = _mm_cvtps_epi32((a / b).s); return a; } - inline - int32_8_simd& operator/=(int32_8_simd &a, int32_8_simd b) + inline int32_8_simd &operator/=(int32_8_simd &a, int32_8_simd b) { a.s = _mm256_cvtps_epi32((a / b).s); return a; } - inline - int32_16_simd& operator/=(int32_16_simd &a, int32_16_simd b) + inline int32_16_simd &operator/=(int32_16_simd &a, int32_16_simd b) { a.s = _mm512_cvtps_epi32((a / b).s); return a; } - inline - int32_4_simd& operator^=(int32_4_simd &a, int32_4_simd b) + inline int32_4_simd &operator^=(int32_4_simd &a, int32_4_simd b) { a = a ^ b; return a; } - inline - int32_8_simd& operator^=(int32_8_simd &a, int32_8_simd b) + inline int32_8_simd &operator^=(int32_8_simd &a, int32_8_simd b) { a = a ^ b; return a; } - inline - int32_16_simd& operator^=(int32_16_simd &a, int32_16_simd b) + inline int32_16_simd &operator^=(int32_16_simd &a, int32_16_simd b) { a = a ^ b; return a; } - inline - int32_4_simd operator<(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator<(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_cmplt_epi32(a.s, b.s); @@ -428,70 +357,47 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator<(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator<(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; - simd.s = _mm256_xor_si256( - _mm256_cmpgt_epi32(a.s, b.s), - _mm256_set1_epi32(-1) - ); + simd.s = _mm256_xor_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1)); return simd; } - inline - int32_16_simd operator<(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator<(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; - simd.s = _mm512_mask_blend_epi32( - _mm512_cmplt_epi32_mask(a.s, b.s), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_epi32(_mm512_cmplt_epi32_mask(a.s, b.s), a.s, b.s); return simd; } - inline - int32_4_simd operator<=(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator<=(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; - simd.s = _mm_andnot_si128( - _mm_cmplt_epi32(b.s, a.s), - _mm_set1_epi32(-1) - ); + simd.s = _mm_andnot_si128(_mm_cmplt_epi32(b.s, a.s), _mm_set1_epi32(-1)); return simd; } - inline - int32_8_simd operator<=(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator<=(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; - simd.s = _mm256_andnot_si256( - _mm256_cmpgt_epi32(a.s, b.s), - _mm256_set1_epi32(-1) - ); + simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1)); return simd; } - inline - int32_16_simd operator<=(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator<=(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; - simd.s = _mm512_mask_blend_epi32( - _mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), - b.s, - a.s - ); + simd.s = _mm512_mask_blend_epi32(_mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), b.s, a.s); return simd; } - inline - int32_4_simd operator>(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator>(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_cmpgt_epi32(a.s, b.s); @@ -499,8 +405,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator>(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator>(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_cmpgt_epi32(a.s, b.s); @@ -508,58 +413,39 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator>(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator>(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmpgt_epi32_mask(a.s, b.s), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmpgt_epi32_mask(a.s, b.s), a.s, b.s); return simd; } - inline - int32_4_simd operator>=(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator>=(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; - simd.s = _mm_andnot_si128( - _mm_cmplt_epi32(a.s, b.s), - _mm_set1_epi32(-1) - ); + simd.s = _mm_andnot_si128(_mm_cmplt_epi32(a.s, b.s), _mm_set1_epi32(-1)); return simd; } - inline - int32_8_simd operator>=(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator>=(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; - simd.s = _mm256_andnot_si256( - _mm256_cmpgt_epi32(b.s, a.s), - _mm256_set1_epi32(-1) - ); + simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(b.s, a.s), _mm256_set1_epi32(-1)); return simd; } - inline - int32_16_simd operator>=(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator>=(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmpge_epi32_mask(a.s, b.s), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmpge_epi32_mask(a.s, b.s), a.s, b.s); return simd; } - inline - int32_4_simd operator==(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator==(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_cmpeq_epi32(a.s, b.s); @@ -567,8 +453,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator==(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator==(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_cmpeq_epi32(a.s, b.s); @@ -576,21 +461,15 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator==(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator==(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; - simd.s = _mm512_mask_blend_ps( - _mm512_cmpeq_epi32_mask(a.s, b.s), - a.s, - b.s - ); + simd.s = _mm512_mask_blend_ps(_mm512_cmpeq_epi32_mask(a.s, b.s), a.s, b.s); return simd; } - inline - int32_4_simd operator!=(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator!=(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_cmpneq_epi32(a.s, b.s); @@ -598,8 +477,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator!=(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator!=(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_cmp_epi32(a.s, b.s, _CMP_NEQ_OQ); @@ -607,21 +485,16 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator!=(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator!=(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; - simd.s = _mm512_mask_mov_epi32( - _mm512_setzero_epi32(), - _mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), - _mm512_set1_epi32(1.0f) - ); + simd.s = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), + _mm512_set1_epi32(1.0f)); return simd; } - inline - int32_4_simd operator&(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator&(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_and_epi32(a.s, b.s); @@ -629,8 +502,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator&(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator&(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_and_epi32(a.s, b.s); @@ -638,8 +510,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator&(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator&(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; simd.s = _mm512_and_epi32(a.s, b.s); @@ -647,8 +518,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd operator|(int32_4_simd a, int32_4_simd b) + inline int32_4_simd operator|(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_or_epi32(a.s, b.s); @@ -656,8 +526,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd operator|(int32_8_simd a, int32_8_simd b) + inline int32_8_simd operator|(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_or_epi32(a.s, b.s); @@ -665,8 +534,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd operator|(int32_16_simd a, int32_16_simd b) + inline int32_16_simd operator|(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; simd.s = _mm512_or_epi32(a.s, b.s); @@ -674,59 +542,52 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd& operator&=(int32_4_simd &a, int32_4_simd b) + inline int32_4_simd &operator&=(int32_4_simd &a, int32_4_simd b) { a = a & b; return a; } - inline - int32_8_simd& operator&=(int32_8_simd &a, int32_8_simd b) + inline int32_8_simd &operator&=(int32_8_simd &a, int32_8_simd b) { a = a & b; return a; } - inline - int32_16_simd& operator&=(int32_16_simd &a, int32_16_simd b) + inline int32_16_simd &operator&=(int32_16_simd &a, int32_16_simd b) { a = a & b; return a; } - inline - int32_4_simd& operator|=(int32_4_simd &a, int32_4_simd b) + inline int32_4_simd &operator|=(int32_4_simd &a, int32_4_simd b) { a = a | b; return a; } - inline - int32_8_simd& operator|=(int32_8_simd &a, int32_8_simd b) + inline int32_8_simd &operator|=(int32_8_simd &a, int32_8_simd b) { a = a | b; return a; } - inline - int32_16_simd& operator|=(int32_16_simd &a, int32_16_simd b) + inline int32_16_simd &operator|=(int32_16_simd &a, int32_16_simd b) { a = a | b; return a; } - inline - int32_4_simd abs(int32_4_simd a) + inline int32_4_simd abs(int32_4_simd a) { unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m128 mask = _mm_set1_epi32(*(float *) &unsigned_mask); + __m128 mask = _mm_set1_epi32(*(float *) &unsigned_mask); int32_4_simd simd; simd.s = _mm_and_epi32(a.s, mask); @@ -734,11 +595,10 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd abs(int32_8_simd a) + inline int32_8_simd abs(int32_8_simd a) { unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m256 mask = _mm256_set1_epi32(*(float *) &unsigned_mask); + __m256 mask = _mm256_set1_epi32(*(float *) &unsigned_mask); int32_8_simd simd; simd.s = _mm256_and_epi32(a.s, mask); @@ -746,11 +606,10 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd abs(int32_16_simd a) + inline int32_16_simd abs(int32_16_simd a) { unsigned int unsigned_mask = (unsigned int) (1 << 31); - __m512 mask = _mm512_set1_epi32(*(float *) &unsigned_mask); + __m512 mask = _mm512_set1_epi32(*(float *) &unsigned_mask); int32_16_simd simd; simd.s = _mm512_and_epi32(a.s, mask); @@ -758,8 +617,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd min(int32_4_simd a, int32_4_simd b) + inline int32_4_simd min(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_min_epi32(a.s, b.s); @@ -767,8 +625,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd min(int32_8_simd a, int32_8_simd b) + inline int32_8_simd min(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_min_epi32(a.s, b.s); @@ -776,8 +633,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd min(int32_16_simd a, int32_16_simd b) + inline int32_16_simd min(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; simd.s = _mm512_min_epi32(a.s, b.s); @@ -785,8 +641,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd max(int32_4_simd a, int32_4_simd b) + inline int32_4_simd max(int32_4_simd a, int32_4_simd b) { int32_4_simd simd; simd.s = _mm_max_epi32(a.s, b.s); @@ -794,8 +649,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd max(int32_8_simd a, int32_8_simd b) + inline int32_8_simd max(int32_8_simd a, int32_8_simd b) { int32_8_simd simd; simd.s = _mm256_max_epi32(a.s, b.s); @@ -803,8 +657,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd max(int32_16_simd a, int32_16_simd b) + inline int32_16_simd max(int32_16_simd a, int32_16_simd b) { int32_16_simd simd; simd.s = _mm512_max_epi32(a.s, b.s); @@ -812,11 +665,10 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd sign(int32_4_simd a) + inline int32_4_simd sign(int32_4_simd a) { unsigned int umask = (unsigned int) (1 << 31); - __m128 mask = _mm_set1_epi32(*(float *) &umask); + __m128 mask = _mm_set1_epi32(*(float *) &umask); int32_4_simd signBit; signBit.s = _mm_and_epi32(a.s, mask); @@ -829,11 +681,10 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd sign(int32_8_simd a) + inline int32_8_simd sign(int32_8_simd a) { unsigned int umask = (unsigned int) (1 << 31); - __m256 mask = _mm256_set1_epi32(*(float *) &umask); + __m256 mask = _mm256_set1_epi32(*(float *) &umask); int32_8_simd signBit; signBit.s = _mm256_and_epi32(a.s, mask); @@ -846,11 +697,10 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd sign(int32_16_simd a) + inline int32_16_simd sign(int32_16_simd a) { unsigned int umask = (unsigned int) (1 << 31); - __m512 mask = _mm512_set1_epi32(*(float *) &umask); + __m512 mask = _mm512_set1_epi32(*(float *) &umask); int32_16_simd signBit; signBit.s = _mm512_and_epi32(a.s, mask); @@ -863,8 +713,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd floor(int32_4_simd a) + inline int32_4_simd floor(int32_4_simd a) { int32_4_simd simd; simd.s = _mm_floor_epi32(a.s); @@ -872,8 +721,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd floor(int32_8_simd a) + inline int32_8_simd floor(int32_8_simd a) { int32_8_simd simd; simd.s = _mm256_floor_epi32(a.s); @@ -881,8 +729,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd floor(int32_16_simd a) + inline int32_16_simd floor(int32_16_simd a) { int32_16_simd simd; simd.s = _mm512_floor_epi32(a.s); @@ -890,8 +737,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd ceil(int32_4_simd a) + inline int32_4_simd ceil(int32_4_simd a) { int32_4_simd simd; simd.s = _mm_ceil_epi32(a.s); @@ -899,8 +745,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd ceil(int32_8_simd a) + inline int32_8_simd ceil(int32_8_simd a) { int32_8_simd simd; simd.s = _mm256_ceil_epi32(a.s); @@ -908,8 +753,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd ceil(int32_16_simd a) + inline int32_16_simd ceil(int32_16_simd a) { int32_16_simd simd; simd.s = _mm512_ceil_epi32(a.s); @@ -917,8 +761,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd sqrt(int32_4_simd a) + inline int32_4_simd sqrt(int32_4_simd a) { int32_4_simd simd; simd.s = _mm_sqrt_epi32(a.s); @@ -926,8 +769,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd sqrt(int32_8_simd a) + inline int32_8_simd sqrt(int32_8_simd a) { int32_8_simd simd; simd.s = _mm256_sqrt_epi32(a.s); @@ -935,8 +777,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd sqrt(int32_16_simd a) + inline int32_16_simd sqrt(int32_16_simd a) { int32_16_simd simd; simd.s = _mm512_sqrt_epi32(a.s); @@ -944,8 +785,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd sqrt_inv_approx(int32_4_simd a) + inline int32_4_simd sqrt_inv_approx(int32_4_simd a) { int32_4_simd simd; simd.s = _mm_rsqrt_epi32(a.s); @@ -953,8 +793,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd sqrt_inv_approx(int32_8_simd a) + inline int32_8_simd sqrt_inv_approx(int32_8_simd a) { int32_8_simd simd; simd.s = _mm256_rsqrt_epi32(a.s); @@ -962,8 +801,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd sqrt_inv_approx(int32_16_simd a) + inline int32_16_simd sqrt_inv_approx(int32_16_simd a) { int32_16_simd simd; simd.s = _mm512_rsqrt14_epi32(a.s); @@ -971,8 +809,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd one_over_approx(int32_4_simd a) + inline int32_4_simd one_over_approx(int32_4_simd a) { int32_4_simd simd; simd.s = _mm_rcp_epi32(a.s); @@ -980,8 +817,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_8_simd one_over_approx(int32_8_simd a) + inline int32_8_simd one_over_approx(int32_8_simd a) { int32_8_simd simd; simd.s = _mm256_rcp_epi32(a.s); @@ -989,8 +825,7 @@ namespace Stdlib::SIMD return simd; } - inline - int32_16_simd one_over_approx(int32_16_simd a) + inline int32_16_simd one_over_approx(int32_16_simd a) { int32_16_simd simd; simd.s = _mm512_rcp14_epi32(a.s); @@ -998,120 +833,105 @@ namespace Stdlib::SIMD return simd; } - inline - int32_4_simd clamp(int32_4_simd min_value, int32_4_simd a, int32_4_simd max_value) + inline int32_4_simd clamp(int32_4_simd min_value, int32_4_simd a, int32_4_simd max_value) { return min(max(a, min_value), max_value); } - inline - int32_8_simd clamp(int32_8_simd min_value, int32_8_simd a, int32_8_simd max_value) + inline int32_8_simd clamp(int32_8_simd min_value, int32_8_simd a, int32_8_simd max_value) { return min(max(a, min_value), max_value); } - inline - int32_16_simd clamp(int32_16_simd min_value, int32_16_simd a, int32_16_simd max_value) + inline int32_16_simd clamp(int32_16_simd min_value, int32_16_simd a, int32_16_simd max_value) { return min(max(a, min_value), max_value); } - inline - int32 which_true(int32_4_simd a) + inline int32 which_true(int32_4_simd a) { int32 which_true = _mm_movemask_epi32(a.s); return which_true; } - inline - int32 which_true(int32_8_simd a) + inline int32 which_true(int32_8_simd a) { int32 which_true = _mm256_movemask_epi32(a.s); return which_true; } - inline - int32 which_true(int32_16_simd a) + inline int32 which_true(int32_16_simd a) { int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)); return which_true; } - inline - bool any_true(int32_4_simd a) + inline bool any_true(int32_4_simd a) { bool is_any_true = _mm_movemask_epi32(a.s) > 0; return is_any_true; } - inline - bool any_true(int32_8_simd a) + inline bool any_true(int32_8_simd a) { bool is_any_true = _mm256_movemask_epi32(a.s) > 0; return is_any_true; } - inline - bool any_true(int32_16_simd a) + inline bool any_true(int32_16_simd a) { bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0; return is_any_true; } - inline - bool all_true(int32_4_simd a) + inline bool all_true(int32_4_simd a) { bool is_true = (_mm_movemask_epi32(a.s) == 15); return is_true; } - inline - bool all_true(int32_8_simd a) + inline bool all_true(int32_8_simd a) { bool is_true = (_mm256_movemask_epi32(a.s) == 255); return is_true; } - inline - bool all_true(int32_16_simd a) + inline bool all_true(int32_16_simd a) { bool is_true = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535); return is_true; } - inline - bool all_false(int32_4_simd a) + inline bool all_false(int32_4_simd a) { bool is_false = (_mm_movemask_epi32(a.s) == 0); return is_false; } - inline - bool all_false(int32_8_simd a) + inline bool all_false(int32_8_simd a) { bool is_false = (_mm256_movemask_epi32(a.s) == 0); return is_false; } - inline - bool all_false(int32_16_simd a) + inline bool all_false(int32_16_simd a) { // @todo This can be optimized (requires also changes in the comparison functions return) bool is_false = (_mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0); return is_false; } -} +} // namespace Stdlib::SIMD #endif diff --git a/Stdlib/Types.h b/Stdlib/Types.h index 2cd4679..b517bf5 100644 --- a/Stdlib/Types.h +++ b/Stdlib/Types.h @@ -10,8 +10,8 @@ #ifndef STDLIB_TYPES_H #define STDLIB_TYPES_H -#include #include +#include typedef int8_t int8; typedef int16_t int16; diff --git a/Utils/ApplicationUtils.h b/Utils/ApplicationUtils.h index 7398ba0..b01e86b 100755 --- a/Utils/ApplicationUtils.h +++ b/Utils/ApplicationUtils.h @@ -20,28 +20,26 @@ namespace Utils { namespace ApplicationUtils { - inline - char *cwd() + inline char *cwd() { char *cwd = (char *) malloc(4096 * sizeof(char)); if (cwd == NULL) { return NULL; } - getcwd(cwd, 4096 * sizeof(char)); + getcwd(cwd, 4096 * sizeof(char)); return cwd; } - inline - void chdir_application(const char *cwd, const char *arg) + inline void chdir_application(const char *cwd, const char *arg) { char *pos = strrchr((char *) arg, '/'); if (pos == NULL) { pos = strrchr((char *) arg, '\\'); } - char* dir = (char *) calloc((pos - arg + 1), sizeof(char)); + char *dir = (char *) calloc((pos - arg + 1), sizeof(char)); if (!dir) { return; } @@ -53,8 +51,7 @@ namespace Utils } } - inline - const char *compile_arg_line(int argc, const char **argv) + inline const char *compile_arg_line(int argc, const char **argv) { size_t max = 512; size_t length = 0; @@ -79,17 +76,17 @@ namespace Utils max += 128; } - #ifdef _WIN32 - strcat_s(arg, max * sizeof(char), argv[i]); - #else - strcat(arg, argv[i]); - #endif +#ifdef _WIN32 + strcat_s(arg, max * sizeof(char), argv[i]); +#else + strcat(arg, argv[i]); +#endif length += argv_length; } return arg; } - } -} + } // namespace ApplicationUtils +} // namespace Utils #endif \ No newline at end of file diff --git a/Utils/ArrayUtils.h b/Utils/ArrayUtils.h index b662b42..8cce79b 100755 --- a/Utils/ArrayUtils.h +++ b/Utils/ArrayUtils.h @@ -18,8 +18,7 @@ namespace Utils::ArrayUtils { - inline - const char* get_arg(const char *id, const char **argv, size_t size) + inline const char *get_arg(const char *id, const char **argv, size_t size) { if (Utils::StringUtils::is_number(id)) { return argv[atoi(id)]; @@ -34,8 +33,7 @@ namespace Utils::ArrayUtils return NULL; } - inline - bool has_arg(const char *id, const char **argv, size_t size) + inline bool has_arg(const char *id, const char **argv, size_t size) { for (size_t i = 0; i < size; ++i) { if (strcmp(id, argv[i]) == 0) { @@ -46,8 +44,7 @@ namespace Utils::ArrayUtils return false; } - inline - double array_sum_double(const double *array, size_t size) + inline double array_sum_double(const double *array, size_t size) { double sum = 0.0; for (size_t i = 0; i < size; ++i) { @@ -57,8 +54,7 @@ namespace Utils::ArrayUtils return sum; } - inline - float array_sum_float(const float *array, size_t size) + inline float array_sum_float(const float *array, size_t size) { float sum = 0.0; for (size_t i = 0; i < size; ++i) { @@ -68,8 +64,7 @@ namespace Utils::ArrayUtils return sum; } - inline - int64_t array_sum_int(const int64_t *array, size_t size) + inline int64_t array_sum_int(const int64_t *array, size_t size) { int64_t sum = 0; for (size_t i = 0; i < size; ++i) { @@ -79,8 +74,7 @@ namespace Utils::ArrayUtils return sum; } - inline - size_t find_in_array_string(const char *element, const char **array, size_t size) + inline size_t find_in_array_string(const char *element, const char **array, size_t size) { for (size_t i = 0; i < size; ++i) { if (strcmp(element, array[i]) == 0) { @@ -91,8 +85,7 @@ namespace Utils::ArrayUtils return -1; } - inline - size_t find_in_array_double(double element, const double *array, size_t size) + inline size_t find_in_array_double(double element, const double *array, size_t size) { for (size_t i = 0; i < size; ++i) { if (array[i] == element) { @@ -103,8 +96,7 @@ namespace Utils::ArrayUtils return -1; } - inline - size_t find_in_array_float(float element, const float *array, size_t size) + inline size_t find_in_array_float(float element, const float *array, size_t size) { for (size_t i = 0; i < size; ++i) { if (array[i] == element) { @@ -115,8 +107,7 @@ namespace Utils::ArrayUtils return -1; } - inline - size_t find_in_array_int(int64_t element, const int64_t *array, size_t size) + inline size_t find_in_array_int(int64_t element, const int64_t *array, size_t size) { for (size_t i = 0; i < size; ++i) { if (array[i] == element) { @@ -127,10 +118,9 @@ namespace Utils::ArrayUtils return -1; } - inline - double* merge_arrays_double(const double* array1, size_t size1, const double* array2, size_t size2) + inline double *merge_arrays_double(const double *array1, size_t size1, const double *array2, size_t size2) { - double* merged = (double*) malloc((size1 + size2) * sizeof(double)); + double *merged = (double *) malloc((size1 + size2) * sizeof(double)); if (merged == NULL) { return NULL; } @@ -146,10 +136,9 @@ namespace Utils::ArrayUtils return merged; } - inline - float* merge_arrays_float(const float* array1, size_t size1, const float* array2, size_t size2) + inline float *merge_arrays_float(const float *array1, size_t size1, const float *array2, size_t size2) { - float* merged = (float*) malloc((size1 + size2) * sizeof(float)); + float *merged = (float *) malloc((size1 + size2) * sizeof(float)); if (merged == NULL) { return NULL; } @@ -165,10 +154,9 @@ namespace Utils::ArrayUtils return merged; } - inline - int64_t* merge_arrays_int(const int64_t* array1, size_t size1, const int64_t* array2, size_t size2) + inline int64_t *merge_arrays_int(const int64_t *array1, size_t size1, const int64_t *array2, size_t size2) { - int64_t* merged = (int64_t*) malloc((size1 + size2) * sizeof(int64_t)); + int64_t *merged = (int64_t *) malloc((size1 + size2) * sizeof(int64_t)); if (merged == NULL) { return NULL; } @@ -184,26 +172,25 @@ namespace Utils::ArrayUtils return merged; } - inline - char** merge_arrays_char(const char** array1, size_t size1, const char** array2, size_t size2) + inline char **merge_arrays_char(const char **array1, size_t size1, const char **array2, size_t size2) { - char** merged = (char**) malloc((size1 + size2) * sizeof(char*)); + char **merged = (char **) malloc((size1 + size2) * sizeof(char *)); if (merged == NULL) { return NULL; } for (size_t i = 0; i < size1; ++i) { - merged[i] = (char*) malloc((strlen(array1[i]) + 1) * sizeof(char)); + merged[i] = (char *) malloc((strlen(array1[i]) + 1) * sizeof(char)); strcpy(merged[i], array1[i]); } for (size_t i = 0; i < size2; ++i) { - merged[i] = (char*) malloc((strlen(array2[i]) + 1) * sizeof(char)); + merged[i] = (char *) malloc((strlen(array2[i]) + 1) * sizeof(char)); strcpy(merged[i], array2[i]); } return merged; } -} +} // namespace Utils::ArrayUtils #endif \ No newline at end of file diff --git a/Utils/ColorUtils.h b/Utils/ColorUtils.h index c5a7b19..eae73fc 100644 --- a/Utils/ColorUtils.h +++ b/Utils/ColorUtils.h @@ -21,10 +21,9 @@ namespace Utils::ColorUtils unsigned char b = 0; } RGB; - inline - RGB* int_to_rgb(int rgb) + inline RGB *int_to_rgb(int rgb) { - RGB* result = (RGB*) malloc(1 * sizeof(RGB)); + RGB *result = (RGB *) malloc(1 * sizeof(RGB)); result->r = rgb & 255; result->g = (rgb >> 8) & 255; @@ -33,25 +32,23 @@ namespace Utils::ColorUtils return result; } - inline - int rgb_to_int(const RGB* rgb) + inline int rgb_to_int(const RGB *rgb) { - int i = (255 & rgb->r) << 16; - i += (255 & rgb->g) << 8; - i += (255 & rgb->b); + int i = (255 & rgb->r) << 16; + i += (255 & rgb->g) << 8; + i += (255 & rgb->b); return i; } - inline - int rgb_to_int(char r, char g, char b) + inline int rgb_to_int(char r, char g, char b) { - int i = (255 & r) << 16; - i += (255 & g) << 8; - i += (255 & b); + int i = (255 & r) << 16; + i += (255 & g) << 8; + i += (255 & b); return i; } -} +} // namespace Utils::ColorUtils #endif \ No newline at end of file diff --git a/Utils/FileUtils.h b/Utils/FileUtils.h index 4a0674f..c0a6249 100755 --- a/Utils/FileUtils.h +++ b/Utils/FileUtils.h @@ -15,8 +15,8 @@ #include #ifdef _WIN32 - #include #include + #include #else #include #endif @@ -25,65 +25,62 @@ namespace Utils::FileUtils { - inline - bool file_exists (const char *filename) + inline bool file_exists(const char *filename) { - #ifdef _WIN32 - return access(filename, 0) == 0; - #else - struct stat buffer; - return stat(filename, &buffer) == 0; - #endif +#ifdef _WIN32 + return access(filename, 0) == 0; +#else + struct stat buffer; + return stat(filename, &buffer) == 0; +#endif } - inline - time_t last_modification (const char *filename) + inline time_t last_modification(const char *filename) { - #ifdef _WIN32 - FILETIME modtime; - HANDLE h; +#ifdef _WIN32 + FILETIME modtime; + HANDLE h; - size_t nameLength = strlen(filename); + size_t nameLength = strlen(filename); - wchar_t *wtext = (wchar_t *) calloc(nameLength, sizeof(char)); - mbstowcs_s(NULL, wtext, nameLength, filename, nameLength); - LPWSTR pFilename = wtext; - - if (!pFilename) { - free(wtext); - - return 0; - } - - h = CreateFileW(pFilename, GENERIC_READ | FILE_WRITE_ATTRIBUTES, 0, NULL, OPEN_EXISTING, 0, NULL); + wchar_t *wtext = (wchar_t *) calloc(nameLength, sizeof(char)); + mbstowcs_s(NULL, wtext, nameLength, filename, nameLength); + LPWSTR pFilename = wtext; + if (!pFilename) { free(wtext); - free(pFilename); - if (h == INVALID_HANDLE_VALUE) { - return (time_t) 0; - } + return 0; + } - if (GetFileTime(h, NULL, NULL, &modtime) == 0) { - return (time_t) 0; - } + h = CreateFileW(pFilename, GENERIC_READ | FILE_WRITE_ATTRIBUTES, 0, NULL, OPEN_EXISTING, 0, NULL); - unsigned long long seconds = ((unsigned long long) (modtime.dwHighDateTime)) << 32; - seconds |= modtime.dwLowDateTime; + free(wtext); + free(pFilename); - return (seconds - 116444736000000000) / 10000000; - #else - struct stat buffer; - stat(filename, &buffer); + if (h == INVALID_HANDLE_VALUE) { + return (time_t) 0; + } - return (time_t) buffer.st_mtim.tv_sec; - #endif + if (GetFileTime(h, NULL, NULL, &modtime) == 0) { + return (time_t) 0; + } + + unsigned long long seconds = ((unsigned long long) (modtime.dwHighDateTime)) << 32; + seconds |= modtime.dwLowDateTime; + + return (seconds - 116444736000000000) / 10000000; +#else + struct stat buffer; + stat(filename, &buffer); + + return (time_t) buffer.st_mtim.tv_sec; +#endif return (time_t) 0; } - inline - const char* file_extension (const char *filename) + inline const char *file_extension(const char *filename) { char *dot = strrchr((char *) filename, '.'); @@ -99,7 +96,7 @@ namespace Utils::FileUtils int size = 0; // doesn't include null termination (same as strlen) } file_body; - file_body read_file (const char *filename) + file_body read_file(const char *filename) { file_body file = {0}; @@ -126,6 +123,6 @@ namespace Utils::FileUtils return file; } -} +} // namespace Utils::FileUtils #endif diff --git a/Utils/Intrinsics.h b/Utils/Intrinsics.h index bffa214..a3316b6 100644 --- a/Utils/Intrinsics.h +++ b/Utils/Intrinsics.h @@ -15,25 +15,37 @@ /* MMX -Introduce eight 64 bit registers (MM0-MM7) and instructions to work with eight signed/unsigned bytes, four signed/unsigned words, two signed/unsigned dwords. +Introduce eight 64 bit registers (MM0-MM7) and instructions to work with eight signed/unsigned bytes, four +signed/unsigned words, two signed/unsigned dwords. 3DNow! -Add support for single precision floating point operand to MMX. Few operation supported, for example addition, subtraction, multiplication. +Add support for single precision floating point operand to MMX. Few operation supported, for example addition, +subtraction, multiplication. SSE -Introduce eight/sixteen 128 bit registers (XMM0-XMM7/15) and instruction to work with four single precision floating point operands. Add integer operations on MMX registers too. (The MMX-integer part of SSE is sometimes called MMXEXT, and was implemented on a few non-Intel CPUs without xmm registers and the floating point part of SSE.) +Introduce eight/sixteen 128 bit registers (XMM0-XMM7/15) and instruction to work with four single precision floating +point operands. Add integer operations on MMX registers too. (The MMX-integer part of SSE is sometimes called MMXEXT, +and was implemented on a few non-Intel CPUs without xmm registers and the floating point part of SSE.) SSE2 -Introduces instruction to work with 2 double precision floating point operands, and with packed byte/word/dword/qword integers in 128-bit xmm registers. +Introduces instruction to work with 2 double precision floating point operands, and with packed byte/word/dword/qword +integers in 128-bit xmm registers. SSE3 -Add a few varied instructions (mostly floating point), including a special kind of unaligned load (lddqu) that was better on Pentium 4, synchronization instruction, horizontal add/sub. +Add a few varied instructions (mostly floating point), including a special kind of unaligned load (lddqu) that was +better on Pentium 4, synchronization instruction, horizontal add/sub. SSSE3 -Again a varied set of instructions, mostly integer. The first shuffle that takes its control operand from a register instead of hard-coded (pshufb). More horizontal processing, shuffle, packing/unpacking, mul+add on bytes, and some specialized integer add/mul stuff. +Again a varied set of instructions, mostly integer. The first shuffle that takes its control operand from a register +instead of hard-coded (pshufb). More horizontal processing, shuffle, packing/unpacking, mul+add on bytes, and some +specialized integer add/mul stuff. SSE4 (SSE4.1, SSE4.2) -Add a lot of instructions: Filling in a lot of the gaps by providing min and max and other operations for all integer data types (especially 32-bit integer had been lacking), where previously integer min was only available for unsigned bytes and signed 16-bit. Also scaling, FP rounding, blending, linear algebra operation, text processing, comparisons. Also a non temporal load for reading video memory, or copying it back to main memory. (Previously only NT stores were available.) +Add a lot of instructions: Filling in a lot of the gaps by providing min and max and other operations for all integer +data types (especially 32-bit integer had been lacking), where previously integer min was only available for unsigned +bytes and signed 16-bit. Also scaling, FP rounding, blending, linear algebra operation, text processing, comparisons. +Also a non temporal load for reading video memory, or copying it back to main memory. (Previously only NT stores were +available.) AESNI Add support for accelerating AES symmetric encryption/decryption. @@ -48,7 +60,9 @@ AVX2 Add support for integer data types. AVX512F -Add eight/thirty-two 512 bit registers (ZMM0-ZMM7/31) and eight 64-bit mask register (k0-k7). Promote most previous instruction to 512 bit wide. Optional parts of AVX512 add instruction for exponentials & reciprocals (AVX512ER), scatter/gather prefetching (AVX512PF), scatter conflict detection (AVX512CD), compress, expand. +Add eight/thirty-two 512 bit registers (ZMM0-ZMM7/31) and eight 64-bit mask register (k0-k7). Promote most previous +instruction to 512 bit wide. Optional parts of AVX512 add instruction for exponentials & reciprocals (AVX512ER), +scatter/gather prefetching (AVX512PF), scatter conflict detection (AVX512CD), compress, expand. IMCI (Intel Xeon Phi) Early development of AVX512 for the first-gen Intel Xeon Phi (Knight's Corner) coprocessor. diff --git a/Utils/Rng/StringUtils.h b/Utils/Rng/StringUtils.h index d29fe6e..e1f4730 100755 --- a/Utils/Rng/StringUtils.h +++ b/Utils/Rng/StringUtils.h @@ -16,14 +16,13 @@ namespace Utils::Rng::StringUtils { - inline - char* generate_string( - size_t min = 10, size_t max = 10, - const char *charset = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", int charsetLength = 62 - ) { + inline char *generate_string(size_t min = 10, size_t max = 10, + const char *charset = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", + int charsetLength = 62) + { srand(time(0)); - size_t length = (rand() % (max - min + 1)) + min; + size_t length = (rand() % (max - min + 1)) + min; char *randomString = (char *) malloc(length + 1); for (size_t i = 0; i < length; ++i) { @@ -34,6 +33,6 @@ namespace Utils::Rng::StringUtils return randomString; } -} +} // namespace Utils::Rng::StringUtils #endif \ No newline at end of file diff --git a/Utils/StringUtils.h b/Utils/StringUtils.h index a2bc894..435cb7f 100755 --- a/Utils/StringUtils.h +++ b/Utils/StringUtils.h @@ -10,12 +10,12 @@ #ifndef UTILS_STRING_UTILS_H #define UTILS_STRING_UTILS_H +#include #include #include -#include -#include "MathUtils.h" #include "ArraySort.h" +#include "MathUtils.h" namespace Utils::StringUtils { @@ -62,8 +62,7 @@ namespace Utils::StringUtils return result; } - inline - bool is_number(const char *s) + inline bool is_number(const char *s) { while (*s != '\0') { if (!isdigit(*s)) { @@ -76,8 +75,7 @@ namespace Utils::StringUtils return true; } - inline - size_t str_count(const char *str, const char *substr) + inline size_t str_count(const char *str, const char *substr) { size_t l1 = strlen(str); size_t l2 = strlen(substr); @@ -94,13 +92,12 @@ namespace Utils::StringUtils return count; } - inline - char *strsep(const char **sp, const char *sep) + inline char *strsep(const char **sp, const char *sep) { char *p, *s; if (sp == NULL || *sp == NULL || **sp == '\0') { - return(NULL); + return (NULL); } s = (char *) *sp; @@ -116,8 +113,7 @@ namespace Utils::StringUtils } // @todo Implement delim as const char* (also allow \0 length) - inline - int str_split(char **list, const char *str, const char delim) + inline int str_split(char **list, const char *str, const char delim) { size_t splits = str_count(str, (char *) &delim) + 1; list = (char **) malloc(splits * sizeof(char *)); @@ -136,8 +132,7 @@ namespace Utils::StringUtils } // @todo Implement delim as const char* (also allow \0 length) - inline - char* str_combine(const char **str, size_t size, const char delim) + inline char *str_combine(const char **str, size_t size, const char delim) { if (size < 1) { return NULL; @@ -199,8 +194,8 @@ namespace Utils::StringUtils for (i = 1; i <= fromSize; ++i) { for (j = 1; j <= toSize; ++j) { dm[i * fromSize + j] = strcmp(from[i - 1], to[j - 1]) == 0 - ? dm[(i - 1) * fromSize + (j - 1)] + 1 - : oms_max(dm[(i - 1) * fromSize + j], dm[i * fromSize + (j - 1)]); + ? dm[(i - 1) * fromSize + (j - 1)] + 1 + : oms_max(dm[(i - 1) * fromSize + j], dm[i * fromSize + (j - 1)]); } } @@ -218,11 +213,11 @@ namespace Utils::StringUtils continue; } - #ifdef _WIN32 - strcpy_s(diffValues[diffIndex], (strlen(to[j - 1]) + 1) * sizeof(char), to[j - 1]); - #else - strcpy(diffValues[diffIndex], to[j - 1]); - #endif +#ifdef _WIN32 + strcpy_s(diffValues[diffIndex], (strlen(to[j - 1]) + 1) * sizeof(char), to[j - 1]); +#else + strcpy(diffValues[diffIndex], to[j - 1]); +#endif diffMasks[diffIndex] = 1; @@ -240,11 +235,11 @@ namespace Utils::StringUtils continue; } - #ifdef _WIN32 - strcpy_s(diffValues[diffIndex], (strlen(from[i - 1]) + 1) * sizeof(char), from[i - 1]); - #else - strcpy(diffValues[diffIndex], from[i - 1]); - #endif +#ifdef _WIN32 + strcpy_s(diffValues[diffIndex], (strlen(from[i - 1]) + 1) * sizeof(char), from[i - 1]); +#else + strcpy(diffValues[diffIndex], from[i - 1]); +#endif diffMasks[diffIndex] = -1; @@ -261,11 +256,11 @@ namespace Utils::StringUtils continue; } - #ifdef _WIN32 - strcpy_s(diffValues[diffIndex], (strlen(from[i - 1]) + 1) * sizeof(char), from[i - 1]); - #else - strcpy(diffValues[diffIndex], from[i - 1]); - #endif +#ifdef _WIN32 + strcpy_s(diffValues[diffIndex], (strlen(from[i - 1]) + 1) * sizeof(char), from[i - 1]); +#else + strcpy(diffValues[diffIndex], from[i - 1]); +#endif /* Handled with calloc diffMasks[diffIndex] = 0; @@ -303,7 +298,8 @@ namespace Utils::StringUtils return text_diff{diffValues, diffMasks, diffIndex}; } - char *strtok(char *str, const char *delim, char **saveptr) { + char *strtok(char *str, const char *delim, char **saveptr) + { if (str == NULL) { str = *saveptr; } @@ -313,17 +309,17 @@ namespace Utils::StringUtils } char *token_start = str; - char *token_end = strpbrk(token_start, delim); + char *token_end = strpbrk(token_start, delim); if (token_end == NULL) { *saveptr = NULL; } else { *token_end = '\0'; - *saveptr = token_end + 1; + *saveptr = token_end + 1; } return token_start; } -} +} // namespace Utils::StringUtils #endif \ No newline at end of file diff --git a/Utils/TestUtils.h b/Utils/TestUtils.h index 870739e..44998ef 100755 --- a/Utils/TestUtils.h +++ b/Utils/TestUtils.h @@ -10,67 +10,97 @@ #ifndef UTILS_TEST_UTILS_H #define UTILS_TEST_UTILS_H -#include #include "MathUtils.h" +#include -#define ASSERT_EQUALS(a, b, t1, t2) ({\ - if ((a) == (b)) { \ - printf("."); \ - } else { \ - printf("\033[31m[F]\033[0m"); \ +#define ASSERT_EQUALS(a, b, t1, t2) \ + ({ \ + if ((a) == (b)) { \ + printf("."); \ + } else { \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ - printf((t1), (a)); printf(" != "); printf((t2), (b)); printf("\n"); \ - return 0; } \ + printf((t1), (a)); \ + printf(" != "); \ + printf((t2), (b)); \ + printf("\n"); \ + return 0; \ + } \ }) -#define ASSERT_NOT_EQUALS(a, b, t1, t2) ({\ - if ((a) != (b)) { \ - printf("."); \ - } else { \ - printf("\033[31m[F]\033[0m"); \ +#define ASSERT_NOT_EQUALS(a, b, t1, t2) \ + ({ \ + if ((a) != (b)) { \ + printf("."); \ + } else { \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ - printf((t1), (a)); printf(" == "); printf((t2), (b)); printf("\n"); \ - return 0; } \ + printf((t1), (a)); \ + printf(" == "); \ + printf((t2), (b)); \ + printf("\n"); \ + return 0; \ + } \ }) -#define ASSERT_EQUALS_WITH_DELTA(a, b, delta, t1, t2) ({\ - if (oms_abs((a) - (b)) <= (delta)) { \ - printf("."); \ - } else { \ - printf("\033[31m[F]\033[0m"); \ +#define ASSERT_EQUALS_WITH_DELTA(a, b, delta, t1, t2) \ + ({ \ + if (oms_abs((a) - (b)) <= (delta)) { \ + printf("."); \ + } else { \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ - printf((t1), (a)); printf(" != "); printf((t2), (b)); printf("\n"); \ - return 0; } \ + printf((t1), (a)); \ + printf(" != "); \ + printf((t2), (b)); \ + printf("\n"); \ + return 0; \ + } \ }) -#define ASSERT_CONTAINS(a, b) ({\ - if (strstr((a), (b)) != NULL) { \ - printf("."); \ - } else { \ - printf("\033[31m[F]\033[0m"); \ +#define ASSERT_CONTAINS(a, b) \ + ({ \ + if (strstr((a), (b)) != NULL) { \ + printf("."); \ + } else { \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ - printf("%s", (a)); printf(" !contains "); printf("%s", (b)); printf("\n"); \ - return 0; } \ + printf("%s", (a)); \ + printf(" !contains "); \ + printf("%s", (b)); \ + printf("\n"); \ + return 0; \ + } \ }) -#define ASSERT_TRUE(a) ({\ - if ((a) == true) { \ - printf("."); \ - } else { \ - printf("\033[31m[F]\033[0m"); \ +#define ASSERT_TRUE(a) \ + ({ \ + if ((a) == true) { \ + printf("."); \ + } else { \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ - printf("%d", (a)); printf(" != "); printf("1"); printf("\n"); \ - return 0; } \ + printf("%d", (a)); \ + printf(" != "); \ + printf("1"); \ + printf("\n"); \ + return 0; \ + } \ }) -#define ASSERT_FALSE(a) ({\ - if ((a) == false) { \ - printf("."); \ - } else { \ - printf("\033[31m[F]\033[0m"); \ +#define ASSERT_FALSE(a) \ + ({ \ + if ((a) == false) { \ + printf("."); \ + } else { \ + printf("\033[31m[F]\033[0m"); \ printf("\n\n%s - %i: ", __FILE__, __LINE__); \ - printf("%d", (a)); printf(" != "); printf("1"); printf("\n"); \ - return 0; } \ + printf("%d", (a)); \ + printf(" != "); \ + printf("1"); \ + printf("\n"); \ + return 0; \ + } \ }) #endif diff --git a/tests/Image/ImageUtilsTest.cpp b/tests/Image/ImageUtilsTest.cpp index 9bdf5c0..25ad2ce 100755 --- a/tests/Image/ImageUtilsTest.cpp +++ b/tests/Image/ImageUtilsTest.cpp @@ -9,10 +9,10 @@ */ #include -#include "../../Utils/TestUtils.h" #include "../../Image/ImageUtils.h" +#include "../../Utils/TestUtils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { printf("ImageUtils:\n\n"); diff --git a/tests/Stdlib/IntrinsicsTest.cpp b/tests/Stdlib/IntrinsicsTest.cpp index 35b5d40..77e4538 100644 --- a/tests/Stdlib/IntrinsicsTest.cpp +++ b/tests/Stdlib/IntrinsicsTest.cpp @@ -7,22 +7,22 @@ * @version 1.0.0 * @link https://jingga.app */ +#include #include #include -#include -#include "../../Utils/TestUtils.h" #include "../../Stdlib/Intrinsics.h" +#include "../../Utils/TestUtils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { printf("Intrinsics:\n\n"); ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::sqrt(1.234f), sqrt(1.234f), 0.01, "%f", "%f"); ASSERT_EQUALS_WITH_DELTA((int) Stdlib::Intrinsics::round_to_int(1.234f), (int) roundf(1.234f), 0.01, "%d", "%d"); - //ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::floor(1.234f), 1.0f, 0.01, "%f", "%f"); - //ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::round(1.234f), roundf(1.234f), 0.01, "%f", "%f"); + // ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::floor(1.234f), 1.0f, 0.01, "%f", "%f"); + // ASSERT_EQUALS_WITH_DELTA(Stdlib::Intrinsics::round(1.234f), roundf(1.234f), 0.01, "%f", "%f"); ASSERT_NOT_EQUALS(Stdlib::Intrinsics::hash(123456), Stdlib::Intrinsics::hash(654321), "%d", "%d"); diff --git a/tests/Stdlib/SIMD/SIMD_F32Test.cpp b/tests/Stdlib/SIMD/SIMD_F32Test.cpp index 15b4e6f..b00b213 100644 --- a/tests/Stdlib/SIMD/SIMD_F32Test.cpp +++ b/tests/Stdlib/SIMD/SIMD_F32Test.cpp @@ -10,30 +10,30 @@ #include #include -#include "../../../Utils/TestUtils.h" #include "../../../Stdlib/SIMD/SIMD_F32.h" #include "../../../Stdlib/SIMD/SIMD_Helper.h" +#include "../../../Utils/TestUtils.h" -float* a_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); -float* b_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); -float* expected_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); -float* result_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float *a_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float *b_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float *expected_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); +float *result_array_4 = (float *) aligned_alloc(32, 4 * sizeof(float)); -float* a_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); -float* b_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); -float* expected_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); -float* result_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float *a_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float *b_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float *expected_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); +float *result_array_8 = (float *) aligned_alloc(32, 8 * sizeof(float)); -float* a_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); -float* b_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); -float* expected_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); -float* result_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float *a_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float *b_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float *expected_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); +float *result_array_16 = (float *) aligned_alloc(32, 16 * sizeof(float)); int test_operator_plus(); int test_operator_minus(); int test_operator_mul(); -int main(int argc, char** argv) +int main(int argc, char **argv) { printf("SIMD_F32:\n"); @@ -56,10 +56,19 @@ int test_operator_plus() return 0; } - a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; - b_array_4[0] = 0.0f; b_array_4[1] = 1.0f; b_array_4[2] = 2.0f; b_array_4[3] = 3.0f; + a_array_4[0] = 0.0f; + a_array_4[1] = 1.0f; + a_array_4[2] = 2.0f; + a_array_4[3] = 3.0f; + b_array_4[0] = 0.0f; + b_array_4[1] = 1.0f; + b_array_4[2] = 2.0f; + b_array_4[3] = 3.0f; - expected_array_4[0] = 0.0f; expected_array_4[1] = 2.0f; expected_array_4[2] = 4.0f; expected_array_4[3] = 6.0f; + expected_array_4[0] = 0.0f; + expected_array_4[1] = 2.0f; + expected_array_4[2] = 4.0f; + expected_array_4[3] = 6.0f; Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); @@ -82,14 +91,32 @@ int test_operator_plus() return 0; } - a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; - a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + a_array_8[0] = 0.0f; + a_array_8[1] = 1.0f; + a_array_8[2] = 2.0f; + a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; + a_array_8[5] = 1.0f; + a_array_8[6] = 2.0f; + a_array_8[7] = 3.0f; - b_array_8[0] = 0.0f; b_array_8[1] = 1.0f; b_array_8[2] = 2.0f; b_array_8[3] = 3.0f; - b_array_8[4] = 0.0f; b_array_8[5] = 1.0f; b_array_8[6] = 2.0f; b_array_8[7] = 3.0f; + b_array_8[0] = 0.0f; + b_array_8[1] = 1.0f; + b_array_8[2] = 2.0f; + b_array_8[3] = 3.0f; + b_array_8[4] = 0.0f; + b_array_8[5] = 1.0f; + b_array_8[6] = 2.0f; + b_array_8[7] = 3.0f; - expected_array_8[0] = 0.0f; expected_array_8[1] = 2.0f; expected_array_8[2] = 4.0f; expected_array_8[3] = 6.0f; - expected_array_8[4] = 0.0f; expected_array_8[5] = 2.0f; expected_array_8[6] = 4.0f; expected_array_8[7] = 6.0f; + expected_array_8[0] = 0.0f; + expected_array_8[1] = 2.0f; + expected_array_8[2] = 4.0f; + expected_array_8[3] = 6.0f; + expected_array_8[4] = 0.0f; + expected_array_8[5] = 2.0f; + expected_array_8[6] = 4.0f; + expected_array_8[7] = 6.0f; Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); @@ -116,20 +143,56 @@ int test_operator_plus() return 0; } - a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; - a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; - a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; - a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + a_array_16[0] = 0.0f; + a_array_16[1] = 1.0f; + a_array_16[2] = 2.0f; + a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; + a_array_16[5] = 1.0f; + a_array_16[6] = 2.0f; + a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; + a_array_16[9] = 1.0f; + a_array_16[10] = 2.0f; + a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; + a_array_16[13] = 1.0f; + a_array_16[14] = 2.0f; + a_array_16[15] = 3.0f; - b_array_16[0] = 0.0f; b_array_16[1] = 1.0f; b_array_16[2] = 2.0f; b_array_16[3] = 3.0f; - b_array_16[4] = 0.0f; b_array_16[5] = 1.0f; b_array_16[6] = 2.0f; b_array_16[7] = 3.0f; - b_array_16[8] = 0.0f; b_array_16[9] = 1.0f; b_array_16[10] = 2.0f; b_array_16[11] = 3.0f; - b_array_16[12] = 0.0f; b_array_16[13] = 1.0f; b_array_16[14] = 2.0f; b_array_16[15] = 3.0f; + b_array_16[0] = 0.0f; + b_array_16[1] = 1.0f; + b_array_16[2] = 2.0f; + b_array_16[3] = 3.0f; + b_array_16[4] = 0.0f; + b_array_16[5] = 1.0f; + b_array_16[6] = 2.0f; + b_array_16[7] = 3.0f; + b_array_16[8] = 0.0f; + b_array_16[9] = 1.0f; + b_array_16[10] = 2.0f; + b_array_16[11] = 3.0f; + b_array_16[12] = 0.0f; + b_array_16[13] = 1.0f; + b_array_16[14] = 2.0f; + b_array_16[15] = 3.0f; - expected_array_16[0] = 0.0f; expected_array_16[1] = 2.0f; expected_array_16[2] = 4.0f; expected_array_16[3] = 6.0f; - expected_array_16[4] = 0.0f; expected_array_16[5] = 2.0f; expected_array_16[6] = 4.0f; expected_array_16[7] = 6.0f; - expected_array_16[8] = 0.0f; expected_array_16[9] = 2.0f; expected_array_16[10] = 4.0f; expected_array_16[11] = 6.0f; - expected_array_16[12] = 0.0f; expected_array_16[13] = 2.0f; expected_array_16[14] = 4.0f; expected_array_16[15] = 6.0f; + expected_array_16[0] = 0.0f; + expected_array_16[1] = 2.0f; + expected_array_16[2] = 4.0f; + expected_array_16[3] = 6.0f; + expected_array_16[4] = 0.0f; + expected_array_16[5] = 2.0f; + expected_array_16[6] = 4.0f; + expected_array_16[7] = 6.0f; + expected_array_16[8] = 0.0f; + expected_array_16[9] = 2.0f; + expected_array_16[10] = 4.0f; + expected_array_16[11] = 6.0f; + expected_array_16[12] = 0.0f; + expected_array_16[13] = 2.0f; + expected_array_16[14] = 4.0f; + expected_array_16[15] = 6.0f; Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); @@ -169,10 +232,19 @@ int test_operator_minus() return 0; } - a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; - b_array_4[0] = 1.0f; b_array_4[1] = 1.0f; b_array_4[2] = 1.0f; b_array_4[3] = 1.0f; + a_array_4[0] = 0.0f; + a_array_4[1] = 1.0f; + a_array_4[2] = 2.0f; + a_array_4[3] = 3.0f; + b_array_4[0] = 1.0f; + b_array_4[1] = 1.0f; + b_array_4[2] = 1.0f; + b_array_4[3] = 1.0f; - expected_array_4[0] = -1.0f; expected_array_4[1] = 0.0f; expected_array_4[2] = 1.0f; expected_array_4[3] = 2.0f; + expected_array_4[0] = -1.0f; + expected_array_4[1] = 0.0f; + expected_array_4[2] = 1.0f; + expected_array_4[3] = 2.0f; Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); @@ -195,14 +267,32 @@ int test_operator_minus() return 0; } - a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; - a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + a_array_8[0] = 0.0f; + a_array_8[1] = 1.0f; + a_array_8[2] = 2.0f; + a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; + a_array_8[5] = 1.0f; + a_array_8[6] = 2.0f; + a_array_8[7] = 3.0f; - b_array_8[0] = 1.0f; b_array_8[1] = 1.0f; b_array_8[2] = 1.0f; b_array_8[3] = 1.0f; - b_array_8[4] = 1.0f; b_array_8[5] = 1.0f; b_array_8[6] = 1.0f; b_array_8[7] = 1.0f; + b_array_8[0] = 1.0f; + b_array_8[1] = 1.0f; + b_array_8[2] = 1.0f; + b_array_8[3] = 1.0f; + b_array_8[4] = 1.0f; + b_array_8[5] = 1.0f; + b_array_8[6] = 1.0f; + b_array_8[7] = 1.0f; - expected_array_8[0] = -1.0f; expected_array_8[1] = 0.0f; expected_array_8[2] = 1.0f; expected_array_8[3] = 2.0f; - expected_array_8[4] = -1.0f; expected_array_8[5] = 0.0f; expected_array_8[6] = 1.0f; expected_array_8[7] = 2.0f; + expected_array_8[0] = -1.0f; + expected_array_8[1] = 0.0f; + expected_array_8[2] = 1.0f; + expected_array_8[3] = 2.0f; + expected_array_8[4] = -1.0f; + expected_array_8[5] = 0.0f; + expected_array_8[6] = 1.0f; + expected_array_8[7] = 2.0f; Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); @@ -229,20 +319,56 @@ int test_operator_minus() return 0; } - a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; - a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; - a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; - a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + a_array_16[0] = 0.0f; + a_array_16[1] = 1.0f; + a_array_16[2] = 2.0f; + a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; + a_array_16[5] = 1.0f; + a_array_16[6] = 2.0f; + a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; + a_array_16[9] = 1.0f; + a_array_16[10] = 2.0f; + a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; + a_array_16[13] = 1.0f; + a_array_16[14] = 2.0f; + a_array_16[15] = 3.0f; - b_array_16[0] = 1.0f; b_array_16[1] = 1.0f; b_array_16[2] = 1.0f; b_array_16[3] = 1.0f; - b_array_16[4] = 1.0f; b_array_16[5] = 1.0f; b_array_16[6] = 1.0f; b_array_16[7] = 1.0f; - b_array_16[8] = 1.0f; b_array_16[9] = 1.0f; b_array_16[10] = 1.0f; b_array_16[11] = 1.0f; - b_array_16[12] = 1.0f; b_array_16[13] = 1.0f; b_array_16[14] = 1.0f; b_array_16[15] = 1.0f; + b_array_16[0] = 1.0f; + b_array_16[1] = 1.0f; + b_array_16[2] = 1.0f; + b_array_16[3] = 1.0f; + b_array_16[4] = 1.0f; + b_array_16[5] = 1.0f; + b_array_16[6] = 1.0f; + b_array_16[7] = 1.0f; + b_array_16[8] = 1.0f; + b_array_16[9] = 1.0f; + b_array_16[10] = 1.0f; + b_array_16[11] = 1.0f; + b_array_16[12] = 1.0f; + b_array_16[13] = 1.0f; + b_array_16[14] = 1.0f; + b_array_16[15] = 1.0f; - expected_array_16[0] = -1.0f; expected_array_16[1] = 0.0f; expected_array_16[2] = 1.0f; expected_array_16[3] = 2.0f; - expected_array_16[4] = -1.0f; expected_array_16[5] = 0.0f; expected_array_16[6] = 1.0f; expected_array_16[7] = 2.0f; - expected_array_16[8] = -1.0f; expected_array_16[9] = 0.0f; expected_array_16[10] = 1.0f; expected_array_16[11] = 2.0f; - expected_array_16[12] = -1.0f; expected_array_16[13] = 0.0f; expected_array_16[14] = 1.0f; expected_array_16[15] = 2.0f; + expected_array_16[0] = -1.0f; + expected_array_16[1] = 0.0f; + expected_array_16[2] = 1.0f; + expected_array_16[3] = 2.0f; + expected_array_16[4] = -1.0f; + expected_array_16[5] = 0.0f; + expected_array_16[6] = 1.0f; + expected_array_16[7] = 2.0f; + expected_array_16[8] = -1.0f; + expected_array_16[9] = 0.0f; + expected_array_16[10] = 1.0f; + expected_array_16[11] = 2.0f; + expected_array_16[12] = -1.0f; + expected_array_16[13] = 0.0f; + expected_array_16[14] = 1.0f; + expected_array_16[15] = 2.0f; Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); @@ -282,10 +408,19 @@ int test_operator_mul() return 0; } - a_array_4[0] = 0.0f; a_array_4[1] = 1.0f; a_array_4[2] = 2.0f; a_array_4[3] = 3.0f; - b_array_4[0] = 0.0f; b_array_4[1] = 1.0f; b_array_4[2] = 2.0f; b_array_4[3] = 3.0f; + a_array_4[0] = 0.0f; + a_array_4[1] = 1.0f; + a_array_4[2] = 2.0f; + a_array_4[3] = 3.0f; + b_array_4[0] = 0.0f; + b_array_4[1] = 1.0f; + b_array_4[2] = 2.0f; + b_array_4[3] = 3.0f; - expected_array_4[0] = 0.0f; expected_array_4[1] = 1.0f; expected_array_4[2] = 4.0f; expected_array_4[3] = 9.0f; + expected_array_4[0] = 0.0f; + expected_array_4[1] = 1.0f; + expected_array_4[2] = 4.0f; + expected_array_4[3] = 9.0f; Stdlib::SIMD::f32_4_simd expected_simd_4 = Stdlib::SIMD::load_f32_4_simd(expected_array_4); Stdlib::SIMD::f32_4_simd a_simd_4 = Stdlib::SIMD::load_f32_4_simd(a_array_4); @@ -308,14 +443,32 @@ int test_operator_mul() return 0; } - a_array_8[0] = 0.0f; a_array_8[1] = 1.0f; a_array_8[2] = 2.0f; a_array_8[3] = 3.0f; - a_array_8[4] = 0.0f; a_array_8[5] = 1.0f; a_array_8[6] = 2.0f; a_array_8[7] = 3.0f; + a_array_8[0] = 0.0f; + a_array_8[1] = 1.0f; + a_array_8[2] = 2.0f; + a_array_8[3] = 3.0f; + a_array_8[4] = 0.0f; + a_array_8[5] = 1.0f; + a_array_8[6] = 2.0f; + a_array_8[7] = 3.0f; - b_array_8[0] = 0.0f; b_array_8[1] = 1.0f; b_array_8[2] = 2.0f; b_array_8[3] = 3.0f; - b_array_8[4] = 0.0f; b_array_8[5] = 1.0f; b_array_8[6] = 2.0f; b_array_8[7] = 3.0f; + b_array_8[0] = 0.0f; + b_array_8[1] = 1.0f; + b_array_8[2] = 2.0f; + b_array_8[3] = 3.0f; + b_array_8[4] = 0.0f; + b_array_8[5] = 1.0f; + b_array_8[6] = 2.0f; + b_array_8[7] = 3.0f; - expected_array_8[0] = 0.0f; expected_array_8[1] = 1.0f; expected_array_8[2] = 4.0f; expected_array_8[3] = 9.0f; - expected_array_8[4] = 0.0f; expected_array_8[5] = 1.0f; expected_array_8[6] = 4.0f; expected_array_8[7] = 9.0f; + expected_array_8[0] = 0.0f; + expected_array_8[1] = 1.0f; + expected_array_8[2] = 4.0f; + expected_array_8[3] = 9.0f; + expected_array_8[4] = 0.0f; + expected_array_8[5] = 1.0f; + expected_array_8[6] = 4.0f; + expected_array_8[7] = 9.0f; Stdlib::SIMD::f32_8_simd expected_simd_8 = Stdlib::SIMD::load_f32_8_simd(expected_array_8); Stdlib::SIMD::f32_8_simd a_simd_8 = Stdlib::SIMD::load_f32_8_simd(a_array_8); @@ -342,20 +495,56 @@ int test_operator_mul() return 0; } - a_array_16[0] = 0.0f; a_array_16[1] = 1.0f; a_array_16[2] = 2.0f; a_array_16[3] = 3.0f; - a_array_16[4] = 0.0f; a_array_16[5] = 1.0f; a_array_16[6] = 2.0f; a_array_16[7] = 3.0f; - a_array_16[8] = 0.0f; a_array_16[9] = 1.0f; a_array_16[10] = 2.0f; a_array_16[11] = 3.0f; - a_array_16[12] = 0.0f; a_array_16[13] = 1.0f; a_array_16[14] = 2.0f; a_array_16[15] = 3.0f; + a_array_16[0] = 0.0f; + a_array_16[1] = 1.0f; + a_array_16[2] = 2.0f; + a_array_16[3] = 3.0f; + a_array_16[4] = 0.0f; + a_array_16[5] = 1.0f; + a_array_16[6] = 2.0f; + a_array_16[7] = 3.0f; + a_array_16[8] = 0.0f; + a_array_16[9] = 1.0f; + a_array_16[10] = 2.0f; + a_array_16[11] = 3.0f; + a_array_16[12] = 0.0f; + a_array_16[13] = 1.0f; + a_array_16[14] = 2.0f; + a_array_16[15] = 3.0f; - b_array_16[0] = 0.0f; b_array_16[1] = 1.0f; b_array_16[2] = 2.0f; b_array_16[3] = 3.0f; - b_array_16[4] = 0.0f; b_array_16[5] = 1.0f; b_array_16[6] = 2.0f; b_array_16[7] = 3.0f; - b_array_16[8] = 0.0f; b_array_16[9] = 1.0f; b_array_16[10] = 2.0f; b_array_16[11] = 3.0f; - b_array_16[12] = 0.0f; b_array_16[13] = 1.0f; b_array_16[14] = 2.0f; b_array_16[15] = 3.0f; + b_array_16[0] = 0.0f; + b_array_16[1] = 1.0f; + b_array_16[2] = 2.0f; + b_array_16[3] = 3.0f; + b_array_16[4] = 0.0f; + b_array_16[5] = 1.0f; + b_array_16[6] = 2.0f; + b_array_16[7] = 3.0f; + b_array_16[8] = 0.0f; + b_array_16[9] = 1.0f; + b_array_16[10] = 2.0f; + b_array_16[11] = 3.0f; + b_array_16[12] = 0.0f; + b_array_16[13] = 1.0f; + b_array_16[14] = 2.0f; + b_array_16[15] = 3.0f; - expected_array_16[0] = 0.0f; expected_array_16[1] = 1.0f; expected_array_16[2] = 4.0f; expected_array_16[3] = 9.0f; - expected_array_16[4] = 0.0f; expected_array_16[5] = 1.0f; expected_array_16[6] = 4.0f; expected_array_16[7] = 9.0f; - expected_array_16[8] = 0.0f; expected_array_16[9] = 1.0f; expected_array_16[10] = 4.0f; expected_array_16[11] = 9.0f; - expected_array_16[12] = 0.0f; expected_array_16[13] = 1.0f; expected_array_16[14] = 4.0f; expected_array_16[15] = 9.0f; + expected_array_16[0] = 0.0f; + expected_array_16[1] = 1.0f; + expected_array_16[2] = 4.0f; + expected_array_16[3] = 9.0f; + expected_array_16[4] = 0.0f; + expected_array_16[5] = 1.0f; + expected_array_16[6] = 4.0f; + expected_array_16[7] = 9.0f; + expected_array_16[8] = 0.0f; + expected_array_16[9] = 1.0f; + expected_array_16[10] = 4.0f; + expected_array_16[11] = 9.0f; + expected_array_16[12] = 0.0f; + expected_array_16[13] = 1.0f; + expected_array_16[14] = 4.0f; + expected_array_16[15] = 9.0f; Stdlib::SIMD::f32_16_simd expected_simd_16 = Stdlib::SIMD::load_f32_16_simd(expected_array_16); Stdlib::SIMD::f32_16_simd a_simd_16 = Stdlib::SIMD::load_f32_16_simd(a_array_16); diff --git a/tests/Stdlib/SIMD/SIMD_HelperTest.cpp b/tests/Stdlib/SIMD/SIMD_HelperTest.cpp index dbb6347..28ab416 100644 --- a/tests/Stdlib/SIMD/SIMD_HelperTest.cpp +++ b/tests/Stdlib/SIMD/SIMD_HelperTest.cpp @@ -10,11 +10,10 @@ #include #include -#include "../../../Utils/TestUtils.h" #include "../../../Stdlib/SIMD/SIMD_Helper.h" +#include "../../../Utils/TestUtils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { printf("SIMD_Helper:\n"); diff --git a/tests/Threads/ThreadPoolTest.cpp b/tests/Threads/ThreadPoolTest.cpp index 592cb16..f2baa70 100755 --- a/tests/Threads/ThreadPoolTest.cpp +++ b/tests/Threads/ThreadPoolTest.cpp @@ -7,8 +7,8 @@ * @version 1.0.0 * @link https://jingga.app */ -#include #include +#include #include "../../Threads/Thread.h" #include "../../Utils/TestUtils.h" @@ -21,8 +21,8 @@ void worker(void *arg) { Threads::Job *job = (Threads::Job *) arg; - int *val = (int *) job->arg; - *val += 100; + int *val = (int *) job->arg; + *val += 100; if (*val % 2) { sleep(1); @@ -31,24 +31,24 @@ void worker(void *arg) job->state = 1; } -int main(int argc, char** argv) +int main(int argc, char **argv) { printf("Threads:\n\n"); printf("ThreadPool:\n"); int i; Threads::ThreadPool *pool = Threads::pool_create(num_threads); - int *vals = (int *) calloc(num_items, sizeof(int)); - Threads::Job **works = (Threads::Job **) calloc(num_items, sizeof(Threads::Job)); + int *vals = (int *) calloc(num_items, sizeof(int)); + Threads::Job **works = (Threads::Job **) calloc(num_items, sizeof(Threads::Job)); for (i = 0; i < num_items; ++i) { - vals[i] = i; + vals[i] = i; works[i] = Threads::pool_add_work(pool, worker, vals + i); } // @bug wait is not working as expected // I thought wait works similarly to what the do/while construct below does - //Threads::pool_wait(pool); + // Threads::pool_wait(pool); bool finished = false; do { diff --git a/tests/Utils/WebUtilsTest.cpp b/tests/Utils/WebUtilsTest.cpp index 93507a4..c5c8c10 100755 --- a/tests/Utils/WebUtilsTest.cpp +++ b/tests/Utils/WebUtilsTest.cpp @@ -9,10 +9,10 @@ */ #include -#include "../../Utils/WebUtils.h" #include "../../Utils/TestUtils.h" +#include "../../Utils/WebUtils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { printf("Utils:\n\n"); printf("WebUtils:\n"); From b9c3c8c130af1d3ea7c49501960a774e64b78b14 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Wed, 24 Apr 2024 23:37:26 +0000 Subject: [PATCH 7/7] prepare master merge --- LICENSE.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/LICENSE.txt b/LICENSE.txt index 4ba0161..02b1ff5 100755 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -46,3 +46,7 @@ Unless required by applicable law or agreed to in writing, Licensor provides the 7. Limitation of Liability In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +8. Future Changes + +The Licensor may change the License for future versions as he sees fit.