linux bug fixes
Some checks are pending
CodeQL / Analyze (${{ matrix.language }}) (autobuild, c-cpp) (push) Waiting to run
Microsoft C++ Code Analysis / Analyze (push) Waiting to run

This commit is contained in:
Dennis Eichhorn 2025-03-22 01:10:19 +00:00
parent 4f1cbd98f9
commit 39fbcf4300
485 changed files with 5108 additions and 4628 deletions

0
.github/workflows/codeql.yml vendored Normal file → Executable file
View File

0
.github/workflows/msvc.yml vendored Normal file → Executable file
View File

0
.gitignore vendored Normal file → Executable file
View File

0
Guidelines.md Normal file → Executable file
View File

0
README.md Normal file → Executable file
View File

0
animation/Animation.h Normal file → Executable file
View File

0
animation/AnimationEaseType.h Normal file → Executable file
View File

0
architecture/CpuInfo.cpp Normal file → Executable file
View File

0
architecture/CpuInfo.h Normal file → Executable file
View File

0
architecture/Intrinsics.h Normal file → Executable file
View File

0
architecture/arm/CpuInfo.cpp Normal file → Executable file
View File

4
architecture/arm/Intrinsics.h Normal file → Executable file
View File

@ -11,6 +11,7 @@
#include <arm_sve.h>
#include <arm_acle.h>
#include <arm_neon.h>
#include "../../stdlib/Types.h"
#include "../../compiler/CompilerUtils.h"
@ -50,4 +51,7 @@
#define intrin_timestamp_counter() __builtin_readcyclecounter()
#endif
// a * b + c
#define intrin_fmadd(a, b, c) vgetq_lane_f32(vmlaq_f32(vdupq_n_f32(c), vdupq_n_f32(a), vdupq_n_f32(b)), 0)
#endif

0
architecture/arm/neon/utils/Utils.h Normal file → Executable file
View File

0
architecture/arm/sve/utils/Utils.h Normal file → Executable file
View File

0
architecture/x86/CpuInfo.cpp Normal file → Executable file
View File

3
architecture/x86/Intrinsics.h Normal file → Executable file
View File

@ -58,6 +58,9 @@
#define intrin_prefetch_l2(mem) _mm_prefetch((const char *) (mem), _MM_HINT_T1)
#define intrin_prefetch_l3(mem) _mm_prefetch((const char *) (mem), _MM_HINT_T2)
// a * b + c
#define intrin_fmadd(a, b, c) _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a), _mm_set_ss(b), _mm_set_ss(c)))
inline
uint64 intrin_timestamp_counter() noexcept {
_mm_mfence();

1373
architecture/x86/simd/SIMD_F32.h Normal file → Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,426 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_F32_H
#define COMS_STDLIB_SIMD_F32_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
#include "SIMD_SVML_AVX2.h"
struct f32_8 {
union {
#if ARM
svfloat32_t s;
#else
__m256 s;
#endif
f32 v[8];
};
};
inline f32_8 load_f32_8(const f32* mem)
{
f32_8 simd;
simd.s = _mm256_load_ps(mem);
return simd;
}
inline f32_8 init_f32_8(const f32* mem)
{
f32_8 simd;
simd.s = _mm256_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]);
return simd;
}
inline void unload_f32_8(f32_8 a, f32 *array) { _mm256_store_ps(array, a.s); }
inline f32_8 init_zero_f32_8()
{
f32_8 simd;
simd.s = _mm256_setzero_ps();
return simd;
}
inline f32_8 init_value_f32_8(f32 value)
{
f32_8 simd;
simd.s = _mm256_set1_ps(value);
return simd;
}
inline f32_8 init_values_f32_8(
f32 a, f32 b, f32 c, f32 d,
f32 e, f32 f, f32 g, f32 h
)
{
f32_8 simd;
simd.s = _mm256_set_ps(a, b, c, d, e, f, g, h);
return simd;
}
inline f32_8 operator+(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_add_ps(a.s, b.s);
return simd;
}
inline f32_8 operator-(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_sub_ps(a.s, b.s);
return simd;
}
inline f32_8 operator-(f32_8 a) { return init_zero_f32_8() - a; }
inline f32_8 operator*(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_mul_ps(a.s, b.s);
return simd;
}
inline f32_8 operator/(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_div_ps(a.s, b.s);
return simd;
}
inline f32_8 operator^(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_xor_ps(a.s, b.s);
return simd;
}
inline f32_8 &operator-=(f32_8 &a, f32_8 b)
{
a = a - b;
return a;
}
inline f32_8 &operator+=(f32_8 &a, f32_8 b)
{
a = a + b;
return a;
}
inline f32_8 &operator*=(f32_8 &a, f32_8 b)
{
a = a * b;
return a;
}
inline f32_8 &operator/=(f32_8 &a, f32_8 b)
{
a = a / b;
return a;
}
inline f32_8 &operator^=(f32_8 &a, f32_8 b)
{
a = a ^ b;
return a;
}
inline f32_8 operator<(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LT_OQ);
return simd;
}
inline f32_8 operator<=(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LE_OQ);
return simd;
}
inline f32_8 operator>(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GT_OQ);
return simd;
}
inline f32_8 operator>=(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GE_OQ);
return simd;
}
inline f32_8 operator==(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_EQ_OQ);
return simd;
}
inline f32_8 operator!=(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_NEQ_OQ);
return simd;
}
inline f32_8 operator&(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_and_ps(a.s, b.s);
return simd;
}
inline f32_8 operator|(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_or_ps(a.s, b.s);
return simd;
}
inline f32_8 &operator&=(f32_8 &a, f32_8 b)
{
a = a & b;
return a;
}
inline f32_8 &operator|=(f32_8 &a, f32_8 b)
{
a = a | b;
return a;
}
inline f32_8 abs(f32_8 a)
{
uint32 unsigned_mask = (uint32) (1U << 31);
__m256 mask = _mm256_set1_ps(*(f32 *) &unsigned_mask);
f32_8 simd;
simd.s = _mm256_and_ps(a.s, mask);
return simd;
}
inline f32_8 simd_min(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_min_ps(a.s, b.s);
return simd;
}
inline f32_8 simd_max(f32_8 a, f32_8 b)
{
f32_8 simd;
simd.s = _mm256_max_ps(a.s, b.s);
return simd;
}
inline f32_8 sign(f32_8 a)
{
uint32 umask = (uint32) (1U << 31);
__m256 mask = _mm256_set1_ps(*(f32 *) &umask);
f32_8 signBit;
signBit.s = _mm256_and_ps(a.s, mask);
f32_8 b;
b.s = _mm256_set1_ps(1.0f);
f32_8 simd = b | signBit;
return simd;
}
inline f32_8 floor(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_floor_ps(a.s);
return simd;
}
inline f32_8 ceil(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_ceil_ps(a.s);
return simd;
}
inline f32_8 sqrt(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_sqrt_ps(a.s);
return simd;
}
inline f32_8 sqrt_inv_approx(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_rsqrt_ps(a.s);
return simd;
}
inline f32_8 one_over_approx(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_rcp_ps(a.s);
return simd;
}
inline f32_8 clamp(f32_8 min_value, f32_8 a, f32_8 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(f32_8 a)
{
int32 which_true = _mm256_movemask_ps(a.s);
return which_true;
}
inline bool any_true(f32_8 a)
{
bool is_any_true = _mm256_movemask_ps(a.s) > 0;
return is_any_true;
}
inline bool all_true(f32_8 a)
{
bool is_true = _mm256_movemask_ps(a.s) == 255;
return is_true;
}
inline bool all_false(f32_8 a)
{
bool is_false = _mm256_movemask_ps(a.s) == 0;
return is_false;
}
inline
void simd_cmp_le(const __m256* a, f32 b, bool* result, int32 size)
{
__m256 b_8 = _mm256_set1_ps(b);
for (int32 i = 0; i < size; ++i) {
int32 mask = _mm256_movemask_ps(_mm256_cmp_ps(a[i], b_8, _CMP_LE_OQ));
for (int32 j = 0; j < 8; ++j) {
result[i * 8 + j] = (mask & (1 << j)) != 0;
}
}
}
inline
f32_8 simd_sin(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_sin_ps(a.s);
return simd;
}
inline
f32_8 simd_cos(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_cos_ps(a.s);
return simd;
}
inline
f32_8 simd_asin(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_asin_ps(a.s);
return simd;
}
inline
f32_8 simd_acos(f32_8 a)
{
f32_8 simd;
simd.s = _mm256_acos_ps(a.s);
return simd;
}
inline
void simd_div(const f32* a, f32 b, __m256* result, int32 size)
{
int32 i = 0;
int32 j = 0;
// @todo this his how all the functions should be implemented that take in baseic types and output basic types
__m256 a_8;
__m256 b_8 = _mm256_set1_ps(b);
__m256 result_8;
for (; i <= size - 8; i += 8) {
a_8 = _mm256_load_ps(a);
result_8 = _mm256_div_ps(a_8, b_8);
result[j] = result_8;
a += 8;
++j;
}
int32 diff = size - i;
alignas(32) f32 temp[8];
for (int32 k = 0; k < diff; k++) {
temp[k] = a[i + k] / b;
}
result[j] = _mm256_load_ps(temp);
}
#endif

View File

@ -0,0 +1,385 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_F32_H
#define COMS_STDLIB_SIMD_F32_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
#include "SIMD_SVML_AVX512.h"
struct f32_16 {
union {
#if ARM
svfloat32_t s;
#else
__m512 s;
#endif
f32 v[16];
};
};
inline f32_16 load_f32_16(const f32* mem)
{
f32_16 simd;
simd.s = _mm512_load_ps(mem);
return simd;
}
inline f32_16 init_f32_16(const f32* mem)
{
f32_16 simd;
simd.s = _mm512_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], mem[8], mem[9], mem[10],
mem[11], mem[12], mem[13], mem[14], mem[15]);
return simd;
}
inline void unload_f32_16(f32_16 a, f32 *array) { _mm512_store_ps(array, a.s); }
inline f32_16 init_zero_f32_16()
{
f32_16 simd;
simd.s = _mm512_setzero_ps();
return simd;
}
inline f32_16 init_value_f32_16(f32 value)
{
f32_16 simd;
simd.s = _mm512_set1_ps(value);
return simd;
}
inline f32_16 init_values_f32_16(
f32 a, f32 b, f32 c, f32 d,
f32 e, f32 f, f32 g, f32 h,
f32 i, f32 j, f32 k, f32 l,
f32 m, f32 n, f32 o, f32 p
)
{
f32_16 simd;
simd.s = _mm512_set_ps(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
return simd;
}
inline f32_16 operator+(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_add_ps(a.s, b.s);
return simd;
}
inline f32_16 operator-(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_sub_ps(a.s, b.s);
return simd;
}
inline f32_16 operator-(f32_16 a) { return init_zero_f32_16() - a; }
inline f32_16 operator*(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_mul_ps(a.s, b.s);
return simd;
}
inline f32_16 operator/(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_div_ps(a.s, b.s);
return simd;
}
inline f32_16 operator^(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_xor_ps(a.s, b.s);
return simd;
}
inline f32_16 &operator-=(f32_16 &a, f32_16 b)
{
a = a - b;
return a;
}
inline f32_16 &operator+=(f32_16 &a, f32_16 b)
{
a = a + b;
return a;
}
inline f32_16 &operator*=(f32_16 &a, f32_16 b)
{
a = a * b;
return a;
}
inline f32_16 &operator/=(f32_16 &a, f32_16 b)
{
a = a / b;
return a;
}
inline f32_16 &operator^=(f32_16 &a, f32_16 b)
{
a = a ^ b;
return a;
}
inline f32_16 operator<(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_mask_blend_ps(_mm512_cmplt_ps_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline f32_16 operator<=(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), a.s, b.s);
return simd;
}
inline f32_16 operator>(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), a.s, b.s);
return simd;
}
inline f32_16 operator>=(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), a.s, b.s);
return simd;
}
inline f32_16 operator==(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), a.s, b.s);
return simd;
}
inline f32_16 operator!=(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), a.s, b.s);
return simd;
}
inline f32_16 operator&(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_and_ps(a.s, b.s);
return simd;
}
inline f32_16 operator|(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_or_ps(a.s, b.s);
return simd;
}
inline f32_16 &operator&=(f32_16 &a, f32_16 b)
{
a = a & b;
return a;
}
inline f32_16 &operator|=(f32_16 &a, f32_16 b)
{
a = a | b;
return a;
}
inline f32_16 abs(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_abs_ps(a.s);
return simd;
}
inline f32_16 simd_min(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_min_ps(a.s, b.s);
return simd;
}
inline f32_16 simd_max(f32_16 a, f32_16 b)
{
f32_16 simd;
simd.s = _mm512_max_ps(a.s, b.s);
return simd;
}
inline f32_16 sign(f32_16 a)
{
uint32 umask = (uint32) (1U << 31);
__m512 mask = _mm512_set1_ps(*(f32 *) &umask);
f32_16 signBit;
signBit.s = _mm512_and_ps(a.s, mask);
f32_16 b;
b.s = _mm512_set1_ps(1.0f);
f32_16 simd = b | signBit;
return simd;
}
inline f32_16 floor(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_floor_ps(a.s);
return simd;
}
inline f32_16 ceil(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_ceil_ps(a.s);
return simd;
}
inline f32_16 sqrt(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_sqrt_ps(a.s);
return simd;
}
inline f32_16 sqrt_inv_approx(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_rsqrt14_ps(a.s);
return simd;
}
inline f32_16 one_over_approx(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_rcp14_ps(a.s);
return simd;
}
inline f32_16 clamp(f32_16 min_value, f32_16 a, f32_16 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(f32_16 a)
{
int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s));
return which_true;
}
inline bool any_true(f32_16 a)
{
bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0;
return is_any_true;
}
inline bool all_true(f32_16 a)
{
bool is_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535;
return is_true;
}
inline bool all_false(f32_16 a)
{
// @todo This can be optimized (requires also changes in the comparison functions return)
bool is_false = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0;
return is_false;
}
inline
f32_16 simd_sin(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_sin_ps(a.s);
return simd;
}
inline
f32_16 simd_cos(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_cos_ps(a.s);
return simd;
}
inline
f32_16 simd_asin(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_asin_ps(a.s);
return simd;
}
inline
f32_16 simd_acos(f32_16 a)
{
f32_16 simd;
simd.s = _mm512_acos_ps(a.s);
return simd;
}
// @todo implement more trigonometry function
#endif

View File

@ -0,0 +1,381 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_F32_SSE_H
#define COMS_STDLIB_SIMD_F32_SSE_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
#include "SIMD_SVML_SSE.h"
struct f32_4 {
union {
#if ARM
svfloat32_t s;
#else
__m128 s;
#endif
f32 v[4];
};
};
inline f32_4 load_f32_4(const f32* mem)
{
f32_4 simd;
simd.s = _mm_load_ps(mem);
return simd;
}
inline f32_4 init_f32_4(const f32* mem)
{
f32_4 simd;
simd.s = _mm_set_ps(mem[0], mem[1], mem[2], mem[3]);
return simd;
}
inline void unload_f32_4(f32_4 a, f32 *array) { _mm_store_ps(array, a.s); }
inline f32_4 init_zero_f32_4()
{
f32_4 simd;
simd.s = _mm_setzero_ps();
return simd;
}
inline f32_4 init_value_f32_4(f32 value)
{
f32_4 simd;
simd.s = _mm_set1_ps(value);
return simd;
}
inline f32_4 init_values_f32_4(f32 a, f32 b, f32 c, f32 d)
{
f32_4 simd;
simd.s = _mm_set_ps(a, b, c, d);
return simd;
}
inline f32_4 operator+(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_add_ps(a.s, b.s);
return simd;
}
inline f32_4 operator-(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_sub_ps(a.s, b.s);
return simd;
}
inline f32_4 operator-(f32_4 a) { return init_zero_f32_4() - a; }
inline f32_4 operator*(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_mul_ps(a.s, b.s);
return simd;
}
inline f32_4 operator/(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_div_ps(a.s, b.s);
return simd;
}
inline f32_4 operator^(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_xor_ps(a.s, b.s);
return simd;
}
inline f32_4 &operator-=(f32_4 &a, f32_4 b)
{
a = a - b;
return a;
}
inline f32_4 &operator+=(f32_4 &a, f32_4 b)
{
a = a + b;
return a;
}
inline f32_4 &operator*=(f32_4 &a, f32_4 b)
{
a = a * b;
return a;
}
inline f32_4 &operator/=(f32_4 &a, f32_4 b)
{
a = a / b;
return a;
}
inline f32_4 &operator^=(f32_4 &a, f32_4 b)
{
a = a ^ b;
return a;
}
inline f32_4 operator<(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_cmplt_ps(a.s, b.s);
return simd;
}
inline f32_4 operator<=(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_cmple_ps(a.s, b.s);
return simd;
}
inline f32_4 operator>(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_cmpgt_ps(a.s, b.s);
return simd;
}
inline f32_4 operator>=(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_cmpge_ps(a.s, b.s);
return simd;
}
inline f32_4 operator==(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_cmpeq_ps(a.s, b.s);
return simd;
}
inline f32_4 operator!=(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_cmpneq_ps(a.s, b.s);
return simd;
}
inline f32_4 operator&(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_and_ps(a.s, b.s);
return simd;
}
inline f32_4 operator|(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_or_ps(a.s, b.s);
return simd;
}
inline f32_4 &operator&=(f32_4 &a, f32_4 b)
{
a = a & b;
return a;
}
inline f32_4 &operator|=(f32_4 &a, f32_4 b)
{
a = a | b;
return a;
}
inline f32_4 abs(f32_4 a)
{
uint32 unsigned_mask = (uint32) (1U << 31);
__m128 mask = _mm_set1_ps(*(f32 *) &unsigned_mask);
f32_4 simd;
simd.s = _mm_and_ps(a.s, mask);
return simd;
}
inline f32_4 simd_min(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_min_ps(a.s, b.s);
return simd;
}
inline f32_4 simd_max(f32_4 a, f32_4 b)
{
f32_4 simd;
simd.s = _mm_max_ps(a.s, b.s);
return simd;
}
inline f32_4 sign(f32_4 a)
{
uint32 umask = (uint32) (1U << 31);
__m128 mask = _mm_set1_ps(*(f32 *) &umask);
f32_4 signBit;
signBit.s = _mm_and_ps(a.s, mask);
f32_4 b;
b.s = _mm_set1_ps(1.0f);
f32_4 simd = b | signBit;
return simd;
}
inline f32_4 floor(f32_4 a)
{
f32_4 simd;
simd.s = _mm_floor_ps(a.s);
return simd;
}
inline f32_4 ceil(f32_4 a)
{
f32_4 simd;
simd.s = _mm_ceil_ps(a.s);
return simd;
}
inline f32_4 sqrt(f32_4 a)
{
f32_4 simd;
simd.s = _mm_sqrt_ps(a.s);
return simd;
}
inline f32_4 sqrt_inv_approx(f32_4 a)
{
f32_4 simd;
simd.s = _mm_rsqrt_ps(a.s);
return simd;
}
inline f32_4 one_over_approx(f32_4 a)
{
f32_4 simd;
simd.s = _mm_rcp_ps(a.s);
return simd;
}
inline f32_4 clamp(f32_4 min_value, f32_4 a, f32_4 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(f32_4 a)
{
int32 which_true = _mm_movemask_ps(a.s);
return which_true;
}
inline bool any_true(f32_4 a)
{
bool is_any_true = _mm_movemask_ps(a.s) > 0;
return is_any_true;
}
inline bool all_true(f32_4 a)
{
bool is_true = _mm_movemask_ps(a.s) == 15;
return is_true;
}
inline bool all_false(f32_4 a)
{
bool is_false = _mm_movemask_ps(a.s) == 0;
return is_false;
}
inline
f32_4 simd_sin(f32_4 a)
{
f32_4 simd;
simd.s = _mm_sin_ps(a.s);
return simd;
}
inline
f32_4 simd_cos(f32_4 a)
{
f32_4 simd;
simd.s = _mm_cos_ps(a.s);
return simd;
}
inline
f32_4 simd_asin(f32_4 a)
{
f32_4 simd;
simd.s = _mm_asin_ps(a.s);
return simd;
}
inline
f32_4 simd_acos(f32_4 a)
{
f32_4 simd;
simd.s = _mm_acos_ps(a.s);
return simd;
}
// @todo implement more trigonometry function
#endif

42
architecture/x86/simd/SIMD_F64.h Normal file → Executable file
View File

@ -14,40 +14,16 @@
#include "../../../stdlib/Types.h"
struct f64_2 {
union {
#if ARM
svfloat64_t s;
#else
__m128 s;
#endif
#ifdef MACRO_CPU_FEATURE_SSE42
#include "SIMD_F64_SSE.h"
#endif
f64 v[2];
};
};
#ifdef MACRO_CPU_FEATURE_AVX2
#include "SIMD_F64_AVX2.h"
#endif
struct f64_4 {
union {
#if ARM
svfloat64_t s;
#else
__m256 s;
#endif
f64 v[4];
};
};
struct f64_8 {
union {
#if ARM
svfloat64_t s;
#else
__m512 s;
#endif
f64 v[8];
};
};
#ifdef MACRO_CPU_FEATURE_AVX512
#include "SIMD_F64_AVX512.h"
#endif
#endif

View File

@ -0,0 +1,30 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_F64_AVX2_H
#define COMS_STDLIB_SIMD_F64_AVX2_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct f64_4 {
union {
#if ARM
svfloat64_t s;
#else
__m256 s;
#endif
f64 v[4];
};
};
#endif

View File

@ -0,0 +1,29 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_F64_AVX512_H
#define COMS_STDLIB_SIMD_F64_AVX512_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct f64_8 {
union {
#if ARM
svfloat64_t s;
#else
__m512 s;
#endif
f64 v[8];
};
};
#endif

View File

@ -0,0 +1,29 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_F64_SSE_H
#define COMS_STDLIB_SIMD_F64_SSE_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct f64_2 {
union {
#if ARM
svfloat64_t s;
#else
__m128 s;
#endif
f64 v[2];
};
};
#endif

860
architecture/x86/simd/SIMD_I16.h Normal file → Executable file
View File

@ -14,742 +14,17 @@
#include "../../../stdlib/Types.h"
struct int16_8 {
union {
#if ARM
svint16_t s;
#else
__m128i s;
#endif
#ifdef MACRO_CPU_FEATURE_SSE42
#include "SIMD_I16_SSE.h"
#endif
int16 v[8];
};
};
#ifdef MACRO_CPU_FEATURE_AVX2
#include "SIMD_I16_AVX2.h"
#endif
struct int16_16 {
union {
#if ARM
svint16_t s;
#else
__m256i s;
#endif
int16 v[16];
};
};
struct int16_32 {
union {
#if ARM
svint16_t s;
#else
__m512i s;
#endif
int16 v[32];
};
};
inline int16_8 load_int16_8(const int16* mem)
{
int16_8 simd;
simd.s = _mm_load_si128((__m128i *) mem);
return simd;
}
inline int16_8 init_int16_8(const int16* mem)
{
int16_8 simd;
simd.s = _mm_set_epi16(
mem[0], mem[1], mem[2], mem[3],
mem[4], mem[5], mem[6], mem[7]
);
return simd;
}
inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i *) array, a.s); }
inline int16_16 load_int16_16(const int16* mem)
{
int16_16 simd;
simd.s = _mm256_load_si256((__m256i *) mem);
return simd;
}
inline int16_16 init_int16_16(const int16* mem)
{
int16_16 simd;
simd.s = _mm256_set_epi16(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]
);
return simd;
}
inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m256i *) array, a.s); }
inline int16_32 load_int16_32(const int16* mem)
{
int16_32 simd;
simd.s = _mm512_load_si512((__m512i *) mem);
return simd;
}
inline int16_32 init_int16_32(const int16* mem)
{
int16_32 simd;
simd.s = _mm512_set_epi16(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31]
);
return simd;
}
inline void unload_int16_32(int16_32 a, int16 *array) { _mm512_storeu_epi16(array, a.s); }
inline int16_8 init_zero_int16_8()
{
int16_8 simd;
simd.s = _mm_setzero_si128();
return simd;
}
inline int16_16 init_zero_int16_16()
{
int16_16 simd;
simd.s = _mm256_setzero_si256();
return simd;
}
inline int16_32 init_zero_int16_32()
{
int16_32 simd;
simd.s = _mm512_setzero_si512();
return simd;
}
inline int16_8 init_value_int16_8(int16 value)
{
int16_8 simd;
simd.s = _mm_set1_epi16(value);
return simd;
}
inline int16_16 init_value_int16_16(int16 value)
{
int16_16 simd;
simd.s = _mm256_set1_epi16(value);
return simd;
}
inline int16_32 init_value_int16_32(int16 value)
{
int16_32 simd;
simd.s = _mm512_set1_epi16(value);
return simd;
}
inline int16_8 operator+(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_add_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator+(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_add_epi16(a.s, b.s);
return simd;
}
inline int16_32 operator+(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_add_epi16(a.s, b.s);
return simd;
}
inline int16_8 operator-(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_sub_epi16(a.s, b.s);
return simd;
}
inline int16_8 operator-(int16_8 a) { return init_zero_int16_8() - a; }
inline int16_16 operator-(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_sub_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator-(int16_16 a) { return init_zero_int16_16() - a; }
inline int16_32 operator-(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_sub_epi16(a.s, b.s);
return simd;
}
inline int16_32 operator-(int16_32 a) { return init_zero_int16_32() - a; }
inline int16_8 operator*(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_mul_epi32(a.s, b.s);
return simd;
}
inline int16_16 operator*(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_mul_epi32(a.s, b.s);
return simd;
}
inline int16_32 operator*(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mul_epi32(a.s, b.s);
return simd;
}
inline int16_8 operator^(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_xor_si128(a.s, b.s);
return simd;
}
inline int16_16 operator^(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_xor_si256(a.s, b.s);
return simd;
}
inline int16_32 operator^(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_xor_si512(a.s, b.s);
return simd;
}
inline int16_8 &operator-=(int16_8 &a, int16_8 b)
{
a = a - b;
return a;
}
inline int16_16 &operator-=(int16_16 &a, int16_16 b)
{
a = a - b;
return a;
}
inline int16_32 &operator-=(int16_32 &a, int16_32 b)
{
a = a - b;
return a;
}
inline int16_8 &operator+=(int16_8 &a, int16_8 b)
{
a = a + b;
return a;
}
inline int16_16 &operator+=(int16_16 &a, int16_16 b)
{
a = a + b;
return a;
}
inline int16_32 &operator+=(int16_32 &a, int16_32 b)
{
a = a + b;
return a;
}
inline int16_8 &operator*=(int16_8 &a, int16_8 b)
{
a = a * b;
return a;
}
inline int16_16 &operator*=(int16_16 &a, int16_16 b)
{
a = a * b;
return a;
}
inline int16_32 &operator*=(int16_32 &a, int16_32 b)
{
a = a * b;
return a;
}
inline int16_8 &operator^=(int16_8 &a, int16_8 b)
{
a = a ^ b;
return a;
}
inline int16_16 &operator^=(int16_16 &a, int16_16 b)
{
a = a ^ b;
return a;
}
inline int16_32 &operator^=(int16_32 &a, int16_32 b)
{
a = a ^ b;
return a;
}
inline int16_8 operator<(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_cmplt_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator<(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_xor_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1));
return simd;
}
inline int16_32 operator<(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmplt_epi16_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int16_8 operator<=(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi16(b.s, a.s), _mm_set1_epi16(-1));
return simd;
}
inline int16_16 operator<=(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1));
return simd;
}
inline int16_32 operator<=(int16_32 a, int16_32 b)
{
int16_32 simd;
__mmask32 mask = _mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_LE);
simd.s = _mm512_mask_blend_epi16(mask, b.s, a.s);
return simd;
}
inline int16_8 operator>(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_cmpgt_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator>(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_cmpgt_epi16(a.s, b.s);
return simd;
}
inline int16_32 operator>(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmpgt_epi16_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int16_8 operator>=(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi16(a.s, b.s), _mm_set1_epi16(-1));
return simd;
}
inline int16_16 operator>=(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(b.s, a.s), _mm256_set1_epi16(-1));
return simd;
}
inline int16_32 operator>=(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmpge_epi16_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int16_8 operator==(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_cmpeq_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator==(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_cmpeq_epi16(a.s, b.s);
return simd;
}
inline int16_32 operator==(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmpeq_epi16_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int16_8 operator!=(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_andnot_si128(_mm_cmpeq_epi16(a.s, b.s), _mm_set1_epi16(-1));
return simd;
}
inline int16_16 operator!=(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_mask_blend_epi16(_mm256_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int16_32 operator!=(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int16_8 operator&(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_and_si128(a.s, b.s);
return simd;
}
inline int16_16 operator&(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_and_si256(a.s, b.s);
return simd;
}
inline int16_32 operator&(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_and_si512(a.s, b.s);
return simd;
}
inline int16_8 operator|(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_or_si128(a.s, b.s);
return simd;
}
inline int16_16 operator|(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_or_si256(a.s, b.s);
return simd;
}
inline int16_32 operator|(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_or_si512(a.s, b.s);
return simd;
}
inline int16_8 &operator&=(int16_8 &a, int16_8 b)
{
a = a & b;
return a;
}
inline int16_16 &operator&=(int16_16 &a, int16_16 b)
{
a = a & b;
return a;
}
inline int16_32 &operator&=(int16_32 &a, int16_32 b)
{
a = a & b;
return a;
}
inline int16_8 &operator|=(int16_8 &a, int16_8 b)
{
a = a | b;
return a;
}
inline int16_16 &operator|=(int16_16 &a, int16_16 b)
{
a = a | b;
return a;
}
inline int16_32 &operator|=(int16_32 &a, int16_32 b)
{
a = a | b;
return a;
}
inline int16_8 abs(int16_8 a)
{
int16_8 simd;
simd.s = _mm_abs_epi16(a.s);
return simd;
}
inline int16_16 abs(int16_16 a)
{
int16_16 simd;
simd.s = _mm256_abs_epi16(a.s);
return simd;
}
inline int16_32 abs(int16_32 a)
{
int16_32 simd;
simd.s = _mm512_abs_epi16(a.s);
return simd;
}
inline int16_8 simd_min(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_min_epi16(a.s, b.s);
return simd;
}
inline int16_16 simd_min(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_min_epi16(a.s, b.s);
return simd;
}
inline int16_32 simd_min(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_min_epi16(a.s, b.s);
return simd;
}
inline int16_8 simd_max(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_max_epi16(a.s, b.s);
return simd;
}
inline int16_16 simd_max(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_max_epi16(a.s, b.s);
return simd;
}
inline int16_32 simd_max(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_max_epi16(a.s, b.s);
return simd;
}
inline int16_8 clamp(int16_8 min_value, int16_8 a, int16_8 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int16_16 clamp(int16_16 min_value, int16_16 a, int16_16 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int16_32 clamp(int16_32 min_value, int16_32 a, int16_32 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int16_8 a)
{
return _mm_movemask_epi8(a.s);
}
inline int32 which_true(int16_16 a)
{
return _mm256_movemask_epi8(a.s);
}
inline int32 which_true(int16_32 a)
{
return _mm512_movepi16_mask(a.s);
}
inline bool any_true(int16_8 a)
{
bool is_any_true = _mm_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool any_true(int16_16 a)
{
bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool any_true(int16_32 a)
{
bool is_any_true = _mm512_movepi16_mask(a.s) > 0;
return is_any_true;
}
inline bool all_true(int16_8 a)
{
bool is_true = _mm_movemask_epi8(a.s) == 15;
return is_true;
}
inline bool all_true(int16_16 a)
{
bool is_true = _mm256_movemask_epi8(a.s) == 255;
return is_true;
}
inline bool all_true(int16_32 a)
{
bool is_true = _mm512_movepi16_mask(a.s) == 65535;
return is_true;
}
inline bool all_false(int16_8 a)
{
bool is_false = _mm_movemask_epi8(a.s) == 0;
return is_false;
}
inline bool all_false(int16_16 a)
{
bool is_false = _mm256_movemask_epi8(a.s) == 0;
return is_false;
}
inline bool all_false(int16_32 a)
{
// @todo This can be optimized (requires also changes in the comparison functions return)
bool is_false = _mm512_movepi16_mask(a.s) == 0;
return is_false;
}
#ifdef MACRO_CPU_FEATURE_AVX512
#include "SIMD_I16_AVX512.h"
#endif
// @todo from down here we can optimize some of the code by NOT using the wrappers
// the code is self contained and we could use te intrinsic functions directly
@ -758,74 +33,93 @@ inline
void simd_mult(const int16* a, f32 b, int16* result, int32 size, int32 steps)
{
int32 i = 0;
steps = intrin_validate_steps((const byte*) a, steps);
steps = intrin_validate_steps((const byte*) result, steps);
if (steps == 16) {
__m512i a_16;
__m512 af_lo, af_hi;
__m512 b_16 = _mm512_set1_ps(b);
__m512 result_lo, result_hi;
__m512i result_16;
#ifdef MACRO_CPU_FEATURE_AVX512
if (steps >= 16) {
steps = 16;
__m512i a_16;
__m512 af_lo, af_hi;
__m512 b_16 = _mm512_set1_ps(b);
__m512 result_lo, result_hi;
__m512i result_16;
for (; i <= size - steps; i += steps) {
a_16 = _mm512_loadu_si512((__m512i*) a);
for (; i <= size - steps; i += steps) {
a_16 = _mm512_load_si512((__m512i*) a);
af_lo = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 0)));
af_hi = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 1)));
af_lo = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 0)));
af_hi = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 1)));
result_lo = _mm512_mul_ps(af_lo, b_16);
result_hi = _mm512_mul_ps(af_hi, b_16);
result_lo = _mm512_mul_ps(af_lo, b_16);
result_hi = _mm512_mul_ps(af_hi, b_16);
result_16 = _mm512_packs_epi32(_mm512_cvtps_epi32(result_lo), _mm512_cvtps_epi32(result_hi));
_mm512_storeu_si512((__m512i*) result, result_16);
result_16 = _mm512_packs_epi32(_mm512_cvtps_epi32(result_lo), _mm512_cvtps_epi32(result_hi));
_mm512_store_si512((__m512i*) result, result_16);
a += steps;
result += steps;
a += steps;
result += steps;
}
steps = 1;
}
} else if (steps == 8) {
__m256i a_8;
__m256 af_lo, af_hi;
__m256 b_8 = _mm256_set1_ps(b);
__m256 result_lo, result_hi;
__m256i result_8;
#endif
for (; i <= size - steps; i += steps) {
a_8 = _mm256_loadu_si256((__m256i*) a);
#ifdef MACRO_CPU_FEATURE_AVX2
if (steps >= 8) {
steps = 8;
__m256i a_8;
__m256 af_lo, af_hi;
__m256 b_8 = _mm256_set1_ps(b);
__m256 result_lo, result_hi;
__m256i result_8;
af_lo = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 0)));
af_hi = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 1)));
for (; i <= size - steps; i += steps) {
a_8 = _mm256_load_si256((__m256i*) a);
result_lo = _mm256_mul_ps(af_lo, b_8);
result_hi = _mm256_mul_ps(af_hi, b_8);
af_lo = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 0)));
af_hi = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 1)));
result_8 = _mm256_packs_epi32(_mm256_cvtps_epi32(result_lo), _mm256_cvtps_epi32(result_hi));
_mm256_storeu_si256((__m256i*) result, result_8);
result_lo = _mm256_mul_ps(af_lo, b_8);
result_hi = _mm256_mul_ps(af_hi, b_8);
a += steps;
result += steps;
result_8 = _mm256_packs_epi32(_mm256_cvtps_epi32(result_lo), _mm256_cvtps_epi32(result_hi));
_mm256_store_si256((__m256i*) result, result_8);
a += steps;
result += steps;
}
steps = 1;
}
} else if (steps == 4) {
__m128i a_4;
__m128 af_lo, af_hi;
__m128 b_4 = _mm_set1_ps(b);
__m128 result_lo, result_hi;
__m128i result_4;
#endif
for (; i <= size - steps; i += steps) {
a_4 = _mm_loadu_si128((__m128i*) a);
#ifdef MACRO_CPU_FEATURE_SSE42
if (steps >= 4) {
steps = 4;
__m128i a_4;
__m128 af_lo, af_hi;
__m128 b_4 = _mm_set1_ps(b);
__m128 result_lo, result_hi;
__m128i result_4;
af_lo = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_4));
af_hi = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(a_4, 8)));
for (; i <= size - steps; i += steps) {
a_4 = _mm_load_si128((__m128i*) a);
result_lo = _mm_mul_ps(af_lo, b_4);
result_hi = _mm_mul_ps(af_hi, b_4);
af_lo = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_4));
af_hi = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(a_4, 8)));
result_4 = _mm_packs_epi32(_mm_cvtps_epi32(result_lo), _mm_cvtps_epi32(result_hi));
_mm_storeu_si128((__m128i*) result, result_4);
result_lo = _mm_mul_ps(af_lo, b_4);
result_hi = _mm_mul_ps(af_hi, b_4);
a += steps;
result += steps;
result_4 = _mm_packs_epi32(_mm_cvtps_epi32(result_lo), _mm_cvtps_epi32(result_hi));
_mm_store_si128((__m128i*) result, result_4);
a += steps;
result += steps;
}
}
}
#endif
// Handle any remaining elements
for (; i < size; ++i) {

View File

@ -0,0 +1,262 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I16_AVX2_H
#define COMS_TOS_STDLIB_SIMD_I16_AVX2_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int16_16 {
union {
#if ARM
svint16_t s;
#else
__m256i s;
#endif
int16 v[16];
};
};
inline int16_16 load_int16_16(const int16* mem)
{
int16_16 simd;
simd.s = _mm256_load_si256((__m256i *) mem);
return simd;
}
inline int16_16 init_int16_16(const int16* mem)
{
int16_16 simd;
simd.s = _mm256_set_epi16(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]
);
return simd;
}
inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m256i *) array, a.s); }
inline int16_16 init_zero_int16_16()
{
int16_16 simd;
simd.s = _mm256_setzero_si256();
return simd;
}
inline int16_16 init_value_int16_16(int16 value)
{
int16_16 simd;
simd.s = _mm256_set1_epi16(value);
return simd;
}
inline int16_16 operator+(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_add_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator-(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_sub_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator-(int16_16 a) { return init_zero_int16_16() - a; }
inline int16_16 operator*(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_mul_epi32(a.s, b.s);
return simd;
}
inline int16_16 operator^(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_xor_si256(a.s, b.s);
return simd;
}
inline int16_16 &operator-=(int16_16 &a, int16_16 b)
{
a = a - b;
return a;
}
inline int16_16 &operator+=(int16_16 &a, int16_16 b)
{
a = a + b;
return a;
}
inline int16_16 &operator*=(int16_16 &a, int16_16 b)
{
a = a * b;
return a;
}
inline int16_16 &operator^=(int16_16 &a, int16_16 b)
{
a = a ^ b;
return a;
}
inline int16_16 operator<(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_xor_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1));
return simd;
}
inline int16_16 operator<=(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1));
return simd;
}
inline int16_16 operator>(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_cmpgt_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator>=(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(b.s, a.s), _mm256_set1_epi16(-1));
return simd;
}
inline int16_16 operator==(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_cmpeq_epi16(a.s, b.s);
return simd;
}
inline int16_16 operator!=(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_mask_blend_epi16(_mm256_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int16_16 operator&(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_and_si256(a.s, b.s);
return simd;
}
inline int16_16 operator|(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_or_si256(a.s, b.s);
return simd;
}
inline int16_16 &operator&=(int16_16 &a, int16_16 b)
{
a = a & b;
return a;
}
inline int16_16 &operator|=(int16_16 &a, int16_16 b)
{
a = a | b;
return a;
}
inline int16_16 abs(int16_16 a)
{
int16_16 simd;
simd.s = _mm256_abs_epi16(a.s);
return simd;
}
inline int16_16 simd_min(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_min_epi16(a.s, b.s);
return simd;
}
inline int16_16 simd_max(int16_16 a, int16_16 b)
{
int16_16 simd;
simd.s = _mm256_max_epi16(a.s, b.s);
return simd;
}
inline int16_16 clamp(int16_16 min_value, int16_16 a, int16_16 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int16_16 a)
{
return _mm256_movemask_epi8(a.s);
}
inline bool any_true(int16_16 a)
{
bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool all_true(int16_16 a)
{
bool is_true = _mm256_movemask_epi8(a.s) == 255;
return is_true;
}
inline bool all_false(int16_16 a)
{
bool is_false = _mm256_movemask_epi8(a.s) == 0;
return is_false;
}
#endif

View File

@ -0,0 +1,265 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I16_AVX512_H
#define COMS_TOS_STDLIB_SIMD_I16_AVX512_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int16_32 {
union {
#if ARM
svint16_t s;
#else
__m512i s;
#endif
int16 v[32];
};
};
inline int16_32 load_int16_32(const int16* mem)
{
int16_32 simd;
simd.s = _mm512_load_si512((__m512i *) mem);
return simd;
}
inline int16_32 init_int16_32(const int16* mem)
{
int16_32 simd;
simd.s = _mm512_set_epi16(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31]
);
return simd;
}
inline void unload_int16_32(int16_32 a, int16 *array) { _mm512_storeu_epi16(array, a.s); }
inline int16_32 init_zero_int16_32()
{
int16_32 simd;
simd.s = _mm512_setzero_si512();
return simd;
}
inline int16_32 init_value_int16_32(int16 value)
{
int16_32 simd;
simd.s = _mm512_set1_epi16(value);
return simd;
}
inline int16_32 operator+(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_add_epi16(a.s, b.s);
return simd;
}
inline int16_32 operator-(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_sub_epi16(a.s, b.s);
return simd;
}
inline int16_32 operator-(int16_32 a) { return init_zero_int16_32() - a; }
inline int16_32 operator*(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mul_epi32(a.s, b.s);
return simd;
}
inline int16_32 operator^(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_xor_si512(a.s, b.s);
return simd;
}
inline int16_32 &operator-=(int16_32 &a, int16_32 b)
{
a = a - b;
return a;
}
inline int16_32 &operator+=(int16_32 &a, int16_32 b)
{
a = a + b;
return a;
}
inline int16_32 &operator*=(int16_32 &a, int16_32 b)
{
a = a * b;
return a;
}
inline int16_32 &operator^=(int16_32 &a, int16_32 b)
{
a = a ^ b;
return a;
}
inline int16_32 operator<(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmplt_epi16_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int16_32 operator<=(int16_32 a, int16_32 b)
{
int16_32 simd;
__mmask32 mask = _mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_LE);
simd.s = _mm512_mask_blend_epi16(mask, b.s, a.s);
return simd;
}
inline int16_32 operator>(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmpgt_epi16_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int16_32 operator>=(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmpge_epi16_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int16_32 operator==(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmpeq_epi16_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int16_32 operator!=(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_mask_blend_epi16(_mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int16_32 operator&(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_and_si512(a.s, b.s);
return simd;
}
inline int16_32 operator|(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_or_si512(a.s, b.s);
return simd;
}
inline int16_32 &operator&=(int16_32 &a, int16_32 b)
{
a = a & b;
return a;
}
inline int16_32 &operator|=(int16_32 &a, int16_32 b)
{
a = a | b;
return a;
}
inline int16_32 abs(int16_32 a)
{
int16_32 simd;
simd.s = _mm512_abs_epi16(a.s);
return simd;
}
inline int16_32 simd_min(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_min_epi16(a.s, b.s);
return simd;
}
inline int16_32 simd_max(int16_32 a, int16_32 b)
{
int16_32 simd;
simd.s = _mm512_max_epi16(a.s, b.s);
return simd;
}
inline int16_32 clamp(int16_32 min_value, int16_32 a, int16_32 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int16_32 a)
{
return _mm512_movepi16_mask(a.s);
}
inline bool any_true(int16_32 a)
{
bool is_any_true = _mm512_movepi16_mask(a.s) > 0;
return is_any_true;
}
inline bool all_true(int16_32 a)
{
bool is_true = _mm512_movepi16_mask(a.s) == 65535;
return is_true;
}
inline bool all_false(int16_32 a)
{
// @todo This can be optimized (requires also changes in the comparison functions return)
bool is_false = _mm512_movepi16_mask(a.s) == 0;
return is_false;
}
#endif

View File

@ -0,0 +1,261 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I16_SSE_H
#define COMS_TOS_STDLIB_SIMD_I16_SSE_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int16_8 {
union {
#if ARM
svint16_t s;
#else
__m128i s;
#endif
int16 v[8];
};
};
inline int16_8 load_int16_8(const int16* mem)
{
int16_8 simd;
simd.s = _mm_load_si128((__m128i *) mem);
return simd;
}
inline int16_8 init_int16_8(const int16* mem)
{
int16_8 simd;
simd.s = _mm_set_epi16(
mem[0], mem[1], mem[2], mem[3],
mem[4], mem[5], mem[6], mem[7]
);
return simd;
}
inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i *) array, a.s); }
inline int16_8 init_zero_int16_8()
{
int16_8 simd;
simd.s = _mm_setzero_si128();
return simd;
}
inline int16_8 init_value_int16_8(int16 value)
{
int16_8 simd;
simd.s = _mm_set1_epi16(value);
return simd;
}
inline int16_8 operator+(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_add_epi16(a.s, b.s);
return simd;
}
inline int16_8 operator-(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_sub_epi16(a.s, b.s);
return simd;
}
inline int16_8 operator-(int16_8 a) { return init_zero_int16_8() - a; }
inline int16_8 operator*(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_mul_epi32(a.s, b.s);
return simd;
}
inline int16_8 operator^(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_xor_si128(a.s, b.s);
return simd;
}
inline int16_8 &operator-=(int16_8 &a, int16_8 b)
{
a = a - b;
return a;
}
inline int16_8 &operator+=(int16_8 &a, int16_8 b)
{
a = a + b;
return a;
}
inline int16_8 &operator*=(int16_8 &a, int16_8 b)
{
a = a * b;
return a;
}
inline int16_8 &operator^=(int16_8 &a, int16_8 b)
{
a = a ^ b;
return a;
}
inline int16_8 operator<(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_cmplt_epi16(a.s, b.s);
return simd;
}
inline int16_8 operator<=(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi16(b.s, a.s), _mm_set1_epi16(-1));
return simd;
}
inline int16_8 operator>(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_cmpgt_epi16(a.s, b.s);
return simd;
}
inline int16_8 operator>=(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi16(a.s, b.s), _mm_set1_epi16(-1));
return simd;
}
inline int16_8 operator==(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_cmpeq_epi16(a.s, b.s);
return simd;
}
inline int16_8 operator!=(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_andnot_si128(_mm_cmpeq_epi16(a.s, b.s), _mm_set1_epi16(-1));
return simd;
}
inline int16_8 operator&(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_and_si128(a.s, b.s);
return simd;
}
inline int16_8 operator|(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_or_si128(a.s, b.s);
return simd;
}
inline int16_8 &operator&=(int16_8 &a, int16_8 b)
{
a = a & b;
return a;
}
inline int16_8 &operator|=(int16_8 &a, int16_8 b)
{
a = a | b;
return a;
}
inline int16_8 abs(int16_8 a)
{
int16_8 simd;
simd.s = _mm_abs_epi16(a.s);
return simd;
}
inline int16_8 simd_min(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_min_epi16(a.s, b.s);
return simd;
}
inline int16_8 simd_max(int16_8 a, int16_8 b)
{
int16_8 simd;
simd.s = _mm_max_epi16(a.s, b.s);
return simd;
}
inline int16_8 clamp(int16_8 min_value, int16_8 a, int16_8 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int16_8 a)
{
return _mm_movemask_epi8(a.s);
}
inline bool any_true(int16_8 a)
{
bool is_any_true = _mm_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool all_true(int16_8 a)
{
bool is_true = _mm_movemask_epi8(a.s) == 15;
return is_true;
}
inline bool all_false(int16_8 a)
{
bool is_false = _mm_movemask_epi8(a.s) == 0;
return is_false;
}
#endif

2125
architecture/x86/simd/SIMD_I32.h Normal file → Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,288 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I32_H
#define COMS_TOS_STDLIB_SIMD_I32_H
#include <immintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include "../../../stdlib/Types.h"
// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general
// or better create alternative functions for the available sse version.
// @question why are we passing structs by value?
struct int32_8 {
union {
#if ARM
svint32_t s;
#else
__m256i s;
#endif
int32 v[8];
};
};
inline int32_8 load_int32_8(const int32* mem)
{
int32_8 simd;
simd.s = _mm256_load_si256((__m256i *) mem);
return simd;
}
inline int32_8 init_int32_8(const int32* mem)
{
int32_8 simd;
simd.s = _mm256_set_epi32(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]);
return simd;
}
inline void unload_int32_8(int32_8 a, int32 *array) { _mm256_store_si256((__m256i *) array, a.s); }
inline int32_8 init_zero_int32_8()
{
int32_8 simd;
simd.s = _mm256_setzero_si256();
return simd;
}
inline int32_8 init_value_int32_8(int32 value)
{
int32_8 simd;
simd.s = _mm256_set1_epi32(value);
return simd;
}
inline int32_8 init_values_int32_8(
int32 a, int32 b, int32 c, int32 d,
int32 e, int32 f, int32 g, int32 h
)
{
int32_8 simd;
simd.s = _mm256_set_epi32(a, b, c, d, e, f, g, h);
return simd;
}
inline int32_8 operator+(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_add_epi32(a.s, b.s);
return simd;
}
inline int32_8 operator-(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_sub_epi32(a.s, b.s);
return simd;
}
inline int32_8 operator-(int32_8 a) { return init_zero_int32_8() - a; }
inline int32_8 operator*(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_mul_epi32(a.s, b.s);
return simd;
}
inline int32_8 operator^(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_xor_epi32(a.s, b.s);
return simd;
}
inline int32_8 &operator-=(int32_8 &a, int32_8 b)
{
a = a - b;
return a;
}
inline int32_8 &operator+=(int32_8 &a, int32_8 b)
{
a = a + b;
return a;
}
inline int32_8 &operator*=(int32_8 &a, int32_8 b)
{
a = a * b;
return a;
}
inline int32_8 &operator^=(int32_8 &a, int32_8 b)
{
a = a ^ b;
return a;
}
inline int32_8 operator<(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_xor_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1));
return simd;
}
inline int32_8 operator<=(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1));
return simd;
}
inline int32_8 operator>(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_cmpgt_epi32(a.s, b.s);
return simd;
}
inline int32_8 operator>=(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(b.s, a.s), _mm256_set1_epi32(-1));
return simd;
}
inline int32_8 operator==(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_cmpeq_epi32(a.s, b.s);
return simd;
}
inline int32_8 operator!=(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_mask_blend_epi32(_mm256_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int32_8 operator&(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_and_si256(a.s, b.s);
return simd;
}
inline int32_8 operator|(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_or_epi32(a.s, b.s);
return simd;
}
inline int32_8 &operator&=(int32_8 &a, int32_8 b)
{
a = a & b;
return a;
}
inline int32_8 &operator|=(int32_8 &a, int32_8 b)
{
a = a | b;
return a;
}
inline int32_8 abs(int32_8 a)
{
int32_8 simd;
simd.s = _mm256_abs_epi32(a.s);
return simd;
}
inline int32_8 simd_min(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_min_epi32(a.s, b.s);
return simd;
}
inline int32_8 simd_max(int32_8 a, int32_8 b)
{
int32_8 simd;
simd.s = _mm256_max_epi32(a.s, b.s);
return simd;
}
inline int32_8 sign(int32_8 a)
{
__m256i mask = _mm256_set1_epi32(0x80000000);
__m256i signBit = _mm256_and_si256(a.s, mask);
__m256i b = _mm256_set1_epi32(1);
int32_8 simd;
simd.s = _mm256_or_si256(b, signBit);
return simd;
}
inline int32_8 clamp(int32_8 min_value, int32_8 a, int32_8 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int32_8 a)
{
int32 which_true = _mm256_movemask_epi8(a.s);
return which_true;
}
inline bool any_true(int32_8 a)
{
bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool all_true(int32_8 a)
{
bool is_true = _mm256_movemask_epi8(a.s) == 255;
return is_true;
}
inline bool all_false(int32_8 a)
{
bool is_false = _mm256_movemask_epi8(a.s) == 0;
return is_false;
}
#endif

View File

@ -0,0 +1,309 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I32_H
#define COMS_TOS_STDLIB_SIMD_I32_H
#include <immintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include "../../../stdlib/Types.h"
#include "SIMD_SVML_AVX512.h"
// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general
// or better create alternative functions for the available sse version.
// @question why are we passing structs by value?
struct int32_16 {
union {
#if ARM
svint32_t s;
#else
__m512i s;
#endif
int32 v[16];
};
};
inline int32_16 load_int32_16(const int32* mem)
{
int32_16 simd;
simd.s = _mm512_load_epi32(mem);
return simd;
}
inline int32_16 init_int32_16(const int32* mem)
{
int32_16 simd;
simd.s = _mm512_set_epi32(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]);
return simd;
}
inline void unload_int32_16(int32_16 a, int32 *array) { _mm512_store_epi32(array, a.s); }
inline int32_16 init_zero_int32_16()
{
int32_16 simd;
simd.s = _mm512_setzero_epi32();
return simd;
}
inline int32_16 init_value_int32_16(int32 value)
{
int32_16 simd;
simd.s = _mm512_set1_epi32(value);
return simd;
}
inline int32_16 init_values_int32_16(
int32 a, int32 b, int32 c, int32 d,
int32 e, int32 f, int32 g, int32 h,
int32 i, int32 j, int32 k, int32 l,
int32 m, int32 n, int32 o, int32 p
)
{
int32_16 simd;
simd.s = _mm512_set_epi32(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
return simd;
}
inline int32_16 operator+(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_add_epi32(a.s, b.s);
return simd;
}
inline int32_16 operator-(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_sub_epi32(a.s, b.s);
return simd;
}
inline int32_16 operator-(int32_16 a) { return init_zero_int32_16() - a; }
inline int32_16 operator*(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_mul_epi32(a.s, b.s);
return simd;
}
inline int32_16 operator/(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_div_epi32(a.s, b.s);
return simd;
}
inline int32_16 operator^(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_xor_epi32(a.s, b.s);
return simd;
}
inline int32_16 &operator-=(int32_16 &a, int32_16 b)
{
a = a - b;
return a;
}
inline int32_16 &operator+=(int32_16 &a, int32_16 b)
{
a = a + b;
return a;
}
inline int32_16 &operator*=(int32_16 &a, int32_16 b)
{
a = a * b;
return a;
}
inline int32_16 &operator/=(int32_16 &a, int32_16 b)
{
a.s = (a / b).s;
return a;
}
inline int32_16 &operator^=(int32_16 &a, int32_16 b)
{
a = a ^ b;
return a;
}
inline int32_16 operator<(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_mask_blend_epi32(_mm512_cmplt_epi32_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int32_16 operator<=(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_mask_blend_epi32(_mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), b.s, a.s);
return simd;
}
inline int32_16 operator>(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_mask_blend_epi32(_mm512_cmpgt_epi32_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int32_16 operator>=(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_mask_blend_epi32(_mm512_cmpge_epi32_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int32_16 operator==(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_mask_blend_epi32(_mm512_cmpeq_epi32_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int32_16 operator!=(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_mask_blend_epi32(_mm512_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int32_16 operator&(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_and_si512(a.s, b.s);
return simd;
}
inline int32_16 operator|(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_or_epi32(a.s, b.s);
return simd;
}
inline int32_16 &operator&=(int32_16 &a, int32_16 b)
{
a = a & b;
return a;
}
inline int32_16 &operator|=(int32_16 &a, int32_16 b)
{
a = a | b;
return a;
}
inline int32_16 abs(int32_16 a)
{
int32_16 simd;
simd.s = _mm512_abs_epi64(a.s);
return simd;
}
inline int32_16 simd_min(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_min_epi32(a.s, b.s);
return simd;
}
inline int32_16 simd_max(int32_16 a, int32_16 b)
{
int32_16 simd;
simd.s = _mm512_max_epi32(a.s, b.s);
return simd;
}
inline int32_16 sign(int32_16 a)
{
__m512i mask = _mm512_set1_epi32(0x80000000);
__m512i signBit = _mm512_and_si512(a.s, mask);
__m512i b = _mm512_set1_epi32(1);
int32_16 simd;
simd.s = _mm512_or_si512(b, signBit);
return simd;
}
inline int32_16 clamp(int32_16 min_value, int32_16 a, int32_16 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int32_16 a)
{
int32 which_true = _mm512_movepi32_mask(a.s);
return which_true;
}
inline bool any_true(int32_16 a)
{
bool is_any_true = _mm512_movepi32_mask(a.s) > 0;
return is_any_true;
}
inline bool all_true(int32_16 a)
{
bool is_true = _mm512_movepi32_mask(a.s) == 65535;
return is_true;
}
inline bool all_false(int32_16 a)
{
// @todo This can be optimized (requires also changes in the comparison functions return)
bool is_false = _mm512_movepi32_mask(a.s) == 0;
return is_false;
}
#endif

View File

@ -0,0 +1,286 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I32_H
#define COMS_TOS_STDLIB_SIMD_I32_H
#include <immintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include "../../../stdlib/Types.h"
// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general
// or better create alternative functions for the available sse version.
// @question why are we passing structs by value?
struct int32_4 {
union {
#if ARM
svint32_t s;
#else
__m128i s;
#endif
int32 v[4];
};
};
inline int32_4 load_int32_4(const int32* mem)
{
int32_4 simd;
simd.s = _mm_load_si128((__m128i *) mem);
return simd;
}
inline int32_4 init_int32_4(const int32* mem)
{
int32_4 simd;
simd.s = _mm_set_epi32(mem[0], mem[1], mem[2], mem[3]);
return simd;
}
inline void unload_int32_4(int32_4 a, int32 *array) { _mm_store_si128((__m128i *) array, a.s); }
inline int32_4 init_zero_int32_4()
{
int32_4 simd;
simd.s = _mm_setzero_si128();
return simd;
}
inline int32_4 init_value_int32_4(int32 value)
{
int32_4 simd;
simd.s = _mm_set1_epi32(value);
return simd;
}
inline int32_4 init_values_int32_4(int32 a, int32 b, int32 c, int32 d)
{
int32_4 simd;
simd.s = _mm_set_epi32(a, b, c, d);
return simd;
}
inline int32_4 operator+(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_add_epi32(a.s, b.s);
return simd;
}
inline int32_4 operator-(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_sub_epi32(a.s, b.s);
return simd;
}
inline int32_4 operator-(int32_4 a) { return init_zero_int32_4() - a; }
inline int32_4 operator*(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_mul_epi32(a.s, b.s);
return simd;
}
inline int32_4 operator^(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_xor_epi32(a.s, b.s);
return simd;
}
inline int32_4 &operator-=(int32_4 &a, int32_4 b)
{
a = a - b;
return a;
}
inline int32_4 &operator+=(int32_4 &a, int32_4 b)
{
a = a + b;
return a;
}
inline int32_4 &operator*=(int32_4 &a, int32_4 b)
{
a = a * b;
return a;
}
inline int32_4 &operator^=(int32_4 &a, int32_4 b)
{
a = a ^ b;
return a;
}
inline int32_4 operator<(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_cmplt_epi32(a.s, b.s);
return simd;
}
inline int32_4 operator<=(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi32(b.s, a.s), _mm_set1_epi32(-1));
return simd;
}
inline int32_4 operator>(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_cmpgt_epi32(a.s, b.s);
return simd;
}
inline int32_4 operator>=(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi32(a.s, b.s), _mm_set1_epi32(-1));
return simd;
}
inline int32_4 operator==(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_cmpeq_epi32(a.s, b.s);
return simd;
}
inline int32_4 operator!=(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_andnot_si128(_mm_cmpeq_epi32(a.s, b.s), _mm_set1_epi32(-1));
return simd;
}
inline int32_4 operator&(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_and_si128(a.s, b.s);
return simd;
}
inline int32_4 operator|(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_or_epi32(a.s, b.s);
return simd;
}
inline int32_4 &operator&=(int32_4 &a, int32_4 b)
{
a = a & b;
return a;
}
inline int32_4 &operator|=(int32_4 &a, int32_4 b)
{
a = a | b;
return a;
}
inline int32_4 abs(int32_4 a)
{
int32_4 simd;
simd.s = _mm_abs_epi32(a.s);
return simd;
}
inline int32_4 simd_min(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_min_epi32(a.s, b.s);
return simd;
}
inline int32_4 simd_max(int32_4 a, int32_4 b)
{
int32_4 simd;
simd.s = _mm_max_epi32(a.s, b.s);
return simd;
}
inline int32_4 sign(int32_4 a)
{
__m128i mask = _mm_set1_epi32(0x80000000);
__m128i signBit = _mm_and_si128(a.s, mask);
__m128i b = _mm_set1_epi32(1);
int32_4 simd;
simd.s = _mm_or_si128(b, signBit);
return simd;
}
inline int32_4 clamp(int32_4 min_value, int32_4 a, int32_4 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int32_4 a)
{
int32 which_true = _mm_movemask_epi8(a.s);
return which_true;
}
inline bool any_true(int32_4 a)
{
bool is_any_true = _mm_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool all_true(int32_4 a)
{
bool is_true = _mm_movemask_epi8(a.s) == 15;
return is_true;
}
inline bool all_false(int32_4 a)
{
bool is_false = _mm_movemask_epi8(a.s) == 0;
return is_false;
}
#endif

43
architecture/x86/simd/SIMD_I64.h Normal file → Executable file
View File

@ -13,42 +13,17 @@
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
#include "SIMD_F64.h"
struct int64_2 {
union {
#if ARM
svint64_t s;
#else
__m128i s;
#endif
#ifdef MACRO_CPU_FEATURE_SSE42
#include "SIMD_I64_SSE.h"
#endif
int64 v[2];
};
};
#ifdef MACRO_CPU_FEATURE_AVX2
#include "SIMD_I64_AVX2.h"
#endif
struct int64_4 {
union {
#if ARM
svint64_t s;
#else
__m256i s;
#endif
int64 v[4];
};
};
struct int64_8 {
union {
#if ARM
svint64_t s;
#else
__m512i s;
#endif
int64 v[8];
};
};
#ifdef MACRO_CPU_FEATURE_AVX512
#include "SIMD_I64_AVX512.h"
#endif
#endif

View File

@ -0,0 +1,29 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I64_AVX2_H
#define COMS_TOS_STDLIB_SIMD_I64_AVX2_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int64_4 {
union {
#if ARM
svint64_t s;
#else
__m256i s;
#endif
int64 v[4];
};
};
#endif

View File

@ -0,0 +1,29 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I64_AVX512_H
#define COMS_TOS_STDLIB_SIMD_I64_AVX512_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int64_8 {
union {
#if ARM
svint64_t s;
#else
__m512i s;
#endif
int64 v[8];
};
};
#endif

View File

@ -0,0 +1,29 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I64_SSE_H
#define COMS_TOS_STDLIB_SIMD_I64_SSE_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int64_2 {
union {
#if ARM
svint64_t s;
#else
__m128i s;
#endif
int64 v[2];
};
};
#endif

946
architecture/x86/simd/SIMD_I8.h Normal file → Executable file
View File

@ -13,906 +13,108 @@
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
#include "SIMD_F32.h"
#include "SIMD_I32.h"
struct int8_16 {
union {
#if ARM
svint8_t s;
#else
__m128i s;
#endif
#ifdef MACRO_CPU_FEATURE_SSE42
#include "SIMD_I8_SSE.h"
#endif
int8 v[16];
};
};
#ifdef MACRO_CPU_FEATURE_AVX2
#include "SIMD_I8_AVX2.h"
#endif
struct int8_32 {
union {
#if ARM
svint8_t s;
#else
__m256i s;
#endif
#ifdef MACRO_CPU_FEATURE_AVX512
#include "SIMD_I8_AVX512.h"
#endif
int8 v[32];
};
};
struct int8_64 {
union {
#if ARM
svint8_t s;
#else
__m512i s;
#endif
int8 v[64];
};
};
inline int8_16 load_int8_16(const int8* mem)
{
int8_16 simd;
simd.s = _mm_load_si128((__m128i *) mem);
return simd;
}
inline int8_16 init_int8_16(const int8* mem)
{
int8_16 simd;
simd.s = _mm_set_epi8(
mem[0], mem[1], mem[2], mem[3],
mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11],
mem[12], mem[13], mem[14], mem[15]
);
return simd;
}
inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *) array, a.s); }
inline int8_32 load_int8_32(const int8* mem)
{
int8_32 simd;
simd.s = _mm256_load_si256((__m256i *) mem);
return simd;
}
inline int8_32 init_int8_32(const int8* mem)
{
int8_32 simd;
simd.s = _mm256_set_epi8(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31]
);
return simd;
}
inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i *) array, a.s); }
inline int8_64 load_int8_64(const int8* mem)
{
int8_64 simd;
simd.s = _mm512_load_si512((__m512i *) mem);
return simd;
}
inline int8_64 init_int8_64(const int8* mem)
{
int8_64 simd;
simd.s = _mm512_set_epi8(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31],
mem[32], mem[33], mem[34], mem[35], mem[36], mem[37], mem[38], mem[39],
mem[40], mem[41], mem[42], mem[43], mem[44], mem[45], mem[46], mem[47],
mem[48], mem[49], mem[50], mem[51], mem[52], mem[53], mem[54], mem[55],
mem[56], mem[57], mem[58], mem[59], mem[60], mem[61], mem[62], mem[63]
);
return simd;
}
inline void unload_int8_64(int8_64 a, int8 *array) { _mm512_storeu_epi8(array, a.s); }
inline int8_16 init_zero_int8_16()
{
int8_16 simd;
simd.s = _mm_setzero_si128();
return simd;
}
inline int8_32 init_zero_int8_32()
{
int8_32 simd;
simd.s = _mm256_setzero_si256();
return simd;
}
inline int8_64 init_zero_int8_64()
{
int8_64 simd;
simd.s = _mm512_setzero_si512();
return simd;
}
inline int8_16 init_value_int8_16(int8 value)
{
int8_16 simd;
simd.s = _mm_set1_epi8(value);
return simd;
}
inline int8_32 init_value_int8_32(int8 value)
{
int8_32 simd;
simd.s = _mm256_set1_epi8(value);
return simd;
}
inline int8_64 init_value_int8_64(int8 value)
{
int8_64 simd;
simd.s = _mm512_set1_epi8(value);
return simd;
}
inline
f32_4 int8_16_to_f32_4(int8_16 a)
{
f32_4 result;
result.s = _mm_cvtepi32_ps(a.s);
return result;
}
inline
f32_8 int8_16_to_f32_8(int8_16 a)
{
f32_8 result;
result.s = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(a.s));
return result;
}
inline
f32_16 int8_16_to_f32_16(int8_16 a)
{
f32_16 result;
result.s = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(a.s));
return result;
}
inline int8_16 operator+(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_add_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator+(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_add_epi8(a.s, b.s);
return simd;
}
inline int8_64 operator+(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_add_epi8(a.s, b.s);
return simd;
}
inline int8_16 operator-(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_sub_epi8(a.s, b.s);
return simd;
}
inline int8_16 operator-(int8_16 a) { return init_zero_int8_16() - a; }
inline int8_32 operator-(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_sub_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator-(int8_32 a) { return init_zero_int8_32() - a; }
inline int8_64 operator-(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_sub_epi8(a.s, b.s);
return simd;
}
inline int8_64 operator-(int8_64 a) { return init_zero_int8_64() - a; }
inline int8_16 operator*(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_mul_epi32(a.s, b.s);
return simd;
}
inline int8_32 operator*(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_mul_epi32(a.s, b.s);
return simd;
}
inline int8_64 operator*(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mul_epi32(a.s, b.s);
return simd;
}
inline int8_16 operator^(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_xor_si128(a.s, b.s);
return simd;
}
inline int8_32 operator^(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_xor_si256(a.s, b.s);
return simd;
}
inline int8_64 operator^(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_xor_si512(a.s, b.s);
return simd;
}
inline int8_16 &operator-=(int8_16 &a, int8_16 b)
{
a = a - b;
return a;
}
inline int8_32 &operator-=(int8_32 &a, int8_32 b)
{
a = a - b;
return a;
}
inline int8_64 &operator-=(int8_64 &a, int8_64 b)
{
a = a - b;
return a;
}
inline int8_16 &operator+=(int8_16 &a, int8_16 b)
{
a = a + b;
return a;
}
inline int8_32 &operator+=(int8_32 &a, int8_32 b)
{
a = a + b;
return a;
}
inline int8_64 &operator+=(int8_64 &a, int8_64 b)
{
a = a + b;
return a;
}
inline int8_16 &operator*=(int8_16 &a, int8_16 b)
{
a = a * b;
return a;
}
inline int8_32 &operator*=(int8_32 &a, int8_32 b)
{
a = a * b;
return a;
}
inline int8_64 &operator*=(int8_64 &a, int8_64 b)
{
a = a * b;
return a;
}
inline int8_16 &operator^=(int8_16 &a, int8_16 b)
{
a = a ^ b;
return a;
}
inline int8_32 &operator^=(int8_32 &a, int8_32 b)
{
a = a ^ b;
return a;
}
inline int8_64 &operator^=(int8_64 &a, int8_64 b)
{
a = a ^ b;
return a;
}
inline int8_16 operator<(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_cmplt_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator<(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_xor_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1));
return simd;
}
inline int8_64 operator<(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmplt_epi8_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int8_16 operator<=(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi8(b.s, a.s), _mm_set1_epi8(-1));
return simd;
}
inline int8_32 operator<=(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1));
return simd;
}
inline int8_64 operator<=(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmple_epi8_mask(a.s, b.s), b.s, a.s);
return simd;
}
inline int8_16 operator>(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_cmpgt_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator>(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_cmpgt_epi8(a.s, b.s);
return simd;
}
inline int8_64 operator>(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmpgt_epi8_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int8_16 operator>=(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi8(a.s, b.s), _mm_set1_epi8(-1));
return simd;
}
inline int8_32 operator>=(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(b.s, a.s), _mm256_set1_epi8(-1));
return simd;
}
inline int8_64 operator>=(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmpge_epi8_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int8_16 operator==(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_cmpeq_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator==(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_cmpeq_epi8(a.s, b.s);
return simd;
}
inline int8_64 operator==(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmpeq_epi8_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int8_16 operator!=(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_andnot_si128(_mm_cmpeq_epi8(a.s, b.s), _mm_set1_epi8(-1));
return simd;
}
inline int8_32 operator!=(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_mask_blend_epi8(_mm256_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int8_64 operator!=(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int8_16 operator&(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_and_si128(a.s, b.s);
return simd;
}
inline int8_32 operator&(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_and_si256(a.s, b.s);
return simd;
}
inline int8_64 operator&(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_and_si512(a.s, b.s);
return simd;
}
inline int8_16 operator|(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_or_si128(a.s, b.s);
return simd;
}
inline int8_32 operator|(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_or_si256(a.s, b.s);
return simd;
}
inline int8_64 operator|(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_or_si512(a.s, b.s);
return simd;
}
inline int8_16 &operator&=(int8_16 &a, int8_16 b)
{
a = a & b;
return a;
}
inline int8_32 &operator&=(int8_32 &a, int8_32 b)
{
a = a & b;
return a;
}
inline int8_64 &operator&=(int8_64 &a, int8_64 b)
{
a = a & b;
return a;
}
inline int8_16 &operator|=(int8_16 &a, int8_16 b)
{
a = a | b;
return a;
}
inline int8_32 &operator|=(int8_32 &a, int8_32 b)
{
a = a | b;
return a;
}
inline int8_64 &operator|=(int8_64 &a, int8_64 b)
{
a = a | b;
return a;
}
inline int8_16 abs(int8_16 a)
{
int8_16 simd;
simd.s = _mm_abs_epi8(a.s);
return simd;
}
inline int8_32 abs(int8_32 a)
{
int8_32 simd;
simd.s = _mm256_abs_epi16(a.s);
return simd;
}
inline int8_64 abs(int8_64 a)
{
int8_64 simd;
simd.s = _mm512_abs_epi16(a.s);
return simd;
}
inline int8_16 simd_min(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_min_epi8(a.s, b.s);
return simd;
}
inline int8_32 simd_min(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_min_epi8(a.s, b.s);
return simd;
}
inline int8_64 simd_min(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_min_epi8(a.s, b.s);
return simd;
}
inline int8_16 simd_max(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_max_epi8(a.s, b.s);
return simd;
}
inline int8_32 simd_max(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_max_epi8(a.s, b.s);
return simd;
}
inline int8_64 simd_max(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_max_epi8(a.s, b.s);
return simd;
}
inline int8_16 clamp(int8_16 min_value, int8_16 a, int8_16 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int8_32 clamp(int8_32 min_value, int8_32 a, int8_32 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int8_64 clamp(int8_64 min_value, int8_64 a, int8_64 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int8_16 a)
{
int32 which_true = _mm_movemask_epi8(a.s);
return which_true;
}
inline int32 which_true(int8_32 a)
{
int32 which_true = _mm256_movemask_epi8(a.s);
return which_true;
}
inline int64 which_true(int8_64 a)
{
int64 which_true = _mm512_movepi8_mask(a.s);
return which_true;
}
inline bool any_true(int8_16 a)
{
bool is_any_true = _mm_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool any_true(int8_32 a)
{
bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool any_true(int8_64 a)
{
bool is_any_true = _mm512_movepi8_mask(a.s) > 0;
return is_any_true;
}
inline bool all_true(int8_16 a)
{
bool is_true = _mm_movemask_epi8(a.s) == 15;
return is_true;
}
inline bool all_true(int8_32 a)
{
bool is_true = _mm256_movemask_epi8(a.s) == 255;
return is_true;
}
inline bool all_true(int8_64 a)
{
bool is_true = _mm512_movepi8_mask(a.s) == 65535;
return is_true;
}
inline bool all_false(int8_16 a)
{
bool is_false = _mm_movemask_epi8(a.s) == 0;
return is_false;
}
inline bool all_false(int8_32 a)
{
bool is_false = _mm256_movemask_epi8(a.s) == 0;
return is_false;
}
inline bool all_false(int8_64 a)
{
// @todo This can be optimized (requires also changes in the comparison functions return)
bool is_false = _mm512_movepi8_mask(a.s) == 0;
return is_false;
}
// @todo from down here we can optimize some of the code by NOT using the wrappers
// the code is self contained and we could use te intrinsic functions directly
/*
inline
f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps)
{
if (steps == 16) {
__m512i a_16 = _mm512_load_si512((__m512i *) a);
__m512 af_16 = _mm512_cvtepi32_ps(a_16);
__m512 b_16 = _mm512_set1_ps(b);
__m512 result = _mm512_mul_ps(af_16, b_16);
} else if (steps == 8) {
__m256i a_8 = _mm256_load_si256((__m256i *) a);
__m256 af_8 = _mm256_cvtepi32_ps(a_8);
__m256 b_8 = _mm256_set1_ps(b);
__m256 result = _mm256_mul_ps(af_8, b_8);
} else if (steps == 4) {
__m128i a_4 = _mm_load_si128((__m128i *) a);
__m128 af_4 = _mm_cvtepi32_ps(a_4);
__m128 b_4 = _mm_set1_ps(b);
__m128 result = _mm_mul_ps(af_4, b_4);
} else {
}
}
*/
bool simd_compare_64(const byte* a, const byte* b)
{
__m256i chunk1_a = _mm256_load_si256((__m256i*) a);
__m256i chunk1_b = _mm256_load_si256((__m256i*) b);
__m256i chunk2_a = _mm256_load_si256((__m256i*) (a + 32));
__m256i chunk2_b = _mm256_load_si256((__m256i*) (b + 32));
__m256i result1 = _mm256_cmpeq_epi8(chunk1_a, chunk1_b);
__m256i result2 = _mm256_cmpeq_epi8(chunk2_a, chunk2_b);
__m256i combined = _mm256_and_si256(result1, result2);
return _mm256_testc_si256(combined, _mm256_set1_epi8(-1)) != 1;
}
int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
int simd_equal(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
uint32 i = 0;
steps = intrin_validate_steps((const byte*) a, steps);
steps = intrin_validate_steps((const byte*) b, steps);
if (steps == 16) {
if (size >= 128) {
__m512i a_16;
__m512i b_16;
__mmask64 result_mask;
#ifdef MACRO_CPU_FEATURE_AVX512
if (steps >= 16) {
steps = 16;
if (size >= 128) {
__m512i a_16;
__m512i b_16;
__mmask64 result_mask;
for (; i <= size - 64; i += 64) { // 64 bytes per iteration
a_16 = _mm512_load_si512((__m512i*) a);
b_16 = _mm512_load_si512((__m512i*) b);
for (; i <= size - 64; i += 64) { // 64 bytes per iteration
a_16 = _mm512_load_si512((__m512i*) a);
b_16 = _mm512_load_si512((__m512i*) b);
result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16);
result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16);
if (result_mask != 0xFFFFFFFFFFFFFFFF) {
return false;
if (result_mask != 0xFFFFFFFFFFFFFFFF) {
return false;
}
a += 64;
b += 64;
}
}
a += 64;
b += 64;
if (size - i >= 64) {
return simd_equal(a, b, size - i, 8);
} else if (size - i >= 32) {
return simd_equal(a, b, size - i, 4);
}
}
#endif
if (size - i >= 64) {
return simd_compare(a, b, size - i, 8);
} else if (size - i >= 32) {
return simd_compare(a, b, size - i, 4);
}
} else if (steps == 8) {
if (size >= 64) {
__m256i a_8;
__m256i b_8;
__m256i result_8;
#ifdef MACRO_CPU_FEATURE_AVX2
if (steps >= 8) {
steps = 8;
if (size >= 64) {
__m256i a_8;
__m256i b_8;
__m256i result_8;
for (; i <= size - steps; i += steps) {
a_8 = _mm256_load_si256((__m256i*) a);
b_8 = _mm256_load_si256((__m256i*) b);
for (; i <= size - steps; i += steps) {
a_8 = _mm256_load_si256((__m256i*) a);
b_8 = _mm256_load_si256((__m256i*) b);
result_8 = _mm256_cmpeq_epi8(a_8, b_8);
result_8 = _mm256_cmpeq_epi8(a_8, b_8);
if (_mm256_testc_si256(result_8, _mm256_set1_epi8(-1)) != 1) {
return false;
if (_mm256_testc_si256(result_8, _mm256_set1_epi8(-1)) != 1) {
return false;
}
a += steps;
b += steps;
}
}
a += steps;
b += steps;
if (size - i >= 32) {
return simd_equal(a, b, size - i, 4);
}
}
#endif
if (size - i >= 32) {
return simd_compare(a, b, size - i, 4);
}
} else if (steps == 4) {
if (size >= 16) {
__m128i a_4;
__m128i b_4;
__m128i result_4;
#ifdef MACRO_CPU_FEATURE_SSE42
if (steps >= 4) {
steps = 4;
if (size >= 16) {
__m128i a_4;
__m128i b_4;
__m128i result_4;
for (; i <= size - steps; i += steps) {
a_4 = _mm_load_si128((__m128i*) a);
b_4 = _mm_load_si128((__m128i*) b);
for (; i <= size - steps; i += steps) {
a_4 = _mm_load_si128((__m128i*) a);
b_4 = _mm_load_si128((__m128i*) b);
result_4 = _mm_cmpeq_epi8(a_4, b_4);
result_4 = _mm_cmpeq_epi8(a_4, b_4);
if (_mm_movemask_epi8(result_4) != 0xFFFF) {
return false;
if (_mm_movemask_epi8(result_4) != 0xFFFF) {
return false;
}
a += steps;
b += steps;
}
a += steps;
b += steps;
}
}
}
#endif
for (; i < size; ++i) {
if (*a++ != *b++) {

View File

@ -0,0 +1,265 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I8_H
#define COMS_TOS_STDLIB_SIMD_I8_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int8_32 {
union {
#if ARM
svint8_t s;
#else
__m256i s;
#endif
int8 v[32];
};
};
inline int8_32 load_int8_32(const int8* mem)
{
int8_32 simd;
simd.s = _mm256_load_si256((__m256i *) mem);
return simd;
}
inline int8_32 init_int8_32(const int8* mem)
{
int8_32 simd;
simd.s = _mm256_set_epi8(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31]
);
return simd;
}
inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i *) array, a.s); }
inline int8_32 init_zero_int8_32()
{
int8_32 simd;
simd.s = _mm256_setzero_si256();
return simd;
}
inline int8_32 init_value_int8_32(int8 value)
{
int8_32 simd;
simd.s = _mm256_set1_epi8(value);
return simd;
}
inline int8_32 operator+(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_add_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator-(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_sub_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator-(int8_32 a) { return init_zero_int8_32() - a; }
inline int8_32 operator*(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_mul_epi32(a.s, b.s);
return simd;
}
inline int8_32 operator^(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_xor_si256(a.s, b.s);
return simd;
}
inline int8_32 &operator-=(int8_32 &a, int8_32 b)
{
a = a - b;
return a;
}
inline int8_32 &operator+=(int8_32 &a, int8_32 b)
{
a = a + b;
return a;
}
inline int8_32 &operator*=(int8_32 &a, int8_32 b)
{
a = a * b;
return a;
}
inline int8_32 &operator^=(int8_32 &a, int8_32 b)
{
a = a ^ b;
return a;
}
inline int8_32 operator<(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_xor_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1));
return simd;
}
inline int8_32 operator<=(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1));
return simd;
}
inline int8_32 operator>(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_cmpgt_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator>=(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(b.s, a.s), _mm256_set1_epi8(-1));
return simd;
}
inline int8_32 operator==(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_cmpeq_epi8(a.s, b.s);
return simd;
}
inline int8_32 operator!=(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_mask_blend_epi8(_mm256_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int8_32 operator&(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_and_si256(a.s, b.s);
return simd;
}
inline int8_32 operator|(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_or_si256(a.s, b.s);
return simd;
}
inline int8_32 &operator&=(int8_32 &a, int8_32 b)
{
a = a & b;
return a;
}
inline int8_32 &operator|=(int8_32 &a, int8_32 b)
{
a = a | b;
return a;
}
inline int8_32 abs(int8_32 a)
{
int8_32 simd;
simd.s = _mm256_abs_epi16(a.s);
return simd;
}
inline int8_32 simd_min(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_min_epi8(a.s, b.s);
return simd;
}
inline int8_32 simd_max(int8_32 a, int8_32 b)
{
int8_32 simd;
simd.s = _mm256_max_epi8(a.s, b.s);
return simd;
}
inline int8_32 clamp(int8_32 min_value, int8_32 a, int8_32 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int8_32 a)
{
int32 which_true = _mm256_movemask_epi8(a.s);
return which_true;
}
inline bool any_true(int8_32 a)
{
bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool all_true(int8_32 a)
{
bool is_true = _mm256_movemask_epi8(a.s) == 255;
return is_true;
}
inline bool all_false(int8_32 a)
{
bool is_false = _mm256_movemask_epi8(a.s) == 0;
return is_false;
}
#endif

View File

@ -0,0 +1,270 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I8_H
#define COMS_TOS_STDLIB_SIMD_I8_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int8_64 {
union {
#if ARM
svint8_t s;
#else
__m512i s;
#endif
int8 v[64];
};
};
inline int8_64 load_int8_64(const int8* mem)
{
int8_64 simd;
simd.s = _mm512_load_si512((__m512i *) mem);
return simd;
}
inline int8_64 init_int8_64(const int8* mem)
{
int8_64 simd;
simd.s = _mm512_set_epi8(
mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31],
mem[32], mem[33], mem[34], mem[35], mem[36], mem[37], mem[38], mem[39],
mem[40], mem[41], mem[42], mem[43], mem[44], mem[45], mem[46], mem[47],
mem[48], mem[49], mem[50], mem[51], mem[52], mem[53], mem[54], mem[55],
mem[56], mem[57], mem[58], mem[59], mem[60], mem[61], mem[62], mem[63]
);
return simd;
}
inline void unload_int8_64(int8_64 a, int8 *array) { _mm512_storeu_epi8(array, a.s); }
inline int8_64 init_zero_int8_64()
{
int8_64 simd;
simd.s = _mm512_setzero_si512();
return simd;
}
inline int8_64 init_value_int8_64(int8 value)
{
int8_64 simd;
simd.s = _mm512_set1_epi8(value);
return simd;
}
inline int8_64 operator+(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_add_epi8(a.s, b.s);
return simd;
}
inline int8_64 operator-(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_sub_epi8(a.s, b.s);
return simd;
}
inline int8_64 operator-(int8_64 a) { return init_zero_int8_64() - a; }
inline int8_64 operator*(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mul_epi32(a.s, b.s);
return simd;
}
inline int8_64 operator^(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_xor_si512(a.s, b.s);
return simd;
}
inline int8_64 &operator-=(int8_64 &a, int8_64 b)
{
a = a - b;
return a;
}
inline int8_64 &operator+=(int8_64 &a, int8_64 b)
{
a = a + b;
return a;
}
inline int8_64 &operator*=(int8_64 &a, int8_64 b)
{
a = a * b;
return a;
}
inline int8_64 &operator^=(int8_64 &a, int8_64 b)
{
a = a ^ b;
return a;
}
inline int8_64 operator<(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmplt_epi8_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int8_64 operator<=(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmple_epi8_mask(a.s, b.s), b.s, a.s);
return simd;
}
inline int8_64 operator>(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmpgt_epi8_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int8_64 operator>=(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmpge_epi8_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int8_64 operator==(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmpeq_epi8_mask(a.s, b.s), a.s, b.s);
return simd;
}
inline int8_64 operator!=(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_mask_blend_epi8(_mm512_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
return simd;
}
inline int8_64 operator&(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_and_si512(a.s, b.s);
return simd;
}
inline int8_64 operator|(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_or_si512(a.s, b.s);
return simd;
}
inline int8_64 &operator&=(int8_64 &a, int8_64 b)
{
a = a & b;
return a;
}
inline int8_64 &operator|=(int8_64 &a, int8_64 b)
{
a = a | b;
return a;
}
inline int8_64 abs(int8_64 a)
{
int8_64 simd;
simd.s = _mm512_abs_epi16(a.s);
return simd;
}
inline int8_64 simd_min(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_min_epi8(a.s, b.s);
return simd;
}
inline int8_64 simd_max(int8_64 a, int8_64 b)
{
int8_64 simd;
simd.s = _mm512_max_epi8(a.s, b.s);
return simd;
}
inline int8_64 clamp(int8_64 min_value, int8_64 a, int8_64 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int64 which_true(int8_64 a)
{
int64 which_true = _mm512_movepi8_mask(a.s);
return which_true;
}
inline bool any_true(int8_64 a)
{
bool is_any_true = _mm512_movepi8_mask(a.s) > 0;
return is_any_true;
}
inline bool all_true(int8_64 a)
{
bool is_true = _mm512_movepi8_mask(a.s) == 65535;
return is_true;
}
inline bool all_false(int8_64 a)
{
// @todo This can be optimized (requires also changes in the comparison functions return)
bool is_false = _mm512_movepi8_mask(a.s) == 0;
return is_false;
}
#endif

View File

@ -0,0 +1,265 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_TOS_STDLIB_SIMD_I8_H
#define COMS_TOS_STDLIB_SIMD_I8_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
struct int8_16 {
union {
#if ARM
svint8_t s;
#else
__m128i s;
#endif
int8 v[16];
};
};
inline int8_16 load_int8_16(const int8* mem)
{
int8_16 simd;
simd.s = _mm_load_si128((__m128i *) mem);
return simd;
}
inline int8_16 init_int8_16(const int8* mem)
{
int8_16 simd;
simd.s = _mm_set_epi8(
mem[0], mem[1], mem[2], mem[3],
mem[4], mem[5], mem[6], mem[7],
mem[8], mem[9], mem[10], mem[11],
mem[12], mem[13], mem[14], mem[15]
);
return simd;
}
inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *) array, a.s); }
inline int8_16 init_zero_int8_16()
{
int8_16 simd;
simd.s = _mm_setzero_si128();
return simd;
}
inline int8_16 init_value_int8_16(int8 value)
{
int8_16 simd;
simd.s = _mm_set1_epi8(value);
return simd;
}
inline int8_16 operator+(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_add_epi8(a.s, b.s);
return simd;
}
inline int8_16 operator-(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_sub_epi8(a.s, b.s);
return simd;
}
inline int8_16 operator-(int8_16 a) { return init_zero_int8_16() - a; }
inline int8_16 operator*(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_mul_epi32(a.s, b.s);
return simd;
}
inline int8_16 operator^(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_xor_si128(a.s, b.s);
return simd;
}
inline int8_16 &operator-=(int8_16 &a, int8_16 b)
{
a = a - b;
return a;
}
inline int8_16 &operator+=(int8_16 &a, int8_16 b)
{
a = a + b;
return a;
}
inline int8_16 &operator*=(int8_16 &a, int8_16 b)
{
a = a * b;
return a;
}
inline int8_16 &operator^=(int8_16 &a, int8_16 b)
{
a = a ^ b;
return a;
}
inline int8_16 operator<(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_cmplt_epi8(a.s, b.s);
return simd;
}
inline int8_16 operator<=(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi8(b.s, a.s), _mm_set1_epi8(-1));
return simd;
}
inline int8_16 operator>(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_cmpgt_epi8(a.s, b.s);
return simd;
}
inline int8_16 operator>=(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_andnot_si128(_mm_cmplt_epi8(a.s, b.s), _mm_set1_epi8(-1));
return simd;
}
inline int8_16 operator==(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_cmpeq_epi8(a.s, b.s);
return simd;
}
inline int8_16 operator!=(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_andnot_si128(_mm_cmpeq_epi8(a.s, b.s), _mm_set1_epi8(-1));
return simd;
}
inline int8_16 operator&(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_and_si128(a.s, b.s);
return simd;
}
inline int8_16 operator|(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_or_si128(a.s, b.s);
return simd;
}
inline int8_16 &operator&=(int8_16 &a, int8_16 b)
{
a = a & b;
return a;
}
inline int8_16 &operator|=(int8_16 &a, int8_16 b)
{
a = a | b;
return a;
}
inline int8_16 abs(int8_16 a)
{
int8_16 simd;
simd.s = _mm_abs_epi8(a.s);
return simd;
}
inline int8_16 simd_min(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_min_epi8(a.s, b.s);
return simd;
}
inline int8_16 simd_max(int8_16 a, int8_16 b)
{
int8_16 simd;
simd.s = _mm_max_epi8(a.s, b.s);
return simd;
}
inline int8_16 clamp(int8_16 min_value, int8_16 a, int8_16 max_value)
{
return simd_min(simd_max(a, min_value), max_value);
}
inline int32 which_true(int8_16 a)
{
int32 which_true = _mm_movemask_epi8(a.s);
return which_true;
}
inline bool any_true(int8_16 a)
{
bool is_any_true = _mm_movemask_epi8(a.s) > 0;
return is_any_true;
}
inline bool all_true(int8_16 a)
{
bool is_true = _mm_movemask_epi8(a.s) == 15;
return is_true;
}
inline bool all_false(int8_16 a)
{
bool is_false = _mm_movemask_epi8(a.s) == 0;
return is_false;
}
#endif

View File

@ -9,160 +9,16 @@
#ifndef COMS_STDLIB_SIMD_SVML_H
#define COMS_STDLIB_SIMD_SVML_H
#include <immintrin.h>
#include <xmmintrin.h>
#ifdef MACRO_CPU_FEATURE_SSE42
#include "SIMD_SVML_SSE.h"
#endif
#include "../../../stdlib/Types.h"
#ifdef MACRO_CPU_FEATURE_AVX2
#include "SIMD_SVML_AVX2.h"
#endif
#if __linux__
#include "math.h"
inline __m128i _mm_div_epi32(__m128i a, __m128i b) {
alignas(16) int32 a_array[4], b_array[4], result[4];
_mm_storeu_si128((__m128i*) a_array, a);
_mm_storeu_si128((__m128i*) b_array, b);
for (int32 i = 0; i < 4; ++i) {
result[i] = a_array[i] / b_array[i];
}
return _mm_load_si128((__m128i*) result);
}
inline __m256i _mm256_div_epi32(__m256i a, __m256i b) {
alignas(32) int32 a_array[8], b_array[8], result[8];
_mm256_storeu_si256((__m256i*) a_array, a);
_mm256_storeu_si256((__m256i*) b_array, b);
for (int32 i = 0; i < 8; ++i) {
result[i] = a_array[i] / b_array[i];
}
return _mm256_load_si256((__m256i*) result);
}
inline __m512i _mm512_div_epi32(__m512i a, __m512i b) {
alignas(64) int32 a_array[16], b_array[16], result[16];
_mm512_storeu_si512((__m512i*) a_array, a);
_mm512_storeu_si512((__m512i*) b_array, b);
for (int32 i = 0; i < 16; ++i) {
result[i] = a_array[i] / b_array[i];
}
return _mm512_load_si512((__m512i*) result);
}
inline __m128 _mm_sin_ps(__m128 a) {
alignas(16) f32 a_array[4], result[4];
_mm_storeu_ps(a_array, a);
for (int32 i = 0; i < 4; ++i) {
result[i] = sinf(a_array[i]);
}
return _mm_load_ps(result);
}
inline __m128 _mm_cos_ps(__m128 a) {
alignas(16) f32 a_array[4], result[4];
_mm_storeu_ps(a_array, a);
for (int32 i = 0; i < 4; ++i) {
result[i] = cosf(a_array[i]);
}
return _mm_load_ps(result);
}
inline __m128 _mm_asin_ps(__m128 a) {
alignas(16) f32 a_array[4], result[4];
_mm_storeu_ps(a_array, a);
for (int32 i = 0; i < 4; ++i) {
result[i] = asinf(a_array[i]);
}
return _mm_load_ps(result);
}
inline __m128 _mm_acos_ps(__m128 a) {
alignas(16) f32 a_array[4], result[4];
_mm_storeu_ps(a_array, a);
for (int32 i = 0; i < 4; ++i) {
result[i] = acosf(a_array[i]);
}
return _mm_load_ps(result);
}
inline __m256 _mm256_sin_ps(__m256 a) {
alignas(32) f32 a_array[8], result[8];
_mm256_storeu_ps(a_array, a);
for (int32 i = 0; i < 8; ++i) {
result[i] = sinf(a_array[i]);
}
return _mm256_load_ps(result);
}
inline __m256 _mm256_cos_ps(__m256 a) {
alignas(32) f32 a_array[8], result[8];
_mm256_storeu_ps(a_array, a);
for (int32 i = 0; i < 8; ++i) {
result[i] = cosf(a_array[i]);
}
return _mm256_load_ps(result);
}
inline __m256 _mm256_asin_ps(__m256 a) {
alignas(32) f32 a_array[8], result[8];
_mm256_storeu_ps(a_array, a);
for (int32 i = 0; i < 8; ++i) {
result[i] = asinf(a_array[i]);
}
return _mm256_load_ps(result);
}
inline __m256 _mm256_acos_ps(__m256 a) {
alignas(32) f32 a_array[8], result[8];
_mm256_storeu_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = acosf(a_array[i]);
}
return _mm256_load_ps(result);
}
inline __m512 _mm512_sin_ps(__m512 a) {
alignas(64) f32 a_array[8], result[8];
_mm512_storeu_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = sinf(a_array[i]);
}
return _mm512_load_ps(result);
}
inline __m512 _mm512_cos_ps(__m512 a) {
alignas(64) f32 a_array[8], result[8];
_mm512_storeu_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = cosf(a_array[i]);
}
return _mm512_load_ps(result);
}
inline __m512 _mm512_asin_ps(__m512 a) {
alignas(64) f32 a_array[8], result[8];
_mm512_storeu_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = asinf(a_array[i]);
}
return _mm512_load_ps(result);
}
inline __m512 _mm512_acos_ps(__m512 a) {
alignas(64) f32 a_array[16], result[16];
_mm512_storeu_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = acosf(a_array[i]);
}
return _mm512_load_ps(result);
}
#ifdef MACRO_CPU_FEATURE_AVX512
#include "SIMD_SVML_AVX512.h"
#endif
#endif

View File

@ -0,0 +1,69 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_SVML_AVX2_H
#define COMS_STDLIB_SIMD_SVML_AVX2_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
#if __linux__
#include <math.h>
inline __m256i _mm256_div_epi32(__m256i a, __m256i b) {
alignas(32) int32 a_array[8], b_array[8], result[8];
_mm256_store_si256((__m256i*) a_array, a);
_mm256_store_si256((__m256i*) b_array, b);
for (int32 i = 0; i < 8; ++i) {
result[i] = a_array[i] / b_array[i];
}
return _mm256_load_si256((__m256i*) result);
}
inline __m256 _mm256_sin_ps(__m256 a) {
alignas(32) f32 a_array[8], result[8];
_mm256_store_ps(a_array, a);
for (int32 i = 0; i < 8; ++i) {
result[i] = sinf(a_array[i]);
}
return _mm256_load_ps(result);
}
inline __m256 _mm256_cos_ps(__m256 a) {
alignas(32) f32 a_array[8], result[8];
_mm256_store_ps(a_array, a);
for (int32 i = 0; i < 8; ++i) {
result[i] = cosf(a_array[i]);
}
return _mm256_load_ps(result);
}
inline __m256 _mm256_asin_ps(__m256 a) {
alignas(32) f32 a_array[8], result[8];
_mm256_store_ps(a_array, a);
for (int32 i = 0; i < 8; ++i) {
result[i] = asinf(a_array[i]);
}
return _mm256_load_ps(result);
}
inline __m256 _mm256_acos_ps(__m256 a) {
alignas(32) f32 a_array[8], result[8];
_mm256_store_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = acosf(a_array[i]);
}
return _mm256_load_ps(result);
}
#endif
#endif

View File

@ -0,0 +1,70 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_SVML_AVX512_H
#define COMS_STDLIB_SIMD_SVML_AVX512_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
#if __linux__
#include <math.h>
inline __m512i _mm512_div_epi32(__m512i a, __m512i b) {
alignas(64) int32 a_array[16], b_array[16], result[16];
_mm512_store_si512((__m512i*) a_array, a);
_mm512_store_si512((__m512i*) b_array, b);
for (int32 i = 0; i < 16; ++i) {
result[i] = a_array[i] / b_array[i];
}
return _mm512_load_si512((__m512i*) result);
}
inline __m512 _mm512_sin_ps(__m512 a) {
alignas(64) f32 a_array[8], result[8];
_mm512_store_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = sinf(a_array[i]);
}
return _mm512_load_ps(result);
}
inline __m512 _mm512_cos_ps(__m512 a) {
alignas(64) f32 a_array[8], result[8];
_mm512_store_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = cosf(a_array[i]);
}
return _mm512_load_ps(result);
}
inline __m512 _mm512_asin_ps(__m512 a) {
alignas(64) f32 a_array[8], result[8];
_mm512_store_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = asinf(a_array[i]);
}
return _mm512_load_ps(result);
}
inline __m512 _mm512_acos_ps(__m512 a) {
alignas(64) f32 a_array[16], result[16];
_mm512_store_ps(a_array, a);
for (int32 i = 0; i < 16; ++i) {
result[i] = acosf(a_array[i]);
}
return _mm512_load_ps(result);
}
#endif
#endif

View File

@ -0,0 +1,70 @@
/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_STDLIB_SIMD_SVML_SSE_H
#define COMS_STDLIB_SIMD_SVML_SSE_H
#include <immintrin.h>
#include <xmmintrin.h>
#include "../../../stdlib/Types.h"
#if __linux__
#include <math.h>
inline __m128i _mm_div_epi32(__m128i a, __m128i b) {
alignas(16) int32 a_array[4], b_array[4], result[4];
_mm_store_si128((__m128i*) a_array, a);
_mm_store_si128((__m128i*) b_array, b);
for (int32 i = 0; i < 4; ++i) {
result[i] = a_array[i] / b_array[i];
}
return _mm_load_si128((__m128i*) result);
}
inline __m128 _mm_sin_ps(__m128 a) {
alignas(16) f32 a_array[4], result[4];
_mm_store_ps(a_array, a);
for (int32 i = 0; i < 4; ++i) {
result[i] = sinf(a_array[i]);
}
return _mm_load_ps(result);
}
inline __m128 _mm_cos_ps(__m128 a) {
alignas(16) f32 a_array[4], result[4];
_mm_store_ps(a_array, a);
for (int32 i = 0; i < 4; ++i) {
result[i] = cosf(a_array[i]);
}
return _mm_load_ps(result);
}
inline __m128 _mm_asin_ps(__m128 a) {
alignas(16) f32 a_array[4], result[4];
_mm_store_ps(a_array, a);
for (int32 i = 0; i < 4; ++i) {
result[i] = asinf(a_array[i]);
}
return _mm_load_ps(result);
}
inline __m128 _mm_acos_ps(__m128 a) {
alignas(16) f32 a_array[4], result[4];
_mm_store_ps(a_array, a);
for (int32 i = 0; i < 4; ++i) {
result[i] = acosf(a_array[i]);
}
return _mm_load_ps(result);
}
#endif
#endif

63
architecture/x86/simd/utils/Utils.h Normal file → Executable file
View File

@ -17,6 +17,7 @@
// Only allowed for data >= 64 bits
bool is_empty(const byte* region, uint64 size, int32 steps = 8)
{
// Quick check of first 8 bytes
if (*((uint64 *) region) != 0) {
return false;
}
@ -25,40 +26,52 @@ bool is_empty(const byte* region, uint64 size, int32 steps = 8)
steps = intrin_validate_steps(region, steps);
switch (steps) {
case 16: {
while (region + 64 <= end) {
__m512i chunk = _mm512_loadu_si512((const __m512i *) region);
__mmask64 mask = _mm512_cmpeq_epi8_mask(chunk, _mm512_setzero_si512());
if (mask != 0xFFFFFFFFFFFFFFFF) {
#ifdef MACRO_CPU_FEATURE_AVX512
case 16: {
while (region + 64 <= end) {
__m512i chunk = _mm512_load_si512((const __m512i *) region);
__mmask64 mask = _mm512_cmpeq_epi8_mask(chunk, _mm512_setzero_si512());
if (mask != 0xFFFFFFFFFFFFFFFF) {
return false;
}
region += 64;
}
};
[[fallthrough]];
#else
case 16: [[fallthrough]];
#endif
#ifdef MACRO_CPU_FEATURE_AVX2
case 8: {
while (region + 32 <= end) {
__m256i chunk = _mm256_load_si256((const __m256i *) region);
if (!_mm256_testz_si256(chunk, chunk)) {
return false;
}
region += 64;
region += 32;
}
};
[[fallthrough]];
case 8: {
while (region + 32 <= end) {
__m256i chunk = _mm256_loadu_si256((const __m256i *) region);
if (!_mm256_testz_si256(chunk, chunk)) {
return false;
}
#else
case 8: [[fallthrough]];
#endif
#ifdef MACRO_CPU_FEATURE_SSE42
case 4: {
while (region + 16 <= end) {
__m128i chunk = _mm_load_si128((const __m128i *) region);
if (!_mm_testz_si128(chunk, chunk)) {
return false;
}
region += 32;
}
};
[[fallthrough]];
case 4: {
while (region + 16 <= end) {
__m128i chunk = _mm_loadu_si128((const __m128i *) region);
if (!_mm_testz_si128(chunk, chunk)) {
return false;
region += 16;
}
region += 16;
}
}
[[fallthrough]];
[[fallthrough]];
#else
case 4: [[fallthrough]];
#endif
case 1: {
while (region + 4 <= end) {
if (*((const uint32_t *) region) != 0) {

0
asset/Asset.h Normal file → Executable file
View File

0
asset/AssetArchive.h Normal file → Executable file
View File

0
asset/AssetManagementSystem.h Normal file → Executable file
View File

0
asset/AssetType.h Normal file → Executable file
View File

0
audio/Audio.cpp Normal file → Executable file
View File

0
audio/Audio.h Normal file → Executable file
View File

0
audio/AudioMixer.h Normal file → Executable file
View File

0
audio/AudioSetting.h Normal file → Executable file
View File

0
audio/Qoa.h Normal file → Executable file
View File

0
audio/QoaSimd.h Normal file → Executable file
View File

0
audio/Wav.h Normal file → Executable file
View File

0
auth/Auth.h Normal file → Executable file
View File

0
camera/Camera.h Normal file → Executable file
View File

0
camera/CameraMovement.h Normal file → Executable file
View File

0
color/ColorVisionDeficiency.h Normal file → Executable file
View File

0
command/AppCmdBuffer.cpp Normal file → Executable file
View File

0
command/AppCmdBuffer.h Normal file → Executable file
View File

0
command/Command.h Normal file → Executable file
View File

0
compiler/CompilerUtils.h Normal file → Executable file
View File

0
compiler/gcc/Atomic.h Normal file → Executable file
View File

0
compiler/gcc/CompilerUtils.h Normal file → Executable file
View File

0
compiler/msvc/CompilerUtils.h Normal file → Executable file
View File

0
compression/Huffman.h Normal file → Executable file
View File

0
compression/LZP.h Normal file → Executable file
View File

0
compression/RLE.h Normal file → Executable file
View File

0
database/Database.h Normal file → Executable file
View File

0
database/DatabaseConnection.h Normal file → Executable file
View File

0
database/DatabaseType.h Normal file → Executable file
View File

0
encryption/CeasarEncryption.h Normal file → Executable file
View File

0
encryption/XorEncryption.h Normal file → Executable file
View File

0
entity/AnimationEntityComponent.h Normal file → Executable file
View File

0
entity/CursorEntity.h Normal file → Executable file
View File

0
entity/Entity.h Normal file → Executable file
View File

0
entity/EntityComponentSystem.h Normal file → Executable file
View File

0
entity/EntitySize.h Normal file → Executable file
View File

0
environment/Globe.h Normal file → Executable file
View File

0
environment/Universe.h Normal file → Executable file
View File

0
error/HammingCodes.h Normal file → Executable file
View File

0
font/Font.h Normal file → Executable file
View File

0
font/font_characters.txt Normal file → Executable file
View File

0
gpuapi/AntiAliasing.h Normal file → Executable file
View File

0
gpuapi/GpuApiType.h Normal file → Executable file
View File

0
gpuapi/GpuAttributeType.h Normal file → Executable file
View File

0
gpuapi/RenderUtils.h Normal file → Executable file
View File

0
gpuapi/ShaderType.h Normal file → Executable file
View File

0
gpuapi/direct3d/AppCmdBuffer.h Normal file → Executable file
View File

0
gpuapi/direct3d/DirectXUtils.h Normal file → Executable file
View File

0
gpuapi/direct3d/FramesInFlightContainer.h Normal file → Executable file
View File

0
gpuapi/direct3d/Shader.h Normal file → Executable file
View File

0
gpuapi/direct3d/ShaderUtils.h Normal file → Executable file
View File

0
gpuapi/opengl/AppCmdBuffer.h Normal file → Executable file
View File

0
gpuapi/opengl/FramesInFlightContainer.h Normal file → Executable file
View File

0
gpuapi/opengl/Opengl.h Normal file → Executable file
View File

0
gpuapi/opengl/OpenglDefines.h Normal file → Executable file
View File

0
gpuapi/opengl/OpenglDescriptorSetLayoutBinding.h Normal file → Executable file
View File

Some files were not shown because too many files have changed in this diff Show More