diff --git a/font/Font.h b/font/Font.h index e20802f..251fca0 100644 --- a/font/Font.h +++ b/font/Font.h @@ -100,6 +100,8 @@ void font_from_file_txt( while (*pos != '\n') { *texture_pos++ = *pos++; } + + *texture_pos++ = '\0'; } else if (strcmp(block_name, "font_size") == 0) { font->size = strtof(pos, &pos); } else if (strcmp(block_name, "line_height") == 0) { diff --git a/gpuapi/RenderUtils.h b/gpuapi/RenderUtils.h index 52398ad..6d4bf55 100644 --- a/gpuapi/RenderUtils.h +++ b/gpuapi/RenderUtils.h @@ -693,15 +693,15 @@ void entity_clip_space_mat_sse(f32* result_mat, const f32* model_mat, const f32* __m128 a[4]; __m128 b[4]; - a[0] = _mm_loadu_ps(projection_mat); - a[1] = _mm_loadu_ps(&projection_mat[4]); - a[2] = _mm_loadu_ps(&projection_mat[8]); - a[3] = _mm_loadu_ps(&projection_mat[12]); + a[0] = _mm_load_ps(projection_mat); + a[1] = _mm_load_ps(&projection_mat[4]); + a[2] = _mm_load_ps(&projection_mat[8]); + a[3] = _mm_load_ps(&projection_mat[12]); - b[0] = _mm_loadu_ps(view_mat); - b[1] = _mm_loadu_ps(&view_mat[4]); - b[2] = _mm_loadu_ps(&view_mat[8]); - b[3] = _mm_loadu_ps(&view_mat[12]); + b[0] = _mm_load_ps(view_mat); + b[1] = _mm_load_ps(&view_mat[4]); + b[2] = _mm_load_ps(&view_mat[8]); + b[3] = _mm_load_ps(&view_mat[12]); _MM_TRANSPOSE4_PS(b[0], b[1], b[2], b[3]); mat4mat4_mult_sse(a, b, temp); @@ -711,10 +711,10 @@ void entity_clip_space_mat_sse(f32* result_mat, const f32* model_mat, const f32* a[2] = temp[2]; a[3] = temp[3]; - b[0] = _mm_loadu_ps(model_mat); - b[1] = _mm_loadu_ps(&model_mat[4]); - b[2] = _mm_loadu_ps(&model_mat[8]); - b[3] = _mm_loadu_ps(&model_mat[12]); + b[0] = _mm_load_ps(model_mat); + b[1] = _mm_load_ps(&model_mat[4]); + b[2] = _mm_load_ps(&model_mat[8]); + b[3] = _mm_load_ps(&model_mat[12]); _MM_TRANSPOSE4_PS(b[0], b[1], b[2], b[3]); mat4mat4_mult_sse(a, b, temp); diff --git a/gpuapi/UIUtils.h b/gpuapi/UIUtils.h new file mode 100644 index 0000000..a2420e1 --- /dev/null +++ b/gpuapi/UIUtils.h @@ -0,0 +1,33 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef TOS_GPUAPI_UI_UTILS_H +#define TOS_GPUAPI_UI_UTILS_H + +#include +#include + +void ui_input_create(Vertex3DTextureColorIndex* __restrict vertices, uint32* __restrict index, f32 zindex, + f32 x, f32 y, f32 width, f32 height, int32 align_h, int32 align_v, + uint32 color_index = 0, f32 tex_x1 = 0.0f, f32 tex_y1 = 0.0f, f32 tex_x2 = 0.0f, f32 tex_y2 = 0.0f +) +{ + vertex_rect_border_create( + vertices, index, zindex, + x, y, width, height, 1, UI_ALIGN_H_LEFT, UI_ALIGN_V_BOTTOM, + 12, 0.0f, 0.0f + ); + + vertex_rect_create( + vertices, index, zindex, + x + 1, y + 1, width - 2, height - 2, UI_ALIGN_H_LEFT, UI_ALIGN_V_BOTTOM, + 14, 0.0f, 0.0f + ); +} + +#endif \ No newline at end of file diff --git a/log/Debug.cpp b/log/Debug.cpp index 347d390..efabe34 100644 --- a/log/Debug.cpp +++ b/log/Debug.cpp @@ -1,6 +1,7 @@ #ifndef TOS_LOG_DEBUG_MEMORY_C #define TOS_LOG_DEBUG_MEMORY_C +#include "../stdlib/Types.h" #include "Debug.h" #include "DebugMemory.h" #include "Log.h" diff --git a/math/matrix/MatrixFloat32.h b/math/matrix/MatrixFloat32.h index 846f562..6d69a3a 100644 --- a/math/matrix/MatrixFloat32.h +++ b/math/matrix/MatrixFloat32.h @@ -401,11 +401,11 @@ void mat3vec3_mult(const f32* __restrict matrix, const f32* __restrict vector, f // @question could simple mul add sse be faster? void mat3vec3_mult_sse(const f32* __restrict matrix, const f32* __restrict vector, f32* __restrict result) { - __m128 vec = _mm_loadu_ps(vector); + __m128 vec = _mm_load_ps(vector); vec = _mm_insert_ps(vec, _mm_setzero_ps(), 0x30); // vec[3] = 0 for (int32 i = 0; i < 3; ++i) { - __m128 row = _mm_loadu_ps(&matrix[i * 3]); + __m128 row = _mm_load_ps(&matrix[i * 3]); row = _mm_insert_ps(row, _mm_setzero_ps(), 0x30); // row[3] = 0 __m128 dot = _mm_dp_ps(row, vec, 0xF1); @@ -444,10 +444,10 @@ void mat4vec4_mult(const f32* __restrict matrix, const f32* __restrict vector, f // @question could simple mul add sse be faster? void mat4vec4_mult_sse(const f32* __restrict matrix, const f32* __restrict vector, f32* __restrict result) { - __m128 vec = _mm_loadu_ps(vector); + __m128 vec = _mm_load_ps(vector); for (int32 i = 0; i < 4; ++i) { - __m128 row = _mm_loadu_ps(&matrix[i * 4]); + __m128 row = _mm_load_ps(&matrix[i * 4]); __m128 dot = _mm_dp_ps(row, vec, 0xF1); result[i] = _mm_cvtss_f32(dot); @@ -502,16 +502,16 @@ void mat4mat4_mult(const f32* __restrict a, const f32* __restrict b, f32* __rest // @todo check http://fhtr.blogspot.com/2010/02/4x4-f32-matrix-multiplication-using.html // @question could simple mul add sse be faster? // Load rows of matrix a - __m128 a_1 = _mm_loadu_ps(a); - __m128 a_2 = _mm_loadu_ps(&a[4]); - __m128 a_3 = _mm_loadu_ps(&a[8]); - __m128 a_4 = _mm_loadu_ps(&a[12]); + __m128 a_1 = _mm_load_ps(a); + __m128 a_2 = _mm_load_ps(&a[4]); + __m128 a_3 = _mm_load_ps(&a[8]); + __m128 a_4 = _mm_load_ps(&a[12]); // Load columns of matrix b - __m128 b_1 = _mm_loadu_ps(b); - __m128 b_2 = _mm_loadu_ps(&b[4]); - __m128 b_3 = _mm_loadu_ps(&b[8]); - __m128 b_4 = _mm_loadu_ps(&b[12]); + __m128 b_1 = _mm_load_ps(b); + __m128 b_2 = _mm_load_ps(&b[4]); + __m128 b_3 = _mm_load_ps(&b[8]); + __m128 b_4 = _mm_load_ps(&b[12]); _mm_storeu_ps(&result[0], _mm_add_ps( diff --git a/memory/ChunkMemory.h b/memory/ChunkMemory.h index a8f6291..0eb74a7 100644 --- a/memory/ChunkMemory.h +++ b/memory/ChunkMemory.h @@ -69,12 +69,14 @@ void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, i buf->memory = data; buf->count = count; - buf->size = chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64); + buf->size = chunk_size * count + sizeof(buf->free) * CEIL_DIV(count, 64); buf->chunk_size = chunk_size; buf->last_pos = -1; buf->alignment = alignment; // @question Could it be beneficial to have this before the element data? + // On the other hand the way we do it right now we never have to move past the free array since it is at the end + // On another hand we could by accident overwrite the values in free if we are not careful buf->free = (uint64 *) (buf->memory + count * chunk_size); DEBUG_MEMORY_INIT((uint64) buf->memory, buf->size); @@ -124,7 +126,7 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false) int32 bit_index; int64 free_element = -1; - byte mask; + int64 mask; int32 i = 0; int64 max_bytes = (buf->count + 7) / 64; @@ -157,7 +159,7 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false) uint64 current_free_index = free_index + (bit_index + j) / 64; int32 current_bit_index = (bit_index + j) % 64; - mask = 1 << current_bit_index; + mask = 1LL << current_bit_index; if ((buf->free[current_free_index] & mask) == 0) { ++consecutive_free_bits; } else { @@ -201,23 +203,23 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false) byte* chunk_find_free(ChunkMemory* buf) { - int64 byte_index = (buf->last_pos + 1) / 64; + int64 free_index = (buf->last_pos + 1) / 64; int32 bit_index; int64 free_element = -1; - byte mask; + int64 mask; int32 i = 0; int64 max_bytes = (buf->count + 7) / 64; while (free_element < 0 && i < buf->count) { - if (byte_index >= max_bytes) { - byte_index = 0; + if (free_index >= max_bytes) { + free_index = 0; } - if (buf->free[byte_index] == 0xFF) { + if (buf->free[free_index] == 0xFF) { ++i; - ++byte_index; + ++free_index; continue; } @@ -226,10 +228,10 @@ byte* chunk_find_free(ChunkMemory* buf) // @performance on the first iteration through the buffer we could optimize this by starting at a different bit_index // because we know that the bit_index is based on last_pos for (bit_index = 0; bit_index < 64; ++bit_index) { - mask = 1 << bit_index; - if ((buf->free[byte_index] & mask) == 0) { - free_element = byte_index * 64 + bit_index; - buf->free[byte_index] |= (1LL << bit_index); + mask = 1LL << bit_index; + if ((buf->free[free_index] & mask) == 0) { + free_element = free_index * 64 + bit_index; + buf->free[free_index] |= (1LL << bit_index); break; } @@ -248,10 +250,10 @@ void chunk_free_element(ChunkMemory* buf, uint64 element) { DEBUG_MEMORY_DELETE((uint64) (buf->memory + element * buf->chunk_size), buf->chunk_size); - int64 byte_index = element / 64; + int64 free_index = element / 64; int32 bit_index = element % 64; - buf->free[byte_index] &= ~(1 << bit_index); + buf->free[free_index] &= ~(1LL << bit_index); } inline diff --git a/memory/RingMemory.h b/memory/RingMemory.h index 57a3d7a..804d07f 100644 --- a/memory/RingMemory.h +++ b/memory/RingMemory.h @@ -62,7 +62,7 @@ void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64) inline void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment = 64) { - ring->memory = buffer_get_memory(buf, size, alignment); + ring->memory = buffer_get_memory(buf, size, alignment, true); ring->size = size; ring->pos = 0; @@ -71,8 +71,6 @@ void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment ring->start = 0; ring->end = 0; - memset(ring->memory, 0, buf->size); - DEBUG_MEMORY_INIT((uint64) ring->memory, ring->size); } diff --git a/stdlib/HashMap.h b/stdlib/HashMap.h index 1f6a99c..eede30d 100644 --- a/stdlib/HashMap.h +++ b/stdlib/HashMap.h @@ -122,6 +122,11 @@ int64 hashmap_size(const HashMap* hm) } void hashmap_insert(HashMap* hm, const char* key, int32 value) { + // @performance Do we really want to do this check every time? + if (hm->buf.count == 0) { + return; + } + uint64 index = hash_djb2(key) % hm->buf.count; int64 element = chunk_reserve(&hm->buf, 1); @@ -147,6 +152,11 @@ void hashmap_insert(HashMap* hm, const char* key, int32 value) { } void hashmap_insert(HashMap* hm, const char* key, int64 value) { + // @performance Do we really want to do this check every time? + if (hm->buf.count == 0) { + return; + } + uint64 index = hash_djb2(key) % hm->buf.count; int64 element = chunk_reserve(&hm->buf, 1); @@ -172,6 +182,11 @@ void hashmap_insert(HashMap* hm, const char* key, int64 value) { } void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) { + // @performance Do we really want to do this check every time? + if (hm->buf.count == 0) { + return; + } + uint64 index = hash_djb2(key) % hm->buf.count; int64 element = chunk_reserve(&hm->buf, 1); @@ -197,6 +212,11 @@ void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) { } void hashmap_insert(HashMap* hm, const char* key, void* value) { + // @performance Do we really want to do this check every time? + if (hm->buf.count == 0) { + return; + } + uint64 index = hash_djb2(key) % hm->buf.count; int64 element = chunk_reserve(&hm->buf, 1); @@ -222,6 +242,11 @@ void hashmap_insert(HashMap* hm, const char* key, void* value) { } void hashmap_insert(HashMap* hm, const char* key, f32 value) { + // @performance Do we really want to do this check every time? + if (hm->buf.count == 0) { + return; + } + uint64 index = hash_djb2(key) % hm->buf.count; int64 element = chunk_reserve(&hm->buf, 1); @@ -247,6 +272,11 @@ void hashmap_insert(HashMap* hm, const char* key, f32 value) { } void hashmap_insert(HashMap* hm, const char* key, const char* value) { + // @performance Do we really want to do this check every time? + if (hm->buf.count == 0) { + return; + } + uint64 index = hash_djb2(key) % hm->buf.count; int64 element = chunk_reserve(&hm->buf, 1); @@ -274,6 +304,11 @@ void hashmap_insert(HashMap* hm, const char* key, const char* value) { } void hashmap_insert(HashMap* hm, const char* key, byte* value) { + // @performance Do we really want to do this check every time? + if (hm->buf.count == 0) { + return; + } + uint64 index = hash_djb2(key) % hm->buf.count; int64 element = chunk_reserve(&hm->buf, 1); @@ -302,6 +337,11 @@ void hashmap_insert(HashMap* hm, const char* key, byte* value) { } HashEntry* hashmap_get_entry(HashMap* hm, const char* key) { + // @performance Do we really want to do this check every time? + if (hm->buf.count == 0) { + return NULL; + } + uint64 index = hash_djb2(key) % hm->buf.count; HashEntry* entry = (HashEntry *) hm->table[index]; @@ -356,90 +396,131 @@ void hashmap_delete_entry(HashMap* hm, const char* key) { } } -// @bug We cannot know if the data needs endian swap (it coult be int/float, but also some other 4/8 byte value) -// -> if we save this to a file and load it on a different system we will have "corrupt" data inline int64 hashmap_dump(const HashMap* hm, byte* data) { *((uint64 *) data) = SWAP_ENDIAN_LITTLE(hm->buf.count); data += sizeof(uint64); - uint64 next_count_total = 0; - // Dump the table content where the elements are relative indeces/pointers for (int32 i = 0; i < hm->buf.count; ++i) { - *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) hm->table[i] - (uintptr_t) hm->buf.memory); - data += sizeof(uint64); + *((uint64 *) data) = hm->table[i] + ? SWAP_ENDIAN_LITTLE((uintptr_t) hm->table[i] - (uintptr_t) hm->buf.memory) + : 0ULL; + } + data += sizeof(uint64) * hm->buf.count; - // Also dump the next pointer - // Count how many next elements we have - HashEntry* entry = ((HashEntry *) hm->table[i])->next; - int32 next_count = 0; - while (entry) { - ++next_count; - entry = entry->next; + int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64); + + // Dumb hash map content = buffer memory + int32 free_index = 0; + int32 bit_index = 0; + for (int32 i = 0; i < hm->buf.count; ++i) { + if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) { + HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i); + + // element_id + *((uint64 *) data) = SWAP_ENDIAN_LITTLE(entry->element_id); + data += sizeof(entry->element_id); + + // key + memcpy(data, entry->key, sizeof(entry->key)); + data += sizeof(entry->key); + + // next pointer + if (entry->next) { + *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) entry->next - (uintptr_t) hm->buf.memory); + } else { + memset(data, 0, sizeof(uint64)); + } + data += sizeof(uint64); + + // We just assume that 4 or 8 bytes = int -> endian handling + if (value_size == 4) { + *((int32 *) data) = SWAP_ENDIAN_LITTLE(((HashEntryInt32 *) entry)->value); + } else if (value_size == 8) { + *((int64 *) data) = SWAP_ENDIAN_LITTLE(((HashEntryInt64 *) entry)->value); + } else { + memcpy(data, entry->value, value_size); + } + data += value_size; + } else { + // No entry defined -> NULL + memset(data, 0, hm->buf.chunk_size); + data += hm->buf.chunk_size; } - next_count_total += next_count; - - *((int32 *) data) = SWAP_ENDIAN_LITTLE(next_count); - data += sizeof(next_count); - - if (next_count > 0) { - entry = ((HashEntry *) hm->table[i])->next; - while (entry) { - *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) entry - (uintptr_t) hm->buf.memory); - data += sizeof(uint64); - - entry = entry->next; - } + ++bit_index; + if (bit_index > 63) { + bit_index = 0; + ++free_index; } } - // @performance chunk_dump() below contains some data we already output above - // (next pointer but it is useless, since we need relative positions) - // Maybe we should manually re-create the chunk_dump here and omit the already dumped data for the next pointer? + // dump free array + memcpy(data, hm->buf.free, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64)); - // How many bytes were written (+ dump the chunk memory) - return sizeof(hm->buf.count) + return sizeof(hm->buf.count) // hash map count = buffer count + hm->buf.count * sizeof(uint64) // table content - + hm->buf.count * sizeof(int32) // counter for the next pointer (one for every element) - + next_count_total * sizeof(uint64) // next pointer offset - + chunk_dump(&hm->buf, data); + + hm->buf.size; // hash map content + free array } +// WARNING: Requires hashmap_create first inline int64 hashmap_load(HashMap* hm, const byte* data) { uint64 count = SWAP_ENDIAN_LITTLE(*((uint64 *) data)); data += sizeof(uint64); - uint64 next_count_total = 0; - - // Load the table content, we also need to convert from relative indeces to pointers + // Load the table content for (int i = 0; i < count; ++i) { - hm->table[i] = hm->buf.memory + SWAP_ENDIAN_LITTLE(*((uint64 *) data)); - data += sizeof(uint64); + uint64 offset = SWAP_ENDIAN_LITTLE(*((uint64 *) data)); + data += sizeof(offset); - // Also load the next pointer - // Count how many next elements we have - int32 next_count = SWAP_ENDIAN_LITTLE(*((int32 *) data)); - data += sizeof(next_count); + // the first element has no offset! + hm->table[i] = offset || i == 0 ? hm->buf.memory + offset : NULL; + } - HashEntry* entry = ((HashEntry *) hm->table[i]); - for (int32 j = 0; j < next_count; ++j) { - entry->next = (HashEntry *) (hm->buf.memory + SWAP_ENDIAN_LITTLE(*((uint64 *) data))); - data += sizeof(uint64); - entry = entry->next; + // This loop here is why it is important to already have an initialized hashmap + // @question Do we maybe want to change this and not require an initalized hashmap? + memcpy(hm->buf.memory, data, hm->buf.size); + data += hm->buf.chunk_size * hm->buf.count; + + // @question don't we have to possibly endian swap check the free array as well? + memcpy(hm->buf.free, data, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64)); + + int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64); + + // Switch endian AND turn offsets to pointers + int32 free_index = 0; + int32 bit_index = 0; + for (int32 i = 0; i < hm->buf.count; ++i) { + if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) { + HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i); + + // element id + entry->element_id = SWAP_ENDIAN_LITTLE(entry->element_id); + + // key is already loaded with the memcpy + // @question Do we even want to use memcpy? We are re-checking all the values here anyways + + // next pointer + if (entry->next) { + entry->next = (HashEntry *) (hm->buf.memory + SWAP_ENDIAN_LITTLE((uint64) entry->next)); + } + + if (value_size == 4) { + ((HashEntryInt32 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt32 *) entry)->value); + } else if (value_size == 8) { + ((HashEntryInt64 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt64 *) entry)->value); + } } } // How many bytes was read from data - return sizeof(count) + return sizeof(hm->buf.count) // hash map count = buffer count + hm->buf.count * sizeof(uint64) // table content - + hm->buf.count * sizeof(int32) // counter for the next pointer (one for every element) - + next_count_total * sizeof(uint64) // next pointer offset - + chunk_load(&hm->buf, data); + + hm->buf.size; } #endif \ No newline at end of file diff --git a/stdlib/simd/SIMD_F32.h b/stdlib/simd/SIMD_F32.h index bb4b3d0..24a5124 100644 --- a/stdlib/simd/SIMD_F32.h +++ b/stdlib/simd/SIMD_F32.h @@ -39,7 +39,7 @@ struct f32_16 { inline f32_4 load_f32_4(const f32* mem) { f32_4 simd; - simd.s = _mm_loadu_ps(mem); + simd.s = _mm_load_ps(mem); return simd; } @@ -57,7 +57,7 @@ inline void unload_f32_4(f32_4 a, f32 *array) { _mm_store_ps(array, a.s); } inline f32_8 load_f32_8(const f32* mem) { f32_8 simd; - simd.s = _mm256_loadu_ps(mem); + simd.s = _mm256_load_ps(mem); return simd; } @@ -75,7 +75,7 @@ inline void unload_f32_8(f32_8 a, f32 *array) { _mm256_store_ps(array, a.s); } inline f32_16 load_f32_16(const f32* mem) { f32_16 simd; - simd.s = _mm512_loadu_ps(mem); + simd.s = _mm512_load_ps(mem); return simd; } @@ -996,8 +996,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps) __m512 result_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_ps(a); - b_16 = _mm512_loadu_ps(b); + a_16 = _mm512_load_ps(a); + b_16 = _mm512_load_ps(b); result_16 = _mm512_mul_ps(a_16, b_16); _mm512_store_ps(result, result_16); @@ -1011,8 +1011,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps) __m256 result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_ps(a); - b_8 = _mm256_loadu_ps(b); + a_8 = _mm256_load_ps(a); + b_8 = _mm256_load_ps(b); result_8 = _mm256_mul_ps(a_8, b_8); _mm256_store_ps(result, result_8); @@ -1026,8 +1026,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps) __m128 result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_ps(a); - b_4 = _mm_loadu_ps(b); + a_4 = _mm_load_ps(a); + b_4 = _mm_load_ps(b); result_4 = _mm_mul_ps(a_4, b_4); _mm_store_ps(result, result_4); @@ -1057,7 +1057,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps) __m512 result_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_ps(a); + a_16 = _mm512_load_ps(a); result_16 = _mm512_mul_ps(a_16, b_16); _mm512_store_ps(result, result_16); @@ -1070,7 +1070,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps) __m256 result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_ps(a); + a_8 = _mm256_load_ps(a); result_8 = _mm256_mul_ps(a_8, b_8); _mm256_store_ps(result, result_8); @@ -1083,7 +1083,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps) __m128 result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_ps(a); + a_4 = _mm_load_ps(a); result_4 = _mm_mul_ps(a_4, b_4); _mm_store_ps(result, result_4); @@ -1111,7 +1111,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps) __m512 result_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_ps(a); + a_16 = _mm512_load_ps(a); result_16 = _mm512_div_ps(a_16, b_16); _mm512_store_ps(result, result_16); @@ -1124,7 +1124,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps) __m256 result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_ps(a); + a_8 = _mm256_load_ps(a); result_8 = _mm256_div_ps(a_8, b_8); _mm256_store_ps(result, result_8); @@ -1137,7 +1137,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps) __m128 result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_ps(a); + a_4 = _mm_load_ps(a); result_4 = _mm_div_ps(a_4, b_4); _mm_store_ps(result, result_4); @@ -1166,7 +1166,7 @@ void simd_div(const f32* a, f32 b, __m256* result, int32 size) __m256 result_8; for (; i <= size - 8; i += 8) { - a_8 = _mm256_loadu_ps(a); + a_8 = _mm256_load_ps(a); result_8 = _mm256_div_ps(a_8, b_8); result[j] = result_8; @@ -1181,7 +1181,7 @@ void simd_div(const f32* a, f32 b, __m256* result, int32 size) temp[k] = a[i + k] / b; } - result[j] = _mm256_loadu_ps(temp); + result[j] = _mm256_load_ps(temp); } inline diff --git a/stdlib/simd/SIMD_I16.h b/stdlib/simd/SIMD_I16.h index 9ffb372..7b3a8a4 100644 --- a/stdlib/simd/SIMD_I16.h +++ b/stdlib/simd/SIMD_I16.h @@ -39,7 +39,7 @@ struct int16_32 { inline int16_8 load_int16_8(const int16* mem) { int16_8 simd; - simd.s = _mm_loadu_epi16(mem); + simd.s = _mm_load_si128((__m128i *) mem); return simd; } @@ -60,7 +60,7 @@ inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i * inline int16_16 load_int16_16(const int16* mem) { int16_16 simd; - simd.s = _mm256_loadu_epi16(mem); + simd.s = _mm256_load_si256((__m256i *) mem); return simd; } @@ -81,7 +81,7 @@ inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m2 inline int16_32 load_int16_32(const int16* mem) { int16_32 simd; - simd.s = _mm512_loadu_epi16(mem); + simd.s = _mm512_load_si512((__m512i *) mem); return simd; } diff --git a/stdlib/simd/SIMD_I32.h b/stdlib/simd/SIMD_I32.h index 88edfd9..38c55f1 100644 --- a/stdlib/simd/SIMD_I32.h +++ b/stdlib/simd/SIMD_I32.h @@ -11,6 +11,7 @@ #include #include +#include #include "../Types.h" #include "../../utils/BitUtils.h" @@ -45,7 +46,7 @@ struct int32_16 { inline int32_4 load_int32_4(const int32* mem) { int32_4 simd; - simd.s = _mm_loadu_epi32(mem); + simd.s = _mm_load_si128((__m128i *) mem); return simd; } @@ -63,7 +64,7 @@ inline void unload_int32_4(int32_4 a, int32 *array) { _mm_store_si128((__m128i * inline int32_8 load_int32_8(const int32* mem) { int32_8 simd; - simd.s = _mm256_loadu_epi32(mem); + simd.s = _mm256_load_si256((__m256i *) mem); return simd; } @@ -81,7 +82,7 @@ inline void unload_int32_8(int32_8 a, int32 *array) { _mm256_store_si256((__m256 inline int32_16 load_int32_16(const int32* mem) { int32_16 simd; - simd.s = _mm512_loadu_epi32(mem); + simd.s = _mm512_load_epi32(mem); return simd; } @@ -1039,8 +1040,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32 __m512i result_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_epi32(a); - b_16 = _mm512_loadu_epi32(b); + a_16 = _mm512_load_epi32(a); + b_16 = _mm512_load_epi32(b); result_16 = _mm512_mul_epi32(a_16, b_16); _mm512_store_epi32(result, result_16); @@ -1054,8 +1055,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32 __m256i result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_epi32(a); - b_8 = _mm256_loadu_epi32(b); + a_8 = _mm256_load_si256((__m256i *) a); + b_8 = _mm256_load_si256((__m256i *) b); result_8 = _mm256_mul_epi32(a_8, b_8); _mm256_store_si256((__m256i *) result, result_8); @@ -1069,8 +1070,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32 __m128i result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_epi32(a); - b_4 = _mm_loadu_epi32(b); + a_4 = _mm_load_si128((__m128i *) a); + b_4 = _mm_load_si128((__m128i *) b); result_4 = _mm_mul_epi32(a_4, b_4); _mm_store_si128((__m128i *) result, result_4); @@ -1101,9 +1102,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step __m512 result_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_epi32(a); + a_16 = _mm512_load_epi32(a); af_16 = _mm512_cvtepi32_ps(a_16); - b_16 = _mm512_loadu_ps(b); + b_16 = _mm512_load_ps(b); result_16 = _mm512_mul_ps(af_16, b_16); _mm512_store_ps(result, result_16); @@ -1118,9 +1119,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step __m256 result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_epi32(a); + a_8 = _mm256_load_si256((__m256i *) a); af_8 = _mm256_cvtepi32_ps(a_8); - b_8 = _mm256_loadu_ps(b); + b_8 = _mm256_load_ps(b); result_8 = _mm256_mul_ps(af_8, b_8); _mm256_store_ps(result, result_8); @@ -1135,9 +1136,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step __m128 result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_epi32(a); + a_4 = _mm_load_si128((__m128i *) a); af_4 = _mm_cvtepi32_ps(a_4); - b_4 = _mm_loadu_ps(b); + b_4 = _mm_load_ps(b); result_4 = _mm_mul_ps(af_4, b_4); _mm_store_ps(result, result_4); @@ -1169,9 +1170,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st __m512i resulti_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_epi32(a); + a_16 = _mm512_load_epi32(a); af_16 = _mm512_cvtepi32_ps(a_16); - b_16 = _mm512_loadu_ps(b); + b_16 = _mm512_load_ps(b); result_16 = _mm512_mul_ps(af_16, b_16); resulti_16 = _mm512_cvtps_epi32(result_16); _mm512_store_epi32(result, resulti_16); @@ -1188,9 +1189,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st __m256i resulti_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_epi32(a); + a_8 = _mm256_load_si256((__m256i *) a); af_8 = _mm256_cvtepi32_ps(a_8); - b_8 = _mm256_loadu_ps(b); + b_8 = _mm256_load_ps(b); result_8 = _mm256_mul_ps(af_8, b_8); resulti_8 = _mm256_cvtps_epi32(result_8); _mm256_store_si256((__m256i *) result, resulti_8); @@ -1207,9 +1208,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st __m128i resulti_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_epi32(a); + a_4 = _mm_load_si128((__m128i *) a); af_4 = _mm_cvtepi32_ps(a_4); - b_4 = _mm_loadu_ps(b); + b_4 = _mm_load_ps(b); result_4 = _mm_mul_ps(af_4, b_4); resulti_4 = _mm_cvtps_epi32(result_4); _mm_store_si128((__m128i *) result, resulti_4); @@ -1242,7 +1243,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps) __m512i resulti_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_epi32(a); + a_16 = _mm512_load_epi32(a); af_16 = _mm512_cvtepi32_ps(a_16); result_16 = _mm512_mul_ps(af_16, b_16); resulti_16 = _mm512_cvtps_epi32(result_16); @@ -1259,7 +1260,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps) __m256i resulti_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_epi32(a); + a_8 = _mm256_load_si256((__m256i *) a); af_8 = _mm256_cvtepi32_ps(a_8); result_8 = _mm256_mul_ps(af_8, b_8); resulti_8 = _mm256_cvtps_epi32(result_8); @@ -1276,7 +1277,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps) __m128i resulti_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_epi32(a); + a_4 = _mm_load_si128((__m128i *) a); af_4 = _mm_cvtepi32_ps(a_4); result_4 = _mm_mul_ps(af_4, b_4); resulti_4 = _mm_cvtps_epi32(result_4); @@ -1307,7 +1308,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps) __m512 result_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_epi32(a); + a_16 = _mm512_load_epi32(a); af_16 = _mm512_cvtepi32_ps(a_16); result_16 = _mm512_div_ps(af_16, b_16); _mm512_store_ps(result, result_16); @@ -1323,7 +1324,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps) __m256 result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_epi32(a); + a_8 = _mm256_load_si256((__m256i *) a); af_8 = _mm256_cvtepi32_ps(a_8); result_8 = _mm256_div_ps(af_8, b_8); _mm256_store_ps(result, result_8); @@ -1338,7 +1339,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps) __m128 result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_epi32(a); + a_4 = _mm_load_si128((__m128i *) a); af_4 = _mm_cvtepi32_ps(a_4); result_4 = _mm_div_ps(af_4, b_4); _mm_store_ps(result, result_4); @@ -1367,8 +1368,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s __m512i result_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_epi32(a); - b_16 = _mm512_loadu_epi32(b); + a_16 = _mm512_load_epi32(a); + b_16 = _mm512_load_epi32(b); result_16 = _mm512_add_epi32(a_16, b_16); _mm512_store_epi32(result, result_16); @@ -1382,8 +1383,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s __m256i result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_epi32(a); - b_8 = _mm256_loadu_epi32(b); + a_8 = _mm256_load_si256((__m256i *) a); + b_8 = _mm256_load_si256((__m256i *) b); result_8 = _mm256_add_epi32(a_8, b_8); _mm256_store_si256((__m256i *) result, result_8); @@ -1397,8 +1398,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s __m128i result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_epi32(a); - b_4 = _mm_loadu_epi32(b); + a_4 = _mm_load_si128((__m128i *) a); + b_4 = _mm_load_si128((__m128i *) b); result_4 = _mm_add_epi32(a_4, b_4); _mm_store_si128((__m128i *) result, result_4); @@ -1429,9 +1430,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps __m512 result_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_epi32(a); + a_16 = _mm512_load_epi32(a); af_16 = _mm512_cvtepi32_ps(a_16); - b_16 = _mm512_loadu_ps(b); + b_16 = _mm512_load_ps(b); result_16 = _mm512_add_ps(af_16, b_16); _mm512_store_ps(result, result_16); @@ -1446,9 +1447,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps __m256 result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_epi32(a); + a_8 = _mm256_load_si256((__m256i *) a); af_8 = _mm256_cvtepi32_ps(a_8); - b_8 = _mm256_loadu_ps(b); + b_8 = _mm256_load_ps(b); result_8 = _mm256_add_ps(af_8, b_8); _mm256_store_ps(result, result_8); @@ -1463,9 +1464,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps __m128 result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_epi32(a); + a_4 = _mm_load_si128((__m128i *) a); af_4 = _mm_cvtepi32_ps(a_4); - b_4 = _mm_loadu_ps(b); + b_4 = _mm_load_ps(b); result_4 = _mm_add_ps(af_4, b_4); _mm_store_ps(result, result_4); @@ -1497,9 +1498,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste __m512i resulti_16; for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_epi32(a); + a_16 = _mm512_load_epi32(a); af_16 = _mm512_cvtepi32_ps(a_16); - b_16 = _mm512_loadu_ps(b); + b_16 = _mm512_load_ps(b); result_16 = _mm512_add_ps(af_16, b_16); resulti_16 = _mm512_cvtps_epi32(result_16); _mm512_store_epi32(result, resulti_16); @@ -1516,9 +1517,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste __m256i resulti_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_epi32(a); + a_8 = _mm256_load_si256((__m256i *) a); af_8 = _mm256_cvtepi32_ps(a_8); - b_8 = _mm256_loadu_ps(b); + b_8 = _mm256_load_ps(b); result_8 = _mm256_add_ps(af_8, b_8); resulti_8 = _mm256_cvtps_epi32(result_8); _mm256_store_si256((__m256i *) result, resulti_8); @@ -1535,9 +1536,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste __m128i resulti_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_epi32(a); + a_4 = _mm_load_si128((__m128i *) a); af_4 = _mm_cvtepi32_ps(a_4); - b_4 = _mm_loadu_ps(b); + b_4 = _mm_load_ps(b); result_4 = _mm_add_ps(af_4, b_4); resulti_4 = _mm_cvtps_epi32(result_4); _mm_store_si128((__m128i *) result, resulti_4); @@ -1560,8 +1561,8 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste // WARNING: only works with SSE4.2 // WARNING: incl. \0 both strings must be <= 16 length bool str_compare_avx512(const char* str1, const char* str2) { - __m128i s1 = _mm_loadu_si128((const __m128i *) str1); - __m128i s2 = _mm_loadu_si128((const __m128i *) str2); + __m128i s1 = _mm_load_si128((__m128i *) (const __m128i *) str1); + __m128i s2 = _mm_load_si128((__m128i *) (const __m128i *) str2); return _mm_cmpistrc(s1, s2, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH) == 0; } @@ -1580,7 +1581,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps) ); for (i = 0; i <= size - steps; i += steps) { - __m512i vec = _mm512_loadu_si512((const __m512i *) (val + i)); + __m512i vec = _mm512_load_si512((const __m512i *) (val + i)); vec = _mm512_shuffle_epi8(vec, mask_512); _mm512_storeu_si512((__m512i *) (result + i), vec); @@ -1594,7 +1595,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps) ); for (i = 0; i <= size - steps; i += steps) { - __m256i vec = _mm256_loadu_si256((const __m256i *) (val + i)); + __m256i vec = _mm256_load_si256((const __m256i *) (val + i)); vec = _mm256_shuffle_epi8(vec, mask_256); _mm256_storeu_si256((__m256i *) (result + i), vec); @@ -1608,7 +1609,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps) ); for (i = 0; i <= size - steps; i += steps) { - __m128i vec = _mm_loadu_si128((const __m128i *) (val + i)); + __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i)); vec = _mm_shuffle_epi8(vec, mask_128); _mm_storeu_si128((__m128i *) (result + i), vec); diff --git a/stdlib/simd/SIMD_I8.h b/stdlib/simd/SIMD_I8.h index 9c8ddd4..7809a40 100644 --- a/stdlib/simd/SIMD_I8.h +++ b/stdlib/simd/SIMD_I8.h @@ -40,7 +40,7 @@ struct int8_64 { inline int8_16 load_int8_16(const int8* mem) { int8_16 simd; - simd.s = _mm_loadu_epi8(mem); + simd.s = _mm_load_si128((__m128i *) mem); return simd; } @@ -63,7 +63,7 @@ inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *) inline int8_32 load_int8_32(const int8* mem) { int8_32 simd; - simd.s = _mm256_loadu_epi8(mem); + simd.s = _mm256_load_si256((__m256i *) mem); return simd; } @@ -86,7 +86,7 @@ inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i inline int8_64 load_int8_64(const int8* mem) { int8_64 simd; - simd.s = _mm512_loadu_epi8(mem); + simd.s = _mm512_load_si512((__m512i *) mem); return simd; } @@ -830,19 +830,19 @@ inline f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps) { if (steps == 16) { - __m512i a_16 = _mm512_loadu_epi8(a); + __m512i a_16 = _mm512_load_si512((__m512i *) a); __m512 af_16 = _mm512_cvtepi32_ps(a_16); __m512 b_16 = _mm512_set1_ps(b); __m512 result = _mm512_mul_ps(af_16, b_16); } else if (steps == 8) { - __m256i a_8 = _mm256_loadu_epi8(a); + __m256i a_8 = _mm256_load_si256((__m256i *) a); __m256 af_8 = _mm256_cvtepi32_ps(a_8); __m256 b_8 = _mm256_set1_ps(b); __m256 result = _mm256_mul_ps(af_8, b_8); } else if (steps == 4) { - __m128i a_4 = _mm_loadu_epi8(a); + __m128i a_4 = _mm_load_si128((__m128i *) a); __m128 af_4 = _mm_cvtepi32_ps(a_4); __m128 b_4 = _mm_set1_ps(b); @@ -855,11 +855,11 @@ f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps) bool simd_compare_64(const byte* a, const byte* b) { - __m256i chunk1_a = _mm256_loadu_si256((__m256i*) a); - __m256i chunk1_b = _mm256_loadu_si256((__m256i*) b); + __m256i chunk1_a = _mm256_load_si256((__m256i*) a); + __m256i chunk1_b = _mm256_load_si256((__m256i*) b); - __m256i chunk2_a = _mm256_loadu_si256((__m256i*) (a + 32)); - __m256i chunk2_b = _mm256_loadu_si256((__m256i*) (b + 32)); + __m256i chunk2_a = _mm256_load_si256((__m256i*) (a + 32)); + __m256i chunk2_b = _mm256_load_si256((__m256i*) (b + 32)); __m256i result1 = _mm256_cmpeq_epi8(chunk1_a, chunk1_b); __m256i result2 = _mm256_cmpeq_epi8(chunk2_a, chunk2_b); @@ -879,8 +879,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) { __mmask64 result_mask; for (; i <= size - 64; i += 64) { // 64 bytes per iteration - a_16 = _mm512_loadu_si512((__m512i*) a); - b_16 = _mm512_loadu_si512((__m512i*) b); + a_16 = _mm512_load_si512((__m512i*) a); + b_16 = _mm512_load_si512((__m512i*) b); result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16); @@ -905,8 +905,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) { __m256i result_8; for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_si256((__m256i*) a); - b_8 = _mm256_loadu_si256((__m256i*) b); + a_8 = _mm256_load_si256((__m256i*) a); + b_8 = _mm256_load_si256((__m256i*) b); result_8 = _mm256_cmpeq_epi8(a_8, b_8); @@ -929,8 +929,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) { __m128i result_4; for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_si128((__m128i*) a); - b_4 = _mm_loadu_si128((__m128i*) b); + a_4 = _mm_load_si128((__m128i*) a); + b_4 = _mm_load_si128((__m128i*) b); result_4 = _mm_cmpeq_epi8(a_4, b_4); diff --git a/stdlib/simd/SIMD_SVML.h b/stdlib/simd/SIMD_SVML.h index f61fae6..e863957 100644 --- a/stdlib/simd/SIMD_SVML.h +++ b/stdlib/simd/SIMD_SVML.h @@ -25,7 +25,7 @@ result[i] = a_array[i] / b_array[i]; } - return _mm_loadu_si128((__m128i*)result); + return _mm_load_si128((__m128i*)result); } inline __m256i _mm256_div_epi32(__m256i a, __m256i b) { @@ -38,7 +38,7 @@ result[i] = a_array[i] / b_array[i]; } - return _mm256_loadu_si256((__m256i*)result); + return _mm256_load_si256((__m256i*)result); } inline __m512i _mm512_div_epi32(__m512i a, __m512i b) { @@ -51,7 +51,7 @@ result[i] = a_array[i] / b_array[i]; } - return _mm512_loadu_si512((__m512i*)result); + return _mm512_load_si512((__m512i*)result); } inline __m128 _mm_sin_ps(__m128 a) { @@ -60,7 +60,7 @@ for (int i = 0; i < 4; ++i) { result[i] = sinf(a_array[i]); } - return _mm_loadu_ps(result); + return _mm_load_ps(result); } inline __m128 _mm_cos_ps(__m128 a) { @@ -69,7 +69,7 @@ for (int i = 0; i < 4; ++i) { result[i] = cosf(a_array[i]); } - return _mm_loadu_ps(result); + return _mm_load_ps(result); } inline __m128 _mm_asin_ps(__m128 a) { @@ -78,7 +78,7 @@ for (int i = 0; i < 4; ++i) { result[i] = asinf(a_array[i]); } - return _mm_loadu_ps(result); + return _mm_load_ps(result); } inline __m128 _mm_acos_ps(__m128 a) { @@ -87,7 +87,7 @@ for (int i = 0; i < 4; ++i) { result[i] = acosf(a_array[i]); } - return _mm_loadu_ps(result); + return _mm_load_ps(result); } inline __m256 _mm256_sin_ps(__m256 a) { @@ -96,7 +96,7 @@ for (int i = 0; i < 8; ++i) { result[i] = sinf(a_array[i]); } - return _mm256_loadu_ps(result); + return _mm256_load_ps(result); } inline __m256 _mm256_cos_ps(__m256 a) { @@ -105,7 +105,7 @@ for (int i = 0; i < 8; ++i) { result[i] = cosf(a_array[i]); } - return _mm256_loadu_ps(result); + return _mm256_load_ps(result); } inline __m256 _mm256_asin_ps(__m256 a) { @@ -114,7 +114,7 @@ for (int i = 0; i < 8; ++i) { result[i] = asinf(a_array[i]); } - return _mm256_loadu_ps(result); + return _mm256_load_ps(result); } inline __m256 _mm256_acos_ps(__m256 a) { @@ -123,7 +123,7 @@ for (int i = 0; i < 16; ++i) { result[i] = acosf(a_array[i]); } - return _mm256_loadu_ps(result); + return _mm256_load_ps(result); } inline __m512 _mm512_sin_ps(__m512 a) { @@ -132,7 +132,7 @@ for (int i = 0; i < 16; ++i) { result[i] = sinf(a_array[i]); } - return _mm512_loadu_ps(result); + return _mm512_load_ps(result); } inline __m512 _mm512_cos_ps(__m512 a) { @@ -141,7 +141,7 @@ for (int i = 0; i < 16; ++i) { result[i] = cosf(a_array[i]); } - return _mm512_loadu_ps(result); + return _mm512_load_ps(result); } inline __m512 _mm512_asin_ps(__m512 a) { @@ -150,7 +150,7 @@ for (int i = 0; i < 16; ++i) { result[i] = asinf(a_array[i]); } - return _mm512_loadu_ps(result); + return _mm512_load_ps(result); } inline __m512 _mm512_acos_ps(__m512 a) { @@ -159,7 +159,7 @@ for (int i = 0; i < 16; ++i) { result[i] = acosf(a_array[i]); } - return _mm512_loadu_ps(result); + return _mm512_load_ps(result); } #endif diff --git a/ui/UIAttribute.h b/ui/UIAttribute.h index f99257d..e241e24 100644 --- a/ui/UIAttribute.h +++ b/ui/UIAttribute.h @@ -96,9 +96,23 @@ enum UIAttributeType { UIAttribute* ui_attribute_from_group(UIAttributeGroup* group, UIAttributeType type) { - for (int i = 0; i < UI_ATTRIBUTE_TYPE_SIZE && i <= type; ++i) { - if (group->attributes[i].attribute_id == type) { - return &group->attributes[i]; + if (!group->attributes) { + return NULL; + } + + int32 left = 0; + int32 right = type; + + // Binary search since attributes are sorted by attribute_id + while (left <= right) { + int32 mid = left + (right - left) / 2; + + if (group->attributes[mid].attribute_id == type) { + return &group->attributes[mid]; + } else if (group->attributes[mid].attribute_id < type) { + left = mid + 1; + } else { + right = mid - 1; } } @@ -199,98 +213,4 @@ constexpr const char* ui_attribute_type_to_string_const(UIAttributeType e) return NULL; } -const char* ui_attribute_type_to_string(UIAttributeType e) -{ - switch (e) { - case UI_ATTRIBUTE_TYPE_TYPE: - return "type"; - case UI_ATTRIBUTE_TYPE_STYLE: - return "style"; - case UI_ATTRIBUTE_TYPE_DIMENSION_X: - return "x"; - case UI_ATTRIBUTE_TYPE_DIMENSION_Y: - return "y"; - case UI_ATTRIBUTE_TYPE_DIMENSION_WIDTH: - return "width"; - case UI_ATTRIBUTE_TYPE_DIMENSION_HEIGHT: - return "height"; - case UI_ATTRIBUTE_TYPE_FONT_NAME: - return "font_name"; - case UI_ATTRIBUTE_TYPE_FONT_COLOR: - return "font_color"; - case UI_ATTRIBUTE_TYPE_FONT_SIZE: - return "font_size"; - case UI_ATTRIBUTE_TYPE_FONT_WEIGHT: - return "font_weight"; - case UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT: - return "font_line_height"; - case UI_ATTRIBUTE_TYPE_ALIGN_H: - return "align_h"; - case UI_ATTRIBUTE_TYPE_ALIGN_V: - return "align_v"; - case UI_ATTRIBUTE_TYPE_ZINDEX: - return "zindex"; - case UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR: - return "background_color"; - case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG: - return "background_img"; - case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY: - return "background_img_opacity"; - case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V: - return "background_img_position_v"; - case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H: - return "background_img_position_h"; - case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE: - return "background_img_style"; - case UI_ATTRIBUTE_TYPE_BORDER_COLOR: - return "border_color"; - case UI_ATTRIBUTE_TYPE_BORDER_WIDTH: - return "border_width"; - case UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR: - return "border_top_color"; - case UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH: - return "border_top_width"; - case UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR: - return "border_right_color"; - case UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH: - return "border_right_width"; - case UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR: - return "border_bottom_color"; - case UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH: - return "border_bottom_width"; - case UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR: - return "border_left_color"; - case UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH: - return "border_left_width"; - case UI_ATTRIBUTE_TYPE_PADDING: - return "padding"; - case UI_ATTRIBUTE_TYPE_PADDING_TOP: - return "padding_top"; - case UI_ATTRIBUTE_TYPE_PADDING_RIGHT: - return "padding_right"; - case UI_ATTRIBUTE_TYPE_PADDING_BOTTOM: - return "padding_bottom"; - case UI_ATTRIBUTE_TYPE_PADDING_LEFT: - return "padding_left"; - case UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR: - return "shadow_inner_color"; - case UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE: - return "shadow_inner_angle"; - case UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE: - return "shadow_inner_distance"; - case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR: - return "shadow_outer_color"; - case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE: - return "shadow_outer_angle"; - case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE: - return "shadow_outer_distance"; - case UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION: - return "transition_animation"; - case UI_ATTRIBUTE_TYPE_TRANSITION_DURATION: - return "transition_duration"; - } - - return NULL; -} - #endif \ No newline at end of file diff --git a/ui/UIElement.h b/ui/UIElement.h index 643c42f..4a1a7c5 100644 --- a/ui/UIElement.h +++ b/ui/UIElement.h @@ -5,6 +5,9 @@ #include "UIElementType.h" #include "../object/Vertex.h" +#include +#include + struct UIElementDimension { int16 x1; int16 y1; @@ -22,6 +25,7 @@ struct UIElement { const char* name; int32 id; UIElementType type; + bool is_dynamic; int16 window_id; int16 panel_id; diff --git a/ui/UIElementType.h b/ui/UIElementType.h index 93eac71..df81443 100644 --- a/ui/UIElementType.h +++ b/ui/UIElementType.h @@ -53,36 +53,4 @@ constexpr const char* ui_element_type_to_string_const(UIElementType e) return NULL; } -const char* ui_element_type_to_string(UIElementType e) -{ - switch (e) { - case UI_ELEMENT_TYPE_BUTTON: - return "button"; - case UI_ELEMENT_TYPE_SELECT: - return "select"; - case UI_ELEMENT_TYPE_DROPDOWN: - return "dropdown"; - case UI_ELEMENT_TYPE_TEXTFIELD: - return "textfield"; - case UI_ELEMENT_TYPE_TEXTAREA: - return "textarea"; - case UI_ELEMENT_TYPE_IMAGE: - return "image"; - case UI_ELEMENT_TYPE_TEXT: - return "text"; - case UI_ELEMENT_TYPE_LINK: - return "link"; - case UI_ELEMENT_TYPE_TABLE: - return "table"; - case UI_ELEMENT_TYPE_VIEW_WINDOW: - return "view_window"; - case UI_ELEMENT_TYPE_VIEW_PANEL: - return "view_panel"; - case UI_ELEMENT_TYPE_VIEW_TAB: - return "view_tab"; - } - - return NULL; -} - #endif \ No newline at end of file diff --git a/ui/UILayout.h b/ui/UILayout.h index 00657b6..878de95 100644 --- a/ui/UILayout.h +++ b/ui/UILayout.h @@ -7,30 +7,136 @@ // Modified for every scene struct UILayout { - int32 ui_deadzone_size = 5; - UIElementDimension ui_deadzone[5]; + // This array has the size of the game window and represents in color codes where interactible ui elements are + // Size is based on screen size (we don't need full screen size since we assume an interactible element is at least 4 pixels width and height) + // width = 25% of screen size + // height = 25% of screen size + uint16 width; + uint16 height; - int32 element_hoverable_size; - int32 element_hoverable_pos; - UIElementDimension* elements_hoverable; + // Contains all UI elements also dynamic ones (e.g. movable windows) + uint32* ui_chroma_codes; - int32 element_interactible_size; - int32 element_interactible_pos; - UIElementDimension* elements_interactible; + // Contains constant UI elements that usually don't change (e.g. HUD) + uint32* ui_chroma_codes_static; - // @question Since we use a hashmap below, do we even need the size? - // Isn't the size exactly the same as the hash_map buf size - int32 element_size; - int32 element_pos; - HashMap hash_map; // Used to directly find element by name - - // @question Do we even need this or should the hashmap values be the elements directly? - // In other places (e.g. theme) we simply define a byte* data variable which actually holds the info. - UIElement* elements; + // Used to directly find element by name + // The values are the UIElements + HashMap hash_map; int32 vertex_size; int32 vertex_pos; Vertex3DTextureColorIndex* vertices; }; +inline +uint32 layout_element_from_location(UILayout* layout, uint16 x, uint16 y) +{ + return layout->ui_chroma_codes[layout->width * y / 4 + x / 4]; +} + +// This function should only get called if the location of a UI Element changes +// @performance How to handle moving elements (= dragging a window). We don't want to update this while dragging! +void layout_chroma_codes_update(UILayout* layout) +{ + // Reset all + memcpy(layout->ui_chroma_codes, layout->ui_chroma_codes_static, layout->width * layout->height * sizeof(uint32)); + + // @question Are the dimension values below even absolute? They may be in relation to the parent?! + for (int32 i = 0; i < layout->hash_map.buf.count; ++i) { + if (!layout->hash_map.table[i]) { + continue; + } + + HashEntry* entry = (HashEntry *) layout->hash_map.table[i]; + UIElement* element = (UIElement *) entry->value; + + if (element->is_dynamic) { + continue; + } + + int32 y_start = element->dimension.y1 / 4; + int32 y_end = element->dimension.y2 / 4; + int32 x_start = element->dimension.x1 / 4; + int32 x_end = element->dimension.x2 / 4; + + for (int32 y = y_start; y < y_end; ++y) { + int32 y_offset = layout->width * y; + for (int32 x = x_start; x < x_end; ++x) { + layout->ui_chroma_codes[y_offset + x] = (uint32) element->id; + } + } + + // Now handle all next elements + while (entry->next) { + entry = entry->next; + + element = (UIElement *) entry->value; + + y_start = element->dimension.y1 / 4; + y_end = element->dimension.y2 / 4; + x_start = element->dimension.x1 / 4; + x_end = element->dimension.x2 / 4; + + for (int32 y = y_start; y < y_end; ++y) { + int32 y_offset = layout->width * y; + for (int32 x = x_start; x < x_end; ++x) { + layout->ui_chroma_codes[y_offset + x] = (uint32) element->id; + } + } + } + } +} + +void layout_chroma_codes_update_static(UILayout* layout) +{ + // Reset all + memset(layout->ui_chroma_codes_static, 0, layout->width * layout->height * sizeof(uint32)); + + // @question Are the dimension values below even absolute? They may be in relation to the parent?! + for (int32 i = 0; i < layout->hash_map.buf.count; ++i) { + if (!layout->hash_map.table[i]) { + continue; + } + + HashEntry* entry = (HashEntry *) layout->hash_map.table[i]; + UIElement* element = (UIElement *) entry->value; + + if (!element->is_dynamic) { + continue; + } + + int32 y_start = element->dimension.y1 / 4; + int32 y_end = element->dimension.y2 / 4; + int32 x_start = element->dimension.x1 / 4; + int32 x_end = element->dimension.x2 / 4; + + for (int32 y = y_start; y < y_end; ++y) { + int32 y_offset = layout->width * y; + for (int32 x = x_start; x < x_end; ++x) { + layout->ui_chroma_codes_static[y_offset + x] = (uint32) element->id; + } + } + + // Now handle all next elements + while (entry->next) { + entry = entry->next; + + element = (UIElement *) entry->value; + + y_start = element->dimension.y1 / 4; + y_end = element->dimension.y2 / 4; + x_start = element->dimension.x1 / 4; + x_end = element->dimension.x2 / 4; + + for (int32 y = y_start; y < y_end; ++y) { + int32 y_offset = layout->width * y; + for (int32 x = x_start; x < x_end; ++x) { + layout->ui_chroma_codes_static[y_offset + x] = (uint32) element->id; + } + } + } + } +} + #endif \ No newline at end of file diff --git a/ui/UITheme.h b/ui/UITheme.h index 6237de7..0627b00 100644 --- a/ui/UITheme.h +++ b/ui/UITheme.h @@ -66,6 +66,11 @@ inline UIAttributeGroup* theme_style_group(UIThemeStyle* theme, const char* group_name) { HashEntryInt64* entry = (HashEntryInt64 *) hashmap_get_entry(&theme->hash_map, group_name); + if (!entry) { + ASSERT_SIMPLE(false); + return NULL; + } + return (UIAttributeGroup *) (theme->data + entry->value); } @@ -73,6 +78,11 @@ inline UIAttributeGroup* theme_style_group(UIThemeStyle* theme, const char* group_name, int32 group_id) { HashEntryInt64* entry = (HashEntryInt64 *) hashmap_get_entry(&theme->hash_map, group_name, group_id); + if (!entry) { + ASSERT_SIMPLE(false); + return NULL; + } + return (UIAttributeGroup *) (theme->data + entry->value); } @@ -229,7 +239,7 @@ void theme_from_file_txt( *temp = '\0'; for (int32 j = 0; j < UI_ELEMENT_TYPE_SIZE; ++j) { - if (strcmp(str, ui_element_type_to_string((UIElementType) j)) == 0) { + if (strcmp(str, ui_element_type_to_string_const((UIElementType) j)) == 0) { attribute.value_int = j; break; @@ -246,13 +256,11 @@ void theme_from_file_txt( } *temp = '\0'; - ++pos; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -260,28 +268,27 @@ void theme_from_file_txt( attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_SIZE), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_SIZE; - attribute.value_float = strtof(pos, &pos); ++pos; + attribute.value_float = strtof(pos, &pos); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_WEIGHT), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_WEIGHT; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT; - attribute.value_float = strtof(pos, &pos); ++pos; + attribute.value_float = strtof(pos, &pos); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ALIGN_H), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_ALIGN_H; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ALIGN_V), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_ALIGN_V; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ZINDEX), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_ZINDEX; - attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos)); ++pos; + attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos)); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -298,22 +305,21 @@ void theme_from_file_txt( attribute.value_str[i] = '\0'; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY; - attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos)); ++pos; + attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos)); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -321,13 +327,12 @@ void theme_from_file_txt( attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_WIDTH), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_WIDTH; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -335,13 +340,12 @@ void theme_from_file_txt( attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -349,13 +353,12 @@ void theme_from_file_txt( attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -363,13 +366,12 @@ void theme_from_file_txt( attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -377,28 +379,27 @@ void theme_from_file_txt( attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_TOP), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_TOP; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_RIGHT), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_RIGHT; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_BOTTOM), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_BOTTOM; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_LEFT), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_LEFT; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -406,16 +407,15 @@ void theme_from_file_txt( attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE; - attribute.value_float = strtof(pos, &pos); ++pos; + attribute.value_float = strtof(pos, &pos); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR), attribute_name) == 0) { ++pos; // Skip '#' attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR; uint32 value = (uint32) strtoul(pos, &pos, 16); - pos += 4; attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f; attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f; @@ -423,16 +423,16 @@ void theme_from_file_txt( attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f; } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE; - attribute.value_float = strtof(pos, &pos); ++pos; + attribute.value_float = strtof(pos, &pos); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION; - attribute.value_int = strtoul(pos, &pos, 10); ++pos; + attribute.value_int = strtoul(pos, &pos, 10); } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TRANSITION_DURATION), attribute_name) == 0) { attribute.attribute_id = UI_ATTRIBUTE_TYPE_TRANSITION_DURATION; - attribute.value_float = strtof(pos, &pos); ++pos; + attribute.value_float = strtof(pos, &pos); } else { while (*pos != '\n' && *pos != '\0') { ++pos; @@ -487,12 +487,20 @@ void theme_from_file( // Of course we still need to populate the data using hashmap_load() // The value is a int64 (because this is the value of the chunk buffer size but the hashmap only allows int32) hashmap_create(&theme->hash_map, (int32) SWAP_ENDIAN_LITTLE(*((uint64 *) pos)), sizeof(HashEntryInt64), theme->data); + + const byte* start = theme->hash_map.buf.memory; pos += hashmap_load(&theme->hash_map, pos); // theme data // Layout: first load the size of the group, then load the individual attributes for (int32 i = 0; i < theme->hash_map.buf.count; ++i) { + if (!theme->hash_map.table[i]) { + continue; + } + HashEntryInt64* entry = (HashEntryInt64 *) theme->hash_map.table[i]; + + pos = start + entry->value; UIAttributeGroup* group = (UIAttributeGroup *) (theme->data + entry->value); group->attribute_size = SWAP_ENDIAN_LITTLE(*((int32 *) pos)); @@ -501,6 +509,21 @@ void theme_from_file( // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases memcpy(group->attributes, pos, group->attribute_size * sizeof(UIAttribute)); pos += group->attribute_size * sizeof(UIAttribute); + + // load all the next elements + while (entry->next) { + pos = start + entry->value; + group = (UIAttributeGroup *) (theme->data + entry->value); + + group->attribute_size = SWAP_ENDIAN_LITTLE(*((int32 *) pos)); + pos += sizeof(group->attribute_size); + + // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases + memcpy(group->attributes, pos, group->attribute_size * sizeof(UIAttribute)); + pos += group->attribute_size * sizeof(UIAttribute); + + entry = entry->next; + } } } @@ -541,7 +564,7 @@ void theme_to_file( // I also don't want to add a size variable to the theme as it is useless in all other cases file.size = theme_size(theme); - file.content = ring_get_memory(ring, file.size, 64); + file.content = ring_get_memory(ring, file.size, 64, true); byte* pos = file.content; // version @@ -549,12 +572,19 @@ void theme_to_file( pos += sizeof(theme->version); // hashmap + byte* start = pos; pos += hashmap_dump(&theme->hash_map, pos); // theme data // Layout: first save the size of the group, then save the individual attributes for (int32 i = 0; i < theme->hash_map.buf.count; ++i) { + if (!theme->hash_map.table[i]) { + continue; + } + HashEntryInt64* entry = (HashEntryInt64 *) theme->hash_map.table[i]; + + pos = start + entry->value; UIAttributeGroup* group = (UIAttributeGroup *) (theme->data + entry->value); *((int32 *) pos) = SWAP_ENDIAN_LITTLE(group->attribute_size); @@ -562,7 +592,22 @@ void theme_to_file( // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases memcpy(pos, group->attributes, group->attribute_size * sizeof(UIAttribute)); - pos += group->attribute_size * sizeof(UIAttribute); + pos += sizeof(UIAttribute); + + // save all the next elements + while (entry->next) { + pos = start + entry->value; + group = (UIAttributeGroup *) (theme->data + entry->value); + + *((int32 *) pos) = SWAP_ENDIAN_LITTLE(group->attribute_size); + pos += sizeof(group->attribute_size); + + // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases + memcpy(pos, group->attributes, group->attribute_size * sizeof(UIAttribute)); + pos += sizeof(UIAttribute); + + entry = entry->next; + } } file.size = pos - file.content; diff --git a/utils/StringUtils.h b/utils/StringUtils.h index cb71f4f..1c5d667 100644 --- a/utils/StringUtils.h +++ b/utils/StringUtils.h @@ -176,7 +176,7 @@ void wchar_to_char(const wchar_t* __restrict src, char* __restrict dest, int32 l *dest = '\0'; } -inline +inline constexpr int32 str_to_int(const char *str) { int32 result = 0; @@ -197,7 +197,47 @@ int32 str_to_int(const char *str) return result * sign; } -inline size_t str_count(const char* __restrict str, const char* __restrict substr) +inline constexpr +int32 int_to_str(int64 number, char *str, const char thousands = ',') { + int32 i = 0; + int64 sign = number; + int32 digit_count = 0; + + if (number == 0) { + str[i++] = '0'; + } else if (number < 0) { + number = -number; + } + + while (number > 0) { + if (thousands + && (digit_count == 3 || digit_count == 6 || digit_count == 9 || digit_count == 12 || digit_count == 15) + ) { + str[i++] = thousands; + } + + str[i++] = number % 10 + '0'; + number /= 10; + ++digit_count; + } + + if (sign < 0) { + str[i++] = '-'; + } + + str[i] = '\0'; + + for (int32 j = 0, k = i - 1; j < k; ++j, --k) { + char temp = str[j]; + str[j] = str[k]; + str[k] = temp; + } + + return i - 1; +} + +inline +size_t str_count(const char* __restrict str, const char* __restrict substr) { size_t l1 = strlen(str); size_t l2 = strlen(substr); @@ -296,45 +336,7 @@ char* strtok(char* str, const char* __restrict delim, char* *key) { return result; } -inline -int32 int_to_str(int64 number, char *str, const char thousands = ',') { - int32 i = 0; - int64 sign = number; - int32 digit_count = 0; - - if (number == 0) { - str[i++] = '0'; - } else if (number < 0) { - number = -number; - } - - while (number > 0) { - if (thousands - && (digit_count == 3 || digit_count == 6 || digit_count == 9 || digit_count == 12 || digit_count == 15) - ) { - str[i++] = thousands; - } - - str[i++] = number % 10 + '0'; - number /= 10; - ++digit_count; - } - - if (sign < 0) { - str[i++] = '-'; - } - - str[i] = '\0'; - - for (int32 j = 0, k = i - 1; j < k; ++j, --k) { - char temp = str[j]; - str[j] = str[k]; - str[k] = temp; - } - - return i - 1; -} - +inline constexpr char toupper_ascii(char c) { return c >= 'a' && c <= 'z' @@ -342,6 +344,7 @@ char toupper_ascii(char c) : c; } +inline constexpr char tolower_ascii(char c) { return c >= 'A' && c <= 'Z' @@ -349,6 +352,7 @@ char tolower_ascii(char c) : c; } +inline constexpr void create_const_name(const unsigned char* name, char* modified_name) { // Print block @@ -365,6 +369,7 @@ void create_const_name(const unsigned char* name, char* modified_name) } } +inline constexpr bool str_ends_with(const char* str, const char* suffix) { if (!str || !suffix) { return false; @@ -431,4 +436,16 @@ void print_bytes(const void* ptr, size_t size) } } +inline constexpr +int32 is_eol(const char* str) +{ + if (*str == '\n') { + return 1; + } else if (*str == '\r' && str[1] == '\n') { + return 2; + } + + return 0; +} + #endif \ No newline at end of file