program starts again. broken are input somehow?!, fps and theme needs to be further implemented

2026-01-11 11:18:40 +00:00 · 2024-10-28 02:43:21 +01:00 · 2024-10-28 02:43:21 +01:00 · c7db2069c0
commit c7db2069c0
parent ba244f8155
19 changed files with 599 additions and 421 deletions
--- a/font/Font.h
+++ b/font/Font.h
@ -100,6 +100,8 @@ void font_from_file_txt(
                while (*pos != '\n') {
                    *texture_pos++ = *pos++;
                }
+
+                *texture_pos++ = '\0';
            } else if (strcmp(block_name, "font_size") == 0) {
                font->size = strtof(pos, &pos);
            } else if (strcmp(block_name, "line_height") == 0) {
--- a/gpuapi/RenderUtils.h
+++ b/gpuapi/RenderUtils.h
@ -693,15 +693,15 @@ void entity_clip_space_mat_sse(f32* result_mat, const f32* model_mat, const f32*
    __m128 a[4];
    __m128 b[4];

-    a[0] = _mm_loadu_ps(projection_mat);
-    a[1] = _mm_loadu_ps(&projection_mat[4]);
-    a[2] = _mm_loadu_ps(&projection_mat[8]);
-    a[3] = _mm_loadu_ps(&projection_mat[12]);
+    a[0] = _mm_load_ps(projection_mat);
+    a[1] = _mm_load_ps(&projection_mat[4]);
+    a[2] = _mm_load_ps(&projection_mat[8]);
+    a[3] = _mm_load_ps(&projection_mat[12]);

-    b[0] = _mm_loadu_ps(view_mat);
-    b[1] = _mm_loadu_ps(&view_mat[4]);
-    b[2] = _mm_loadu_ps(&view_mat[8]);
-    b[3] = _mm_loadu_ps(&view_mat[12]);
+    b[0] = _mm_load_ps(view_mat);
+    b[1] = _mm_load_ps(&view_mat[4]);
+    b[2] = _mm_load_ps(&view_mat[8]);
+    b[3] = _mm_load_ps(&view_mat[12]);
    _MM_TRANSPOSE4_PS(b[0], b[1], b[2], b[3]);

    mat4mat4_mult_sse(a, b, temp);
@ -711,10 +711,10 @@ void entity_clip_space_mat_sse(f32* result_mat, const f32* model_mat, const f32*
    a[2] = temp[2];
    a[3] = temp[3];

-    b[0] = _mm_loadu_ps(model_mat);
-    b[1] = _mm_loadu_ps(&model_mat[4]);
-    b[2] = _mm_loadu_ps(&model_mat[8]);
-    b[3] = _mm_loadu_ps(&model_mat[12]);
+    b[0] = _mm_load_ps(model_mat);
+    b[1] = _mm_load_ps(&model_mat[4]);
+    b[2] = _mm_load_ps(&model_mat[8]);
+    b[3] = _mm_load_ps(&model_mat[12]);
    _MM_TRANSPOSE4_PS(b[0], b[1], b[2], b[3]);

    mat4mat4_mult_sse(a, b, temp);
--- a/gpuapi/UIUtils.h
+++ b/gpuapi/UIUtils.h
@ -0,0 +1,33 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_GPUAPI_UI_UTILS_H
+#define TOS_GPUAPI_UI_UTILS_H
+
+#include <stdio.h>
+#include <string.h>
+
+void ui_input_create(Vertex3DTextureColorIndex* __restrict vertices, uint32* __restrict index, f32 zindex,
+    f32 x, f32 y, f32 width, f32 height, int32 align_h, int32 align_v,
+    uint32 color_index = 0, f32 tex_x1 = 0.0f, f32 tex_y1 = 0.0f, f32 tex_x2 = 0.0f, f32 tex_y2 = 0.0f
+)
+{
+    vertex_rect_border_create(
+        vertices, index, zindex,
+        x, y, width, height, 1, UI_ALIGN_H_LEFT, UI_ALIGN_V_BOTTOM,
+        12, 0.0f, 0.0f
+    );
+
+    vertex_rect_create(
+        vertices, index, zindex,
+        x + 1, y + 1, width - 2, height - 2, UI_ALIGN_H_LEFT, UI_ALIGN_V_BOTTOM,
+        14, 0.0f, 0.0f
+    );
+}
+
+#endif
--- a/log/Debug.cpp
+++ b/log/Debug.cpp
@ -1,6 +1,7 @@
 #ifndef TOS_LOG_DEBUG_MEMORY_C
 #define TOS_LOG_DEBUG_MEMORY_C

+#include "../stdlib/Types.h"
 #include "Debug.h"
 #include "DebugMemory.h"
 #include "Log.h"
--- a/math/matrix/MatrixFloat32.h
+++ b/math/matrix/MatrixFloat32.h
@ -401,11 +401,11 @@ void mat3vec3_mult(const f32* __restrict matrix, const f32* __restrict vector, f
 // @question could simple mul add sse be faster?
 void mat3vec3_mult_sse(const f32* __restrict matrix, const f32* __restrict vector, f32* __restrict result)
 {
-    __m128 vec = _mm_loadu_ps(vector);
+    __m128 vec = _mm_load_ps(vector);
    vec = _mm_insert_ps(vec, _mm_setzero_ps(), 0x30); // vec[3] = 0

    for (int32 i = 0; i < 3; ++i) {
-        __m128 row = _mm_loadu_ps(&matrix[i * 3]);
+        __m128 row = _mm_load_ps(&matrix[i * 3]);
        row = _mm_insert_ps(row, _mm_setzero_ps(), 0x30);  // row[3] = 0

        __m128 dot = _mm_dp_ps(row, vec, 0xF1);
@ -444,10 +444,10 @@ void mat4vec4_mult(const f32* __restrict matrix, const f32* __restrict vector, f
 // @question could simple mul add sse be faster?
 void mat4vec4_mult_sse(const f32* __restrict matrix, const f32* __restrict vector, f32* __restrict result)
 {
-    __m128 vec = _mm_loadu_ps(vector);
+    __m128 vec = _mm_load_ps(vector);

    for (int32 i = 0; i < 4; ++i) {
-        __m128 row = _mm_loadu_ps(&matrix[i * 4]);
+        __m128 row = _mm_load_ps(&matrix[i * 4]);
        __m128 dot = _mm_dp_ps(row, vec, 0xF1);

        result[i] = _mm_cvtss_f32(dot);
@ -502,16 +502,16 @@ void mat4mat4_mult(const f32* __restrict a, const f32* __restrict b, f32* __rest
        // @todo check http://fhtr.blogspot.com/2010/02/4x4-f32-matrix-multiplication-using.html
        // @question could simple mul add sse be faster?
        // Load rows of matrix a
-        __m128 a_1 = _mm_loadu_ps(a);
-        __m128 a_2 = _mm_loadu_ps(&a[4]);
-        __m128 a_3 = _mm_loadu_ps(&a[8]);
-        __m128 a_4 = _mm_loadu_ps(&a[12]);
+        __m128 a_1 = _mm_load_ps(a);
+        __m128 a_2 = _mm_load_ps(&a[4]);
+        __m128 a_3 = _mm_load_ps(&a[8]);
+        __m128 a_4 = _mm_load_ps(&a[12]);

        // Load columns of matrix b
-        __m128 b_1 = _mm_loadu_ps(b);
-        __m128 b_2 = _mm_loadu_ps(&b[4]);
-        __m128 b_3 = _mm_loadu_ps(&b[8]);
-        __m128 b_4 = _mm_loadu_ps(&b[12]);
+        __m128 b_1 = _mm_load_ps(b);
+        __m128 b_2 = _mm_load_ps(&b[4]);
+        __m128 b_3 = _mm_load_ps(&b[8]);
+        __m128 b_4 = _mm_load_ps(&b[12]);

        _mm_storeu_ps(&result[0],
            _mm_add_ps(
--- a/memory/ChunkMemory.h
+++ b/memory/ChunkMemory.h
@ -69,12 +69,14 @@ void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, i
    buf->memory = data;

    buf->count = count;
-    buf->size = chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64);
+    buf->size = chunk_size * count + sizeof(buf->free) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
    buf->last_pos = -1;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
+    //  On the other hand the way we do it right now we never have to move past the free array since it is at the end
+    //  On another hand we could by accident overwrite the values in free if we are not careful
    buf->free = (uint64 *) (buf->memory + count * chunk_size);

    DEBUG_MEMORY_INIT((uint64) buf->memory, buf->size);
@ -124,7 +126,7 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false)
    int32 bit_index;

    int64 free_element = -1;
-    byte mask;
+    int64 mask;

    int32 i = 0;
    int64 max_bytes = (buf->count + 7) / 64;
@ -157,7 +159,7 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false)
                uint64 current_free_index = free_index + (bit_index + j) / 64;
                int32 current_bit_index = (bit_index + j) % 64;

-                mask = 1 << current_bit_index;
+                mask = 1LL << current_bit_index;
                if ((buf->free[current_free_index] & mask) == 0) {
                    ++consecutive_free_bits;
                } else {
@ -201,23 +203,23 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false)

 byte* chunk_find_free(ChunkMemory* buf)
 {
-    int64 byte_index = (buf->last_pos + 1) / 64;
+    int64 free_index = (buf->last_pos + 1) / 64;
    int32 bit_index;

    int64 free_element = -1;
-    byte mask;
+    int64 mask;

    int32 i = 0;
    int64 max_bytes = (buf->count + 7) / 64;

    while (free_element < 0 && i < buf->count) {
-        if (byte_index >= max_bytes) {
-            byte_index = 0;
+        if (free_index >= max_bytes) {
+            free_index = 0;
        }

-        if (buf->free[byte_index] == 0xFF) {
+        if (buf->free[free_index] == 0xFF) {
            ++i;
-            ++byte_index;
+            ++free_index;

            continue;
        }
@ -226,10 +228,10 @@ byte* chunk_find_free(ChunkMemory* buf)
        // @performance on the first iteration through the buffer we could optimize this by starting at a different bit_index
        // because we know that the bit_index is based on last_pos
        for (bit_index = 0; bit_index < 64; ++bit_index) {
-            mask = 1 << bit_index;
-            if ((buf->free[byte_index] & mask) == 0) {
-                free_element = byte_index * 64 + bit_index;
-                buf->free[byte_index] |= (1LL << bit_index);
+            mask = 1LL << bit_index;
+            if ((buf->free[free_index] & mask) == 0) {
+                free_element = free_index * 64 + bit_index;
+                buf->free[free_index] |= (1LL << bit_index);

                break;
            }
@ -248,10 +250,10 @@ void chunk_free_element(ChunkMemory* buf, uint64 element)
 {
    DEBUG_MEMORY_DELETE((uint64) (buf->memory + element * buf->chunk_size), buf->chunk_size);

-    int64 byte_index = element / 64;
+    int64 free_index = element / 64;
    int32 bit_index = element % 64;

-    buf->free[byte_index] &= ~(1 << bit_index);
+    buf->free[free_index] &= ~(1LL << bit_index);
 }

 inline
--- a/memory/RingMemory.h
+++ b/memory/RingMemory.h
@ -62,7 +62,7 @@ void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64)
 inline
 void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment = 64)
 {
-    ring->memory = buffer_get_memory(buf, size, alignment);
+    ring->memory = buffer_get_memory(buf, size, alignment, true);

    ring->size = size;
    ring->pos = 0;
@ -71,8 +71,6 @@ void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment
    ring->start = 0;
    ring->end = 0;

-    memset(ring->memory, 0, buf->size);
-
    DEBUG_MEMORY_INIT((uint64) ring->memory, ring->size);
 }

--- a/stdlib/HashMap.h
+++ b/stdlib/HashMap.h
@ -122,6 +122,11 @@ int64 hashmap_size(const HashMap* hm)
 }

 void hashmap_insert(HashMap* hm, const char* key, int32 value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
    uint64 index = hash_djb2(key) % hm->buf.count;

    int64 element = chunk_reserve(&hm->buf, 1);
@ -147,6 +152,11 @@ void hashmap_insert(HashMap* hm, const char* key, int32 value) {
 }

 void hashmap_insert(HashMap* hm, const char* key, int64 value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
    uint64 index = hash_djb2(key) % hm->buf.count;

    int64 element = chunk_reserve(&hm->buf, 1);
@ -172,6 +182,11 @@ void hashmap_insert(HashMap* hm, const char* key, int64 value) {
 }

 void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
    uint64 index = hash_djb2(key) % hm->buf.count;

    int64 element = chunk_reserve(&hm->buf, 1);
@ -197,6 +212,11 @@ void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) {
 }

 void hashmap_insert(HashMap* hm, const char* key, void* value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
    uint64 index = hash_djb2(key) % hm->buf.count;

    int64 element = chunk_reserve(&hm->buf, 1);
@ -222,6 +242,11 @@ void hashmap_insert(HashMap* hm, const char* key, void* value) {
 }

 void hashmap_insert(HashMap* hm, const char* key, f32 value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
    uint64 index = hash_djb2(key) % hm->buf.count;

    int64 element = chunk_reserve(&hm->buf, 1);
@ -247,6 +272,11 @@ void hashmap_insert(HashMap* hm, const char* key, f32 value) {
 }

 void hashmap_insert(HashMap* hm, const char* key, const char* value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
    uint64 index = hash_djb2(key) % hm->buf.count;

    int64 element = chunk_reserve(&hm->buf, 1);
@ -274,6 +304,11 @@ void hashmap_insert(HashMap* hm, const char* key, const char* value) {
 }

 void hashmap_insert(HashMap* hm, const char* key, byte* value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
    uint64 index = hash_djb2(key) % hm->buf.count;

    int64 element = chunk_reserve(&hm->buf, 1);
@ -302,6 +337,11 @@ void hashmap_insert(HashMap* hm, const char* key, byte* value) {
 }

 HashEntry* hashmap_get_entry(HashMap* hm, const char* key) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return NULL;
+    }
+
    uint64 index = hash_djb2(key) % hm->buf.count;
    HashEntry* entry = (HashEntry *) hm->table[index];

@ -356,90 +396,131 @@ void hashmap_delete_entry(HashMap* hm, const char* key) {
    }
 }

-// @bug We cannot know if the data needs endian swap (it coult be int/float, but also some other 4/8 byte value)
-//  -> if we save this to a file and load it on a different system we will have "corrupt" data
 inline
 int64 hashmap_dump(const HashMap* hm, byte* data)
 {
    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(hm->buf.count);
    data += sizeof(uint64);

-    uint64 next_count_total = 0;
-
    // Dump the table content where the elements are relative indeces/pointers
    for (int32 i = 0; i < hm->buf.count; ++i) {
-        *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) hm->table[i] - (uintptr_t) hm->buf.memory);
+        *((uint64 *) data) = hm->table[i]
+            ? SWAP_ENDIAN_LITTLE((uintptr_t) hm->table[i] - (uintptr_t) hm->buf.memory)
+            : 0ULL;
+    }
+    data += sizeof(uint64) * hm->buf.count;
+
+    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64);
+
+    // Dumb hash map content = buffer memory
+    int32 free_index = 0;
+    int32 bit_index = 0;
+    for (int32 i = 0; i < hm->buf.count; ++i) {
+        if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
+            HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);
+
+            // element_id
+            *((uint64 *) data) = SWAP_ENDIAN_LITTLE(entry->element_id);
+            data += sizeof(entry->element_id);
+
+            // key
+            memcpy(data, entry->key, sizeof(entry->key));
+            data += sizeof(entry->key);
+
+            // next pointer
+            if (entry->next) {
+                *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) entry->next - (uintptr_t) hm->buf.memory);
+            } else {
+                memset(data, 0, sizeof(uint64));
+            }
            data += sizeof(uint64);

-        // Also dump the next pointer
-        // Count how many next elements we have
-        HashEntry* entry = ((HashEntry *) hm->table[i])->next;
-        int32 next_count = 0;
-        while (entry) {
-            ++next_count;
-            entry = entry->next;
+            // We just assume that 4 or 8 bytes = int -> endian handling
+            if (value_size == 4) {
+                *((int32 *) data) = SWAP_ENDIAN_LITTLE(((HashEntryInt32 *) entry)->value);
+            } else if (value_size == 8) {
+                *((int64 *) data) = SWAP_ENDIAN_LITTLE(((HashEntryInt64 *) entry)->value);
+            } else {
+                memcpy(data, entry->value, value_size);
+            }
+            data += value_size;
+        } else {
+            // No entry defined -> NULL
+            memset(data, 0, hm->buf.chunk_size);
+            data += hm->buf.chunk_size;
        }

-        next_count_total += next_count;
-
-        *((int32 *) data) = SWAP_ENDIAN_LITTLE(next_count);
-        data += sizeof(next_count);
-
-        if (next_count > 0) {
-            entry = ((HashEntry *) hm->table[i])->next;
-            while (entry) {
-                *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) entry - (uintptr_t) hm->buf.memory);
-                data += sizeof(uint64);
-
-                entry = entry->next;
-            }
+        ++bit_index;
+        if (bit_index > 63) {
+            bit_index = 0;
+            ++free_index;
        }
    }

-    // @performance chunk_dump() below contains some data we already output above
-    // (next pointer but it is useless, since we need relative positions)
-    // Maybe we should manually re-create the chunk_dump here and omit the already dumped data for the next pointer?
+    // dump free array
+    memcpy(data, hm->buf.free, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64));

-    // How many bytes were written (+ dump the chunk memory)
-    return sizeof(hm->buf.count)
+    return sizeof(hm->buf.count) // hash map count = buffer count
        + hm->buf.count * sizeof(uint64) // table content
-        + hm->buf.count * sizeof(int32) // counter for the next pointer (one for every element)
-        + next_count_total * sizeof(uint64) // next pointer offset
-        + chunk_dump(&hm->buf, data);
+        + hm->buf.size; // hash map content + free array
 }

+// WARNING: Requires hashmap_create first
 inline
 int64 hashmap_load(HashMap* hm, const byte* data)
 {
    uint64 count = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
    data += sizeof(uint64);

-    uint64 next_count_total = 0;
-
-    // Load the table content, we also need to convert from relative indeces to pointers
+    // Load the table content
    for (int i = 0; i < count; ++i) {
-        hm->table[i] = hm->buf.memory + SWAP_ENDIAN_LITTLE(*((uint64 *) data));
-        data += sizeof(uint64);
+        uint64 offset =  SWAP_ENDIAN_LITTLE(*((uint64 *) data));
+        data += sizeof(offset);

-        // Also load the next pointer
-        // Count how many next elements we have
-        int32 next_count = SWAP_ENDIAN_LITTLE(*((int32 *) data));
-        data += sizeof(next_count);
+        // the first element has no offset!
+        hm->table[i] = offset || i == 0 ? hm->buf.memory + offset : NULL;
+    }

-        HashEntry* entry = ((HashEntry *) hm->table[i]);
-        for (int32 j = 0; j < next_count; ++j) {
-            entry->next = (HashEntry *) (hm->buf.memory + SWAP_ENDIAN_LITTLE(*((uint64 *) data)));
-            data += sizeof(uint64);
-            entry = entry->next;
+    // This loop here is why it is important to already have an initialized hashmap
+    // @question Do we maybe want to change this and not require an initalized hashmap?
+    memcpy(hm->buf.memory, data, hm->buf.size);
+    data += hm->buf.chunk_size * hm->buf.count;
+
+    // @question don't we have to possibly endian swap check the free array as well?
+    memcpy(hm->buf.free, data, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64));
+
+    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64);
+
+    // Switch endian AND turn offsets to pointers
+    int32 free_index = 0;
+    int32 bit_index = 0;
+    for (int32 i = 0; i < hm->buf.count; ++i) {
+        if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
+            HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);
+
+            // element id
+            entry->element_id = SWAP_ENDIAN_LITTLE(entry->element_id);
+
+            // key is already loaded with the memcpy
+            // @question Do we even want to use memcpy? We are re-checking all the values here anyways
+
+            // next pointer
+            if (entry->next) {
+                entry->next = (HashEntry *) (hm->buf.memory + SWAP_ENDIAN_LITTLE((uint64) entry->next));
+            }
+
+            if (value_size == 4) {
+                ((HashEntryInt32 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt32 *) entry)->value);
+            } else if (value_size == 8) {
+                ((HashEntryInt64 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt64 *) entry)->value);
+            }
        }
    }

    // How many bytes was read from data
-    return sizeof(count)
+    return sizeof(hm->buf.count) // hash map count = buffer count
        + hm->buf.count * sizeof(uint64) // table content
-        + hm->buf.count * sizeof(int32) // counter for the next pointer (one for every element)
-        + next_count_total * sizeof(uint64) // next pointer offset
-        + chunk_load(&hm->buf, data);
+        + hm->buf.size;
 }

 #endif
--- a/stdlib/simd/SIMD_F32.h
+++ b/stdlib/simd/SIMD_F32.h
@ -39,7 +39,7 @@ struct f32_16 {
 inline f32_4 load_f32_4(const f32* mem)
 {
    f32_4 simd;
-    simd.s = _mm_loadu_ps(mem);
+    simd.s = _mm_load_ps(mem);

    return simd;
 }
@ -57,7 +57,7 @@ inline void unload_f32_4(f32_4 a, f32 *array) { _mm_store_ps(array, a.s); }
 inline f32_8 load_f32_8(const f32* mem)
 {
    f32_8 simd;
-    simd.s = _mm256_loadu_ps(mem);
+    simd.s = _mm256_load_ps(mem);

    return simd;
 }
@ -75,7 +75,7 @@ inline void unload_f32_8(f32_8 a, f32 *array) { _mm256_store_ps(array, a.s); }
 inline f32_16 load_f32_16(const f32* mem)
 {
    f32_16 simd;
-    simd.s = _mm512_loadu_ps(mem);
+    simd.s = _mm512_load_ps(mem);

    return simd;
 }
@ -996,8 +996,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps)
        __m512 result_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_ps(a);
-            b_16 = _mm512_loadu_ps(b);
+            a_16 = _mm512_load_ps(a);
+            b_16 = _mm512_load_ps(b);
            result_16 = _mm512_mul_ps(a_16, b_16);
            _mm512_store_ps(result, result_16);

@ -1011,8 +1011,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps)
        __m256 result_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_ps(a);
-            b_8 = _mm256_loadu_ps(b);
+            a_8 = _mm256_load_ps(a);
+            b_8 = _mm256_load_ps(b);
            result_8 = _mm256_mul_ps(a_8, b_8);
            _mm256_store_ps(result, result_8);

@ -1026,8 +1026,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps)
        __m128 result_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_ps(a);
-            b_4 = _mm_loadu_ps(b);
+            a_4 = _mm_load_ps(a);
+            b_4 = _mm_load_ps(b);
            result_4 = _mm_mul_ps(a_4, b_4);
            _mm_store_ps(result, result_4);

@ -1057,7 +1057,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
        __m512 result_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_ps(a);
+            a_16 = _mm512_load_ps(a);
            result_16 = _mm512_mul_ps(a_16, b_16);
            _mm512_store_ps(result, result_16);

@ -1070,7 +1070,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
        __m256 result_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_ps(a);
+            a_8 = _mm256_load_ps(a);
            result_8 = _mm256_mul_ps(a_8, b_8);
            _mm256_store_ps(result, result_8);

@ -1083,7 +1083,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
        __m128 result_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_ps(a);
+            a_4 = _mm_load_ps(a);
            result_4 = _mm_mul_ps(a_4, b_4);
            _mm_store_ps(result, result_4);

@ -1111,7 +1111,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps)
        __m512 result_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_ps(a);
+            a_16 = _mm512_load_ps(a);
            result_16 = _mm512_div_ps(a_16, b_16);
            _mm512_store_ps(result, result_16);

@ -1124,7 +1124,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps)
        __m256 result_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_ps(a);
+            a_8 = _mm256_load_ps(a);
            result_8 = _mm256_div_ps(a_8, b_8);
            _mm256_store_ps(result, result_8);

@ -1137,7 +1137,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps)
        __m128 result_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_ps(a);
+            a_4 = _mm_load_ps(a);
            result_4 = _mm_div_ps(a_4, b_4);
            _mm_store_ps(result, result_4);

@ -1166,7 +1166,7 @@ void simd_div(const f32* a, f32 b, __m256* result, int32 size)
    __m256 result_8;

    for (; i <= size - 8; i += 8) {
-        a_8 = _mm256_loadu_ps(a);
+        a_8 = _mm256_load_ps(a);
        result_8 = _mm256_div_ps(a_8, b_8);
        result[j] = result_8;

@ -1181,7 +1181,7 @@ void simd_div(const f32* a, f32 b, __m256* result, int32 size)
        temp[k] = a[i + k] / b;
    }

-    result[j] = _mm256_loadu_ps(temp);
+    result[j] = _mm256_load_ps(temp);
 }

 inline
--- a/stdlib/simd/SIMD_I16.h
+++ b/stdlib/simd/SIMD_I16.h
@ -39,7 +39,7 @@ struct int16_32 {
 inline int16_8 load_int16_8(const int16* mem)
 {
    int16_8 simd;
-    simd.s = _mm_loadu_epi16(mem);
+    simd.s = _mm_load_si128((__m128i *) mem);

    return simd;
 }
@ -60,7 +60,7 @@ inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i *
 inline int16_16 load_int16_16(const int16* mem)
 {
    int16_16 simd;
-    simd.s = _mm256_loadu_epi16(mem);
+    simd.s = _mm256_load_si256((__m256i *) mem);

    return simd;
 }
@ -81,7 +81,7 @@ inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m2
 inline int16_32 load_int16_32(const int16* mem)
 {
    int16_32 simd;
-    simd.s = _mm512_loadu_epi16(mem);
+    simd.s = _mm512_load_si512((__m512i *) mem);

    return simd;
 }
--- a/stdlib/simd/SIMD_I32.h
+++ b/stdlib/simd/SIMD_I32.h
@ -11,6 +11,7 @@

 #include <immintrin.h>
 #include <xmmintrin.h>
+#include <emmintrin.h>

 #include "../Types.h"
 #include "../../utils/BitUtils.h"
@ -45,7 +46,7 @@ struct int32_16 {
 inline int32_4 load_int32_4(const int32* mem)
 {
    int32_4 simd;
-    simd.s = _mm_loadu_epi32(mem);
+    simd.s = _mm_load_si128((__m128i *) mem);

    return simd;
 }
@ -63,7 +64,7 @@ inline void unload_int32_4(int32_4 a, int32 *array) { _mm_store_si128((__m128i *
 inline int32_8 load_int32_8(const int32* mem)
 {
    int32_8 simd;
-    simd.s = _mm256_loadu_epi32(mem);
+    simd.s = _mm256_load_si256((__m256i *) mem);

    return simd;
 }
@ -81,7 +82,7 @@ inline void unload_int32_8(int32_8 a, int32 *array) { _mm256_store_si256((__m256
 inline int32_16 load_int32_16(const int32* mem)
 {
    int32_16 simd;
-    simd.s = _mm512_loadu_epi32(mem);
+    simd.s = _mm512_load_epi32(mem);

    return simd;
 }
@ -1039,8 +1040,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32
        __m512i result_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
-            b_16 = _mm512_loadu_epi32(b);
+            a_16 = _mm512_load_epi32(a);
+            b_16 = _mm512_load_epi32(b);
            result_16 = _mm512_mul_epi32(a_16, b_16);
            _mm512_store_epi32(result, result_16);

@ -1054,8 +1055,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32
        __m256i result_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
-            b_8 = _mm256_loadu_epi32(b);
+            a_8 = _mm256_load_si256((__m256i *) a);
+            b_8 = _mm256_load_si256((__m256i *) b);
            result_8 = _mm256_mul_epi32(a_8, b_8);
            _mm256_store_si256((__m256i *) result, result_8);

@ -1069,8 +1070,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32
        __m128i result_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
-            b_4 = _mm_loadu_epi32(b);
+            a_4 = _mm_load_si128((__m128i *) a);
+            b_4 = _mm_load_si128((__m128i *) b);
            result_4 = _mm_mul_epi32(a_4, b_4);
            _mm_store_si128((__m128i *) result, result_4);

@ -1101,9 +1102,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step
        __m512 result_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
            af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_loadu_ps(b);
+            b_16 = _mm512_load_ps(b);
            result_16 = _mm512_mul_ps(af_16, b_16);
            _mm512_store_ps(result, result_16);

@ -1118,9 +1119,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step
        __m256 result_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
            af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_loadu_ps(b);
+            b_8 = _mm256_load_ps(b);
            result_8 = _mm256_mul_ps(af_8, b_8);
            _mm256_store_ps(result, result_8);

@ -1135,9 +1136,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step
        __m128 result_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
            af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_loadu_ps(b);
+            b_4 = _mm_load_ps(b);
            result_4 = _mm_mul_ps(af_4, b_4);
            _mm_store_ps(result, result_4);

@ -1169,9 +1170,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st
        __m512i resulti_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
            af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_loadu_ps(b);
+            b_16 = _mm512_load_ps(b);
            result_16 = _mm512_mul_ps(af_16, b_16);
            resulti_16 = _mm512_cvtps_epi32(result_16);
            _mm512_store_epi32(result, resulti_16);
@ -1188,9 +1189,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st
        __m256i resulti_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
            af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_loadu_ps(b);
+            b_8 = _mm256_load_ps(b);
            result_8 = _mm256_mul_ps(af_8, b_8);
            resulti_8 = _mm256_cvtps_epi32(result_8);
            _mm256_store_si256((__m256i *) result, resulti_8);
@ -1207,9 +1208,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st
        __m128i resulti_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
            af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_loadu_ps(b);
+            b_4 = _mm_load_ps(b);
            result_4 = _mm_mul_ps(af_4, b_4);
            resulti_4 = _mm_cvtps_epi32(result_4);
            _mm_store_si128((__m128i *) result, resulti_4);
@ -1242,7 +1243,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps)
        __m512i resulti_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
            af_16 = _mm512_cvtepi32_ps(a_16);
            result_16 = _mm512_mul_ps(af_16, b_16);
            resulti_16 = _mm512_cvtps_epi32(result_16);
@ -1259,7 +1260,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps)
        __m256i resulti_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
            af_8 = _mm256_cvtepi32_ps(a_8);
            result_8 = _mm256_mul_ps(af_8, b_8);
            resulti_8 = _mm256_cvtps_epi32(result_8);
@ -1276,7 +1277,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps)
        __m128i resulti_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
            af_4 = _mm_cvtepi32_ps(a_4);
            result_4 = _mm_mul_ps(af_4, b_4);
            resulti_4 = _mm_cvtps_epi32(result_4);
@ -1307,7 +1308,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
        __m512 result_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
            af_16 = _mm512_cvtepi32_ps(a_16);
            result_16 = _mm512_div_ps(af_16, b_16);
            _mm512_store_ps(result, result_16);
@ -1323,7 +1324,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
        __m256 result_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
            af_8 = _mm256_cvtepi32_ps(a_8);
            result_8 = _mm256_div_ps(af_8, b_8);
            _mm256_store_ps(result, result_8);
@ -1338,7 +1339,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
        __m128 result_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
            af_4 = _mm_cvtepi32_ps(a_4);
            result_4 = _mm_div_ps(af_4, b_4);
            _mm_store_ps(result, result_4);
@ -1367,8 +1368,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s
        __m512i result_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
-            b_16 = _mm512_loadu_epi32(b);
+            a_16 = _mm512_load_epi32(a);
+            b_16 = _mm512_load_epi32(b);
            result_16 = _mm512_add_epi32(a_16, b_16);
            _mm512_store_epi32(result, result_16);

@ -1382,8 +1383,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s
        __m256i result_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
-            b_8 = _mm256_loadu_epi32(b);
+            a_8 = _mm256_load_si256((__m256i *) a);
+            b_8 = _mm256_load_si256((__m256i *) b);
            result_8 = _mm256_add_epi32(a_8, b_8);
            _mm256_store_si256((__m256i *) result, result_8);

@ -1397,8 +1398,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s
        __m128i result_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
-            b_4 = _mm_loadu_epi32(b);
+            a_4 = _mm_load_si128((__m128i *) a);
+            b_4 = _mm_load_si128((__m128i *) b);
            result_4 = _mm_add_epi32(a_4, b_4);
            _mm_store_si128((__m128i *) result, result_4);

@ -1429,9 +1430,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps
        __m512 result_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
            af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_loadu_ps(b);
+            b_16 = _mm512_load_ps(b);
            result_16 = _mm512_add_ps(af_16, b_16);
            _mm512_store_ps(result, result_16);

@ -1446,9 +1447,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps
        __m256 result_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
            af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_loadu_ps(b);
+            b_8 = _mm256_load_ps(b);
            result_8 = _mm256_add_ps(af_8, b_8);
            _mm256_store_ps(result, result_8);

@ -1463,9 +1464,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps
        __m128 result_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
            af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_loadu_ps(b);
+            b_4 = _mm_load_ps(b);
            result_4 = _mm_add_ps(af_4, b_4);
            _mm_store_ps(result, result_4);

@ -1497,9 +1498,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
        __m512i resulti_16;

        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
            af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_loadu_ps(b);
+            b_16 = _mm512_load_ps(b);
            result_16 = _mm512_add_ps(af_16, b_16);
            resulti_16 = _mm512_cvtps_epi32(result_16);
            _mm512_store_epi32(result, resulti_16);
@ -1516,9 +1517,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
        __m256i resulti_8;

        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
            af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_loadu_ps(b);
+            b_8 = _mm256_load_ps(b);
            result_8 = _mm256_add_ps(af_8, b_8);
            resulti_8 = _mm256_cvtps_epi32(result_8);
            _mm256_store_si256((__m256i *) result, resulti_8);
@ -1535,9 +1536,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
        __m128i resulti_4;

        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
            af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_loadu_ps(b);
+            b_4 = _mm_load_ps(b);
            result_4 = _mm_add_ps(af_4, b_4);
            resulti_4 = _mm_cvtps_epi32(result_4);
            _mm_store_si128((__m128i *) result, resulti_4);
@ -1560,8 +1561,8 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
 // WARNING: only works with SSE4.2
 // WARNING: incl. \0 both strings must be <= 16 length
 bool str_compare_avx512(const char* str1, const char* str2) {
-    __m128i s1 = _mm_loadu_si128((const __m128i *)  str1);
-    __m128i s2 = _mm_loadu_si128((const __m128i *)  str2);
+    __m128i s1 = _mm_load_si128((__m128i *) (const __m128i *)  str1);
+    __m128i s2 = _mm_load_si128((__m128i *) (const __m128i *)  str2);

    return _mm_cmpistrc(s1, s2, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH) == 0;
 }
@ -1580,7 +1581,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps)
        );

        for (i = 0; i <= size - steps; i += steps) {
-            __m512i vec = _mm512_loadu_si512((const __m512i *) (val + i));
+            __m512i vec = _mm512_load_si512((const __m512i *) (val + i));
            vec = _mm512_shuffle_epi8(vec, mask_512);

            _mm512_storeu_si512((__m512i *) (result + i), vec);
@ -1594,7 +1595,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps)
        );

        for (i = 0; i <= size - steps; i += steps) {
-            __m256i vec = _mm256_loadu_si256((const __m256i *) (val + i));
+            __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
            vec = _mm256_shuffle_epi8(vec, mask_256);

            _mm256_storeu_si256((__m256i *) (result + i), vec);
@ -1608,7 +1609,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps)
        );

        for (i = 0; i <= size - steps; i += steps) {
-             __m128i vec = _mm_loadu_si128((const __m128i *) (val + i));
+             __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i));
            vec = _mm_shuffle_epi8(vec, mask_128);

            _mm_storeu_si128((__m128i *) (result + i), vec);
--- a/stdlib/simd/SIMD_I8.h
+++ b/stdlib/simd/SIMD_I8.h
@ -40,7 +40,7 @@ struct int8_64 {
 inline int8_16 load_int8_16(const int8* mem)
 {
    int8_16 simd;
-    simd.s = _mm_loadu_epi8(mem);
+    simd.s = _mm_load_si128((__m128i *) mem);

    return simd;
 }
@ -63,7 +63,7 @@ inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *)
 inline int8_32 load_int8_32(const int8* mem)
 {
    int8_32 simd;
-    simd.s = _mm256_loadu_epi8(mem);
+    simd.s = _mm256_load_si256((__m256i *) mem);

    return simd;
 }
@ -86,7 +86,7 @@ inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i
 inline int8_64 load_int8_64(const int8* mem)
 {
    int8_64 simd;
-    simd.s = _mm512_loadu_epi8(mem);
+    simd.s = _mm512_load_si512((__m512i *) mem);

    return simd;
 }
@ -830,19 +830,19 @@ inline
 f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps)
 {
    if (steps == 16) {
-        __m512i a_16 = _mm512_loadu_epi8(a);
+        __m512i a_16 = _mm512_load_si512((__m512i *) a);
        __m512 af_16 = _mm512_cvtepi32_ps(a_16);
        __m512 b_16 = _mm512_set1_ps(b);

        __m512 result = _mm512_mul_ps(af_16, b_16);
    } else if (steps == 8) {
-        __m256i a_8 = _mm256_loadu_epi8(a);
+        __m256i a_8 = _mm256_load_si256((__m256i *) a);
        __m256 af_8 = _mm256_cvtepi32_ps(a_8);
        __m256 b_8 = _mm256_set1_ps(b);

        __m256 result = _mm256_mul_ps(af_8, b_8);
    } else if (steps == 4) {
-        __m128i a_4 = _mm_loadu_epi8(a);
+        __m128i a_4 = _mm_load_si128((__m128i *) a);
        __m128 af_4 = _mm_cvtepi32_ps(a_4);
        __m128 b_4 = _mm_set1_ps(b);

@ -855,11 +855,11 @@ f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps)

 bool simd_compare_64(const byte* a, const byte* b)
 {
-    __m256i chunk1_a = _mm256_loadu_si256((__m256i*) a);
-    __m256i chunk1_b = _mm256_loadu_si256((__m256i*) b);
+    __m256i chunk1_a = _mm256_load_si256((__m256i*) a);
+    __m256i chunk1_b = _mm256_load_si256((__m256i*) b);

-    __m256i chunk2_a = _mm256_loadu_si256((__m256i*) (a + 32));
-    __m256i chunk2_b = _mm256_loadu_si256((__m256i*) (b + 32));
+    __m256i chunk2_a = _mm256_load_si256((__m256i*) (a + 32));
+    __m256i chunk2_b = _mm256_load_si256((__m256i*) (b + 32));

    __m256i result1 = _mm256_cmpeq_epi8(chunk1_a, chunk1_b);
    __m256i result2 = _mm256_cmpeq_epi8(chunk2_a, chunk2_b);
@ -879,8 +879,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
            __mmask64 result_mask;

            for (; i <= size - 64; i += 64) {  // 64 bytes per iteration
-                a_16 = _mm512_loadu_si512((__m512i*) a);
-                b_16 = _mm512_loadu_si512((__m512i*) b);
+                a_16 = _mm512_load_si512((__m512i*) a);
+                b_16 = _mm512_load_si512((__m512i*) b);

                result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16);

@ -905,8 +905,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
            __m256i result_8;

            for (; i <= size - steps; i += steps) {
-                a_8 = _mm256_loadu_si256((__m256i*) a);
-                b_8 = _mm256_loadu_si256((__m256i*) b);
+                a_8 = _mm256_load_si256((__m256i*) a);
+                b_8 = _mm256_load_si256((__m256i*) b);

                result_8 = _mm256_cmpeq_epi8(a_8, b_8);

@ -929,8 +929,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
            __m128i result_4;

            for (; i <= size - steps; i += steps) {
-                a_4 = _mm_loadu_si128((__m128i*) a);
-                b_4 = _mm_loadu_si128((__m128i*) b);
+                a_4 = _mm_load_si128((__m128i*) a);
+                b_4 = _mm_load_si128((__m128i*) b);

                result_4 = _mm_cmpeq_epi8(a_4, b_4);

--- a/stdlib/simd/SIMD_SVML.h
+++ b/stdlib/simd/SIMD_SVML.h
@ -25,7 +25,7 @@
            result[i] = a_array[i] / b_array[i];
        }

-        return _mm_loadu_si128((__m128i*)result);
+        return _mm_load_si128((__m128i*)result);
    }

    inline __m256i _mm256_div_epi32(__m256i a, __m256i b) {
@ -38,7 +38,7 @@
            result[i] = a_array[i] / b_array[i];
        }

-        return _mm256_loadu_si256((__m256i*)result);
+        return _mm256_load_si256((__m256i*)result);
    }

    inline __m512i _mm512_div_epi32(__m512i a, __m512i b) {
@ -51,7 +51,7 @@
            result[i] = a_array[i] / b_array[i];
        }

-        return _mm512_loadu_si512((__m512i*)result);
+        return _mm512_load_si512((__m512i*)result);
    }

    inline __m128 _mm_sin_ps(__m128 a) {
@ -60,7 +60,7 @@
        for (int i = 0; i < 4; ++i) {
            result[i] = sinf(a_array[i]);
        }
-        return _mm_loadu_ps(result);
+        return _mm_load_ps(result);
    }

    inline __m128 _mm_cos_ps(__m128 a) {
@ -69,7 +69,7 @@
        for (int i = 0; i < 4; ++i) {
            result[i] = cosf(a_array[i]);
        }
-        return _mm_loadu_ps(result);
+        return _mm_load_ps(result);
    }

    inline __m128 _mm_asin_ps(__m128 a) {
@ -78,7 +78,7 @@
        for (int i = 0; i < 4; ++i) {
            result[i] = asinf(a_array[i]);
        }
-        return _mm_loadu_ps(result);
+        return _mm_load_ps(result);
    }

    inline __m128 _mm_acos_ps(__m128 a) {
@ -87,7 +87,7 @@
        for (int i = 0; i < 4; ++i) {
            result[i] = acosf(a_array[i]);
        }
-        return _mm_loadu_ps(result);
+        return _mm_load_ps(result);
    }

    inline __m256 _mm256_sin_ps(__m256 a) {
@ -96,7 +96,7 @@
        for (int i = 0; i < 8; ++i) {
            result[i] = sinf(a_array[i]);
        }
-        return _mm256_loadu_ps(result);
+        return _mm256_load_ps(result);
    }

    inline __m256 _mm256_cos_ps(__m256 a) {
@ -105,7 +105,7 @@
        for (int i = 0; i < 8; ++i) {
            result[i] = cosf(a_array[i]);
        }
-        return _mm256_loadu_ps(result);
+        return _mm256_load_ps(result);
    }

    inline __m256 _mm256_asin_ps(__m256 a) {
@ -114,7 +114,7 @@
        for (int i = 0; i < 8; ++i) {
            result[i] = asinf(a_array[i]);
        }
-        return _mm256_loadu_ps(result);
+        return _mm256_load_ps(result);
    }

    inline __m256 _mm256_acos_ps(__m256 a) {
@ -123,7 +123,7 @@
        for (int i = 0; i < 16; ++i) {
            result[i] = acosf(a_array[i]);
        }
-        return _mm256_loadu_ps(result);
+        return _mm256_load_ps(result);
    }

    inline __m512 _mm512_sin_ps(__m512 a) {
@ -132,7 +132,7 @@
        for (int i = 0; i < 16; ++i) {
            result[i] = sinf(a_array[i]);
        }
-        return _mm512_loadu_ps(result);
+        return _mm512_load_ps(result);
    }

    inline __m512 _mm512_cos_ps(__m512 a) {
@ -141,7 +141,7 @@
        for (int i = 0; i < 16; ++i) {
            result[i] = cosf(a_array[i]);
        }
-        return _mm512_loadu_ps(result);
+        return _mm512_load_ps(result);
    }

    inline __m512 _mm512_asin_ps(__m512 a) {
@ -150,7 +150,7 @@
        for (int i = 0; i < 16; ++i) {
            result[i] = asinf(a_array[i]);
        }
-        return _mm512_loadu_ps(result);
+        return _mm512_load_ps(result);
    }

    inline __m512 _mm512_acos_ps(__m512 a) {
@ -159,7 +159,7 @@
        for (int i = 0; i < 16; ++i) {
            result[i] = acosf(a_array[i]);
        }
-        return _mm512_loadu_ps(result);
+        return _mm512_load_ps(result);
    }
 #endif

--- a/ui/UIAttribute.h
+++ b/ui/UIAttribute.h
@ -96,9 +96,23 @@ enum UIAttributeType {

 UIAttribute* ui_attribute_from_group(UIAttributeGroup* group, UIAttributeType type)
 {
-    for (int i = 0; i < UI_ATTRIBUTE_TYPE_SIZE && i <= type; ++i) {
-        if (group->attributes[i].attribute_id == type) {
-            return &group->attributes[i];
+    if (!group->attributes) {
+        return NULL;
+    }
+
+    int32 left = 0;
+    int32 right = type;
+
+    // Binary search since attributes are sorted by attribute_id
+    while (left <= right) {
+        int32 mid = left + (right - left) / 2;
+
+        if (group->attributes[mid].attribute_id == type) {
+            return &group->attributes[mid];
+        }  else if (group->attributes[mid].attribute_id < type) {
+            left = mid + 1;
+        }  else {
+            right = mid - 1;
        }
    }

@ -199,98 +213,4 @@ constexpr const char* ui_attribute_type_to_string_const(UIAttributeType e)
    return NULL;
 }

-const char* ui_attribute_type_to_string(UIAttributeType e)
-{
-    switch (e) {
-        case UI_ATTRIBUTE_TYPE_TYPE:
-            return "type";
-        case UI_ATTRIBUTE_TYPE_STYLE:
-            return "style";
-        case UI_ATTRIBUTE_TYPE_DIMENSION_X:
-            return "x";
-        case UI_ATTRIBUTE_TYPE_DIMENSION_Y:
-            return "y";
-        case UI_ATTRIBUTE_TYPE_DIMENSION_WIDTH:
-            return "width";
-        case UI_ATTRIBUTE_TYPE_DIMENSION_HEIGHT:
-            return "height";
-        case UI_ATTRIBUTE_TYPE_FONT_NAME:
-            return "font_name";
-        case UI_ATTRIBUTE_TYPE_FONT_COLOR:
-            return "font_color";
-        case UI_ATTRIBUTE_TYPE_FONT_SIZE:
-            return "font_size";
-        case UI_ATTRIBUTE_TYPE_FONT_WEIGHT:
-            return "font_weight";
-        case UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT:
-            return "font_line_height";
-        case UI_ATTRIBUTE_TYPE_ALIGN_H:
-            return "align_h";
-        case UI_ATTRIBUTE_TYPE_ALIGN_V:
-            return "align_v";
-        case UI_ATTRIBUTE_TYPE_ZINDEX:
-            return "zindex";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR:
-            return "background_color";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG:
-            return "background_img";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY:
-            return "background_img_opacity";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V:
-            return "background_img_position_v";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H:
-            return "background_img_position_h";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE:
-            return "background_img_style";
-        case UI_ATTRIBUTE_TYPE_BORDER_COLOR:
-            return "border_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_WIDTH:
-            return "border_width";
-        case UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR:
-            return "border_top_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH:
-            return "border_top_width";
-        case UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR:
-            return "border_right_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH:
-            return "border_right_width";
-        case UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR:
-            return "border_bottom_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH:
-            return "border_bottom_width";
-        case UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR:
-            return "border_left_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH:
-            return "border_left_width";
-        case UI_ATTRIBUTE_TYPE_PADDING:
-            return "padding";
-        case UI_ATTRIBUTE_TYPE_PADDING_TOP:
-            return "padding_top";
-        case UI_ATTRIBUTE_TYPE_PADDING_RIGHT:
-            return "padding_right";
-        case UI_ATTRIBUTE_TYPE_PADDING_BOTTOM:
-            return "padding_bottom";
-        case UI_ATTRIBUTE_TYPE_PADDING_LEFT:
-            return "padding_left";
-        case UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR:
-            return "shadow_inner_color";
-        case UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE:
-            return "shadow_inner_angle";
-        case UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE:
-            return "shadow_inner_distance";
-        case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR:
-            return "shadow_outer_color";
-        case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE:
-            return "shadow_outer_angle";
-        case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE:
-            return "shadow_outer_distance";
-        case UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION:
-            return "transition_animation";
-        case UI_ATTRIBUTE_TYPE_TRANSITION_DURATION:
-            return "transition_duration";
-    }
-
-    return NULL;
-}
-
 #endif
--- a/ui/UIElement.h
+++ b/ui/UIElement.h
@ -5,6 +5,9 @@
 #include "UIElementType.h"
 #include "../object/Vertex.h"

+#include <immintrin.h>
+#include <xmmintrin.h>
+
 struct UIElementDimension {
 	int16 x1;
 	int16 y1;
@ -22,6 +25,7 @@ struct UIElement {
    const char* name;
    int32 id;
    UIElementType type;
+    bool is_dynamic;

    int16 window_id;
    int16 panel_id;
--- a/ui/UIElementType.h
+++ b/ui/UIElementType.h
@ -53,36 +53,4 @@ constexpr const char* ui_element_type_to_string_const(UIElementType e)
    return NULL;
 }

-const char* ui_element_type_to_string(UIElementType e)
-{
-    switch (e) {
-        case UI_ELEMENT_TYPE_BUTTON:
-            return "button";
-        case UI_ELEMENT_TYPE_SELECT:
-            return "select";
-        case UI_ELEMENT_TYPE_DROPDOWN:
-            return "dropdown";
-        case UI_ELEMENT_TYPE_TEXTFIELD:
-            return "textfield";
-        case UI_ELEMENT_TYPE_TEXTAREA:
-            return "textarea";
-        case UI_ELEMENT_TYPE_IMAGE:
-            return "image";
-        case UI_ELEMENT_TYPE_TEXT:
-            return "text";
-        case UI_ELEMENT_TYPE_LINK:
-            return "link";
-        case UI_ELEMENT_TYPE_TABLE:
-            return "table";
-        case UI_ELEMENT_TYPE_VIEW_WINDOW:
-            return "view_window";
-        case UI_ELEMENT_TYPE_VIEW_PANEL:
-            return "view_panel";
-        case UI_ELEMENT_TYPE_VIEW_TAB:
-            return "view_tab";
-    }
-
-    return NULL;
-}
-
 #endif
--- a/ui/UILayout.h
+++ b/ui/UILayout.h
@ -7,30 +7,136 @@

 // Modified for every scene
 struct UILayout {
-    int32 ui_deadzone_size = 5;
-    UIElementDimension ui_deadzone[5];
+    // This array has the size of the game window and represents in color codes where interactible ui elements are
+    // Size is based on screen size (we don't need full screen size since we assume an interactible element is at least 4 pixels width and height)
+    //      width = 25% of screen size
+    //      height = 25% of screen size
+    uint16 width;
+    uint16 height;

-    int32 element_hoverable_size;
-    int32 element_hoverable_pos;
-	UIElementDimension* elements_hoverable;
+    // Contains all UI elements also dynamic ones (e.g. movable windows)
+    uint32* ui_chroma_codes;

-    int32 element_interactible_size;
-    int32 element_interactible_pos;
-	UIElementDimension* elements_interactible;
+    // Contains constant UI elements that usually don't change (e.g. HUD)
+    uint32* ui_chroma_codes_static;

-    // @question Since we use a hashmap below, do we even need the size?
-    //      Isn't the size exactly the same as the hash_map buf size
-    int32 element_size;
-    int32 element_pos;
-    HashMap hash_map; // Used to directly find element by name
-
-    // @question Do we even need this or should the hashmap values be the elements directly?
-    //  In other places (e.g. theme) we simply define a byte* data variable which actually holds the info.
-    UIElement* elements;
+    // Used to directly find element by name
+    // The values are the UIElements
+    HashMap hash_map;

    int32 vertex_size;
    int32 vertex_pos;
    Vertex3DTextureColorIndex* vertices;
 };

+inline
+uint32 layout_element_from_location(UILayout* layout, uint16 x, uint16 y)
+{
+    return layout->ui_chroma_codes[layout->width * y / 4 + x / 4];
+}
+
+// This function should only get called if the location of a UI Element changes
+// @performance How to handle moving elements (= dragging a window). We don't want to update this while dragging!
+void layout_chroma_codes_update(UILayout* layout)
+{
+    // Reset all
+    memcpy(layout->ui_chroma_codes, layout->ui_chroma_codes_static, layout->width * layout->height * sizeof(uint32));
+
+    // @question Are the dimension values below even absolute? They may be in relation to the parent?!
+    for (int32 i = 0; i < layout->hash_map.buf.count; ++i) {
+        if (!layout->hash_map.table[i]) {
+            continue;
+        }
+
+        HashEntry* entry = (HashEntry *) layout->hash_map.table[i];
+        UIElement* element = (UIElement *) entry->value;
+
+        if (element->is_dynamic) {
+            continue;
+        }
+
+        int32 y_start = element->dimension.y1 / 4;
+        int32 y_end = element->dimension.y2 / 4;
+        int32 x_start = element->dimension.x1 / 4;
+        int32 x_end = element->dimension.x2 / 4;
+
+        for (int32 y = y_start; y < y_end; ++y) {
+            int32 y_offset = layout->width * y;
+            for (int32 x = x_start; x < x_end; ++x) {
+                layout->ui_chroma_codes[y_offset + x] = (uint32) element->id;
+            }
+        }
+
+        // Now handle all next elements
+        while (entry->next) {
+            entry = entry->next;
+
+            element = (UIElement *) entry->value;
+
+            y_start = element->dimension.y1 / 4;
+            y_end = element->dimension.y2 / 4;
+            x_start = element->dimension.x1 / 4;
+            x_end = element->dimension.x2 / 4;
+
+            for (int32 y = y_start; y < y_end; ++y) {
+                int32 y_offset = layout->width * y;
+                for (int32 x = x_start; x < x_end; ++x) {
+                    layout->ui_chroma_codes[y_offset + x] = (uint32) element->id;
+                }
+            }
+        }
+    }
+}
+
+void layout_chroma_codes_update_static(UILayout* layout)
+{
+    // Reset all
+    memset(layout->ui_chroma_codes_static, 0, layout->width * layout->height * sizeof(uint32));
+
+    // @question Are the dimension values below even absolute? They may be in relation to the parent?!
+    for (int32 i = 0; i < layout->hash_map.buf.count; ++i) {
+        if (!layout->hash_map.table[i]) {
+            continue;
+        }
+
+        HashEntry* entry = (HashEntry *) layout->hash_map.table[i];
+        UIElement* element = (UIElement *) entry->value;
+
+        if (!element->is_dynamic) {
+            continue;
+        }
+
+        int32 y_start = element->dimension.y1 / 4;
+        int32 y_end = element->dimension.y2 / 4;
+        int32 x_start = element->dimension.x1 / 4;
+        int32 x_end = element->dimension.x2 / 4;
+
+        for (int32 y = y_start; y < y_end; ++y) {
+            int32 y_offset = layout->width * y;
+            for (int32 x = x_start; x < x_end; ++x) {
+                layout->ui_chroma_codes_static[y_offset + x] = (uint32) element->id;
+            }
+        }
+
+        // Now handle all next elements
+        while (entry->next) {
+            entry = entry->next;
+
+            element = (UIElement *) entry->value;
+
+            y_start = element->dimension.y1 / 4;
+            y_end = element->dimension.y2 / 4;
+            x_start = element->dimension.x1 / 4;
+            x_end = element->dimension.x2 / 4;
+
+            for (int32 y = y_start; y < y_end; ++y) {
+                int32 y_offset = layout->width * y;
+                for (int32 x = x_start; x < x_end; ++x) {
+                    layout->ui_chroma_codes_static[y_offset + x] = (uint32) element->id;
+                }
+            }
+        }
+    }
+}
+
 #endif
--- a/ui/UITheme.h
+++ b/ui/UITheme.h
@ -66,6 +66,11 @@ inline
 UIAttributeGroup* theme_style_group(UIThemeStyle* theme, const char* group_name)
 {
    HashEntryInt64* entry = (HashEntryInt64 *) hashmap_get_entry(&theme->hash_map, group_name);
+    if (!entry) {
+        ASSERT_SIMPLE(false);
+        return NULL;
+    }
+
    return (UIAttributeGroup *) (theme->data + entry->value);
 }

@ -73,6 +78,11 @@ inline
 UIAttributeGroup* theme_style_group(UIThemeStyle* theme, const char* group_name, int32 group_id)
 {
    HashEntryInt64* entry = (HashEntryInt64 *) hashmap_get_entry(&theme->hash_map, group_name, group_id);
+    if (!entry) {
+        ASSERT_SIMPLE(false);
+        return NULL;
+    }
+
    return (UIAttributeGroup *) (theme->data + entry->value);
 }

@ -229,7 +239,7 @@ void theme_from_file_txt(

            *temp = '\0';
            for (int32 j = 0; j < UI_ELEMENT_TYPE_SIZE; ++j) {
-                if (strcmp(str, ui_element_type_to_string((UIElementType) j)) == 0) {
+                if (strcmp(str, ui_element_type_to_string_const((UIElementType) j)) == 0) {

                    attribute.value_int = j;
                    break;
@ -246,13 +256,11 @@ void theme_from_file_txt(
            }

            *temp = '\0';
-            ++pos;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -260,28 +268,27 @@ void theme_from_file_txt(
            attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_SIZE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_SIZE;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_WEIGHT), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_WEIGHT;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ALIGN_H), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_ALIGN_H;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ALIGN_V), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_ALIGN_V;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ZINDEX), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_ZINDEX;
-            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos)); ++pos;
+            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos));
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -298,22 +305,21 @@ void theme_from_file_txt(
            attribute.value_str[i] = '\0';
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY;
-            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos)); ++pos;
+            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos));
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -321,13 +327,12 @@ void theme_from_file_txt(
            attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -335,13 +340,12 @@ void theme_from_file_txt(
            attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -349,13 +353,12 @@ void theme_from_file_txt(
            attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -363,13 +366,12 @@ void theme_from_file_txt(
            attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -377,28 +379,27 @@ void theme_from_file_txt(
            attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_TOP), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_TOP;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_RIGHT), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_RIGHT;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_BOTTOM), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_BOTTOM;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_LEFT), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_LEFT;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -406,16 +407,15 @@ void theme_from_file_txt(
            attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR;
            uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;

            attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
            attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@ -423,16 +423,16 @@ void theme_from_file_txt(
            attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TRANSITION_DURATION), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_TRANSITION_DURATION;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
        } else {
            while (*pos != '\n' && *pos != '\0') {
                ++pos;
@ -487,12 +487,20 @@ void theme_from_file(
    // Of course we still need to populate the data using hashmap_load()
    // The value is a int64 (because this is the value of the chunk buffer size but the hashmap only allows int32)
    hashmap_create(&theme->hash_map, (int32) SWAP_ENDIAN_LITTLE(*((uint64 *) pos)), sizeof(HashEntryInt64), theme->data);
+
+    const byte* start = theme->hash_map.buf.memory;
    pos += hashmap_load(&theme->hash_map, pos);

    // theme data
    // Layout: first load the size of the group, then load the individual attributes
    for (int32 i = 0; i < theme->hash_map.buf.count; ++i) {
+        if (!theme->hash_map.table[i]) {
+            continue;
+        }
+
        HashEntryInt64* entry = (HashEntryInt64 *) theme->hash_map.table[i];
+
+        pos = start + entry->value;
        UIAttributeGroup* group = (UIAttributeGroup *) (theme->data + entry->value);

        group->attribute_size = SWAP_ENDIAN_LITTLE(*((int32 *) pos));
@ -501,6 +509,21 @@ void theme_from_file(
        // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases
        memcpy(group->attributes, pos, group->attribute_size * sizeof(UIAttribute));
        pos += group->attribute_size * sizeof(UIAttribute);
+
+        // load all the next elements
+        while (entry->next) {
+            pos = start + entry->value;
+            group = (UIAttributeGroup *) (theme->data + entry->value);
+
+            group->attribute_size = SWAP_ENDIAN_LITTLE(*((int32 *) pos));
+            pos += sizeof(group->attribute_size);
+
+            // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases
+            memcpy(group->attributes, pos, group->attribute_size * sizeof(UIAttribute));
+            pos += group->attribute_size * sizeof(UIAttribute);
+
+            entry = entry->next;
+        }
    }
 }

@ -541,7 +564,7 @@ void theme_to_file(
    //      I also don't want to add a size variable to the theme as it is useless in all other cases
    file.size = theme_size(theme);

-    file.content = ring_get_memory(ring, file.size, 64);
+    file.content = ring_get_memory(ring, file.size, 64, true);
    byte* pos = file.content;

    // version
@ -549,12 +572,19 @@ void theme_to_file(
    pos += sizeof(theme->version);

    // hashmap
+    byte* start = pos;
    pos += hashmap_dump(&theme->hash_map, pos);

    // theme data
    // Layout: first save the size of the group, then save the individual attributes
    for (int32 i = 0; i < theme->hash_map.buf.count; ++i) {
+        if (!theme->hash_map.table[i]) {
+            continue;
+        }
+
        HashEntryInt64* entry = (HashEntryInt64 *) theme->hash_map.table[i];
+
+        pos = start + entry->value;
        UIAttributeGroup* group = (UIAttributeGroup *) (theme->data + entry->value);

        *((int32 *) pos) = SWAP_ENDIAN_LITTLE(group->attribute_size);
@ -562,7 +592,22 @@ void theme_to_file(

        // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases
        memcpy(pos, group->attributes, group->attribute_size * sizeof(UIAttribute));
-        pos += group->attribute_size * sizeof(UIAttribute);
+        pos += sizeof(UIAttribute);
+
+        // save all the next elements
+        while (entry->next) {
+            pos = start + entry->value;
+            group = (UIAttributeGroup *) (theme->data + entry->value);
+
+            *((int32 *) pos) = SWAP_ENDIAN_LITTLE(group->attribute_size);
+            pos += sizeof(group->attribute_size);
+
+            // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases
+            memcpy(pos, group->attributes, group->attribute_size * sizeof(UIAttribute));
+            pos += sizeof(UIAttribute);
+
+            entry = entry->next;
+        }
    }

    file.size = pos - file.content;
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@ -176,7 +176,7 @@ void wchar_to_char(const wchar_t* __restrict src, char* __restrict dest, int32 l
    *dest = '\0';
 }

-inline
+inline constexpr
 int32 str_to_int(const char *str)
 {
    int32 result = 0;
@ -197,7 +197,47 @@ int32 str_to_int(const char *str)
    return result * sign;
 }

-inline size_t str_count(const char* __restrict str, const char* __restrict substr)
+inline constexpr
+int32 int_to_str(int64 number, char *str, const char thousands = ',') {
+    int32 i = 0;
+    int64 sign = number;
+    int32 digit_count = 0;
+
+    if (number == 0) {
+        str[i++] = '0';
+    } else if (number < 0) {
+        number = -number;
+    }
+
+    while (number > 0) {
+        if (thousands
+            && (digit_count == 3 || digit_count == 6 || digit_count == 9 || digit_count == 12 || digit_count == 15)
+        ) {
+            str[i++] = thousands;
+        }
+
+        str[i++] = number % 10 + '0';
+        number /= 10;
+        ++digit_count;
+    }
+
+    if (sign < 0) {
+        str[i++] = '-';
+    }
+
+    str[i] = '\0';
+
+    for (int32 j = 0, k = i - 1; j < k; ++j, --k) {
+        char temp = str[j];
+        str[j] = str[k];
+        str[k] = temp;
+    }
+
+    return i - 1;
+}
+
+inline
+size_t str_count(const char* __restrict str, const char* __restrict substr)
 {
    size_t l1 = strlen(str);
    size_t l2 = strlen(substr);
@ -296,45 +336,7 @@ char* strtok(char* str, const char* __restrict delim, char* *key) {
    return result;
 }

-inline
-int32 int_to_str(int64 number, char *str, const char thousands = ',') {
-    int32 i = 0;
-    int64 sign = number;
-    int32 digit_count = 0;
-
-    if (number == 0) {
-        str[i++] = '0';
-    } else if (number < 0) {
-        number = -number;
-    }
-
-    while (number > 0) {
-        if (thousands
-            && (digit_count == 3 || digit_count == 6 || digit_count == 9 || digit_count == 12 || digit_count == 15)
-        ) {
-            str[i++] = thousands;
-        }
-
-        str[i++] = number % 10 + '0';
-        number /= 10;
-        ++digit_count;
-    }
-
-    if (sign < 0) {
-        str[i++] = '-';
-    }
-
-    str[i] = '\0';
-
-    for (int32 j = 0, k = i - 1; j < k; ++j, --k) {
-        char temp = str[j];
-        str[j] = str[k];
-        str[k] = temp;
-    }
-
-    return i - 1;
-}
-
+inline constexpr
 char toupper_ascii(char c)
 {
    return c >= 'a' && c <= 'z'
@ -342,6 +344,7 @@ char toupper_ascii(char c)
        : c;
 }

+inline constexpr
 char tolower_ascii(char c)
 {
    return c >= 'A' && c <= 'Z'
@ -349,6 +352,7 @@ char tolower_ascii(char c)
        : c;
 }

+inline constexpr
 void create_const_name(const unsigned char* name, char* modified_name)
 {
    // Print block
@ -365,6 +369,7 @@ void create_const_name(const unsigned char* name, char* modified_name)
    }
 }

+inline constexpr
 bool str_ends_with(const char* str, const char* suffix) {
    if (!str || !suffix) {
        return false;
@ -431,4 +436,16 @@ void print_bytes(const void* ptr, size_t size)
    }
 }

+inline constexpr
+int32 is_eol(const char* str)
+{
+    if (*str == '\n') {
+        return 1;
+    } else if (*str == '\r' && str[1] == '\n') {
+        return 2;
+    }
+
+    return 0;
+}
+
 #endif