diff --git a/font/Font.h b/font/Font.h
index e20802f..251fca0 100644
--- a/font/Font.h
+++ b/font/Font.h
@@ -100,6 +100,8 @@ void font_from_file_txt(
                 while (*pos != '\n') {
                     *texture_pos++ = *pos++;
                 }
+
+                *texture_pos++ = '\0';
             } else if (strcmp(block_name, "font_size") == 0) {
                 font->size = strtof(pos, &pos);
             } else if (strcmp(block_name, "line_height") == 0) {
diff --git a/gpuapi/RenderUtils.h b/gpuapi/RenderUtils.h
index 52398ad..6d4bf55 100644
--- a/gpuapi/RenderUtils.h
+++ b/gpuapi/RenderUtils.h
@@ -693,15 +693,15 @@ void entity_clip_space_mat_sse(f32* result_mat, const f32* model_mat, const f32*
     __m128 a[4];
     __m128 b[4];
 
-    a[0] = _mm_loadu_ps(projection_mat);
-    a[1] = _mm_loadu_ps(&projection_mat[4]);
-    a[2] = _mm_loadu_ps(&projection_mat[8]);
-    a[3] = _mm_loadu_ps(&projection_mat[12]);
+    a[0] = _mm_load_ps(projection_mat);
+    a[1] = _mm_load_ps(&projection_mat[4]);
+    a[2] = _mm_load_ps(&projection_mat[8]);
+    a[3] = _mm_load_ps(&projection_mat[12]);
 
-    b[0] = _mm_loadu_ps(view_mat);
-    b[1] = _mm_loadu_ps(&view_mat[4]);
-    b[2] = _mm_loadu_ps(&view_mat[8]);
-    b[3] = _mm_loadu_ps(&view_mat[12]);
+    b[0] = _mm_load_ps(view_mat);
+    b[1] = _mm_load_ps(&view_mat[4]);
+    b[2] = _mm_load_ps(&view_mat[8]);
+    b[3] = _mm_load_ps(&view_mat[12]);
     _MM_TRANSPOSE4_PS(b[0], b[1], b[2], b[3]);
 
     mat4mat4_mult_sse(a, b, temp);
@@ -711,10 +711,10 @@ void entity_clip_space_mat_sse(f32* result_mat, const f32* model_mat, const f32*
     a[2] = temp[2];
     a[3] = temp[3];
 
-    b[0] = _mm_loadu_ps(model_mat);
-    b[1] = _mm_loadu_ps(&model_mat[4]);
-    b[2] = _mm_loadu_ps(&model_mat[8]);
-    b[3] = _mm_loadu_ps(&model_mat[12]);
+    b[0] = _mm_load_ps(model_mat);
+    b[1] = _mm_load_ps(&model_mat[4]);
+    b[2] = _mm_load_ps(&model_mat[8]);
+    b[3] = _mm_load_ps(&model_mat[12]);
     _MM_TRANSPOSE4_PS(b[0], b[1], b[2], b[3]);
 
     mat4mat4_mult_sse(a, b, temp);
diff --git a/gpuapi/UIUtils.h b/gpuapi/UIUtils.h
new file mode 100644
index 0000000..a2420e1
--- /dev/null
+++ b/gpuapi/UIUtils.h
@@ -0,0 +1,33 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_GPUAPI_UI_UTILS_H
+#define TOS_GPUAPI_UI_UTILS_H
+
+#include <stdio.h>
+#include <string.h>
+
+void ui_input_create(Vertex3DTextureColorIndex* __restrict vertices, uint32* __restrict index, f32 zindex,
+    f32 x, f32 y, f32 width, f32 height, int32 align_h, int32 align_v,
+    uint32 color_index = 0, f32 tex_x1 = 0.0f, f32 tex_y1 = 0.0f, f32 tex_x2 = 0.0f, f32 tex_y2 = 0.0f
+)
+{
+    vertex_rect_border_create(
+        vertices, index, zindex,
+        x, y, width, height, 1, UI_ALIGN_H_LEFT, UI_ALIGN_V_BOTTOM,
+        12, 0.0f, 0.0f
+    );
+
+    vertex_rect_create(
+        vertices, index, zindex,
+        x + 1, y + 1, width - 2, height - 2, UI_ALIGN_H_LEFT, UI_ALIGN_V_BOTTOM,
+        14, 0.0f, 0.0f
+    );
+}
+
+#endif
\ No newline at end of file
diff --git a/log/Debug.cpp b/log/Debug.cpp
index 347d390..efabe34 100644
--- a/log/Debug.cpp
+++ b/log/Debug.cpp
@@ -1,6 +1,7 @@
 #ifndef TOS_LOG_DEBUG_MEMORY_C
 #define TOS_LOG_DEBUG_MEMORY_C
 
+#include "../stdlib/Types.h"
 #include "Debug.h"
 #include "DebugMemory.h"
 #include "Log.h"
diff --git a/math/matrix/MatrixFloat32.h b/math/matrix/MatrixFloat32.h
index 846f562..6d69a3a 100644
--- a/math/matrix/MatrixFloat32.h
+++ b/math/matrix/MatrixFloat32.h
@@ -401,11 +401,11 @@ void mat3vec3_mult(const f32* __restrict matrix, const f32* __restrict vector, f
 // @question could simple mul add sse be faster?
 void mat3vec3_mult_sse(const f32* __restrict matrix, const f32* __restrict vector, f32* __restrict result)
 {
-    __m128 vec = _mm_loadu_ps(vector);
+    __m128 vec = _mm_load_ps(vector);
     vec = _mm_insert_ps(vec, _mm_setzero_ps(), 0x30); // vec[3] = 0
 
     for (int32 i = 0; i < 3; ++i) {
-        __m128 row = _mm_loadu_ps(&matrix[i * 3]);
+        __m128 row = _mm_load_ps(&matrix[i * 3]);
         row = _mm_insert_ps(row, _mm_setzero_ps(), 0x30);  // row[3] = 0
 
         __m128 dot = _mm_dp_ps(row, vec, 0xF1);
@@ -444,10 +444,10 @@ void mat4vec4_mult(const f32* __restrict matrix, const f32* __restrict vector, f
 // @question could simple mul add sse be faster?
 void mat4vec4_mult_sse(const f32* __restrict matrix, const f32* __restrict vector, f32* __restrict result)
 {
-    __m128 vec = _mm_loadu_ps(vector);
+    __m128 vec = _mm_load_ps(vector);
 
     for (int32 i = 0; i < 4; ++i) {
-        __m128 row = _mm_loadu_ps(&matrix[i * 4]);
+        __m128 row = _mm_load_ps(&matrix[i * 4]);
         __m128 dot = _mm_dp_ps(row, vec, 0xF1);
 
         result[i] = _mm_cvtss_f32(dot);
@@ -502,16 +502,16 @@ void mat4mat4_mult(const f32* __restrict a, const f32* __restrict b, f32* __rest
         // @todo check http://fhtr.blogspot.com/2010/02/4x4-f32-matrix-multiplication-using.html
         // @question could simple mul add sse be faster?
         // Load rows of matrix a
-        __m128 a_1 = _mm_loadu_ps(a);
-        __m128 a_2 = _mm_loadu_ps(&a[4]);
-        __m128 a_3 = _mm_loadu_ps(&a[8]);
-        __m128 a_4 = _mm_loadu_ps(&a[12]);
+        __m128 a_1 = _mm_load_ps(a);
+        __m128 a_2 = _mm_load_ps(&a[4]);
+        __m128 a_3 = _mm_load_ps(&a[8]);
+        __m128 a_4 = _mm_load_ps(&a[12]);
 
         // Load columns of matrix b
-        __m128 b_1 = _mm_loadu_ps(b);
-        __m128 b_2 = _mm_loadu_ps(&b[4]);
-        __m128 b_3 = _mm_loadu_ps(&b[8]);
-        __m128 b_4 = _mm_loadu_ps(&b[12]);
+        __m128 b_1 = _mm_load_ps(b);
+        __m128 b_2 = _mm_load_ps(&b[4]);
+        __m128 b_3 = _mm_load_ps(&b[8]);
+        __m128 b_4 = _mm_load_ps(&b[12]);
 
         _mm_storeu_ps(&result[0],
             _mm_add_ps(
diff --git a/memory/ChunkMemory.h b/memory/ChunkMemory.h
index a8f6291..0eb74a7 100644
--- a/memory/ChunkMemory.h
+++ b/memory/ChunkMemory.h
@@ -69,12 +69,14 @@ void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, i
     buf->memory = data;
 
     buf->count = count;
-    buf->size = chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64);
+    buf->size = chunk_size * count + sizeof(buf->free) * CEIL_DIV(count, 64);
     buf->chunk_size = chunk_size;
     buf->last_pos = -1;
     buf->alignment = alignment;
 
     // @question Could it be beneficial to have this before the element data?
+    //  On the other hand the way we do it right now we never have to move past the free array since it is at the end
+    //  On another hand we could by accident overwrite the values in free if we are not careful
     buf->free = (uint64 *) (buf->memory + count * chunk_size);
 
     DEBUG_MEMORY_INIT((uint64) buf->memory, buf->size);
@@ -124,7 +126,7 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false)
     int32 bit_index;
 
     int64 free_element = -1;
-    byte mask;
+    int64 mask;
 
     int32 i = 0;
     int64 max_bytes = (buf->count + 7) / 64;
@@ -157,7 +159,7 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false)
                 uint64 current_free_index = free_index + (bit_index + j) / 64;
                 int32 current_bit_index = (bit_index + j) % 64;
 
-                mask = 1 << current_bit_index;
+                mask = 1LL << current_bit_index;
                 if ((buf->free[current_free_index] & mask) == 0) {
                     ++consecutive_free_bits;
                 } else {
@@ -201,23 +203,23 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false)
 
 byte* chunk_find_free(ChunkMemory* buf)
 {
-    int64 byte_index = (buf->last_pos + 1) / 64;
+    int64 free_index = (buf->last_pos + 1) / 64;
     int32 bit_index;
 
     int64 free_element = -1;
-    byte mask;
+    int64 mask;
 
     int32 i = 0;
     int64 max_bytes = (buf->count + 7) / 64;
 
     while (free_element < 0 && i < buf->count) {
-        if (byte_index >= max_bytes) {
-            byte_index = 0;
+        if (free_index >= max_bytes) {
+            free_index = 0;
         }
 
-        if (buf->free[byte_index] == 0xFF) {
+        if (buf->free[free_index] == 0xFF) {
             ++i;
-            ++byte_index;
+            ++free_index;
 
             continue;
         }
@@ -226,10 +228,10 @@ byte* chunk_find_free(ChunkMemory* buf)
         // @performance on the first iteration through the buffer we could optimize this by starting at a different bit_index
         // because we know that the bit_index is based on last_pos
         for (bit_index = 0; bit_index < 64; ++bit_index) {
-            mask = 1 << bit_index;
-            if ((buf->free[byte_index] & mask) == 0) {
-                free_element = byte_index * 64 + bit_index;
-                buf->free[byte_index] |= (1LL << bit_index);
+            mask = 1LL << bit_index;
+            if ((buf->free[free_index] & mask) == 0) {
+                free_element = free_index * 64 + bit_index;
+                buf->free[free_index] |= (1LL << bit_index);
 
                 break;
             }
@@ -248,10 +250,10 @@ void chunk_free_element(ChunkMemory* buf, uint64 element)
 {
     DEBUG_MEMORY_DELETE((uint64) (buf->memory + element * buf->chunk_size), buf->chunk_size);
 
-    int64 byte_index = element / 64;
+    int64 free_index = element / 64;
     int32 bit_index = element % 64;
 
-    buf->free[byte_index] &= ~(1 << bit_index);
+    buf->free[free_index] &= ~(1LL << bit_index);
 }
 
 inline
diff --git a/memory/RingMemory.h b/memory/RingMemory.h
index 57a3d7a..804d07f 100644
--- a/memory/RingMemory.h
+++ b/memory/RingMemory.h
@@ -62,7 +62,7 @@ void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64)
 inline
 void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment = 64)
 {
-    ring->memory = buffer_get_memory(buf, size, alignment);
+    ring->memory = buffer_get_memory(buf, size, alignment, true);
 
     ring->size = size;
     ring->pos = 0;
@@ -71,8 +71,6 @@ void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment
     ring->start = 0;
     ring->end = 0;
 
-    memset(ring->memory, 0, buf->size);
-
     DEBUG_MEMORY_INIT((uint64) ring->memory, ring->size);
 }
 
diff --git a/stdlib/HashMap.h b/stdlib/HashMap.h
index 1f6a99c..eede30d 100644
--- a/stdlib/HashMap.h
+++ b/stdlib/HashMap.h
@@ -122,6 +122,11 @@ int64 hashmap_size(const HashMap* hm)
 }
 
 void hashmap_insert(HashMap* hm, const char* key, int32 value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
     uint64 index = hash_djb2(key) % hm->buf.count;
 
     int64 element = chunk_reserve(&hm->buf, 1);
@@ -147,6 +152,11 @@ void hashmap_insert(HashMap* hm, const char* key, int32 value) {
 }
 
 void hashmap_insert(HashMap* hm, const char* key, int64 value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
     uint64 index = hash_djb2(key) % hm->buf.count;
 
     int64 element = chunk_reserve(&hm->buf, 1);
@@ -172,6 +182,11 @@ void hashmap_insert(HashMap* hm, const char* key, int64 value) {
 }
 
 void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
     uint64 index = hash_djb2(key) % hm->buf.count;
 
     int64 element = chunk_reserve(&hm->buf, 1);
@@ -197,6 +212,11 @@ void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) {
 }
 
 void hashmap_insert(HashMap* hm, const char* key, void* value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
     uint64 index = hash_djb2(key) % hm->buf.count;
 
     int64 element = chunk_reserve(&hm->buf, 1);
@@ -222,6 +242,11 @@ void hashmap_insert(HashMap* hm, const char* key, void* value) {
 }
 
 void hashmap_insert(HashMap* hm, const char* key, f32 value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
     uint64 index = hash_djb2(key) % hm->buf.count;
 
     int64 element = chunk_reserve(&hm->buf, 1);
@@ -247,6 +272,11 @@ void hashmap_insert(HashMap* hm, const char* key, f32 value) {
 }
 
 void hashmap_insert(HashMap* hm, const char* key, const char* value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
     uint64 index = hash_djb2(key) % hm->buf.count;
 
     int64 element = chunk_reserve(&hm->buf, 1);
@@ -274,6 +304,11 @@ void hashmap_insert(HashMap* hm, const char* key, const char* value) {
 }
 
 void hashmap_insert(HashMap* hm, const char* key, byte* value) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return;
+    }
+
     uint64 index = hash_djb2(key) % hm->buf.count;
 
     int64 element = chunk_reserve(&hm->buf, 1);
@@ -302,6 +337,11 @@ void hashmap_insert(HashMap* hm, const char* key, byte* value) {
 }
 
 HashEntry* hashmap_get_entry(HashMap* hm, const char* key) {
+    // @performance Do we really want to do this check every time?
+    if (hm->buf.count == 0) {
+        return NULL;
+    }
+
     uint64 index = hash_djb2(key) % hm->buf.count;
     HashEntry* entry = (HashEntry *) hm->table[index];
 
@@ -356,90 +396,131 @@ void hashmap_delete_entry(HashMap* hm, const char* key) {
     }
 }
 
-// @bug We cannot know if the data needs endian swap (it coult be int/float, but also some other 4/8 byte value)
-//  -> if we save this to a file and load it on a different system we will have "corrupt" data
 inline
 int64 hashmap_dump(const HashMap* hm, byte* data)
 {
     *((uint64 *) data) = SWAP_ENDIAN_LITTLE(hm->buf.count);
     data += sizeof(uint64);
 
-    uint64 next_count_total = 0;
-
     // Dump the table content where the elements are relative indeces/pointers
     for (int32 i = 0; i < hm->buf.count; ++i) {
-        *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) hm->table[i] - (uintptr_t) hm->buf.memory);
-        data += sizeof(uint64);
+        *((uint64 *) data) = hm->table[i]
+            ? SWAP_ENDIAN_LITTLE((uintptr_t) hm->table[i] - (uintptr_t) hm->buf.memory)
+            : 0ULL;
+    }
+    data += sizeof(uint64) * hm->buf.count;
 
-        // Also dump the next pointer
-        // Count how many next elements we have
-        HashEntry* entry = ((HashEntry *) hm->table[i])->next;
-        int32 next_count = 0;
-        while (entry) {
-            ++next_count;
-            entry = entry->next;
+    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64);
+
+    // Dumb hash map content = buffer memory
+    int32 free_index = 0;
+    int32 bit_index = 0;
+    for (int32 i = 0; i < hm->buf.count; ++i) {
+        if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
+            HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);
+
+            // element_id
+            *((uint64 *) data) = SWAP_ENDIAN_LITTLE(entry->element_id);
+            data += sizeof(entry->element_id);
+
+            // key
+            memcpy(data, entry->key, sizeof(entry->key));
+            data += sizeof(entry->key);
+
+            // next pointer
+            if (entry->next) {
+                *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) entry->next - (uintptr_t) hm->buf.memory);
+            } else {
+                memset(data, 0, sizeof(uint64));
+            }
+            data += sizeof(uint64);
+
+            // We just assume that 4 or 8 bytes = int -> endian handling
+            if (value_size == 4) {
+                *((int32 *) data) = SWAP_ENDIAN_LITTLE(((HashEntryInt32 *) entry)->value);
+            } else if (value_size == 8) {
+                *((int64 *) data) = SWAP_ENDIAN_LITTLE(((HashEntryInt64 *) entry)->value);
+            } else {
+                memcpy(data, entry->value, value_size);
+            }
+            data += value_size;
+        } else {
+            // No entry defined -> NULL
+            memset(data, 0, hm->buf.chunk_size);
+            data += hm->buf.chunk_size;
         }
 
-        next_count_total += next_count;
-
-        *((int32 *) data) = SWAP_ENDIAN_LITTLE(next_count);
-        data += sizeof(next_count);
-
-        if (next_count > 0) {
-            entry = ((HashEntry *) hm->table[i])->next;
-            while (entry) {
-                *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uintptr_t) entry - (uintptr_t) hm->buf.memory);
-                data += sizeof(uint64);
-
-                entry = entry->next;
-            }
+        ++bit_index;
+        if (bit_index > 63) {
+            bit_index = 0;
+            ++free_index;
         }
     }
 
-    // @performance chunk_dump() below contains some data we already output above
-    // (next pointer but it is useless, since we need relative positions)
-    // Maybe we should manually re-create the chunk_dump here and omit the already dumped data for the next pointer?
+    // dump free array
+    memcpy(data, hm->buf.free, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64));
 
-    // How many bytes were written (+ dump the chunk memory)
-    return sizeof(hm->buf.count)
+    return sizeof(hm->buf.count) // hash map count = buffer count
         + hm->buf.count * sizeof(uint64) // table content
-        + hm->buf.count * sizeof(int32) // counter for the next pointer (one for every element)
-        + next_count_total * sizeof(uint64) // next pointer offset
-        + chunk_dump(&hm->buf, data);
+        + hm->buf.size; // hash map content + free array
 }
 
+// WARNING: Requires hashmap_create first
 inline
 int64 hashmap_load(HashMap* hm, const byte* data)
 {
     uint64 count = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
     data += sizeof(uint64);
 
-    uint64 next_count_total = 0;
-
-    // Load the table content, we also need to convert from relative indeces to pointers
+    // Load the table content
     for (int i = 0; i < count; ++i) {
-        hm->table[i] = hm->buf.memory + SWAP_ENDIAN_LITTLE(*((uint64 *) data));
-        data += sizeof(uint64);
+        uint64 offset =  SWAP_ENDIAN_LITTLE(*((uint64 *) data));
+        data += sizeof(offset);
 
-        // Also load the next pointer
-        // Count how many next elements we have
-        int32 next_count = SWAP_ENDIAN_LITTLE(*((int32 *) data));
-        data += sizeof(next_count);
+        // the first element has no offset!
+        hm->table[i] = offset || i == 0 ? hm->buf.memory + offset : NULL;
+    }
 
-        HashEntry* entry = ((HashEntry *) hm->table[i]);
-        for (int32 j = 0; j < next_count; ++j) {
-            entry->next = (HashEntry *) (hm->buf.memory + SWAP_ENDIAN_LITTLE(*((uint64 *) data)));
-            data += sizeof(uint64);
-            entry = entry->next;
+    // This loop here is why it is important to already have an initialized hashmap
+    // @question Do we maybe want to change this and not require an initalized hashmap?
+    memcpy(hm->buf.memory, data, hm->buf.size);
+    data += hm->buf.chunk_size * hm->buf.count;
+
+    // @question don't we have to possibly endian swap check the free array as well?
+    memcpy(hm->buf.free, data, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64));
+
+    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64);
+
+    // Switch endian AND turn offsets to pointers
+    int32 free_index = 0;
+    int32 bit_index = 0;
+    for (int32 i = 0; i < hm->buf.count; ++i) {
+        if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
+            HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);
+
+            // element id
+            entry->element_id = SWAP_ENDIAN_LITTLE(entry->element_id);
+
+            // key is already loaded with the memcpy
+            // @question Do we even want to use memcpy? We are re-checking all the values here anyways
+
+            // next pointer
+            if (entry->next) {
+                entry->next = (HashEntry *) (hm->buf.memory + SWAP_ENDIAN_LITTLE((uint64) entry->next));
+            }
+
+            if (value_size == 4) {
+                ((HashEntryInt32 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt32 *) entry)->value);
+            } else if (value_size == 8) {
+                ((HashEntryInt64 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt64 *) entry)->value);
+            }
         }
     }
 
     // How many bytes was read from data
-    return sizeof(count)
+    return sizeof(hm->buf.count) // hash map count = buffer count
         + hm->buf.count * sizeof(uint64) // table content
-        + hm->buf.count * sizeof(int32) // counter for the next pointer (one for every element)
-        + next_count_total * sizeof(uint64) // next pointer offset
-        + chunk_load(&hm->buf, data);
+        + hm->buf.size;
 }
 
 #endif
\ No newline at end of file
diff --git a/stdlib/simd/SIMD_F32.h b/stdlib/simd/SIMD_F32.h
index bb4b3d0..24a5124 100644
--- a/stdlib/simd/SIMD_F32.h
+++ b/stdlib/simd/SIMD_F32.h
@@ -39,7 +39,7 @@ struct f32_16 {
 inline f32_4 load_f32_4(const f32* mem)
 {
     f32_4 simd;
-    simd.s = _mm_loadu_ps(mem);
+    simd.s = _mm_load_ps(mem);
 
     return simd;
 }
@@ -57,7 +57,7 @@ inline void unload_f32_4(f32_4 a, f32 *array) { _mm_store_ps(array, a.s); }
 inline f32_8 load_f32_8(const f32* mem)
 {
     f32_8 simd;
-    simd.s = _mm256_loadu_ps(mem);
+    simd.s = _mm256_load_ps(mem);
 
     return simd;
 }
@@ -75,7 +75,7 @@ inline void unload_f32_8(f32_8 a, f32 *array) { _mm256_store_ps(array, a.s); }
 inline f32_16 load_f32_16(const f32* mem)
 {
     f32_16 simd;
-    simd.s = _mm512_loadu_ps(mem);
+    simd.s = _mm512_load_ps(mem);
 
     return simd;
 }
@@ -996,8 +996,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps)
         __m512 result_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_ps(a);
-            b_16 = _mm512_loadu_ps(b);
+            a_16 = _mm512_load_ps(a);
+            b_16 = _mm512_load_ps(b);
             result_16 = _mm512_mul_ps(a_16, b_16);
             _mm512_store_ps(result, result_16);
 
@@ -1011,8 +1011,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps)
         __m256 result_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_ps(a);
-            b_8 = _mm256_loadu_ps(b);
+            a_8 = _mm256_load_ps(a);
+            b_8 = _mm256_load_ps(b);
             result_8 = _mm256_mul_ps(a_8, b_8);
             _mm256_store_ps(result, result_8);
 
@@ -1026,8 +1026,8 @@ void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps)
         __m128 result_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_ps(a);
-            b_4 = _mm_loadu_ps(b);
+            a_4 = _mm_load_ps(a);
+            b_4 = _mm_load_ps(b);
             result_4 = _mm_mul_ps(a_4, b_4);
             _mm_store_ps(result, result_4);
 
@@ -1057,7 +1057,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
         __m512 result_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_ps(a);
+            a_16 = _mm512_load_ps(a);
             result_16 = _mm512_mul_ps(a_16, b_16);
             _mm512_store_ps(result, result_16);
 
@@ -1070,7 +1070,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
         __m256 result_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_ps(a);
+            a_8 = _mm256_load_ps(a);
             result_8 = _mm256_mul_ps(a_8, b_8);
             _mm256_store_ps(result, result_8);
 
@@ -1083,7 +1083,7 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
         __m128 result_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_ps(a);
+            a_4 = _mm_load_ps(a);
             result_4 = _mm_mul_ps(a_4, b_4);
             _mm_store_ps(result, result_4);
 
@@ -1111,7 +1111,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps)
         __m512 result_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_ps(a);
+            a_16 = _mm512_load_ps(a);
             result_16 = _mm512_div_ps(a_16, b_16);
             _mm512_store_ps(result, result_16);
 
@@ -1124,7 +1124,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps)
         __m256 result_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_ps(a);
+            a_8 = _mm256_load_ps(a);
             result_8 = _mm256_div_ps(a_8, b_8);
             _mm256_store_ps(result, result_8);
 
@@ -1137,7 +1137,7 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps)
         __m128 result_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_ps(a);
+            a_4 = _mm_load_ps(a);
             result_4 = _mm_div_ps(a_4, b_4);
             _mm_store_ps(result, result_4);
 
@@ -1166,7 +1166,7 @@ void simd_div(const f32* a, f32 b, __m256* result, int32 size)
     __m256 result_8;
 
     for (; i <= size - 8; i += 8) {
-        a_8 = _mm256_loadu_ps(a);
+        a_8 = _mm256_load_ps(a);
         result_8 = _mm256_div_ps(a_8, b_8);
         result[j] = result_8;
 
@@ -1181,7 +1181,7 @@ void simd_div(const f32* a, f32 b, __m256* result, int32 size)
         temp[k] = a[i + k] / b;
     }
 
-    result[j] = _mm256_loadu_ps(temp);
+    result[j] = _mm256_load_ps(temp);
 }
 
 inline
diff --git a/stdlib/simd/SIMD_I16.h b/stdlib/simd/SIMD_I16.h
index 9ffb372..7b3a8a4 100644
--- a/stdlib/simd/SIMD_I16.h
+++ b/stdlib/simd/SIMD_I16.h
@@ -39,7 +39,7 @@ struct int16_32 {
 inline int16_8 load_int16_8(const int16* mem)
 {
     int16_8 simd;
-    simd.s = _mm_loadu_epi16(mem);
+    simd.s = _mm_load_si128((__m128i *) mem);
 
     return simd;
 }
@@ -60,7 +60,7 @@ inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i *
 inline int16_16 load_int16_16(const int16* mem)
 {
     int16_16 simd;
-    simd.s = _mm256_loadu_epi16(mem);
+    simd.s = _mm256_load_si256((__m256i *) mem);
 
     return simd;
 }
@@ -81,7 +81,7 @@ inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m2
 inline int16_32 load_int16_32(const int16* mem)
 {
     int16_32 simd;
-    simd.s = _mm512_loadu_epi16(mem);
+    simd.s = _mm512_load_si512((__m512i *) mem);
 
     return simd;
 }
diff --git a/stdlib/simd/SIMD_I32.h b/stdlib/simd/SIMD_I32.h
index 88edfd9..38c55f1 100644
--- a/stdlib/simd/SIMD_I32.h
+++ b/stdlib/simd/SIMD_I32.h
@@ -11,6 +11,7 @@
 
 #include <immintrin.h>
 #include <xmmintrin.h>
+#include <emmintrin.h>
 
 #include "../Types.h"
 #include "../../utils/BitUtils.h"
@@ -45,7 +46,7 @@ struct int32_16 {
 inline int32_4 load_int32_4(const int32* mem)
 {
     int32_4 simd;
-    simd.s = _mm_loadu_epi32(mem);
+    simd.s = _mm_load_si128((__m128i *) mem);
 
     return simd;
 }
@@ -63,7 +64,7 @@ inline void unload_int32_4(int32_4 a, int32 *array) { _mm_store_si128((__m128i *
 inline int32_8 load_int32_8(const int32* mem)
 {
     int32_8 simd;
-    simd.s = _mm256_loadu_epi32(mem);
+    simd.s = _mm256_load_si256((__m256i *) mem);
 
     return simd;
 }
@@ -81,7 +82,7 @@ inline void unload_int32_8(int32_8 a, int32 *array) { _mm256_store_si256((__m256
 inline int32_16 load_int32_16(const int32* mem)
 {
     int32_16 simd;
-    simd.s = _mm512_loadu_epi32(mem);
+    simd.s = _mm512_load_epi32(mem);
 
     return simd;
 }
@@ -1039,8 +1040,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32
         __m512i result_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
-            b_16 = _mm512_loadu_epi32(b);
+            a_16 = _mm512_load_epi32(a);
+            b_16 = _mm512_load_epi32(b);
             result_16 = _mm512_mul_epi32(a_16, b_16);
             _mm512_store_epi32(result, result_16);
 
@@ -1054,8 +1055,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32
         __m256i result_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
-            b_8 = _mm256_loadu_epi32(b);
+            a_8 = _mm256_load_si256((__m256i *) a);
+            b_8 = _mm256_load_si256((__m256i *) b);
             result_8 = _mm256_mul_epi32(a_8, b_8);
             _mm256_store_si256((__m256i *) result, result_8);
 
@@ -1069,8 +1070,8 @@ void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32
         __m128i result_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
-            b_4 = _mm_loadu_epi32(b);
+            a_4 = _mm_load_si128((__m128i *) a);
+            b_4 = _mm_load_si128((__m128i *) b);
             result_4 = _mm_mul_epi32(a_4, b_4);
             _mm_store_si128((__m128i *) result, result_4);
 
@@ -1101,9 +1102,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step
         __m512 result_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
             af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_loadu_ps(b);
+            b_16 = _mm512_load_ps(b);
             result_16 = _mm512_mul_ps(af_16, b_16);
             _mm512_store_ps(result, result_16);
 
@@ -1118,9 +1119,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step
         __m256 result_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
             af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_loadu_ps(b);
+            b_8 = _mm256_load_ps(b);
             result_8 = _mm256_mul_ps(af_8, b_8);
             _mm256_store_ps(result, result_8);
 
@@ -1135,9 +1136,9 @@ void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 step
         __m128 result_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
             af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_loadu_ps(b);
+            b_4 = _mm_load_ps(b);
             result_4 = _mm_mul_ps(af_4, b_4);
             _mm_store_ps(result, result_4);
 
@@ -1169,9 +1170,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st
         __m512i resulti_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
             af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_loadu_ps(b);
+            b_16 = _mm512_load_ps(b);
             result_16 = _mm512_mul_ps(af_16, b_16);
             resulti_16 = _mm512_cvtps_epi32(result_16);
             _mm512_store_epi32(result, resulti_16);
@@ -1188,9 +1189,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st
         __m256i resulti_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
             af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_loadu_ps(b);
+            b_8 = _mm256_load_ps(b);
             result_8 = _mm256_mul_ps(af_8, b_8);
             resulti_8 = _mm256_cvtps_epi32(result_8);
             _mm256_store_si256((__m256i *) result, resulti_8);
@@ -1207,9 +1208,9 @@ void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 st
         __m128i resulti_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
             af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_loadu_ps(b);
+            b_4 = _mm_load_ps(b);
             result_4 = _mm_mul_ps(af_4, b_4);
             resulti_4 = _mm_cvtps_epi32(result_4);
             _mm_store_si128((__m128i *) result, resulti_4);
@@ -1242,7 +1243,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps)
         __m512i resulti_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
             af_16 = _mm512_cvtepi32_ps(a_16);
             result_16 = _mm512_mul_ps(af_16, b_16);
             resulti_16 = _mm512_cvtps_epi32(result_16);
@@ -1259,7 +1260,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps)
         __m256i resulti_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
             af_8 = _mm256_cvtepi32_ps(a_8);
             result_8 = _mm256_mul_ps(af_8, b_8);
             resulti_8 = _mm256_cvtps_epi32(result_8);
@@ -1276,7 +1277,7 @@ void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps)
         __m128i resulti_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
             af_4 = _mm_cvtepi32_ps(a_4);
             result_4 = _mm_mul_ps(af_4, b_4);
             resulti_4 = _mm_cvtps_epi32(result_4);
@@ -1307,7 +1308,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
         __m512 result_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
             af_16 = _mm512_cvtepi32_ps(a_16);
             result_16 = _mm512_div_ps(af_16, b_16);
             _mm512_store_ps(result, result_16);
@@ -1323,7 +1324,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
         __m256 result_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
             af_8 = _mm256_cvtepi32_ps(a_8);
             result_8 = _mm256_div_ps(af_8, b_8);
             _mm256_store_ps(result, result_8);
@@ -1338,7 +1339,7 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
         __m128 result_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
             af_4 = _mm_cvtepi32_ps(a_4);
             result_4 = _mm_div_ps(af_4, b_4);
             _mm_store_ps(result, result_4);
@@ -1367,8 +1368,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s
         __m512i result_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
-            b_16 = _mm512_loadu_epi32(b);
+            a_16 = _mm512_load_epi32(a);
+            b_16 = _mm512_load_epi32(b);
             result_16 = _mm512_add_epi32(a_16, b_16);
             _mm512_store_epi32(result, result_16);
 
@@ -1382,8 +1383,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s
         __m256i result_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
-            b_8 = _mm256_loadu_epi32(b);
+            a_8 = _mm256_load_si256((__m256i *) a);
+            b_8 = _mm256_load_si256((__m256i *) b);
             result_8 = _mm256_add_epi32(a_8, b_8);
             _mm256_store_si256((__m256i *) result, result_8);
 
@@ -1397,8 +1398,8 @@ void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 s
         __m128i result_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
-            b_4 = _mm_loadu_epi32(b);
+            a_4 = _mm_load_si128((__m128i *) a);
+            b_4 = _mm_load_si128((__m128i *) b);
             result_4 = _mm_add_epi32(a_4, b_4);
             _mm_store_si128((__m128i *) result, result_4);
 
@@ -1429,9 +1430,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps
         __m512 result_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
             af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_loadu_ps(b);
+            b_16 = _mm512_load_ps(b);
             result_16 = _mm512_add_ps(af_16, b_16);
             _mm512_store_ps(result, result_16);
 
@@ -1446,9 +1447,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps
         __m256 result_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
             af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_loadu_ps(b);
+            b_8 = _mm256_load_ps(b);
             result_8 = _mm256_add_ps(af_8, b_8);
             _mm256_store_ps(result, result_8);
 
@@ -1463,9 +1464,9 @@ void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps
         __m128 result_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
             af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_loadu_ps(b);
+            b_4 = _mm_load_ps(b);
             result_4 = _mm_add_ps(af_4, b_4);
             _mm_store_ps(result, result_4);
 
@@ -1497,9 +1498,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
         __m512i resulti_16;
 
         for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_epi32(a);
+            a_16 = _mm512_load_epi32(a);
             af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_loadu_ps(b);
+            b_16 = _mm512_load_ps(b);
             result_16 = _mm512_add_ps(af_16, b_16);
             resulti_16 = _mm512_cvtps_epi32(result_16);
             _mm512_store_epi32(result, resulti_16);
@@ -1516,9 +1517,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
         __m256i resulti_8;
 
         for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_epi32(a);
+            a_8 = _mm256_load_si256((__m256i *) a);
             af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_loadu_ps(b);
+            b_8 = _mm256_load_ps(b);
             result_8 = _mm256_add_ps(af_8, b_8);
             resulti_8 = _mm256_cvtps_epi32(result_8);
             _mm256_store_si256((__m256i *) result, resulti_8);
@@ -1535,9 +1536,9 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
         __m128i resulti_4;
 
         for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_epi32(a);
+            a_4 = _mm_load_si128((__m128i *) a);
             af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_loadu_ps(b);
+            b_4 = _mm_load_ps(b);
             result_4 = _mm_add_ps(af_4, b_4);
             resulti_4 = _mm_cvtps_epi32(result_4);
             _mm_store_si128((__m128i *) result, resulti_4);
@@ -1560,8 +1561,8 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
 // WARNING: only works with SSE4.2
 // WARNING: incl. \0 both strings must be <= 16 length
 bool str_compare_avx512(const char* str1, const char* str2) {
-    __m128i s1 = _mm_loadu_si128((const __m128i *)  str1);
-    __m128i s2 = _mm_loadu_si128((const __m128i *)  str2);
+    __m128i s1 = _mm_load_si128((__m128i *) (const __m128i *)  str1);
+    __m128i s2 = _mm_load_si128((__m128i *) (const __m128i *)  str2);
 
     return _mm_cmpistrc(s1, s2, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH) == 0;
 }
@@ -1580,7 +1581,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps)
         );
 
         for (i = 0; i <= size - steps; i += steps) {
-            __m512i vec = _mm512_loadu_si512((const __m512i *) (val + i));
+            __m512i vec = _mm512_load_si512((const __m512i *) (val + i));
             vec = _mm512_shuffle_epi8(vec, mask_512);
 
             _mm512_storeu_si512((__m512i *) (result + i), vec);
@@ -1594,7 +1595,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps)
         );
 
         for (i = 0; i <= size - steps; i += steps) {
-            __m256i vec = _mm256_loadu_si256((const __m256i *) (val + i));
+            __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
             vec = _mm256_shuffle_epi8(vec, mask_256);
 
             _mm256_storeu_si256((__m256i *) (result + i), vec);
@@ -1608,7 +1609,7 @@ endian_swap(const int* val, int* result, int32 size, int32 steps)
         );
 
         for (i = 0; i <= size - steps; i += steps) {
-             __m128i vec = _mm_loadu_si128((const __m128i *) (val + i));
+             __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i));
             vec = _mm_shuffle_epi8(vec, mask_128);
 
             _mm_storeu_si128((__m128i *) (result + i), vec);
diff --git a/stdlib/simd/SIMD_I8.h b/stdlib/simd/SIMD_I8.h
index 9c8ddd4..7809a40 100644
--- a/stdlib/simd/SIMD_I8.h
+++ b/stdlib/simd/SIMD_I8.h
@@ -40,7 +40,7 @@ struct int8_64 {
 inline int8_16 load_int8_16(const int8* mem)
 {
     int8_16 simd;
-    simd.s = _mm_loadu_epi8(mem);
+    simd.s = _mm_load_si128((__m128i *) mem);
 
     return simd;
 }
@@ -63,7 +63,7 @@ inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *)
 inline int8_32 load_int8_32(const int8* mem)
 {
     int8_32 simd;
-    simd.s = _mm256_loadu_epi8(mem);
+    simd.s = _mm256_load_si256((__m256i *) mem);
 
     return simd;
 }
@@ -86,7 +86,7 @@ inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i
 inline int8_64 load_int8_64(const int8* mem)
 {
     int8_64 simd;
-    simd.s = _mm512_loadu_epi8(mem);
+    simd.s = _mm512_load_si512((__m512i *) mem);
 
     return simd;
 }
@@ -830,19 +830,19 @@ inline
 f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps)
 {
     if (steps == 16) {
-        __m512i a_16 = _mm512_loadu_epi8(a);
+        __m512i a_16 = _mm512_load_si512((__m512i *) a);
         __m512 af_16 = _mm512_cvtepi32_ps(a_16);
         __m512 b_16 = _mm512_set1_ps(b);
 
         __m512 result = _mm512_mul_ps(af_16, b_16);
     } else if (steps == 8) {
-        __m256i a_8 = _mm256_loadu_epi8(a);
+        __m256i a_8 = _mm256_load_si256((__m256i *) a);
         __m256 af_8 = _mm256_cvtepi32_ps(a_8);
         __m256 b_8 = _mm256_set1_ps(b);
 
         __m256 result = _mm256_mul_ps(af_8, b_8);
     } else if (steps == 4) {
-        __m128i a_4 = _mm_loadu_epi8(a);
+        __m128i a_4 = _mm_load_si128((__m128i *) a);
         __m128 af_4 = _mm_cvtepi32_ps(a_4);
         __m128 b_4 = _mm_set1_ps(b);
 
@@ -855,11 +855,11 @@ f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps)
 
 bool simd_compare_64(const byte* a, const byte* b)
 {
-    __m256i chunk1_a = _mm256_loadu_si256((__m256i*) a);
-    __m256i chunk1_b = _mm256_loadu_si256((__m256i*) b);
+    __m256i chunk1_a = _mm256_load_si256((__m256i*) a);
+    __m256i chunk1_b = _mm256_load_si256((__m256i*) b);
 
-    __m256i chunk2_a = _mm256_loadu_si256((__m256i*) (a + 32));
-    __m256i chunk2_b = _mm256_loadu_si256((__m256i*) (b + 32));
+    __m256i chunk2_a = _mm256_load_si256((__m256i*) (a + 32));
+    __m256i chunk2_b = _mm256_load_si256((__m256i*) (b + 32));
 
     __m256i result1 = _mm256_cmpeq_epi8(chunk1_a, chunk1_b);
     __m256i result2 = _mm256_cmpeq_epi8(chunk2_a, chunk2_b);
@@ -879,8 +879,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
             __mmask64 result_mask;
 
             for (; i <= size - 64; i += 64) {  // 64 bytes per iteration
-                a_16 = _mm512_loadu_si512((__m512i*) a);
-                b_16 = _mm512_loadu_si512((__m512i*) b);
+                a_16 = _mm512_load_si512((__m512i*) a);
+                b_16 = _mm512_load_si512((__m512i*) b);
 
                 result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16);
 
@@ -905,8 +905,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
             __m256i result_8;
 
             for (; i <= size - steps; i += steps) {
-                a_8 = _mm256_loadu_si256((__m256i*) a);
-                b_8 = _mm256_loadu_si256((__m256i*) b);
+                a_8 = _mm256_load_si256((__m256i*) a);
+                b_8 = _mm256_load_si256((__m256i*) b);
 
                 result_8 = _mm256_cmpeq_epi8(a_8, b_8);
 
@@ -929,8 +929,8 @@ int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
             __m128i result_4;
 
             for (; i <= size - steps; i += steps) {
-                a_4 = _mm_loadu_si128((__m128i*) a);
-                b_4 = _mm_loadu_si128((__m128i*) b);
+                a_4 = _mm_load_si128((__m128i*) a);
+                b_4 = _mm_load_si128((__m128i*) b);
 
                 result_4 = _mm_cmpeq_epi8(a_4, b_4);
 
diff --git a/stdlib/simd/SIMD_SVML.h b/stdlib/simd/SIMD_SVML.h
index f61fae6..e863957 100644
--- a/stdlib/simd/SIMD_SVML.h
+++ b/stdlib/simd/SIMD_SVML.h
@@ -25,7 +25,7 @@
             result[i] = a_array[i] / b_array[i];
         }
 
-        return _mm_loadu_si128((__m128i*)result);
+        return _mm_load_si128((__m128i*)result);
     }
 
     inline __m256i _mm256_div_epi32(__m256i a, __m256i b) {
@@ -38,7 +38,7 @@
             result[i] = a_array[i] / b_array[i];
         }
 
-        return _mm256_loadu_si256((__m256i*)result);
+        return _mm256_load_si256((__m256i*)result);
     }
 
     inline __m512i _mm512_div_epi32(__m512i a, __m512i b) {
@@ -51,7 +51,7 @@
             result[i] = a_array[i] / b_array[i];
         }
 
-        return _mm512_loadu_si512((__m512i*)result);
+        return _mm512_load_si512((__m512i*)result);
     }
 
     inline __m128 _mm_sin_ps(__m128 a) {
@@ -60,7 +60,7 @@
         for (int i = 0; i < 4; ++i) {
             result[i] = sinf(a_array[i]);
         }
-        return _mm_loadu_ps(result);
+        return _mm_load_ps(result);
     }
 
     inline __m128 _mm_cos_ps(__m128 a) {
@@ -69,7 +69,7 @@
         for (int i = 0; i < 4; ++i) {
             result[i] = cosf(a_array[i]);
         }
-        return _mm_loadu_ps(result);
+        return _mm_load_ps(result);
     }
 
     inline __m128 _mm_asin_ps(__m128 a) {
@@ -78,7 +78,7 @@
         for (int i = 0; i < 4; ++i) {
             result[i] = asinf(a_array[i]);
         }
-        return _mm_loadu_ps(result);
+        return _mm_load_ps(result);
     }
 
     inline __m128 _mm_acos_ps(__m128 a) {
@@ -87,7 +87,7 @@
         for (int i = 0; i < 4; ++i) {
             result[i] = acosf(a_array[i]);
         }
-        return _mm_loadu_ps(result);
+        return _mm_load_ps(result);
     }
 
     inline __m256 _mm256_sin_ps(__m256 a) {
@@ -96,7 +96,7 @@
         for (int i = 0; i < 8; ++i) {
             result[i] = sinf(a_array[i]);
         }
-        return _mm256_loadu_ps(result);
+        return _mm256_load_ps(result);
     }
 
     inline __m256 _mm256_cos_ps(__m256 a) {
@@ -105,7 +105,7 @@
         for (int i = 0; i < 8; ++i) {
             result[i] = cosf(a_array[i]);
         }
-        return _mm256_loadu_ps(result);
+        return _mm256_load_ps(result);
     }
 
     inline __m256 _mm256_asin_ps(__m256 a) {
@@ -114,7 +114,7 @@
         for (int i = 0; i < 8; ++i) {
             result[i] = asinf(a_array[i]);
         }
-        return _mm256_loadu_ps(result);
+        return _mm256_load_ps(result);
     }
 
     inline __m256 _mm256_acos_ps(__m256 a) {
@@ -123,7 +123,7 @@
         for (int i = 0; i < 16; ++i) {
             result[i] = acosf(a_array[i]);
         }
-        return _mm256_loadu_ps(result);
+        return _mm256_load_ps(result);
     }
 
     inline __m512 _mm512_sin_ps(__m512 a) {
@@ -132,7 +132,7 @@
         for (int i = 0; i < 16; ++i) {
             result[i] = sinf(a_array[i]);
         }
-        return _mm512_loadu_ps(result);
+        return _mm512_load_ps(result);
     }
 
     inline __m512 _mm512_cos_ps(__m512 a) {
@@ -141,7 +141,7 @@
         for (int i = 0; i < 16; ++i) {
             result[i] = cosf(a_array[i]);
         }
-        return _mm512_loadu_ps(result);
+        return _mm512_load_ps(result);
     }
 
     inline __m512 _mm512_asin_ps(__m512 a) {
@@ -150,7 +150,7 @@
         for (int i = 0; i < 16; ++i) {
             result[i] = asinf(a_array[i]);
         }
-        return _mm512_loadu_ps(result);
+        return _mm512_load_ps(result);
     }
 
     inline __m512 _mm512_acos_ps(__m512 a) {
@@ -159,7 +159,7 @@
         for (int i = 0; i < 16; ++i) {
             result[i] = acosf(a_array[i]);
         }
-        return _mm512_loadu_ps(result);
+        return _mm512_load_ps(result);
     }
 #endif
 
diff --git a/ui/UIAttribute.h b/ui/UIAttribute.h
index f99257d..e241e24 100644
--- a/ui/UIAttribute.h
+++ b/ui/UIAttribute.h
@@ -96,9 +96,23 @@ enum UIAttributeType {
 
 UIAttribute* ui_attribute_from_group(UIAttributeGroup* group, UIAttributeType type)
 {
-    for (int i = 0; i < UI_ATTRIBUTE_TYPE_SIZE && i <= type; ++i) {
-        if (group->attributes[i].attribute_id == type) {
-            return &group->attributes[i];
+    if (!group->attributes) {
+        return NULL;
+    }
+
+    int32 left = 0;
+    int32 right = type;
+
+    // Binary search since attributes are sorted by attribute_id
+    while (left <= right) {
+        int32 mid = left + (right - left) / 2;
+
+        if (group->attributes[mid].attribute_id == type) {
+            return &group->attributes[mid];
+        }  else if (group->attributes[mid].attribute_id < type) {
+            left = mid + 1;
+        }  else {
+            right = mid - 1;
         }
     }
 
@@ -199,98 +213,4 @@ constexpr const char* ui_attribute_type_to_string_const(UIAttributeType e)
     return NULL;
 }
 
-const char* ui_attribute_type_to_string(UIAttributeType e)
-{
-    switch (e) {
-        case UI_ATTRIBUTE_TYPE_TYPE:
-            return "type";
-        case UI_ATTRIBUTE_TYPE_STYLE:
-            return "style";
-        case UI_ATTRIBUTE_TYPE_DIMENSION_X:
-            return "x";
-        case UI_ATTRIBUTE_TYPE_DIMENSION_Y:
-            return "y";
-        case UI_ATTRIBUTE_TYPE_DIMENSION_WIDTH:
-            return "width";
-        case UI_ATTRIBUTE_TYPE_DIMENSION_HEIGHT:
-            return "height";
-        case UI_ATTRIBUTE_TYPE_FONT_NAME:
-            return "font_name";
-        case UI_ATTRIBUTE_TYPE_FONT_COLOR:
-            return "font_color";
-        case UI_ATTRIBUTE_TYPE_FONT_SIZE:
-            return "font_size";
-        case UI_ATTRIBUTE_TYPE_FONT_WEIGHT:
-            return "font_weight";
-        case UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT:
-            return "font_line_height";
-        case UI_ATTRIBUTE_TYPE_ALIGN_H:
-            return "align_h";
-        case UI_ATTRIBUTE_TYPE_ALIGN_V:
-            return "align_v";
-        case UI_ATTRIBUTE_TYPE_ZINDEX:
-            return "zindex";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR:
-            return "background_color";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG:
-            return "background_img";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY:
-            return "background_img_opacity";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V:
-            return "background_img_position_v";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H:
-            return "background_img_position_h";
-        case UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE:
-            return "background_img_style";
-        case UI_ATTRIBUTE_TYPE_BORDER_COLOR:
-            return "border_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_WIDTH:
-            return "border_width";
-        case UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR:
-            return "border_top_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH:
-            return "border_top_width";
-        case UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR:
-            return "border_right_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH:
-            return "border_right_width";
-        case UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR:
-            return "border_bottom_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH:
-            return "border_bottom_width";
-        case UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR:
-            return "border_left_color";
-        case UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH:
-            return "border_left_width";
-        case UI_ATTRIBUTE_TYPE_PADDING:
-            return "padding";
-        case UI_ATTRIBUTE_TYPE_PADDING_TOP:
-            return "padding_top";
-        case UI_ATTRIBUTE_TYPE_PADDING_RIGHT:
-            return "padding_right";
-        case UI_ATTRIBUTE_TYPE_PADDING_BOTTOM:
-            return "padding_bottom";
-        case UI_ATTRIBUTE_TYPE_PADDING_LEFT:
-            return "padding_left";
-        case UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR:
-            return "shadow_inner_color";
-        case UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE:
-            return "shadow_inner_angle";
-        case UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE:
-            return "shadow_inner_distance";
-        case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR:
-            return "shadow_outer_color";
-        case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE:
-            return "shadow_outer_angle";
-        case UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE:
-            return "shadow_outer_distance";
-        case UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION:
-            return "transition_animation";
-        case UI_ATTRIBUTE_TYPE_TRANSITION_DURATION:
-            return "transition_duration";
-    }
-
-    return NULL;
-}
-
 #endif
\ No newline at end of file
diff --git a/ui/UIElement.h b/ui/UIElement.h
index 643c42f..4a1a7c5 100644
--- a/ui/UIElement.h
+++ b/ui/UIElement.h
@@ -5,6 +5,9 @@
 #include "UIElementType.h"
 #include "../object/Vertex.h"
 
+#include <immintrin.h>
+#include <xmmintrin.h>
+
 struct UIElementDimension {
 	int16 x1;
 	int16 y1;
@@ -22,6 +25,7 @@ struct UIElement {
     const char* name;
     int32 id;
     UIElementType type;
+    bool is_dynamic;
 
     int16 window_id;
     int16 panel_id;
diff --git a/ui/UIElementType.h b/ui/UIElementType.h
index 93eac71..df81443 100644
--- a/ui/UIElementType.h
+++ b/ui/UIElementType.h
@@ -53,36 +53,4 @@ constexpr const char* ui_element_type_to_string_const(UIElementType e)
     return NULL;
 }
 
-const char* ui_element_type_to_string(UIElementType e)
-{
-    switch (e) {
-        case UI_ELEMENT_TYPE_BUTTON:
-            return "button";
-        case UI_ELEMENT_TYPE_SELECT:
-            return "select";
-        case UI_ELEMENT_TYPE_DROPDOWN:
-            return "dropdown";
-        case UI_ELEMENT_TYPE_TEXTFIELD:
-            return "textfield";
-        case UI_ELEMENT_TYPE_TEXTAREA:
-            return "textarea";
-        case UI_ELEMENT_TYPE_IMAGE:
-            return "image";
-        case UI_ELEMENT_TYPE_TEXT:
-            return "text";
-        case UI_ELEMENT_TYPE_LINK:
-            return "link";
-        case UI_ELEMENT_TYPE_TABLE:
-            return "table";
-        case UI_ELEMENT_TYPE_VIEW_WINDOW:
-            return "view_window";
-        case UI_ELEMENT_TYPE_VIEW_PANEL:
-            return "view_panel";
-        case UI_ELEMENT_TYPE_VIEW_TAB:
-            return "view_tab";
-    }
-
-    return NULL;
-}
-
 #endif
\ No newline at end of file
diff --git a/ui/UILayout.h b/ui/UILayout.h
index 00657b6..878de95 100644
--- a/ui/UILayout.h
+++ b/ui/UILayout.h
@@ -7,30 +7,136 @@
 
 // Modified for every scene
 struct UILayout {
-    int32 ui_deadzone_size = 5;
-    UIElementDimension ui_deadzone[5];
+    // This array has the size of the game window and represents in color codes where interactible ui elements are
+    // Size is based on screen size (we don't need full screen size since we assume an interactible element is at least 4 pixels width and height)
+    //      width = 25% of screen size
+    //      height = 25% of screen size
+    uint16 width;
+    uint16 height;
 
-    int32 element_hoverable_size;
-    int32 element_hoverable_pos;
-	UIElementDimension* elements_hoverable;
+    // Contains all UI elements also dynamic ones (e.g. movable windows)
+    uint32* ui_chroma_codes;
 
-    int32 element_interactible_size;
-    int32 element_interactible_pos;
-	UIElementDimension* elements_interactible;
+    // Contains constant UI elements that usually don't change (e.g. HUD)
+    uint32* ui_chroma_codes_static;
 
-    // @question Since we use a hashmap below, do we even need the size?
-    //      Isn't the size exactly the same as the hash_map buf size
-    int32 element_size;
-    int32 element_pos;
-    HashMap hash_map; // Used to directly find element by name
-
-    // @question Do we even need this or should the hashmap values be the elements directly?
-    //  In other places (e.g. theme) we simply define a byte* data variable which actually holds the info.
-    UIElement* elements;
+    // Used to directly find element by name
+    // The values are the UIElements
+    HashMap hash_map;
 
     int32 vertex_size;
     int32 vertex_pos;
     Vertex3DTextureColorIndex* vertices;
 };
 
+inline
+uint32 layout_element_from_location(UILayout* layout, uint16 x, uint16 y)
+{
+    return layout->ui_chroma_codes[layout->width * y / 4 + x / 4];
+}
+
+// This function should only get called if the location of a UI Element changes
+// @performance How to handle moving elements (= dragging a window). We don't want to update this while dragging!
+void layout_chroma_codes_update(UILayout* layout)
+{
+    // Reset all
+    memcpy(layout->ui_chroma_codes, layout->ui_chroma_codes_static, layout->width * layout->height * sizeof(uint32));
+
+    // @question Are the dimension values below even absolute? They may be in relation to the parent?!
+    for (int32 i = 0; i < layout->hash_map.buf.count; ++i) {
+        if (!layout->hash_map.table[i]) {
+            continue;
+        }
+
+        HashEntry* entry = (HashEntry *) layout->hash_map.table[i];
+        UIElement* element = (UIElement *) entry->value;
+
+        if (element->is_dynamic) {
+            continue;
+        }
+
+        int32 y_start = element->dimension.y1 / 4;
+        int32 y_end = element->dimension.y2 / 4;
+        int32 x_start = element->dimension.x1 / 4;
+        int32 x_end = element->dimension.x2 / 4;
+
+        for (int32 y = y_start; y < y_end; ++y) {
+            int32 y_offset = layout->width * y;
+            for (int32 x = x_start; x < x_end; ++x) {
+                layout->ui_chroma_codes[y_offset + x] = (uint32) element->id;
+            }
+        }
+
+        // Now handle all next elements
+        while (entry->next) {
+            entry = entry->next;
+
+            element = (UIElement *) entry->value;
+
+            y_start = element->dimension.y1 / 4;
+            y_end = element->dimension.y2 / 4;
+            x_start = element->dimension.x1 / 4;
+            x_end = element->dimension.x2 / 4;
+
+            for (int32 y = y_start; y < y_end; ++y) {
+                int32 y_offset = layout->width * y;
+                for (int32 x = x_start; x < x_end; ++x) {
+                    layout->ui_chroma_codes[y_offset + x] = (uint32) element->id;
+                }
+            }
+        }
+    }
+}
+
+void layout_chroma_codes_update_static(UILayout* layout)
+{
+    // Reset all
+    memset(layout->ui_chroma_codes_static, 0, layout->width * layout->height * sizeof(uint32));
+
+    // @question Are the dimension values below even absolute? They may be in relation to the parent?!
+    for (int32 i = 0; i < layout->hash_map.buf.count; ++i) {
+        if (!layout->hash_map.table[i]) {
+            continue;
+        }
+
+        HashEntry* entry = (HashEntry *) layout->hash_map.table[i];
+        UIElement* element = (UIElement *) entry->value;
+
+        if (!element->is_dynamic) {
+            continue;
+        }
+
+        int32 y_start = element->dimension.y1 / 4;
+        int32 y_end = element->dimension.y2 / 4;
+        int32 x_start = element->dimension.x1 / 4;
+        int32 x_end = element->dimension.x2 / 4;
+
+        for (int32 y = y_start; y < y_end; ++y) {
+            int32 y_offset = layout->width * y;
+            for (int32 x = x_start; x < x_end; ++x) {
+                layout->ui_chroma_codes_static[y_offset + x] = (uint32) element->id;
+            }
+        }
+
+        // Now handle all next elements
+        while (entry->next) {
+            entry = entry->next;
+
+            element = (UIElement *) entry->value;
+
+            y_start = element->dimension.y1 / 4;
+            y_end = element->dimension.y2 / 4;
+            x_start = element->dimension.x1 / 4;
+            x_end = element->dimension.x2 / 4;
+
+            for (int32 y = y_start; y < y_end; ++y) {
+                int32 y_offset = layout->width * y;
+                for (int32 x = x_start; x < x_end; ++x) {
+                    layout->ui_chroma_codes_static[y_offset + x] = (uint32) element->id;
+                }
+            }
+        }
+    }
+}
+
 #endif
\ No newline at end of file
diff --git a/ui/UITheme.h b/ui/UITheme.h
index 6237de7..0627b00 100644
--- a/ui/UITheme.h
+++ b/ui/UITheme.h
@@ -66,6 +66,11 @@ inline
 UIAttributeGroup* theme_style_group(UIThemeStyle* theme, const char* group_name)
 {
     HashEntryInt64* entry = (HashEntryInt64 *) hashmap_get_entry(&theme->hash_map, group_name);
+    if (!entry) {
+        ASSERT_SIMPLE(false);
+        return NULL;
+    }
+
     return (UIAttributeGroup *) (theme->data + entry->value);
 }
 
@@ -73,6 +78,11 @@ inline
 UIAttributeGroup* theme_style_group(UIThemeStyle* theme, const char* group_name, int32 group_id)
 {
     HashEntryInt64* entry = (HashEntryInt64 *) hashmap_get_entry(&theme->hash_map, group_name, group_id);
+    if (!entry) {
+        ASSERT_SIMPLE(false);
+        return NULL;
+    }
+
     return (UIAttributeGroup *) (theme->data + entry->value);
 }
 
@@ -229,7 +239,7 @@ void theme_from_file_txt(
 
             *temp = '\0';
             for (int32 j = 0; j < UI_ELEMENT_TYPE_SIZE; ++j) {
-                if (strcmp(str, ui_element_type_to_string((UIElementType) j)) == 0) {
+                if (strcmp(str, ui_element_type_to_string_const((UIElementType) j)) == 0) {
 
                     attribute.value_int = j;
                     break;
@@ -246,13 +256,11 @@ void theme_from_file_txt(
             }
 
             *temp = '\0';
-            ++pos;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -260,28 +268,27 @@ void theme_from_file_txt(
             attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_SIZE), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_SIZE;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_WEIGHT), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_WEIGHT;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ALIGN_H), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_ALIGN_H;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ALIGN_V), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_ALIGN_V;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ZINDEX), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_ZINDEX;
-            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos)); ++pos;
+            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos));
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -298,22 +305,21 @@ void theme_from_file_txt(
             attribute.value_str[i] = '\0';
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY;
-            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos)); ++pos;
+            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos));
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -321,13 +327,12 @@ void theme_from_file_txt(
             attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_WIDTH), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -335,13 +340,12 @@ void theme_from_file_txt(
             attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -349,13 +353,12 @@ void theme_from_file_txt(
             attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -363,13 +366,12 @@ void theme_from_file_txt(
             attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -377,28 +379,27 @@ void theme_from_file_txt(
             attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_TOP), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_TOP;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_RIGHT), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_RIGHT;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_BOTTOM), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_BOTTOM;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_LEFT), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_LEFT;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -406,16 +407,15 @@ void theme_from_file_txt(
             attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR), attribute_name) == 0) {
             ++pos; // Skip '#'
 
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR;
             uint32 value = (uint32) strtoul(pos, &pos, 16);
-            pos += 4;
 
             attribute.value_v4_f32.r = (f32) ((value >> 24) & 0xFF) / 255.0f;
             attribute.value_v4_f32.g = (f32) ((value >> 16) & 0xFF) / 255.0f;
@@ -423,16 +423,16 @@ void theme_from_file_txt(
             attribute.value_v4_f32.a = (f32) (value & 0xFF) / 255.0f;
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION;
-            attribute.value_int = strtoul(pos, &pos, 10); ++pos;
+            attribute.value_int = strtoul(pos, &pos, 10);
         } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TRANSITION_DURATION), attribute_name) == 0) {
             attribute.attribute_id = UI_ATTRIBUTE_TYPE_TRANSITION_DURATION;
-            attribute.value_float = strtof(pos, &pos); ++pos;
+            attribute.value_float = strtof(pos, &pos);
         } else {
             while (*pos != '\n' && *pos != '\0') {
                 ++pos;
@@ -487,12 +487,20 @@ void theme_from_file(
     // Of course we still need to populate the data using hashmap_load()
     // The value is a int64 (because this is the value of the chunk buffer size but the hashmap only allows int32)
     hashmap_create(&theme->hash_map, (int32) SWAP_ENDIAN_LITTLE(*((uint64 *) pos)), sizeof(HashEntryInt64), theme->data);
+
+    const byte* start = theme->hash_map.buf.memory;
     pos += hashmap_load(&theme->hash_map, pos);
 
     // theme data
     // Layout: first load the size of the group, then load the individual attributes
     for (int32 i = 0; i < theme->hash_map.buf.count; ++i) {
+        if (!theme->hash_map.table[i]) {
+            continue;
+        }
+
         HashEntryInt64* entry = (HashEntryInt64 *) theme->hash_map.table[i];
+
+        pos = start + entry->value;
         UIAttributeGroup* group = (UIAttributeGroup *) (theme->data + entry->value);
 
         group->attribute_size = SWAP_ENDIAN_LITTLE(*((int32 *) pos));
@@ -501,6 +509,21 @@ void theme_from_file(
         // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases
         memcpy(group->attributes, pos, group->attribute_size * sizeof(UIAttribute));
         pos += group->attribute_size * sizeof(UIAttribute);
+
+        // load all the next elements
+        while (entry->next) {
+            pos = start + entry->value;
+            group = (UIAttributeGroup *) (theme->data + entry->value);
+
+            group->attribute_size = SWAP_ENDIAN_LITTLE(*((int32 *) pos));
+            pos += sizeof(group->attribute_size);
+
+            // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases
+            memcpy(group->attributes, pos, group->attribute_size * sizeof(UIAttribute));
+            pos += group->attribute_size * sizeof(UIAttribute);
+
+            entry = entry->next;
+        }
     }
 }
 
@@ -541,7 +564,7 @@ void theme_to_file(
     //      I also don't want to add a size variable to the theme as it is useless in all other cases
     file.size = theme_size(theme);
 
-    file.content = ring_get_memory(ring, file.size, 64);
+    file.content = ring_get_memory(ring, file.size, 64, true);
     byte* pos = file.content;
 
     // version
@@ -549,12 +572,19 @@ void theme_to_file(
     pos += sizeof(theme->version);
 
     // hashmap
+    byte* start = pos;
     pos += hashmap_dump(&theme->hash_map, pos);
 
     // theme data
     // Layout: first save the size of the group, then save the individual attributes
     for (int32 i = 0; i < theme->hash_map.buf.count; ++i) {
+        if (!theme->hash_map.table[i]) {
+            continue;
+        }
+
         HashEntryInt64* entry = (HashEntryInt64 *) theme->hash_map.table[i];
+
+        pos = start + entry->value;
         UIAttributeGroup* group = (UIAttributeGroup *) (theme->data + entry->value);
 
         *((int32 *) pos) = SWAP_ENDIAN_LITTLE(group->attribute_size);
@@ -562,7 +592,22 @@ void theme_to_file(
 
         // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases
         memcpy(pos, group->attributes, group->attribute_size * sizeof(UIAttribute));
-        pos += group->attribute_size * sizeof(UIAttribute);
+        pos += sizeof(UIAttribute);
+
+        // save all the next elements
+        while (entry->next) {
+            pos = start + entry->value;
+            group = (UIAttributeGroup *) (theme->data + entry->value);
+
+            *((int32 *) pos) = SWAP_ENDIAN_LITTLE(group->attribute_size);
+            pos += sizeof(group->attribute_size);
+
+            // @performance The UIAttribute contains a char array which makes this WAY larger than it needs to be in 99% of the cases
+            memcpy(pos, group->attributes, group->attribute_size * sizeof(UIAttribute));
+            pos += sizeof(UIAttribute);
+
+            entry = entry->next;
+        }
     }
 
     file.size = pos - file.content;
diff --git a/utils/StringUtils.h b/utils/StringUtils.h
index cb71f4f..1c5d667 100644
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@@ -176,7 +176,7 @@ void wchar_to_char(const wchar_t* __restrict src, char* __restrict dest, int32 l
     *dest = '\0';
 }
 
-inline
+inline constexpr
 int32 str_to_int(const char *str)
 {
     int32 result = 0;
@@ -197,7 +197,47 @@ int32 str_to_int(const char *str)
     return result * sign;
 }
 
-inline size_t str_count(const char* __restrict str, const char* __restrict substr)
+inline constexpr
+int32 int_to_str(int64 number, char *str, const char thousands = ',') {
+    int32 i = 0;
+    int64 sign = number;
+    int32 digit_count = 0;
+
+    if (number == 0) {
+        str[i++] = '0';
+    } else if (number < 0) {
+        number = -number;
+    }
+
+    while (number > 0) {
+        if (thousands
+            && (digit_count == 3 || digit_count == 6 || digit_count == 9 || digit_count == 12 || digit_count == 15)
+        ) {
+            str[i++] = thousands;
+        }
+
+        str[i++] = number % 10 + '0';
+        number /= 10;
+        ++digit_count;
+    }
+
+    if (sign < 0) {
+        str[i++] = '-';
+    }
+
+    str[i] = '\0';
+
+    for (int32 j = 0, k = i - 1; j < k; ++j, --k) {
+        char temp = str[j];
+        str[j] = str[k];
+        str[k] = temp;
+    }
+
+    return i - 1;
+}
+
+inline
+size_t str_count(const char* __restrict str, const char* __restrict substr)
 {
     size_t l1 = strlen(str);
     size_t l2 = strlen(substr);
@@ -296,45 +336,7 @@ char* strtok(char* str, const char* __restrict delim, char* *key) {
     return result;
 }
 
-inline
-int32 int_to_str(int64 number, char *str, const char thousands = ',') {
-    int32 i = 0;
-    int64 sign = number;
-    int32 digit_count = 0;
-
-    if (number == 0) {
-        str[i++] = '0';
-    } else if (number < 0) {
-        number = -number;
-    }
-
-    while (number > 0) {
-        if (thousands
-            && (digit_count == 3 || digit_count == 6 || digit_count == 9 || digit_count == 12 || digit_count == 15)
-        ) {
-            str[i++] = thousands;
-        }
-
-        str[i++] = number % 10 + '0';
-        number /= 10;
-        ++digit_count;
-    }
-
-    if (sign < 0) {
-        str[i++] = '-';
-    }
-
-    str[i] = '\0';
-
-    for (int32 j = 0, k = i - 1; j < k; ++j, --k) {
-        char temp = str[j];
-        str[j] = str[k];
-        str[k] = temp;
-    }
-
-    return i - 1;
-}
-
+inline constexpr
 char toupper_ascii(char c)
 {
     return c >= 'a' && c <= 'z'
@@ -342,6 +344,7 @@ char toupper_ascii(char c)
         : c;
 }
 
+inline constexpr
 char tolower_ascii(char c)
 {
     return c >= 'A' && c <= 'Z'
@@ -349,6 +352,7 @@ char tolower_ascii(char c)
         : c;
 }
 
+inline constexpr
 void create_const_name(const unsigned char* name, char* modified_name)
 {
     // Print block
@@ -365,6 +369,7 @@ void create_const_name(const unsigned char* name, char* modified_name)
     }
 }
 
+inline constexpr
 bool str_ends_with(const char* str, const char* suffix) {
     if (!str || !suffix) {
         return false;
@@ -431,4 +436,16 @@ void print_bytes(const void* ptr, size_t size)
     }
 }
 
+inline constexpr
+int32 is_eol(const char* str)
+{
+    if (*str == '\n') {
+        return 1;
+    } else if (*str == '\r' && str[1] == '\n') {
+        return 2;
+    }
+
+    return 0;
+}
+
 #endif
\ No newline at end of file