huge re-write of ams. impl. of command buffer, ...

2026-01-10 19:08:39 +00:00 · 2025-01-07 20:47:16 +01:00 · 2025-01-07 20:47:16 +01:00 · 2ecb47117b
commit 2ecb47117b
parent 2521f5f2e4
94 changed files with 3089 additions and 1472 deletions
--- a/animation/Animation.h
+++ b/animation/Animation.h
@ -24,6 +24,234 @@ f32 lerp(f32 a, f32 b, f32 t)
 f32 smoothstep(f32 t) {
    return t * t * (3 - 2 * t);
 }
+inline
+f32 anim_discrete(f32 t) {
+    return t >= 1.0f ? 1.0f : 0.0f;
+}
+
+inline
+f32 anim_ease_linear(f32 t) {
+    return t;
+}
+
+inline
+f32 anim_ease_in_sine(f32 t) {
+    return 1.0f - cosf((t * OMS_PI) / 2.0f);
+}
+
+inline
+f32 anim_ease_out_sine(f32 t) {
+    return sinf((t * OMS_PI) / 2.0f);
+}
+
+inline
+f32 anim_ease_in_out_sine(f32 t) {
+    return -(cosf(OMS_PI * t) - 1) / 2.0f;
+}
+
+inline
+f32 anim_ease_in_quad(f32 t) {
+    return t * t;
+}
+
+inline
+f32 anim_ease_out_quad(f32 t) {
+    return 1.0f - (1.0f - t) * (1.0f - t);
+}
+
+inline
+f32 anim_ease_in_out_quad(f32 t) {
+    return t < 0.5f
+        ? 2 * t * t
+        : 1.0f - powf(-2 * t + 2, 2) / 2.0f;
+}
+
+inline
+f32 anim_ease_in_cubic(f32 t) {
+    return t * t * t;
+}
+
+inline
+f32 anim_ease_out_cubic(f32 t) {
+    return 1.0f - powf(1.0f - t, 3);
+}
+
+inline
+f32 anim_ease_in_out_cubic(f32 t) {
+    return t < 0.5f
+        ? 4 * t * t * t
+        : 1.0f - powf(-2 * t + 2, 3) / 2.0f;
+}
+
+inline
+f32 anim_ease_in_quart(f32 t) {
+    return t * t * t * t;
+}
+
+inline
+f32 anim_ease_out_quart(f32 t) {
+    return 1.0f - powf(1.0f - t, 4);
+}
+
+inline
+f32 anim_ease_in_perlin(f32 t) {
+    return t * t * t * (t * (t * 6 - 15) + 10);
+}
+
+inline
+f32 anim_ease_in_out_quart(f32 t) {
+    return t < 0.5f
+        ? 8 * t * t * t * t
+        : 1.0f - powf(-2 * t + 2, 4) / 2.0f;
+}
+
+inline
+f32 anim_ease_in_quint(f32 t) {
+    return t * t * t * t * t;
+}
+
+inline
+f32 anim_ease_out_quint(f32 t) {
+    return 1.0f - powf(1.0f - t, 5);
+}
+
+inline
+f32 anim_ease_in_out_quint(f32 t) {
+    return t < 0.5f
+        ? 16 * t * t * t * t * t
+        : 1.0f - powf(-2 * t + 2, 5) / 2.0f;
+}
+
+inline
+f32 anim_ease_in_expo(f32 t) {
+    return t == 0.0f
+        ? 0.0f
+        : powf(2, 10 * t - 10);
+}
+
+inline
+f32 anim_ease_out_expo(f32 t) {
+    return t == 1.0f
+        ? 1.0f
+        : 1.0f - powf(2, -10 * t);
+}
+
+inline
+f32 anim_ease_in_out_expo(f32 t) {
+    if (t == 0.0f || t == 1.0f) {
+        return t;
+    }
+
+    return t < 0.5f
+        ? powf(2, 20 * t - 10) / 2.0f
+        : (2 - powf(2, -20 * t + 10)) / 2.0f;
+}
+
+inline
+f32 anim_ease_in_circ(f32 t) {
+    return 1.0f - sqrtf(1.0f - powf(t, 2));
+}
+
+inline
+f32 anim_ease_out_circ(f32 t) {
+    return sqrtf(1.0f - powf(t - 1, 2));
+}
+
+inline
+f32 anim_ease_in_out_circ(f32 t) {
+    return t < 0.5f
+        ? (1.0f - sqrtf(1.0f - powf(2 * t, 2))) / 2.0f
+        : (sqrtf(1.0f - powf(-2 * t + 2, 2)) + 1) / 2.0f;
+}
+
+inline
+f32 anim_ease_in_back(f32 t) {
+    const f32 c1 = 1.70158f;
+    const f32 c3 = c1 + 1.0f;
+
+    return c3 * t * t * t - c1 * t * t;
+}
+
+inline
+f32 anim_ease_out_back(f32 t) {
+    const f32 c1 = 1.70158f;
+    const f32 c3 = c1 + 1.0f;
+
+    return 1 + c3 * powf(t - 1, 3) + c1 * powf(t - 1, 2);
+}
+
+inline
+f32 anim_ease_in_out_back(f32 t) {
+    const f32 c1 = 1.70158f;
+    const f32 c2 = c1 * 1.525f;
+
+    return t < 0.5f
+        ? (powf(2 * t, 2) * ((c2 + 1) * 2 * t - c2)) / 2.0f
+        : (powf(2 * t - 2, 2) * ((c2 + 1) * (t * 2 - 2) + c2) + 2) / 2.0f;
+}
+
+inline
+f32 anim_ease_in_elastic(f32 t) {
+    const f32 c4 = OMS_TWO_PI / 3;
+
+    if (t == 0.0f || t == 1.0f) {
+        return t;
+    }
+
+    return -powf(2, 10 * t - 10) * sinf((t * 10 - 10.75f) * c4);
+}
+
+inline
+f32 anim_ease_out_elastic(f32 t) {
+    const f32 c4 = OMS_TWO_PI / 3;
+
+    if (t == 0.0f || t == 1.0f) {
+        return t;
+    }
+
+    return powf(2, -10 * t) * sinf((t * 10 - 0.75f) * c4) + 1;
+}
+
+inline
+f32 anim_ease_in_out_elastic(f32 t) {
+    const f32 c5 = OMS_TWO_PI / 4.5f;
+
+    if (t == 0.0f || t == 1.0f) {
+        return t;
+    } else if (t < 0.5f) {
+        return -(powf(2, 20 * t - 10) * sinf((20 * t - 11.125f) * c5)) / 2.0f;
+    }
+
+    return (powf(2, -20 * t + 10) * sinf((20 * t - 11.125f) * c5)) / 2.0f + 1.0f;
+}
+
+inline
+f32 anim_ease_out_bounce(f32 t) {
+    const f32 n1 = 7.5625f;
+    const f32 d1 = 2.75f;
+
+    if (t < 1.0f / d1) {
+        return n1 * t * t;
+    } else if (t < 2.0f / d1) {
+        return n1 * (t -= 1.5f / d1) * t + 0.75f;
+    } else if (t < 2.5f / d1) {
+        return n1 * (t -= 2.25f / d1) * t + 0.9375f;
+    }
+
+    return n1 * (t -= 2.625f / d1) * t + 0.984375f;
+}
+
+inline
+f32 anim_ease_in_bounce(f32 t) {
+    return 1.0f - anim_ease_out_bounce(1.0f - t);
+}
+
+inline
+f32 anim_ease_in_out_bounce(f32 t) {
+    return t < 0.5f
+        ? (1.0f - anim_ease_out_bounce(1.0f - 2.0f * t)) / 2.0f
+        : (1.0f + anim_ease_out_bounce(2.0f * t - 1.0f)) / 2.0f;
+}

 f32 anim_ease(f32 t, AnimationEaseType type) {
    switch(type) {
@ -125,233 +353,4 @@ f32 anim_ease(f32 t, AnimationEaseType type) {
    }
 }

-inline
-f32 anim_discrete(f32 t) {
-    return t >= 1.0f ? 1.0f : 0.0f;
-}
-
-inline
-f32 anim_ease_linear(f32 t) {
-    return t;
-}
-
-inline
-f32 anim_ease_in_sine(f32 t) {
-    return 1 - cosf((t * OMS_PI) / 2);
-}
-
-inline
-f32 anim_ease_out_sine(f32 t) {
-    return sinf((t * OMS_PI) / 2);
-}
-
-inline
-f32 anim_ease_in_out_sine(f32 t) {
-    return -(cosf(OMS_PI * t) - 1) / 2;
-}
-
-inline
-f32 anim_ease_in_quad(f32 t) {
-    return t * t;
-}
-
-inline
-f32 anim_ease_out_quad(f32 t) {
-    return 1 - (1 - t) * (1 - t);
-}
-
-inline
-f32 anim_ease_in_out_quad(f32 t) {
-    return t < 0.5
-        ? 2 * t * t
-        : 1 - pow(-2 * t + 2, 2) / 2;
-}
-
-inline
-f32 anim_ease_in_cubic(f32 t) {
-    return t * t * t;
-}
-
-inline
-f32 anim_ease_out_cubic(f32 t) {
-    return 1 - pow(1 - t, 3);
-}
-
-inline
-f32 anim_ease_in_out_cubic(f32 t) {
-    return t < 0.5
-        ? 4 * t * t * t
-        : 1 - pow(-2 * t + 2, 3) / 2;
-}
-
-inline
-f32 anim_ease_in_quart(f32 t) {
-    return t * t * t * t;
-}
-
-inline
-f32 anim_ease_out_quart(f32 t) {
-    return 1 - pow(1 - t, 4);
-}
-
-inline
-f32 anim_ease_in_perlin(f32 t) {
-    return t * t * t * (t * (t * 6 - 15) + 10);
-}
-
-inline
-f32 anim_ease_in_out_quart(f32 t) {
-    return t < 0.5
-        ? 8 * t * t * t * t
-        : 1 - pow(-2 * t + 2, 4) / 2;
-}
-
-inline
-f32 anim_ease_in_quint(f32 t) {
-    return t * t * t * t * t;
-}
-
-inline
-f32 anim_ease_out_quint(f32 t) {
-    return 1 - pow(1 - t, 5);
-}
-
-inline
-f32 anim_ease_in_out_quint(f32 t) {
-    return t < 0.5
-        ? 16 * t * t * t * t * t
-        : 1 - pow(-2 * t + 2, 5) / 2;
-}
-
-inline
-f32 anim_ease_in_expo(f32 t) {
-    return t == 0
-        ? 0
-        : pow(2, 10 * t - 10);
-}
-
-inline
-f32 anim_ease_out_expo(f32 t) {
-    return t == 1
-        ? 1
-        : 1 - pow(2, -10 * t);
-}
-
-inline
-f32 anim_ease_in_out_expo(f32 t) {
-    if (t == 0 || t == 1) {
-        return t;
-    }
-
-    return t < 0.5
-        ? pow(2, 20 * t - 10) / 2
-        : (2 - pow(2, -20 * t + 10)) / 2;
-}
-
-inline
-f32 anim_ease_in_circ(f32 t) {
-    return 1 - sqrtf(1 - pow(t, 2));
-}
-
-inline
-f32 anim_ease_out_circ(f32 t) {
-    return sqrtf(1 - pow(t - 1, 2));
-}
-
-inline
-f32 anim_ease_in_out_circ(f32 t) {
-    return t < 0.5
-        ? (1 - sqrtf(1 - pow(2 * t, 2))) / 2
-        : (sqrtf(1 - pow(-2 * t + 2, 2)) + 1) / 2;
-}
-
-inline
-f32 anim_ease_in_back(f32 t) {
-    const f32 c1 = 1.70158;
-    const f32 c3 = c1 + 1;
-
-    return c3 * t * t * t - c1 * t * t;
-}
-
-inline
-f32 anim_ease_out_back(f32 t) {
-    const f32 c1 = 1.70158;
-    const f32 c3 = c1 + 1;
-
-    return 1 + c3 * pow(t - 1, 3) + c1 * pow(t - 1, 2);
-}
-
-inline
-f32 anim_ease_in_out_back(f32 t) {
-    const f32 c1 = 1.70158;
-    const f32 c2 = c1 * 1.525;
-
-    return t < 0.5
-        ? (pow(2 * t, 2) * ((c2 + 1) * 2 * t - c2)) / 2
-        : (pow(2 * t - 2, 2) * ((c2 + 1) * (t * 2 - 2) + c2) + 2) / 2;
-}
-
-inline
-f32 anim_ease_in_elastic(f32 t) {
-    const f32 c4 = (2 * OMS_PI) / 3;
-
-    if (t == 0 || t == 1) {
-        return t;
-    }
-
-    return -pow(2, 10 * t - 10) * sinf((t * 10 - 10.75) * c4);
-}
-
-inline
-f32 anim_ease_out_elastic(f32 t) {
-    const f32 c4 = (2 * OMS_PI) / 3;
-
-    if (t == 0.0 || t == 1.0) {
-        return t;
-    }
-
-    return pow(2, -10 * t) * sinf((t * 10 - 0.75) * c4) + 1;
-}
-
-inline
-f32 anim_ease_in_out_elastic(f32 t) {
-    const f32 c5 = (2 * OMS_PI) / 4.5;
-
-    if (t == 0.0 || t == 1.0) {
-        return t;
-    } else if (t < 0.5) {
-        return -(pow(2, 20 * t - 10) * sinf((20 * t - 11.125) * c5)) / 2;
-    }
-
-    return (pow(2, -20 * t + 10) * sinf((20 * t - 11.125) * c5)) / 2 + 1;
-}
-
-inline
-f32 anim_ease_in_bounce(f32 t) {
-    return 1 - anim_ease_out_bounce(1 - t);
-}
-
-inline
-f32 anim_ease_out_bounce(f32 t) {
-    const f32 n1 = 7.5625;
-    const f32 d1 = 2.75;
-
-    if (t < 1 / d1) {
-        return n1 * t * t;
-    } else if (t < 2 / d1) {
-        return n1 * (t -= 1.5 / d1) * t + 0.75;
-    } else if (t < 2.5 / d1) {
-        return n1 * (t -= 2.25 / d1) * t + 0.9375;
-    }
-
-    return n1 * (t -= 2.625 / d1) * t + 0.984375;
-}
-
-inline
-f32 anim_ease_in_out_bounce(f32 t) {
-    return t < 0.5
-        ? (1 - anim_ease_out_bounce(1 - 2 * t)) / 2
-        : (1 + anim_ease_out_bounce(2 * t - 1)) / 2;
-}
-
 #endif
--- a/asset/Asset.h
+++ b/asset/Asset.h
@ -12,61 +12,49 @@
 #include "../stdlib/Types.h"
 #include "AssetType.h"

-#define MAX_ASSET_NAME_LENGTH 32
+enum AssetState : byte {
+    ASSET_STATE_IN_RAM = 1 << 0,
+    ASSET_STATE_IN_VRAM = 1 << 1,
+    ASSET_STATE_RAM_GC = 1 << 2,
+    ASSET_STATE_VRAM_GC = 1 << 3,
+};

 struct Asset {
-    // The id is the same as its location in memory/in the ams array
-    // This is is only an internal id and NOT the same as a db id (e.g. player id)
-    uint64 internal_id;
-
    // Could be 0 if there is no official id
-    uint64 official_id;
+    uint32 official_id;

-    // @performance This is bad, this uses the same name as the hashmap
-    // We effectively store the asset name twice which shouldn't be the case
-    char name[MAX_ASSET_NAME_LENGTH];
-
-    AssetType type;
-
-    // Counts the references to this asset
-    // e.g. textures
-    int32 reference_count;
+    // @performance We would like to use a bool but windows only supports 32bit atomic values as smallest value
+    // Maybe if we would set the IS_LOADED_STATE in the enum as the highest bit we could use the state variable and check it with >=
+    int32 is_loaded;

    // Describes how much ram/vram the asset uses
    // E.g. vram_size = 0 but ram_size > 0 means that it never uses any gpu memory
    uint32 ram_size;
    uint32 vram_size;
-    uint64 last_access;
+
+    uint32 last_access;

    // Usually 1 but in some cases an ams may hold entities of variable chunk length
    // For textures for example a 128x128 is of size 1 but 256x256 is of size 4
-    uint32 size;
+    uint16 size;

-    // Variable used for thread safety
-    bool is_loaded;
+    // Which asset component is used
+    byte component_id;

-    // Describes if the memory is currently available in ram/vram
-    // E.g. an asset might be uploaded to the gpu and no longer held in ram (or the other way around)
-    bool is_ram;
-    bool is_vram;
-
-    // Describes if the asset can be removed/garbage collected IF necessary
-    // This however only happens if space is needed
-    bool can_garbage_collect_ram;
-    bool can_garbage_collect_vram;
-
-    Asset* next;
-    Asset* prev;
-
-    // An asset can reference up to N other entities
-    // This allows us to quickly update the other entities
-    // Example: A player pulls N mobs
-    // @bug This means there are hard limits on how many mobs can be pulled by a player
-    Asset* references[50];
-    uint64 free_references; // bits show which is free
+    byte state;

    // Actual memory address and specific asset data
    byte* self;
+
+    // Counts the references to this asset
+    // e.g. textures or entity schemas (NOT entities themselves)
+    uint16 reference_count;
+
+    // An asset can reference up to N other assets
+    // This allows us to quickly update the other assets
+    // Uses official_id
+    // @performance This could potentially be bad because many assets will have 0 or only 1-4 references
+    uint32 references[12];
 };

 #endif
--- a/asset/AssetArchive.h
+++ b/asset/AssetArchive.h
@ -25,19 +25,8 @@
 #include "../localization/Language.h"
 #include "../ui/UITheme.h"
 #include "AssetManagementSystem.h"
-
-#if __aarch64__
-    #include "../stdlib/sve/SVE_I32.h"
-#else
-    #include "../stdlib/simd/SIMD_I32.h"
-#endif
-
-#if _WIN32
-    #include <windows.h>
-    #include "../platform/win32/FileUtils.cpp"
-#elif __linux__
-    #include "../platform/win32/FileUtils.cpp"
-#endif
+#include "../system/FileUtils.cpp"
+#include "../stdlib/Simd.h"

 #define ASSET_ARCHIVE_VERSION 1

@ -78,7 +67,7 @@ struct AssetArchive {

    // This is used to tell the asset archive in which AssetManagementSystem (AMS) which asset type is located.
    // Remember, many AMS only contain one asset type (e.g. image, audio, ...)
-    int32 asset_type_map[ASSET_TYPE_SIZE];
+    byte asset_type_map[ASSET_TYPE_SIZE];
 };

 // Calculates how large the header memory has to be to hold all its information
@ -183,37 +172,47 @@ void asset_archive_load(AssetArchive* archive, const char* path, BufferMemory* b
 // Maybe we could just accept a int value which we set atomically as a flag that the asset is complete?
 // this way we can check much faster if we can work with this data from the caller?!
 // The only problem is that we need to pass the pointer to this int in the thrd_queue since we queue the files to load there
-Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetManagementSystem* ams_array, RingMemory* ring)
+Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetManagementSystem* ams, RingMemory* ring)
 {
-    // @todo add calculation from element->type to ams index
+    // @todo add calculation from element->type to ams index. Probably requires an app specific conversion function

    // We have to mask 0x00FFFFFF since the highest bits define the archive id, not the element id
    AssetArchiveElement* element = &archive->header.asset_element[id & 0x00FFFFFF];
-    AssetManagementSystem* ams = &ams_array[archive->asset_type_map[element->type]];

-    // @todo This is a little bit stupid, reconsider
-    char id_str[32];
-    _itoa(id, id_str, 16);
+    byte component_id = archive->asset_type_map[element->type];
+    AssetComponent* ac = &ams->asset_components[component_id];

-    Asset* asset;
+    // Create a string representation from the asset id
+    // We can't just use the asset id, since an int can have a \0 between high byte and low byte
+    // @question We maybe can switch the AMS to work with ints as keys.
+    // We would then have to also create an application specific enum for general assets,
+    // that are not stored in the asset archive (e.g. color palette, which is generated at runtime).
+    char id_str[9];
+    int_to_hex(id, id_str);
+
+    Asset* asset = thrd_ams_get_asset_wait(ams, id_str);

-    // @performance I think we could optimize the ams_reserver_asset in a way so we don't have to lock it the entire time
-    pthread_mutex_lock(&ams->mutex);
-    asset = ams_get_asset(ams, id_str);
    if (asset) {
-        // Asset already loaded
-        pthread_mutex_unlock(&ams->mutex);
+        // Prevent garbage collection
+        asset->state &= ~ASSET_STATE_RAM_GC;
+        asset->state &= ~ASSET_STATE_VRAM_GC;

        return asset;
    }

+    // @bug Couldn't the asset become available from thrd_ams_get_asset_wait to here?
+    // This would mean we are overwriting it
+    // A solution could be a function called thrd_ams_get_reserve_wait() that reserves, if not available
+    // However, that function would have to lock the ams during that entire time
+
    if (element->type == 0) {
-        asset = ams_reserve_asset(ams, id_str, ams_calculate_chunks(ams, element->uncompressed));
+        asset = thrd_ams_reserve_asset(ams, (byte) component_id, id_str, element->uncompressed);
+        asset->official_id = id;

        FileBody file = {};
        file.content = asset->self;

-        // @performance Consider to implement gzip here
+        // @performance Consider to implement general purpose fast compression algorithm

        // We are directly reading into the correct destination
        file_read(archive->fd, &file, element->start, element->length);
@ -230,8 +229,10 @@ Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetMana

        // This happens while the file system loads the data
        // The important part is to reserve the uncompressed file size, not the compressed one
-        asset = ams_reserve_asset(ams, id_str, ams_calculate_chunks(ams, element->uncompressed));
-        asset->is_ram = true;
+        asset = thrd_ams_reserve_asset(ams, (byte) component_id, id_str, element->uncompressed);
+        asset->official_id = id;
+
+        asset->state |= ASSET_STATE_IN_RAM;

        file_async_wait(archive->fd_async, &file.ov, true);
        switch (element->type) {
@ -288,10 +289,13 @@ Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetMana
            }
        }
    }
-    pthread_mutex_unlock(&ams->mutex);
+
+    // Even though dependencies are still being loaded
+    // the main program should still be able to do some work if possible
+    thrd_ams_set_loaded(asset);

    // @performance maybe do in worker threads? This just feels very slow
-    // @question dependencies might be stored in different archives?
+    // @bug dependencies might be stored in different archives?
    for (uint32 i = 0; i < element->dependency_count; ++i) {
        asset_archive_asset_load(archive, id, ams, ring);
    }
--- a/asset/AssetManagementSystem.h
+++ b/asset/AssetManagementSystem.h
@ -14,191 +14,100 @@
 #include "Asset.h"
 #include "../memory/ChunkMemory.h"
 #include "../utils/TestUtils.h"
+#include "../utils/BitUtils.h"
 #include "../stdlib/HashMap.h"
 #include "../log/DebugMemory.h"
+#include "../thread/Atomic.h"

 // The major asset types should have their own asset component system
 // All other entities are grouped together in one asset component system
-// @question Asset component systems could be created per region -> easy to simulate a specific region
-// @bug This means players might not be able to transition from one area to another?!
-
-// @performance There is a huge performance flaw. We CANNOT have an asset only in vram because it always also allocates the ram (asset_data_memory)
-struct AssetManagementSystem {
-    // @question is this even necessary or could we integrate this directly into the system here?
-    HashMap hash_map;
+struct AssetComponent {
+    ChunkMemory asset_memory;

    uint64 ram_size;
    uint64 vram_size;
    uint64 asset_count;
-    int32 overhead;
-    bool has_changed;

-    // The indices of asset_memory and asset_data_memory are always linked
-
-    // @question Wouldn't it make much more sense to have a general AMS for this data
-    // In that case we would only need one AMS which holds the Asset information. All others would only need the data_memory
-    // We could probably dramatically simplify the AMS that holds the actual data. We might only need the ChunkMemory?
-
-    // @question Even further, why would we want to split stats and DATA at all? we are talking about assets which most likely don't fit into a single L1 cache line
-    // BUT they may fit in L2 or L3 and therefore require less pointer chasing
-    // Sure collecting data is faster with split memory (ram/vram usage)
-
-    // General asset memory
-    // Fixed chunk size of sizeof(Asset)
-    ChunkMemory asset_memory;
-
-    // Actual asset data
-    // Chunk size defined during initialization
-    ChunkMemory asset_data_memory;
-
-    // @performance Do we really need the linked list, the ChunkMemory should allow us to do some smart stuff
-    Asset* first;
-    Asset* last;
-
-    // @question do we want to create an extra threaded version? Or a combined one, like we have right now.
    // @question Do we want to add a mutex to assets. This way we don't have to lock the entire ams.
    pthread_mutex_t mutex;
 };

-void ams_create(AssetManagementSystem* ams, BufferMemory* buf, int32 chunk_size, int32 count, int32 overhead = 0)
+struct AssetManagementSystem {
+    HashMap hash_map;
+
+    int32 asset_component_count;
+    AssetComponent* asset_components;
+};
+
+inline
+void ams_create(AssetManagementSystem* ams, BufferMemory* buf, int32 asset_component_count, int32 count)
 {
-    // setup hash_map
-    hashmap_create(&ams->hash_map, count, sizeof(HashEntryInt64), buf);
-
-    ams->overhead = overhead;
-
-    // setup asset_memory
-    chunk_init(&ams->asset_memory, buf, count, sizeof(Asset), 64);
-
-    // setup asset_data_memory
-    chunk_init(&ams->asset_data_memory, buf, count, chunk_size, 64);
-
-    ams->first = NULL;
-    ams->last = NULL;
-
-    pthread_mutex_init(&ams->mutex, NULL);
+    hashmap_create(&ams->hash_map, count, sizeof(HashEntry) + sizeof(Asset), buf);
+    ams->asset_component_count = asset_component_count;
+    ams->asset_components = (AssetComponent *) buffer_get_memory(buf, asset_component_count * sizeof(AssetComponent), 64, true);
 }

-// WARNING: buf size see ams_get_buffer_size
-void ams_create(AssetManagementSystem* ams, byte* buf, int32 chunk_size, int32 count, int32 overhead = 0)
+inline
+void ams_component_create(AssetComponent* ac, BufferMemory* buf, int32 chunk_size, int32 count)
 {
    ASSERT_SIMPLE(chunk_size);

-    // setup hash_map
-    hashmap_create(&ams->hash_map, count, sizeof(HashEntryInt64), buf);
-
-    ams->overhead = overhead;
-
-    // setup asset_memory
-    ams->asset_memory.count = count;
-    ams->asset_memory.chunk_size = sizeof(Asset);
-    ams->asset_memory.last_pos = 0;
-    ams->asset_memory.alignment = 64;
-    ams->asset_memory.memory = buf;
-    ams->asset_memory.free = (uint64 *) (ams->asset_memory.memory + ams->asset_memory.chunk_size * count);
-
-    // setup asset_data_memory
-    ams->asset_data_memory.count = count;
-    ams->asset_data_memory.chunk_size = chunk_size;
-    ams->asset_data_memory.last_pos = 0;
-    ams->asset_data_memory.alignment = 64;
-    ams->asset_data_memory.memory = (byte *) (ams->asset_memory.free + CEIL_DIV(count, 64));
-    ams->asset_data_memory.free = (uint64 *) (ams->asset_data_memory.memory + ams->asset_data_memory.chunk_size * count);
-
-    ams->first = NULL;
-    ams->last = NULL;
-
-    pthread_mutex_init(&ams->mutex, NULL);
+    chunk_init(&ac->asset_memory, buf, count, chunk_size, 64);
+    pthread_mutex_init(&ac->mutex, NULL);
 }

+inline
+void ams_component_create(AssetComponent* ac, byte* buf, int32 chunk_size, int32 count)
+{
+    ASSERT_SIMPLE(chunk_size);
+
+    ac->asset_memory.count = count;
+    ac->asset_memory.chunk_size = chunk_size;
+    ac->asset_memory.last_pos = 0;
+    ac->asset_memory.alignment = 64;
+    ac->asset_memory.memory = buf;
+    ac->asset_memory.free = (uint64 *) (ac->asset_memory.memory + ac->asset_memory.chunk_size * count);
+
+    pthread_mutex_init(&ac->mutex, NULL);
+}
+
+inline
+void ams_component_free(AssetComponent* ac)
+{
+    pthread_mutex_destroy(&ac->mutex);
+}
+
+inline
 void ams_free(AssetManagementSystem* ams)
 {
-    pthread_mutex_destroy(&ams->mutex);
-}
-
-inline
-int32 ams_calculate_chunks(const AssetManagementSystem* ams, int32 byte_size)
-{
-    return (int32) CEIL_DIV(byte_size + ams->overhead, ams->asset_data_memory.chunk_size);
-}
-
-inline
-int64 ams_get_buffer_size(int32 count, int32 chunk_size)
-{
-    return hashmap_size(count, sizeof(HashEntryInt64)) // hash map
-        + sizeof(Asset) * count + CEIL_DIV(count, 64) * sizeof(uint64) // asset_memory
-        + chunk_size * count + CEIL_DIV(count, 64) * sizeof(uint64); // asset_data_memory
-}
-
-inline
-void ams_update_stats(AssetManagementSystem* ams)
-{
-    ams->vram_size = 0;
-    ams->ram_size = 0;
-    ams->asset_count = 0;
-
-    Asset* temp_asset = ams->first;
-
-    while (temp_asset) {
-        ams->vram_size += temp_asset->vram_size;
-        ams->ram_size += temp_asset->ram_size;
-        ++ams->asset_count;
-
-        temp_asset = temp_asset->next;
+    for (int32 i = 0; i < ams->asset_component_count; ++i) {
+        ams_component_free(&ams->asset_components[i]);
    }
-
-    ams->has_changed = false;
 }

 inline
-uint64 ams_get_asset_count(AssetManagementSystem* ams)
+uint16 ams_calculate_chunks(const AssetComponent* ac, int32 byte_size, int32 overhead)
 {
-    if (ams->has_changed) {
-        ams_update_stats(ams);
-    }
-
-    return ams->asset_count;
+    return (uint16) CEIL_DIV(byte_size + overhead, ac->asset_memory.chunk_size);
 }

 inline
-uint64 ams_get_vram_usage(AssetManagementSystem* ams)
+void thrd_ams_set_loaded(Asset* asset)
 {
-    if (ams->has_changed) {
-        ams_update_stats(ams);
-    }
-
-    return ams->vram_size;
+    atomic_set_release(&asset->is_loaded, 1);
 }

 inline
-uint64 ams_get_ram_usage(AssetManagementSystem* ams)
+bool thrd_ams_is_loaded(Asset* asset)
 {
-    if (ams->has_changed) {
-        ams_update_stats(ams);
-    }
-
-    return ams->ram_size;
-}
-
-void ams_free_asset(AssetManagementSystem* ams, Asset* asset)
-{
-    asset->prev->next = asset->next;
-    asset->next->prev = asset->prev;
-
-    hashmap_delete_entry(&ams->hash_map, asset->name);
-
-    for (uint32 i = 0; i < asset->size; ++i) {
-        chunk_free_element(&ams->asset_memory, asset->internal_id + i);
-        chunk_free_element(&ams->asset_data_memory, asset->internal_id + i);
-    }
-
-    ams->has_changed = true;
+    return asset && atomic_get_acquire(&asset->is_loaded) > 0;
 }

 inline
-Asset* ams_get_asset(AssetManagementSystem* ams, uint64 element)
+bool thrd_ams_is_in_vram(Asset* asset)
 {
-    return (Asset *) chunk_get_element(&ams->asset_memory, element, false);
+    return asset && atomic_get_acquire(&asset->is_loaded)
+        && (asset->state & ASSET_STATE_IN_VRAM);
 }

 inline
@ -206,29 +115,6 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key)
 {
    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key);

-    DEBUG_MEMORY_READ(
-        (uint64) (entry ? (Asset *) entry->value : 0),
-        entry ? sizeof(Asset) : 0
-    );
-
-    DEBUG_MEMORY_READ(
-        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
-        entry ? ((Asset *) entry->value)->ram_size : 0
-    );
-
-    return entry ? (Asset *) entry->value : NULL;
-}
-
-inline
-Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash)
-{
-    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key, hash);
-
-    DEBUG_MEMORY_READ(
-        (uint64) (entry ? (Asset *) entry->value : 0),
-        entry ? sizeof(Asset) : 0
-    );
-
    DEBUG_MEMORY_READ(
        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
        entry ? ((Asset *) entry->value)->ram_size : 0
@ -238,159 +124,417 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash)
 }

 // @performance We could probably avoid locking by adding a atomic flag to indicate if the value is valid
-Asset* thrd_ams_get_asset(AssetManagementSystem* ams, uint64 element) {
-    pthread_mutex_lock(&ams->mutex);
-    Asset* asset = ams_get_asset(ams, element);
-    pthread_mutex_unlock(&ams->mutex);
-
-    return asset;
-}
-
+inline
 Asset* thrd_ams_get_asset(AssetManagementSystem* ams, const char* key) {
-    pthread_mutex_lock(&ams->mutex);
-    Asset* asset = ams_get_asset(ams, key);
-    pthread_mutex_unlock(&ams->mutex);
+    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key);
+
+    if (!entry || atomic_get_acquire(&((Asset *) entry->value)->is_loaded) <= 0) {
+        return NULL;
+    }
+
+    DEBUG_MEMORY_READ(
+        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
+        entry ? ((Asset *) entry->value)->ram_size : 0
+    );
+
+    return (Asset *) entry->value;
+}
+
+inline
+Asset* thrd_ams_get_asset_wait(AssetManagementSystem* ams, const char* key) {
+    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key);
+
+    if (!entry) {
+        return NULL;
+    }
+
+    int32 state = 0;
+    while (!(state = atomic_get_acquire(&((Asset *) entry->value)->is_loaded))) {}
+    if (state < 0) {
+        // Marked for removal
+        return NULL;
+    }
+
+    DEBUG_MEMORY_READ(
+        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
+        entry ? ((Asset *) entry->value)->ram_size : 0
+    );
+
+    return (Asset *) entry->value;
+}
+
+inline
+Asset* thrd_ams_get_asset_wait(AssetManagementSystem* ams, const char* key, uint64 hash) {
+    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key, hash);
+
+    if (!entry) {
+        return NULL;
+    }
+
+    int32 state = 0;
+    while (!(state = atomic_get_acquire(&((Asset *) entry->value)->is_loaded))) {}
+    if (state < 0) {
+        // Marked for removal
+        return NULL;
+    }
+
+    DEBUG_MEMORY_READ(
+        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
+        entry ? ((Asset *) entry->value)->ram_size : 0
+    );
+
+    return (Asset *) entry->value;
+}
+
+inline
+Asset* thrd_ams_get_reserve_asset_wait(AssetManagementSystem* ams, byte type, const char* name, uint32 size, uint32 overhead = 0)
+{
+    // @bug Isn't hashmap_get_reserve unsafe for threading?
+    HashEntry* entry = hashmap_get_reserve(&ams->hash_map, name);
+    Asset* asset = (Asset *) entry->value;
+
+    if (asset->self) {
+        int32 state = 0;
+        while (!(state = atomic_get_acquire(&((Asset *) entry->value)->is_loaded))) {}
+        if (state > 0) {
+            return asset;
+        }
+    }
+
+    AssetComponent* ac = &ams->asset_components[type];
+    uint16 elements = ams_calculate_chunks(ac, size, overhead);
+    int32 free_data = chunk_reserve(&ac->asset_memory, elements);
+
+    byte* data = chunk_get_element(&ac->asset_memory, free_data, true);
+
+    asset->component_id = type;
+    asset->self = data;
+    asset->size = elements; // Crucial for freeing
+    asset->ram_size = ac->asset_memory.chunk_size * elements;
+
+    ac->vram_size += asset->vram_size;
+    ac->ram_size += asset->ram_size;
+    ++ac->asset_count;
+
+    DEBUG_MEMORY_RESERVE((uint64) asset, asset->ram_size, 180);

    return asset;
 }

-Asset* thrd_ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash) {
-    pthread_mutex_lock(&ams->mutex);
-    Asset* asset = ams_get_asset(ams, key, hash);
-    pthread_mutex_unlock(&ams->mutex);
+inline
+void ams_remove_asset(AssetManagementSystem* ams, AssetComponent* ac, Asset* asset, const char* name)
+{
+    // @todo remove from vram

-    return asset;
+    asset->is_loaded = 0;
+    ac->vram_size -= asset->vram_size;
+    ac->ram_size -= asset->ram_size;
+    --ac->asset_count;
+
+    hashmap_remove(&ams->hash_map, name);
+    chunk_free_elements(
+        &ac->asset_memory,
+        chunk_id_from_memory(&ac->asset_memory, asset->self),
+        asset->size
+    );
+}
+
+inline
+void ams_remove_asset_ram(AssetManagementSystem* ams, AssetComponent* ac, Asset* asset)
+{
+    ac->ram_size -= asset->ram_size;
+    chunk_free_elements(
+        &ac->asset_memory,
+        chunk_id_from_memory(&ac->asset_memory, asset->self),
+        asset->size
+    );
+}
+
+// @todo It would be nice if we could remove the asset by passing it as a parameter instead of the name
+// The problem is there is no correlation between asset data (e.g. internal_id) and global hashmap (e.g. element_id)
+// This means we would have to iterate all hashmap entries and remove it this way, which is very slow
+inline
+void ams_remove_asset(AssetManagementSystem* ams, const char* name)
+{
+    // @todo remove from vram
+
+    Asset* asset = ams_get_asset(ams, name);
+    AssetComponent* ac = &ams->asset_components[asset->component_id];
+
+    asset->is_loaded = 0;
+    ac->vram_size -= asset->vram_size;
+    ac->ram_size -= asset->ram_size;
+    --ac->asset_count;
+
+    hashmap_remove(&ams->hash_map, name);
+    chunk_free_elements(
+        &ac->asset_memory,
+        chunk_id_from_memory(&ac->asset_memory, asset->self),
+        asset->size
+    );
+}
+
+inline
+void ams_remove_asset(AssetManagementSystem* ams, Asset* asset, const char* name)
+{
+    // @todo remove from vram
+
+    AssetComponent* ac = &ams->asset_components[asset->component_id];
+
+    asset->is_loaded = 0;
+    ac->vram_size -= asset->vram_size;
+    ac->ram_size -= asset->ram_size;
+    --ac->asset_count;
+
+    hashmap_remove(&ams->hash_map, name);
+    chunk_free_elements(
+        &ac->asset_memory,
+        chunk_id_from_memory(&ac->asset_memory, asset->self),
+        asset->size
+    );
+}
+
+inline
+void ams_remove_asset_ram(AssetManagementSystem* ams, Asset* asset)
+{
+    AssetComponent* ac = &ams->asset_components[asset->component_id];
+    ac->ram_size -= asset->ram_size;
+
+    chunk_free_elements(
+        &ac->asset_memory,
+        chunk_id_from_memory(&ac->asset_memory, asset->self),
+        asset->size
+    );
+}
+
+inline
+void thrd_ams_remove_asset(AssetManagementSystem* ams, AssetComponent* ac, Asset* asset, const char* name)
+{
+    // @todo remove from vram
+
+    asset->is_loaded = 0;
+    ac->vram_size -= asset->vram_size;
+    ac->ram_size -= asset->ram_size;
+    --ac->asset_count;
+
+    atomic_set_release(&asset->is_loaded, 0);
+    hashmap_remove(&ams->hash_map, name);
+    chunk_free_elements(
+        &ac->asset_memory,
+        chunk_id_from_memory(&ac->asset_memory, asset->self),
+        asset->size
+    );
+}
+
+void thrd_ams_remove_asset(AssetManagementSystem* ams, const char* name)
+{
+    HashEntry* entry = hashmap_get_entry(&ams->hash_map, name);
+    Asset* asset = (Asset *) entry->value;
+    atomic_set_release(&asset->is_loaded, -1);
+    hashmap_remove(&ams->hash_map, name);
+
+    AssetComponent* ac = &ams->asset_components[asset->component_id];
+    chunk_free_elements(
+        &ac->asset_memory,
+        chunk_id_from_memory(&ac->asset_memory, asset->self),
+        asset->size
+    );
+
+    ac->vram_size -= asset->vram_size;
+    ac->ram_size -= asset->ram_size;
+    --ac->asset_count;
+}
+
+void thrd_ams_remove_asset(AssetManagementSystem* ams, const char* name, Asset* asset)
+{
+    atomic_set_release(&asset->is_loaded, -1);
+    hashmap_remove(&ams->hash_map, name);
+
+    AssetComponent* ac = &ams->asset_components[asset->component_id];
+    chunk_free_elements(
+        &ac->asset_memory,
+        chunk_id_from_memory(&ac->asset_memory, asset->self),
+        asset->size
+    );
+
+    ac->vram_size -= asset->vram_size;
+    ac->ram_size -= asset->ram_size;
+    --ac->asset_count;
 }

 // @todo implement defragment command to optimize memory layout since the memory layout will become fragmented over time

-// @performance This function is VERY important, check if we can optimize it
-// We could probably optimize the threaded version by adding a atomic_set_release(asset->is_loaded, true);
-Asset* ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 elements = 1)
+Asset* ams_reserve_asset(AssetManagementSystem* ams, byte type, const char* name, uint32 size, uint32 overhead = 0)
 {
-    int64 free_asset = chunk_reserve(&ams->asset_memory, elements, true);
-    if (free_asset < 0) {
-        ASSERT_SIMPLE(free_asset >= 0);
+    ASSERT_SIMPLE(strlen(name) < HASH_MAP_MAX_KEY_LENGTH - 1);
+
+    AssetComponent* ac = &ams->asset_components[type];
+    uint16 elements = ams_calculate_chunks(ac, size, overhead);
+
+    int32 free_data = chunk_reserve(&ac->asset_memory, elements);
+    if (free_data < 0) {
+        ASSERT_SIMPLE(free_data >= 0);
        return NULL;
    }

-    size_t name_length = strlen(name);
-    ASSERT_SIMPLE(name_length < MAX_ASSET_NAME_LENGTH - 1);
+    byte* asset_data = chunk_get_element(&ac->asset_memory, free_data, true);
+    Asset* asset = (Asset *) hashmap_reserve(&ams->hash_map, name)->value;

-    Asset* asset = (Asset *) chunk_get_element(&ams->asset_memory, free_asset);
-    asset->internal_id = free_asset;
-
-    strncpy(asset->name, name, name_length);
-    asset->name[name_length] = '\0';
-
-    hashmap_insert(&ams->hash_map, name, (uintptr_t) asset);
-
-    chunk_reserve_index(&ams->asset_data_memory, free_asset, elements, true);
-    asset->self = chunk_get_element(&ams->asset_data_memory, free_asset);
+    asset->component_id = type;
+    asset->self = asset_data;
    asset->size = elements; // Crucial for freeing
-    asset->ram_size = (ams->asset_memory.chunk_size + ams->asset_data_memory.chunk_size) * elements;
+    asset->ram_size = ac->asset_memory.chunk_size * elements;

-    DEBUG_MEMORY_RESERVE((uint64) asset->self, elements * ams->asset_data_memory.chunk_size, 180);
+    ac->vram_size += asset->vram_size;
+    ac->ram_size += asset->ram_size;
+    ++ac->asset_count;

-    // @performance Do we really want a double linked list. Are we really using this feature or is the free_index enough?
-    if (free_asset > 0 && free_asset < ams->asset_memory.count - 1) {
-        Asset* next = ams->first;
-        while (next->next != NULL
-            && next->next->internal_id < asset->internal_id
-            && next->internal_id < ams->asset_memory.count
-        ) {
-            next = next->next;
-        }
-
-        asset->prev = next;
-        asset->next = asset->prev->next;
-
-        if (asset->next) {
-            asset->next->prev = asset;
-        } else {
-            ams->last = asset;
-        }
-
-        asset->prev->next = asset;
-    } else if (free_asset == 0) {
-        asset->next = ams->first;
-
-        if (ams->first) {
-            ams->first->prev = asset;
-        }
-
-        ams->first = asset;
-    } else if (free_asset == ams->asset_memory.count - 1) {
-        asset->prev = ams->last;
-
-        // WARNING: no if here because we assume there is no ECS with just a size of 1
-        ams->last->next = asset;
-        ams->last = asset;
-    }
-
-    ams->has_changed = true;
+    DEBUG_MEMORY_RESERVE((uint64) asset, asset->ram_size, 180);

    return asset;
 }

-void ams_garbage_collect(AssetManagementSystem* ams, uint64 time, uint64 dt)
-{
-    Asset* asset = ams->first;
+inline
+Asset* thrd_ams_reserve_asset(AssetManagementSystem* ams, byte type, const char* name, uint32 size, uint32 overhead = 0) {
+    AssetComponent* ac = &ams->asset_components[type];
+    uint16 elements = ams_calculate_chunks(ac, size, overhead);

-    while (asset) {
-        // @performance We cannot just remove ram and keep vram. This is a huge flaw
-        if (asset->can_garbage_collect_ram && asset->can_garbage_collect_vram && time - asset->last_access <= dt) {
-            ams_free_asset(ams, asset);
+    pthread_mutex_lock(&ams->asset_components[type].mutex);
+    int32 free_data = chunk_reserve(&ac->asset_memory, elements);
+    if (free_data < 0) {
+        pthread_mutex_unlock(&ams->asset_components[type].mutex);
+        ASSERT_SIMPLE(free_data >= 0);
+
+        return NULL;
+    }
+    pthread_mutex_unlock(&ams->asset_components[type].mutex);
+
+    byte* asset_data = chunk_get_element(&ac->asset_memory, free_data, true);
+
+    Asset asset = {};
+
+    asset.component_id = type;
+    asset.self = asset_data;
+    asset.size = elements; // Crucial for freeing
+    asset.ram_size = ac->asset_memory.chunk_size * elements;
+
+    ac->vram_size += asset.vram_size;
+    ac->ram_size += asset.ram_size;
+    ++ac->asset_count;
+
+    DEBUG_MEMORY_RESERVE((uint64) asset_data, asset.ram_size, 180);
+
+    ASSERT_SIMPLE(strlen(name) < HASH_MAP_MAX_KEY_LENGTH - 1);
+
+    return (Asset *) hashmap_insert(&ams->hash_map, name, (byte *) &asset)->value;
+}
+
+// @todo Find a way to handle manual ram/vram changes
+// Either implement a ams_update(AssetManagementSystem* ams, Asset* asset) function
+// Or set .has_changed = true (even if garbage collection gets set) and call this func somewhere (maybe thread?)
+// Perform general ams update (stats and garbage collection)
+// We perform multiple things in one iteration to reduce the iteration costs
+// @todo don't use uint64 for time, use uint32 and use relative time to start of program
+void thrd_ams_update(AssetManagementSystem* ams, uint64 time, uint64 dt)
+{
+    for (int32 i = 0; i < ams->asset_component_count; ++i) {
+        ams->asset_components[i].vram_size = 0;
+        ams->asset_components[i].ram_size = 0;
+        ams->asset_components[i].asset_count = 0;
+    }
+
+    // Iterate the hash map to find all assets
+    int32 chunk_id = 0;
+    chunk_iterate_start(&ams->hash_map.buf, chunk_id)
+        HashEntry* entry = (HashEntry *) chunk_get_element(&ams->hash_map.buf, chunk_id);
+        Asset* asset = (Asset *) entry->value;
+
+        if (!thrd_ams_is_loaded(asset)) {
+            continue;
        }

-        asset = asset->next;
-    }
-}
+        ams->asset_components[asset->component_id].vram_size += asset->vram_size;
+        ams->asset_components[asset->component_id].ram_size += asset->ram_size;
+        ++ams->asset_components[asset->component_id].asset_count;

-void ams_garbage_collect(AssetManagementSystem* ams)
-{
-    Asset* asset = ams->first;
-
-    while (asset) {
-        // @performance We cannot just remove ram and keep vram. This is a huge flaw
-        if (asset->can_garbage_collect_ram && asset->can_garbage_collect_vram) {
-            ams_free_asset(ams, asset);
+        if ((asset->state & ASSET_STATE_RAM_GC) || (asset->state & ASSET_STATE_VRAM_GC)) {
+            if ((asset->state & ASSET_STATE_RAM_GC)
+                && (asset->state & ASSET_STATE_VRAM_GC)
+                && time - asset->last_access <= dt
+            ) {
+                // @performance Ideally we would like to pass the entry to delete
+                // The problem is the hashmap_delete function can't work with entries directly since it is not a doubly linked list
+                thrd_ams_remove_asset(ams, &ams->asset_components[asset->component_id], asset, entry->key);
+            } else if ((asset->state & ASSET_STATE_RAM_GC)
+                && time - asset->last_access <= dt
+            ) {
+                ams_remove_asset_ram(ams, &ams->asset_components[asset->component_id], asset);
+            } else if ((asset->state & ASSET_STATE_VRAM_GC)
+                && time - asset->last_access <= dt
+            ) {
+                ams->asset_components[asset->component_id].vram_size -= asset->vram_size;
+            }
        }
+    chunk_iterate_end;
+}

-        asset = asset->next;
+Asset* ams_insert_asset(AssetManagementSystem* ams, Asset* asset_temp, const char* name)
+{
+    AssetComponent* ac = &ams->asset_components[asset_temp->component_id];
+
+    int32 free_data = chunk_reserve(&ac->asset_memory, asset_temp->size);
+    if (free_data < 0) {
+        ASSERT_SIMPLE(free_data >= 0);
+        return NULL;
    }
-}

-void thrd_ams_garbage_collect(AssetManagementSystem* ams, uint64 time, uint64 dt)
-{
-    pthread_mutex_lock(&ams->mutex);
-    ams_garbage_collect(ams, time, dt);
-    pthread_mutex_unlock(&ams->mutex);
-}
+    byte* asset_data = chunk_get_element(&ac->asset_memory, free_data);

-void thrd_ams_garbage_collect(AssetManagementSystem* ams)
-{
-    pthread_mutex_lock(&ams->mutex);
-    ams_garbage_collect(ams);
-    pthread_mutex_unlock(&ams->mutex);
-}
+    asset_temp->self = asset_data;
+    asset_temp->size = asset_temp->size; // Crucial for freeing
+    asset_temp->ram_size = ac->asset_memory.chunk_size * asset_temp->size;

-Asset* thrd_ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 elements = 1) {
-    pthread_mutex_lock(&ams->mutex);
-    Asset* asset = ams_reserve_asset(ams, name, elements);
-    pthread_mutex_unlock(&ams->mutex);
+    ac->vram_size += asset_temp->vram_size;
+    ac->ram_size += asset_temp->ram_size;
+    ++ac->asset_count;
+
+    Asset* asset = (Asset *) hashmap_insert(&ams->hash_map, name, (byte *) asset_temp)->value;
+    DEBUG_MEMORY_RESERVE((uint64) asset->self, asset->ram_size, 180);

    return asset;
 }

-Asset* thrd_ams_reserve_asset_start(AssetManagementSystem* ams, const char* name, uint32 elements = 1) {
-    pthread_mutex_lock(&ams->mutex);
+inline
+Asset* thrd_ams_insert_asset(AssetManagementSystem* ams, Asset* asset_temp, const char* name)
+{
+    AssetComponent* ac = &ams->asset_components[asset_temp->component_id];

-    return ams_reserve_asset(ams, name, elements);
-}
+    pthread_mutex_lock(&ams->asset_components[asset_temp->component_id].mutex);
+    int32 free_data = chunk_reserve(&ac->asset_memory, asset_temp->size);
+    if (free_data < 0) {
+        pthread_mutex_unlock(&ams->asset_components[asset_temp->component_id].mutex);
+        ASSERT_SIMPLE(free_data >= 0);

-void thrd_ams_reserve_asset_end(AssetManagementSystem* ams) {
-    pthread_mutex_unlock(&ams->mutex);
+        return NULL;
+    }
+    pthread_mutex_unlock(&ams->asset_components[asset_temp->component_id].mutex);
+
+    byte* asset_data = chunk_get_element(&ac->asset_memory, free_data);
+    memcpy(asset_data, asset_temp->self, sizeof(Asset));
+
+    asset_temp->self = asset_data;
+    asset_temp->ram_size = ac->asset_memory.chunk_size * asset_temp->size;
+
+    ac->vram_size += asset_temp->vram_size;
+    ac->ram_size += asset_temp->ram_size;
+    ++ac->asset_count;
+
+    Asset* asset = (Asset *) hashmap_insert(&ams->hash_map, name, (byte *) asset_temp)->value;
+    DEBUG_MEMORY_RESERVE((uint64) asset->self, asset->ram_size, 180);
+
+    atomic_set_release(&asset->is_loaded, 1);
+
+    return asset;
 }

 #endif
--- a/asset/AssetType.h
+++ b/asset/AssetType.h
@ -9,7 +9,7 @@
 #ifndef TOS_ASSET_TYPE_H
 #define TOS_ASSET_TYPE_H

-enum AssetType {
+enum AssetType : byte {
    ASSET_TYPE_GENERAL,
    ASSET_TYPE_OBJ,
    ASSET_TYPE_AUDIO,
--- a/audio/Audio.cpp
+++ b/audio/Audio.cpp
@ -11,12 +11,7 @@

 #include "../utils/StringUtils.h"
 #include "../memory/RingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
+#include "../system/FileUtils.cpp"

 #include "Audio.h"
 #include "AudioSetting.h"
--- a/audio/AudioMixer.h
+++ b/audio/AudioMixer.h
@ -16,12 +16,7 @@
 #include "../utils/MathUtils.h"
 #include "../memory/ChunkMemory.h"
 #include "../math/matrix/MatrixFloat32.h"
-
-#if _WIN32
-    #include "../platform/win32/threading/Atomic.h"
-#elif __linux__
-    #include "../platform/linux/threading/Atomic.h"
-#endif
+#include "../thread/Atomic.h"

 #if DIRECT_SOUND
    #include "../platform/win32/audio/DirectSound.h"
@ -50,10 +45,11 @@ enum AudioEffect {
    AUDIO_EFFECT_EASE_IN = 1 << 14,
    AUDIO_EFFECT_EASE_OUT = 1 << 15,
    AUDIO_EFFECT_SPEED = 1 << 16,
+    AUDIO_EFFECT_REPEAT = 1 << 17,
 };

 struct AudioInstance {
-    int64 id;
+    int32 id;
    AudioLocationSetting origin;

    uint32 audio_size;
@ -62,7 +58,6 @@ struct AudioInstance {
    uint64 effect;
    uint32 sample_index;
    byte channels;
-    bool repeat;

    // @todo How to implement audio that is only supposed to be played after a certain other sound file is finished
    // e.g. queueing soundtracks/ambient noise
@ -130,28 +125,37 @@ bool audio_mixer_is_active(AudioMixer* mixer) {
    return (mixer->state_old = mixer_state) == AUDIO_MIXER_STATE_ACTIVE;
 }

-// @todo expand AudioLocationSetting so that it also includes audio effects, repeat etc.
-void audio_mixer_add(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSetting* origin)
+void audio_mixer_play(AudioMixer* mixer, int32 id, Audio* audio, AudioInstance* settings = NULL)
 {
-    int64 index = chunk_reserve(&mixer->audio_instances, 1);
+    int32 index = chunk_reserve(&mixer->audio_instances, 1);
    if (index < 0) {
        return;
    }

-    // @question Do I really want to use audio instance? wouldn't Audio* be sufficient?
-    // Well AudioInstance is a little bit smaller but is this really worth it, probably yes?!
    AudioInstance* instance = (AudioInstance *) chunk_get_element(&mixer->audio_instances, index);
    instance->id = id;
    instance->audio_size = audio->size;
    instance->audio_data = audio->data;
    instance->channels = audio->channels;

-    if (origin) {
-        memcpy(&instance->origin, origin, sizeof(AudioLocationSetting));
+    if (settings) {
+        memcpy(&instance->origin, &settings->origin, sizeof(AudioLocationSetting));
+        instance->effect = settings->effect;
    }
 }

-void audio_mixer_add_unique(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSetting* origin)
+void audio_mixer_play(AudioMixer* mixer, AudioInstance* settings)
+{
+    int32 index = chunk_reserve(&mixer->audio_instances, 1);
+    if (index < 0) {
+        return;
+    }
+
+    AudioInstance* instance = (AudioInstance *) chunk_get_element(&mixer->audio_instances, index);
+    memcpy(instance, settings, sizeof(AudioInstance));
+}
+
+void audio_mixer_play_unique(AudioMixer* mixer, int32 id, Audio* audio, AudioInstance* settings = NULL)
 {
    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
        // @performance We are not really utilizing chunk memory.
@ -163,16 +167,31 @@ void audio_mixer_add_unique(AudioMixer* mixer, int64 id, Audio* audio, AudioLoca
        }
    }

-    audio_mixer_add(mixer, id, audio, origin);
+    audio_mixer_play(mixer, id, audio, settings);
 }

-void audio_mixer_remove(AudioMixer* mixer, int64 id)
+void audio_mixer_play_unique(AudioMixer* mixer, AudioInstance* settings)
+{
+    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
+        // @performance We are not really utilizing chunk memory.
+        // Maybe a simple array would be better
+        // Or we need to use more chunk functions / maybe even create a chunk_iterate() function?
+        AudioInstance* instance = (AudioInstance *) chunk_get_element(&mixer->audio_instances, i);
+        if (instance->id == settings->id) {
+            return;
+        }
+    }
+
+    audio_mixer_play(mixer, settings);
+}
+
+void audio_mixer_remove(AudioMixer* mixer, int32 id)
 {
    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
        AudioInstance* instance = (AudioInstance *) chunk_get_element(&mixer->audio_instances, i);
        if (instance->id == id) {
            instance->id = 0;
-            chunk_free_element(&mixer->audio_instances, i);
+            chunk_free_elements(&mixer->audio_instances, i);

            // No return, since we want to remove all instances
        }
@ -475,7 +494,7 @@ void audio_mixer_mix(AudioMixer* mixer, uint32 size) {
            // We make it stereo
            for (int32 j = 0; j < limit; ++j) {
                if (sound_sample_index >= sound_sample_count) {
-                    if (!sound->repeat) {
+                    if (!(sound->effect & AUDIO_EFFECT_REPEAT)) {
                        limit = j;
                        break;
                    }
@ -494,7 +513,7 @@ void audio_mixer_mix(AudioMixer* mixer, uint32 size) {
            }

            // Apply effects based on sound's effect type
-            if (sound->effect) {
+            if (sound->effect && sound->effect != AUDIO_EFFECT_REPEAT) {
                int32 sample_adjustment = mixer_effects_mono(mixer, sound->effect, sound_sample_index);
                sound_sample_index += sample_adjustment;
                limit += sample_adjustment;
@ -502,7 +521,7 @@ void audio_mixer_mix(AudioMixer* mixer, uint32 size) {
        } else {
            for (int32 j = 0; j < limit; ++j) {
                if (sound_sample_index >= sound_sample_count) {
-                    if (!sound->repeat) {
+                    if (!(sound->effect & AUDIO_EFFECT_REPEAT)) {
                        limit = j;
                        break;
                    }
@ -520,7 +539,7 @@ void audio_mixer_mix(AudioMixer* mixer, uint32 size) {
            }

            // Apply effects based on sound's effect type
-            if (sound->effect) {
+            if (sound->effect && sound->effect != AUDIO_EFFECT_REPEAT) {
                int32 sample_adjustment = mixer_effects_stereo() / 2;;
                sound_sample_index += sample_adjustment;
                limit += sample_adjustment;
--- a/audio/QoaSimd.h
+++ b/audio/QoaSimd.h
@ -13,7 +13,7 @@
 #include "../stdlib/Types.h"
 #include "../utils/EndianUtils.h"
 #include "../audio/Audio.cpp"
-#include "../stdlib/simd/SIMD_I32.h"
+#include "../stdlib/Simd.h"

 #define QOA_SLICE_LEN 20
 #define QOA_SLICES_PER_FRAME 256
--- a/command/AppCmdBuffer.cpp
+++ b/command/AppCmdBuffer.cpp
@ -0,0 +1,491 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_APP_COMMAND_BUFFER_C
+#define TOS_APP_COMMAND_BUFFER_C
+
+/**
+ * The AppCmdBuffer by itself doesn't do much, it simply takes in commands and executes them.
+ * The actual execution depends on the implementation of the underlying systems like:
+ *      ECS, AMS, AudioMixer, ...
+ * The AppCmdBuffer simplifies the interaction with those systems since the caller has to care less
+ * about the information flow, function structure etc.
+ * On the other hand the caller loses some control:
+ *      No control over the execution order, unless additional overhead like priority gets introduced
+ *      No control over what type of command are executed, unless additional overhead like command type checks get introduced
+ *      ...
+ * In many cases you don't need this type of control, but when you need it you should probably look at how
+ * this AppCmdBuffer interacts with the individual systems and manually call those
+ */
+#include "AppCmdBuffer.h"
+
+inline
+void cmd_buffer_create(AppCmdBuffer* cb, BufferMemory* buf, int32 commands_count)
+{
+    chunk_init(&cb->commands, buf, commands_count, sizeof(Command), 64);
+    pthread_mutex_init(&cb->mutex, NULL);
+}
+
+// This doesn't load the asset directly but tells (most likely) a worker thread to load an asset
+static inline
+void cmd_asset_load_enqueue(AppCmdBuffer* cb, Command* cmd)
+{
+    queue_enqueue_wait_atomic(cb->assets_to_load, (byte *) cmd->data);
+}
+
+static inline
+void* cmd_func_run(AppCmdBuffer* cb, Command* cmd)
+{
+    CommandFunc func = *((CommandFunc *) cmd->data);
+    return func(cmd);
+}
+
+static inline
+Asset* cmd_asset_load(AppCmdBuffer* cb, Command* cmd)
+{
+    int32 asset_id = (int32) str_to_int((char *) cmd->data);
+    int32 archive_id = (asset_id >> 24) & 0xFF;
+    return asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->thrd_mem_vol);
+}
+
+static inline
+Asset* cmd_audio_play_enqueue(AppCmdBuffer* cb, Command* cmd)
+{
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, (char *) cmd->data);
+    if (!asset) {
+        return asset;
+    }
+
+    // @todo How to handle settings = AudioInstance
+    audio_mixer_play(
+        &cb->mixer[(cmd->data + 32) ? *((int32 *) (cmd->data + 32)) : 0], // @bug how to handle multiple mixers
+        asset->official_id + 1, // @bug + 1 necessary since it starts at 0, I think. we are still in the design phase :)
+        (Audio *) asset->self
+    );
+
+    return asset;
+}
+
+static inline
+Asset* cmd_audio_play_async(AppCmdBuffer* cb, Command* cmd)
+{
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, (char *) cmd->data);
+    if (!asset) {
+        cmd_asset_load_enqueue(cb, cmd);
+    } else {
+        cmd_audio_play_enqueue(cb, cmd);
+    }
+
+    return asset;
+}
+
+static inline
+Asset* cmd_texture_create(AppCmdBuffer* cb, Command* cmd)
+{
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, (char *) cmd->data);
+    if (!asset) {
+        return asset;
+    }
+
+    Texture* texture = (Texture *) asset->self;
+    if (cb->gpu_api == GPU_API_TYPE_OPENGL
+        && !(texture->image.image_settings & IMAGE_SETTING_BOTTOM_TO_TOP)
+    ) {
+        image_flip_vertical(cb->thrd_mem_vol, &texture->image);
+    }
+
+    return asset;
+}
+
+static inline
+Asset* cmd_texture_load_async(AppCmdBuffer* cb, Command* cmd)
+{
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, (char *) cmd->data);
+    if (!asset) {
+        cmd_asset_load_enqueue(cb, cmd);
+    } else {
+        cmd_texture_create(cb, cmd);
+    }
+
+    return asset;
+}
+
+static inline
+Asset* cmd_font_create(AppCmdBuffer* cb, Command* cmd)
+{
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, (char *) cmd->data);
+    if (!asset) {
+        return asset;
+    }
+
+    Font* font = (Font *) asset->self;
+    if (cb->gpu_api == GPU_API_TYPE_OPENGL) {
+        font_invert_coordinates(font);
+    }
+
+    return asset;
+}
+
+static inline
+Asset* cmd_font_load_async(AppCmdBuffer* cb, Command* cmd)
+{
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, (char *) cmd->data);
+    if (!asset) {
+        cmd_asset_load_enqueue(cb, cmd);
+    } else {
+        cmd_font_create(cb, cmd);
+    }
+
+    return asset;
+}
+
+inline
+void thrd_cmd_insert(AppCmdBuffer* cb, Command* cmd_temp)
+{
+    pthread_mutex_lock(&cb->mutex);
+    int32 index = chunk_reserve(&cb->commands, 1);
+    if (index < 0) {
+        pthread_mutex_unlock(&cb->mutex);
+        ASSERT_SIMPLE(false);
+
+        return;
+    }
+
+    if (index > cb->last_element) {
+        cb->last_element = index;
+    }
+
+    Command* cmd = (Command *) chunk_get_element(&cb->commands, index);
+    memcpy(cmd, cmd_temp, sizeof(Command));
+    pthread_mutex_unlock(&cb->mutex);
+}
+
+inline
+void thrd_cmd_insert(AppCmdBuffer* cb, CommandType type, int32 data)
+{
+    Command cmd;
+    cmd.type = type;
+    *((int32 *) cmd.data) = data;
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline
+void thrd_cmd_insert(AppCmdBuffer* cb, CommandType type, const char* data)
+{
+    Command cmd;
+    cmd.type = type;
+    str_copy_short((char *) cmd.data, data);
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline void thrd_cmd_func_insert(AppCmdBuffer* cb, CommandType type, CommandFunc* func) {
+    Command cmd;
+    cmd.type = CMD_FUNC_RUN;
+    *((CommandFunc *) cmd.data) = *func;
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline void thrd_cmd_audio_play(AppCmdBuffer* cb, int32 data) {
+    Command cmd;
+    cmd.type = CMD_AUDIO_PLAY;
+    *((int32 *) cmd.data) = data;
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline void thrd_cmd_audio_play(AppCmdBuffer* cb, const char* data) {
+    Command cmd;
+    cmd.type = CMD_AUDIO_PLAY;
+    str_copy_short((char *) cmd.data, data);
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline void thrd_cmd_func_run(AppCmdBuffer* cb, CommandFunc* func) {
+    Command cmd;
+    cmd.type = CMD_FUNC_RUN;
+    *((CommandFunc *) cmd.data) = *func;
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline void thrd_cmd_texture_load(AppCmdBuffer* cb, int32 data) {
+    Command cmd;
+    cmd.type = CMD_TEXTURE_LOAD;
+    *((int32 *) cmd.data) = data;
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline void thrd_cmd_texture_load(AppCmdBuffer* cb, const char* data) {
+    Command cmd;
+    cmd.type = CMD_TEXTURE_LOAD;
+    str_copy_short((char *) cmd.data, data);
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline void thrd_cmd_font_load(AppCmdBuffer* cb, int32 data) {
+    Command cmd;
+    cmd.type = CMD_FONT_LOAD;
+    *((int32 *) cmd.data) = data;
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline void thrd_cmd_font_load(AppCmdBuffer* cb, const char* data) {
+    Command cmd;
+    cmd.type = CMD_FONT_LOAD;
+    str_copy_short((char *) cmd.data, data);
+
+    thrd_cmd_insert(cb, &cmd);
+}
+
+inline Asset* cmd_asset_load(AppCmdBuffer* cb, int32 asset_id)
+{
+    int32 archive_id = (asset_id >> 24) & 0xFF;
+    return asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->mem_vol);
+}
+
+inline Asset* cmd_asset_load(AppCmdBuffer* cb, const char* asset_id_str)
+{
+    int32 asset_id = (int32) str_to_int(asset_id_str);
+    int32 archive_id = (asset_id >> 24) & 0xFF;
+    return asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->mem_vol);
+}
+
+inline Asset* cmd_audio_play(AppCmdBuffer* cb, int32 asset_id)
+{
+    // Check if asset already loaded
+    char id_str[9];
+    int_to_hex(asset_id, id_str);
+
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, id_str);
+
+    // Load asset if not loaded
+    if (!asset) {
+        int32 archive_id = (asset_id >> 24) & 0xFF;
+        asset = asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->mem_vol);
+    }
+
+    // @todo How to handle settings = AudioInstance
+    audio_mixer_play(
+        &cb->mixer[0], // @bug how to handle multiple mixers
+        asset->official_id + 1, // @bug + 1 necessary since it starts at 0, I think. we are still in the design phase :)
+        (Audio *) asset->self
+    );
+
+    return asset;
+}
+
+inline Asset* cmd_audio_play(AppCmdBuffer* cb, const char* name) {
+    // Check if asset already loaded
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, name);
+
+    // Load asset if not loaded
+    if (!asset) {
+        int32 asset_id = (int32) hex_to_int(name);
+        int32 archive_id = (asset_id >> 24) & 0xFF;
+        asset = asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->mem_vol);
+    }
+
+    // @todo How to handle settings = AudioInstance
+    audio_mixer_play(
+        &cb->mixer[0], // @bug how to handle multiple mixers
+        asset->official_id + 1, // @bug + 1 necessary since it starts at 0, I think. we are still in the design phase :)
+        (Audio *) asset->self
+    );
+
+    return asset;
+}
+
+inline void* cmd_func_run(AppCmdBuffer* cb, CommandFunc func) {
+    return func(NULL);
+}
+
+inline Asset* cmd_texture_load(AppCmdBuffer* cb, int32 asset_id) {
+    // Check if asset already loaded
+    char id_str[9];
+    int_to_hex(asset_id, id_str);
+
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, id_str);
+
+    // Load asset if not loaded
+    if (!asset) {
+        int32 archive_id = (asset_id >> 24) & 0xFF;
+        asset = asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->mem_vol);
+    }
+
+    // Setup basic texture
+    Texture* texture = (Texture *) asset->self;
+    if (cb->gpu_api == GPU_API_TYPE_OPENGL
+        && !(texture->image.image_settings & IMAGE_SETTING_BOTTOM_TO_TOP)
+    ) {
+        image_flip_vertical(cb->mem_vol, &texture->image);
+    }
+
+    // @question What about texture upload?
+
+    return asset;
+}
+
+inline Asset* cmd_texture_load(AppCmdBuffer* cb, const char* name) {
+    // Check if asset already loaded
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, name);
+
+    // Load asset if not loaded
+    if (!asset) {
+        int32 asset_id = (int32) hex_to_int(name);
+        int32 archive_id = (asset_id >> 24) & 0xFF;
+        asset = asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->mem_vol);
+    }
+
+    // Setup basic texture
+    Texture* texture = (Texture *) asset->self;
+    if (cb->gpu_api == GPU_API_TYPE_OPENGL
+        && !(texture->image.image_settings & IMAGE_SETTING_BOTTOM_TO_TOP)
+    ) {
+        image_flip_vertical(cb->mem_vol, &texture->image);
+    }
+
+    // @question What about texture upload?
+
+    return asset;
+}
+
+inline Asset* cmd_font_load(AppCmdBuffer* cb, int32 asset_id) {
+    // Check if asset already loaded
+    char id_str[9];
+    int_to_hex(asset_id, id_str);
+
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, id_str);
+
+    // Load asset if not loaded
+    if (!asset) {
+        int32 archive_id = (asset_id >> 24) & 0xFF;
+        asset = asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->mem_vol);
+    }
+
+    // Setup font
+    Font* font = (Font *) asset->self;
+    if (cb->gpu_api == GPU_API_TYPE_OPENGL) {
+        font_invert_coordinates(font);
+    }
+
+    // @question What about also loading the font atlas
+
+    return asset;
+}
+
+inline Asset* cmd_font_load(AppCmdBuffer* cb, const char* name) {
+    // Check if asset already loaded
+    Asset* asset = thrd_ams_get_asset_wait(cb->ams, name);
+
+    // Load asset if not loaded
+    if (!asset) {
+        int32 asset_id = (int32) hex_to_int(name);
+        int32 archive_id = (asset_id >> 24) & 0xFF;
+        asset = asset_archive_asset_load(&cb->asset_archives[archive_id], asset_id, cb->ams, cb->mem_vol);
+    }
+
+    // Setup font
+    Font* font = (Font *) asset->self;
+    if (cb->gpu_api == GPU_API_TYPE_OPENGL) {
+        font_invert_coordinates(font);
+    }
+
+    // @question What about also loading the font atlas
+
+    return asset;
+}
+
+// @question In some cases we don't remove an element if it couldn't get completed
+// Would it make more sense to remove it and add a new follow up command automatically in such cases?
+// e.g. couldn't play audio since it isn't loaded -> queue for asset load -> queue for internal play
+// I gues this only makes sense if we would switch to a queue
+void cmd_iterate(AppCmdBuffer* cb)
+{
+    int32 last_element = 0;
+    int32 chunk_id = 0;
+    chunk_iterate_start(&cb->commands, chunk_id)
+        Command* cmd = (Command *) chunk_get_element(&cb->commands, chunk_id);
+        bool remove = true;
+
+        switch (cmd->type) {
+            case CMD_FUNC_RUN: {
+                    cmd_func_run(cb, cmd);
+                } break;
+            case CMD_ASSET_ENQUEUE: {
+                    cmd_asset_load_enqueue(cb, cmd);
+                } break;
+            case CMD_ASSET_LOAD: {
+                    cmd_asset_load(cb, cmd);
+                } break;
+            case CMD_FILE_LOAD: {} break;
+            case CMD_TEXTURE_LOAD: {
+                    remove = cmd_texture_load_async(cb, cmd) != NULL;
+                } break;
+            case CMD_TEXTURE_CREATE: {
+                    // Internal only
+                    cmd_texture_create(cb, cmd);
+                } break;
+            case CMD_FONT_LOAD: {
+                    remove = cmd_font_load_async(cb, cmd) != NULL;
+                } break;
+            case CMD_FONT_CREATE: {
+                    // Internal only
+                    cmd_font_create(cb, cmd);
+                } break;
+            case CMD_AUDIO_PLAY: {
+                    cmd_audio_play_async(cb, cmd);
+                } break;
+            case CMD_AUDIO_ENQUEUE: {
+                    // Internal only
+                    remove = cmd_audio_play_enqueue(cb, cmd) != NULL;
+                } break;
+            case CMD_SHADER_LOAD: {
+                    remove = cmd_shader_load(cb, cmd) != NULL;
+                } break;
+            default: {
+                UNREACHABLE();
+            }
+        }
+
+        if (!remove) {
+            last_element = chunk_id;
+            continue;
+        }
+
+        chunk_free_element(&cb->commands, free_index, bit_index);
+
+        // @performance This adds some unnecessary overhead.
+        // It would be much better, if we could define cb->last_element as the limit in the for loop
+        if (chunk_id == cb->last_element) {
+            break;
+        }
+    chunk_iterate_end;
+
+    cb->last_element = last_element;
+}
+
+// @performance Locking the entire thing during the iteration is horribly slow, fix.
+// Solution 1: Use Queue
+// Solution 2: create a mask for the chunk->free which will be set (and only then locked) after everything is done
+//              This has the risk that if it takes a long time we may run out of free indices for insert
+//              This shouldn't happen since the command buffer shouldn't fill up in just 1-3 frames
+void thrd_cmd_iterate(AppCmdBuffer* cb)
+{
+    pthread_mutex_lock(&cb->mutex);
+    cmd_iterate(cb);
+    pthread_mutex_unlock(&cb->mutex);
+}
+
+#endif
--- a/command/AppCmdBuffer.h
+++ b/command/AppCmdBuffer.h
@ -0,0 +1,60 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_APP_COMMAND_BUFFER_H
+#define TOS_APP_COMMAND_BUFFER_H
+
+#include "../stdlib/Types.h"
+#include "../memory/ChunkMemory.h"
+#include "../memory/RingMemory.h"
+#include "../audio/AudioMixer.h"
+#include "../audio/Audio.h"
+#include "../asset/AssetArchive.h"
+#include "../gpuapi/GpuApiType.h"
+#include "../asset/Asset.h"
+#include "../asset/AssetManagementSystem.h"
+#include "../object/Texture.h"
+#include "../memory/Queue.h"
+#include "Command.h"
+
+struct AppCmdBuffer {
+    // @performance A queue would be much faster than ChunkMemory.
+    // We only use Chunk memory since we might want to run only certain commands instead of all of them
+    ChunkMemory commands;
+    int32 last_element;
+
+    pthread_mutex_t mutex;
+
+    // Application data for cmd access
+    // The list below depends on what kind of systems our command buffer needs access to
+    // Memory for when a buffer function (e.g. load_asset) is run in a thread context
+    RingMemory* thrd_mem_vol;
+
+    // Memory for when a buffer function (e.g. load_asset) is run in the main loop
+    RingMemory* mem_vol;
+    AssetManagementSystem* ams;
+    AssetArchive* asset_archives;
+    Queue* assets_to_load;
+    AudioMixer* mixer;
+    GpuApiType gpu_api;
+};
+
+#if OPENGL
+    #include "../gpuapi/opengl/AppCmdBuffer.h"
+#elif VULKAN
+    inline void* cmd_shader_load(AppCmdBuffer* cb, Command* cmd) { return NULL; }
+    inline void* cmd_shader_load(AppCmdBuffer* cb, void* shader, int32* shader_ids) { return NULL; }
+#elif DIRECTX
+    inline void* cmd_shader_load(AppCmdBuffer* cb, Command* cmd) { return NULL; }
+    inline void* cmd_shader_load(AppCmdBuffer* cb, void* shader, int32* shader_ids) { return NULL; }
+#else
+    inline void* cmd_shader_load(AppCmdBuffer* cb, Command* cmd) { return NULL; }
+    inline void* cmd_shader_load(AppCmdBuffer* cb, void* shader, int32* shader_ids) { return NULL; }
+#endif
+
+#endif
--- a/command/Command.h
+++ b/command/Command.h
@ -0,0 +1,35 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_COMMAND_H
+#define TOS_COMMAND_H
+
+#include "../stdlib/Types.h"
+
+enum CommandType {
+    CMD_FUNC_RUN,
+    CMD_ASSET_ENQUEUE,
+    CMD_ASSET_LOAD,
+    CMD_FILE_LOAD,
+    CMD_FONT_LOAD,
+    CMD_FONT_CREATE,
+    CMD_TEXTURE_LOAD,
+    CMD_TEXTURE_CREATE, // Only for internal use
+    CMD_AUDIO_PLAY,
+    CMD_AUDIO_ENQUEUE, // Only for internal use
+    CMD_SHADER_LOAD,
+};
+
+struct Command {
+    CommandType type;
+    byte data[28]; // @todo to be adjusted
+};
+
+typedef void* (*CommandFunc)(Command*);
+
+#endif
--- a/entity/AnimationEntity.h
+++ b/entity/AnimationEntity.h
@ -1,78 +0,0 @@
-/**
- * Jingga
- *
- * @copyright Jingga
- * @license   OMS License 2.0
- * @version   1.0.0
- * @link      https://jingga.app
- */
-#ifndef TOS_ANIMATION_ENTITY_H
-#define TOS_ANIMATION_ENTITY_H
-
-#include "../stdlib/Types.h"
-#include "../animation/AnimationEaseType.h"
-#include "../animation/Animation.h"
-#include "../utils/BitUtils.h"
-#include "EntityComponentSystem.h"
-
-struct AnimationEntity {
-    AnimationEaseType type;
-    uint32 start_time;
-    uint32 last_time;
-    f32 interval;
-    f32 progress;
-    byte state_last;
-    byte state;
-
-    // @question Do we want another flag that indicates if the entity got handled by the main loop?
-    // this way we could do the animation process in a thread and only overwrite the state_last whenever the flag is true
-    // However, we would have to implement locking or atomics which might be really bad depending on how we use this data
-};
-
-void update_animation_entity(AnimationEntity* anim, uint32 time, uint32 delay)
-{
-    anim->state_last = anim->state;
-
-    switch (anim->type) {
-        case ANIMATION_EASE_DISCRETE: {
-                anim->progress = anim_discrete((f32) (time - anim->start_time + delay) / (f32) anim->interval);
-                anim->state = (int32) ((f32) anim->state - anim->progress);
-            } break;
-        default: {}
-    }
-}
-
-void update_animation_entities(EntityComponentSystem* ecs, uint32 time, uint32 delay)
-{
-    int32 chunk_bytes = (ecs->entity_data_memory.size + 63) / 64;
-
-    // @performance It might make sense to iterate by int16 or even int32 instead of byte. Needs profiling
-    for (int32 i = 0; i < chunk_bytes; ++i) {
-        // @question Do we want this to be the first case. It probably depends on how often a byte is realistically empty
-        if (!ecs->entity_data_memory.free[i]) {
-            continue;
-        } else if (ecs->entity_data_memory.free[i] == 256) {
-            // @performance If we go larger than 8bit in the outer loop we also have to adjust it here
-            // AND maybe we would want to do sub checks then for 8bit again
-            for (int32 j = 0; j < 8; ++j) {
-                AnimationEntity* anim = (AnimationEntity *) chunk_get_element(&ecs->entity_data_memory, i * 8 + j);
-                update_animation_entity(anim, time, delay);
-            }
-
-            continue;
-        }
-
-        // @performance If we go larger than 8bit in the outer loop we also have to adjust it here
-        // AND maybe we would want to do sub checks then for 8bit again
-        for (int32 j = 0; j < 8; ++j) {
-            if (!IS_BIT_SET_L2R(ecs->entity_data_memory.free[i], j, 1)) {
-                continue;
-            }
-
-            AnimationEntity* anim = (AnimationEntity *) chunk_get_element(&ecs->entity_data_memory, i * 8 + j);
-            update_animation_entity(anim, time, delay);
-        }
-    }
-}
-
-#endif
--- a/entity/AnimationEntityComponent.h
+++ b/entity/AnimationEntityComponent.h
@ -0,0 +1,69 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_ANIMATION_ENTITY_H
+#define TOS_ANIMATION_ENTITY_H
+
+#include "../stdlib/Types.h"
+#include "../animation/AnimationEaseType.h"
+#include "../animation/Animation.h"
+#include "../utils/BitUtils.h"
+#include "EntityComponentSystem.h"
+#include "Entity.h"
+
+enum AnimationSetting {
+    ANIMATION_SETTING_PAUSE = 1 << 0,
+    ANIMATION_SETTING_REPEAT = 1 << 1,
+};
+
+struct AnimationEntityComponent {
+    Entity* entity;
+    AnimationEaseType type;
+    uint32 start_time;
+    uint32 last_time;
+    f32 interval;
+    f32 progress;
+    byte state_last;
+    byte state;
+
+    // Contains repeat, pause etc
+    byte setting;
+
+    // @question Do we want another flag that indicates if the entity got handled by the main loop?
+    // this way we could do the animation process in a thread and only overwrite the state_last whenever the flag is true
+    // However, we would have to implement locking or atomics which might be really bad depending on how we use this data
+};
+
+static inline
+void update_animation_entity(AnimationEntityComponent* anim, uint32 time, uint32 delay)
+{
+    anim->state_last = anim->state;
+
+    switch (anim->type) {
+        case ANIMATION_EASE_DISCRETE: {
+                anim->progress = anim_discrete((f32) (time - anim->start_time + delay) / (f32) anim->interval);
+                anim->state = (byte) ((f32) anim->state - anim->progress + FLOAT_CAST_EPS);
+            } break;
+        default: {}
+    }
+}
+
+void update_animation_entities(ChunkMemory* anim_ec, uint32 time, uint32 delay)
+{
+    int32 chunk_id = 0;
+    chunk_iterate_start(anim_ec, chunk_id)
+        AnimationEntityComponent* anim = (AnimationEntityComponent *) chunk_get_element(anim_ec, chunk_id);
+        if (anim->setting & ANIMATION_SETTING_PAUSE) {
+            continue;
+        }
+
+        update_animation_entity(anim, time, delay);
+    chunk_iterate_end;
+}
+
+#endif
--- a/entity/CursorEntity.h
+++ b/entity/CursorEntity.h
@ -0,0 +1,20 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_ENTITY_CURSOR_H
+#define TOS_ENTITY_CURSOR_H
+
+#include "Entity.h"
+#include "AnimationEntityComponent.h"
+
+struct EntityCursor {
+    Entity* general;
+    AnimationEntityComponent* anim;
+};
+
+#endif
--- a/entity/Entity.h
+++ b/entity/Entity.h
@ -11,41 +11,23 @@

 #include "../stdlib/Types.h"
 #include "../stdlib/HashMap.h"
-#include "EntityType.h"

 #define MAX_ENTITY_NAME_LENGTH 32

 struct Entity {
    // The id is the same as its location in memory/in the ecs array
    // This is is only an internal id and NOT the same as a db id (e.g. player id)
-    uint64 internal_id;
+    uint32 internal_id;

-    EntityType type;
+    uint32 last_access;

-    uint64 last_access;
+    // Which entity is used
+    byte type;

-    // Variable used for thread safety
-    bool is_loaded;
+    byte state;

-    // Describes if the asset can be removed/garbage collected IF necessary
-    // This however only happens if space is needed
-    bool can_garbage_collect_ram;
-    bool can_garbage_collect_vram;
-
-    // Counts the references to this entity
-    // e.g. textures
-    int16 reference_count;
-
-    // A entity can reference up to N other entities
-    // This allows us to quickly update the other entities
-    // Example: A player pulls N mobs
-    // @bug This means there are hard limits on how many mobs can be pulled by a player
-    // @question should this be an entity id?
-    Entity* references[50];
-    uint64 free_references; // bits show which is free
-
-    // @question should this be an entity id?
-    Entity* schema; // This entity represents the schema for this entity (most likely stored in a separate ecs)
+    // This entity represents the schema for this entity (most likely stored in a separate ecs)
+    uint32 schema;

    // Actual memory address and specific entity data
    byte* self;
@ -59,7 +41,7 @@ struct EntitySchema {
    // Could be 0 if there is no official id
    uint64 official_id;

-    EntityType type;
+    byte type;

    // Counts the references to this entity
    // e.g. textures
--- a/entity/EntityComponentSystem.h
+++ b/entity/EntityComponentSystem.h
@ -13,55 +13,109 @@
 #include "../stdlib/Types.h"
 #include "../memory/ChunkMemory.h"
 #include "../utils/TestUtils.h"
+#include "../utils/BitUtils.h"
 #include "../stdlib/HashMap.h"
+#include "../log/DebugMemory.h"

 #include "Entity.h"

+// Entities can be directly accessed by their id
+// highest byte = entity type, lower bytes = id in respective ecs
 struct EntityComponentSystem {
-    // @question is this even necessary or could we integrate this directly into the system here?
-    HashMap hash_map;
+    int32 entity_type_count;
+    int32 component_type_count;
+
+    ChunkMemory* entities;
+    ChunkMemory* components;

    uint64 ram_size;
    uint64 vram_size;
    uint64 entity_count;
-    int32 overhead;
+    uint64 component_count;

-    // @question Do we want this, I would assume this should be almost always true in the final game
-    bool has_changed;
-
-    // The indices of entity_memory and entity_data_memory are always linked
-
-    // @question Consider to reset entity_memory->last_pos to 0 before adding a new element
-    // This allows us to make the chunk memory more continuous which is better for iteration later on
-    // However, adding elements would now be slower. Needs profiling
-
-    // General entity memory
-    ChunkMemory entity_memory;
-
-    // Actual entity data
-    ChunkMemory entity_data_memory;
+    // @question Do we want to add a mutex to assets. This way we don't have to lock the entire ams.
+    pthread_mutex_t* entity_mutex;
+    pthread_mutex_t* component_mutex;
 };

-struct EntitySchemaSystem {
-    // @question is this even necessary or could we integrate this directly into the system here?
-    HashMap hash_map;
+inline
+void ecs_create(EntityComponentSystem* ecs, BufferMemory* buf, int32 entity_count, int32 component_count)
+{
+    ecs->entity_type_count = entity_count;
+    ecs->entities = (ChunkMemory *) buffer_get_memory(buf, sizeof(ChunkMemory) * entity_count, 64);

-    uint64 ram_size;
-    uint64 vram_size;
-    uint64 entity_count;
-    int32 overhead;
-    bool has_changed;
+    ecs->component_type_count = component_count;
+    ecs->components = (ChunkMemory *) buffer_get_memory(buf, sizeof(ChunkMemory) * component_count, 64);
+}

-    // The indices of entity_memory and entity_data_memory are always linked
+inline
+void ecs_entity_type_create(ChunkMemory* ec, BufferMemory* buf, int32 chunk_size, int32 count)
+{
+    ASSERT_SIMPLE(chunk_size);

-    // General entity memory
-    ChunkMemory entity_memory;
+    chunk_init(ec, buf, count, chunk_size, 64);
+    //pthread_mutex_init(&ec->mutex, NULL);
+}

-    // Actual entity data
-    ChunkMemory entity_data_memory;
+inline
+void ecs_component_type_create(ChunkMemory* ec, BufferMemory* buf, int32 chunk_size, int32 count)
+{
+    ASSERT_SIMPLE(chunk_size);

-    EntitySchema* first;
-    EntitySchema* last;
-};
+    chunk_init(ec, buf, count, chunk_size, 64);
+    //pthread_mutex_init(&ec->mutex, NULL);
+}
+
+Entity* ecs_get_entity(EntityComponentSystem* ecs, int32 entity_id)
+{
+    int32 ecs_type = (entity_id >> 24) & 0xFF;
+    int32 raw_id = entity_id & 0x00FFFFFF;
+
+    int32 byte_index = raw_id / 64;
+    int32 bit_index = raw_id & 63;
+
+    return IS_BIT_SET_64_R2L(ecs->entities[ecs_type].free[byte_index], bit_index) ?
+        (Entity *) chunk_get_element(&ecs->entities[ecs_type], raw_id)
+        : NULL;
+}
+
+Entity* ecs_reserve_entity(EntityComponentSystem* ecs, uint32 entity_type)
+{
+    ChunkMemory* mem = &ecs->entities[entity_type];
+    int32 free_entity = chunk_reserve(mem, 1);
+    if (free_entity < 0) {
+        ASSERT_SIMPLE(free_entity >= 0);
+        return NULL;
+    }
+
+    Entity* entity = (Entity *) chunk_get_element(mem, free_entity);
+
+    // @todo log entity stats (count, ram, vram)
+
+    return entity;
+}
+
+Entity* ecs_insert_entity(EntityComponentSystem* ecs, Entity* entity_temp, int32 entity_type)
+{
+    ChunkMemory* mem = &ecs->entities[entity_type];
+    int32 free_entity = chunk_reserve(mem, 1);
+    if (free_entity < 0) {
+        ASSERT_SIMPLE(free_entity >= 0);
+        return NULL;
+    }
+
+    Entity* entity = (Entity *) chunk_get_element(mem, free_entity);
+    memcpy(entity, entity_temp, mem->chunk_size);
+
+    // @todo log entity stats (count, ram, vram)
+    //DEBUG_MEMORY_RESERVE((uint64) entity, entity->ram_size, 180);
+
+    return entity;
+}
+
+void ecs_insert_component()
+{
+
+}

 #endif
--- a/entity/EntitySize.h
+++ b/entity/EntitySize.h
@ -0,0 +1,28 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_ENTITY_SIZE_H
+#define TOS_ENTITY_SIZE_H
+
+enum EntitySize {
+    ENTITY_SIZE_32,
+    ENTITY_SIZE_64,
+    ENTITY_SIZE_128,
+    ENTITY_SIZE_256,
+    ENTITY_SIZE_512,
+    ENTITY_SIZE_1024,
+    ENTITY_SIZE_2048,
+    ENTITY_SIZE_4096,
+    ENTITY_SIZE_8192,
+    ENTITY_SIZE_16384,
+    ENTITY_SIZE_32768,
+    ENTITY_SIZE_65536,
+    ENTITY_SIZE_SIZE
+};
+
+#endif
--- a/entity/EntityType.h
+++ b/entity/EntityType.h
@ -1,21 +0,0 @@
-/**
- * Jingga
- *
- * @copyright Jingga
- * @license   OMS License 2.0
- * @version   1.0.0
- * @link      https://jingga.app
- */
-#ifndef TOS_ENTITY_TYPE_H
-#define TOS_ENTITY_TYPE_H
-
-enum EntityType {
-    ENTITY_TYPE_MONSTER,
-    ENTITY_TYPE_NPC,
-    ENTITY_TYPE_PLAYER,
-    ENTITY_TYPE_ITEM,
-    ENTITY_TYPE_OBJ,
-    ENTITY_TYPE_SIZE
-};
-
-#endif
--- a/font/Font.h
+++ b/font/Font.h
@ -5,18 +5,8 @@
 #include "../memory/BufferMemory.h"
 #include "../utils/EndianUtils.h"
 #include "../utils/Utils.h"
-
-#if __aarch64__
-    #include "../stdlib/sve/SVE_I32.h"
-#else
-    #include "../stdlib/simd/SIMD_I32.h"
-#endif
-
-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
+#include "../stdlib/Simd.h"
+#include "../system/FileUtils.cpp"

 struct GlyphMetrics {
    f32 width;     // Width of the glyph
@ -212,15 +202,6 @@ int32 font_from_data(

    memcpy(font->glyphs, pos, font->glyph_count * sizeof(Glyph));

-    #if OPENGL
-        // @todo Implement y-offset correction
-        for (uint32 i = 0; i < font->glyph_count; ++i) {
-            float temp = font->glyphs[i].coords.y1;
-            font->glyphs[i].coords.y1 = 1.0f - font->glyphs[i].coords.y2;
-            font->glyphs[i].coords.y2 = 1.0f - temp;
-        }
-    #endif
-
    SWAP_ENDIAN_LITTLE_SIMD(
        (int32 *) font->glyphs,
        (int32 *) font->glyphs,
--- a/gpuapi/GpuApiType.h
+++ b/gpuapi/GpuApiType.h
@ -0,0 +1,19 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_GPUAPI_TYPE_H
+#define TOS_GPUAPI_TYPE_H
+
+enum GpuApiType {
+    GPU_API_TYPE_NONE,
+    GPU_API_TYPE_OPENGL,
+    GPU_API_TYPE_VULKAN,
+    GPU_API_TYPE_DIRECTX
+};
+
+#endif
--- a/gpuapi/ShaderType.h
+++ b/gpuapi/ShaderType.h
@ -0,0 +1,25 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_GPUAPI_SHADER_TYPE_H
+#define TOS_GPUAPI_SHADER_TYPE_H
+
+enum ShaderType {
+    SHADER_TYPE_NONE,
+    SHADER_TYPE_VERTEX,
+    SHADER_TYPE_FRAGMENT,
+    SHADER_TYPE_GEOMETRY,
+    SHADER_TYPE_TESSELATION,
+    SHADER_TYPE_PIXEL,
+    SHADER_TYPE_MESH,
+    SHADER_TYPE_RAYTRACING,
+    SHADER_TYPE_TENSOR,
+    SHADER_TYPE_SIZE
+};
+
+#endif
--- a/gpuapi/direct3d/GpuApiContainer.h
+++ b/gpuapi/direct3d/GpuApiContainer.h
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_GPUAPI_DIRECTX_GPU_API_CONTAINER
-#define TOS_GPUAPI_DIRECTX_GPU_API_CONTAINER
+#ifndef TOS_GPUAPI_DIRECTX_GPU_API_CONTAINER_H
+#define TOS_GPUAPI_DIRECTX_GPU_API_CONTAINER_H

 #include <windows.h>
 #include <d3d12.h>
--- a/gpuapi/direct3d/Shader.h
+++ b/gpuapi/direct3d/Shader.h
@ -0,0 +1,20 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_GPUAPI_DIRECT3D_SHADER_H
+#define TOS_GPUAPI_DIRECT3D_SHADER_H
+
+#include "../../stdlib/Types.h"
+
+struct Shader {
+    uint32 id;
+    uint32 locations[7];
+    byte data[16];
+};
+
+#endif
--- a/gpuapi/opengl/AppCmdBuffer.h
+++ b/gpuapi/opengl/AppCmdBuffer.h
@ -0,0 +1,66 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_GPUAPI_OPENGL_APP_CMD_BUFFER_H
+#define TOS_GPUAPI_OPENGL_APP_CMD_BUFFER_H
+
+#include "../../stdlib/Types.h"
+#include "OpenglUtils.h"
+#include "Shader.h"
+#include "ShaderUtils.h"
+#include "../ShaderType.h"
+#include "../../asset/Asset.h"
+
+void* cmd_shader_load(AppCmdBuffer* cb, Command* cmd) {
+    return NULL;
+}
+
+void* cmd_shader_load(AppCmdBuffer* cb, Shader* shader, int32* shader_ids) {
+    char asset_id[9];
+
+    int32 shader_assets[SHADER_TYPE_SIZE];
+    for (int32 i = 0; i < SHADER_TYPE_SIZE; ++i) {
+        shader_assets[i] = -1;
+    }
+
+    for (int32 i = 0; i < SHADER_TYPE_SIZE; ++i) {
+        if (!shader_ids[i]) {
+            continue;
+        } else if (shader_ids[i] < 0) {
+            break;
+        }
+
+        // Load sub asset
+        int_to_hex(shader_ids[i], asset_id);
+        Asset* shader_asset = thrd_ams_get_asset_wait(cb->ams, asset_id);
+        if (!shader_asset) {
+            int32 archive_id = (shader_ids[i] >> 24) & 0xFF;
+            shader_asset = asset_archive_asset_load(&cb->asset_archives[archive_id], shader_ids[i], cb->ams, cb->mem_vol);
+        }
+
+        // Make sub shader
+        shader_assets[i] = shader_make(
+            shader_type_index((ShaderType) (i + 1)),
+            (char *) shader_asset->self,
+            cb->mem_vol
+        );
+
+        shader_asset->state |= ASSET_STATE_RAM_GC;
+        shader_asset->state |= ASSET_STATE_VRAM_GC;
+    }
+
+    // Make shader/program
+    shader->id = program_make(
+        shader_assets[0], shader_assets[1], shader_assets[2],
+        cb->mem_vol
+    );
+
+    return NULL;
+}
+
+#endif
--- a/gpuapi/opengl/GpuApiContainer.h
+++ b/gpuapi/opengl/GpuApiContainer.h
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_GPUAPI_OPENGL_GPU_API_CONTAINER
-#define TOS_GPUAPI_OPENGL_GPU_API_CONTAINER
+#ifndef TOS_GPUAPI_OPENGL_GPU_API_CONTAINER_H
+#define TOS_GPUAPI_OPENGL_GPU_API_CONTAINER_H

 #include "../../stdlib/Types.h"
 #include "OpenglUtils.h"
--- a/gpuapi/opengl/OpenglUtils.h
+++ b/gpuapi/opengl/OpenglUtils.h
@ -16,16 +16,13 @@
 #include "../../image/Image.cpp"
 #include "../../utils/StringUtils.h"
 #include "../../log/Log.h"
-
+#include "../../system/FileUtils.cpp"
 #include "../RenderUtils.h"
 #include "Opengl.h"

 #if _WIN32
-    #include <windows.h>
-    #include "../../platform/win32/FileUtils.cpp"
    #include "../../platform/win32/Window.h"
 #elif __linux__
-    #include "../../platform/linux/FileUtils.cpp"
    #include "../../platform/linux/Window.h"
 #endif

@ -85,10 +82,10 @@ void opengl_info(OpenglInfo* info)

    for (char *at = version; *at; ++at) {
        if (*at == '.') {
-            info->major = str_to_int(version);
+            info->major = (int32) str_to_int(version);

            ++at;
-            info->minor = str_to_int(at);
+            info->minor = (int32) str_to_int(at);
            break;
        }
    }
--- a/gpuapi/opengl/Shader.h
+++ b/gpuapi/opengl/Shader.h
@ -12,9 +12,9 @@
 #include "../../stdlib/Types.h"

 struct Shader {
-    uint32 shader_id;
-    uint32 shader_locations[7];
-    byte shader_data[16];
+    uint32 id;
+    uint32 locations[7];
+    byte data[16];
 };

 #endif
--- a/gpuapi/opengl/ShaderUtils.h
+++ b/gpuapi/opengl/ShaderUtils.h
@ -13,6 +13,19 @@
 #include "../../memory/RingMemory.h"
 #include "../../log/Log.h"
 #include "Opengl.h"
+#include "../ShaderType.h"
+
+int32 shader_type_index(ShaderType type)
+{
+    switch (type) {
+        case SHADER_TYPE_VERTEX:
+            return GL_VERTEX_SHADER;
+        case SHADER_TYPE_FRAGMENT:
+            return GL_FRAGMENT_SHADER;
+        default:
+            return 0;
+    }
+}

 // Set value based on shader uniform name
 inline
@ -333,6 +346,7 @@ GLuint program_make(
    return program;
 }

+// @question Depending on how the different gpu apis work we may want to pass Shader* to have a uniform structure
 inline
 void pipeline_use(uint32 id)
 {
--- a/gpuapi/vulkan/GpuApiContainer.h
+++ b/gpuapi/vulkan/GpuApiContainer.h
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_GPUAPI_VULKAN_GPU_API_CONTAINER
-#define TOS_GPUAPI_VULKAN_GPU_API_CONTAINER
+#ifndef TOS_GPUAPI_VULKAN_GPU_API_CONTAINER_H
+#define TOS_GPUAPI_VULKAN_GPU_API_CONTAINER_H

 #include "../../stdlib/Types.h"
 #include <vulkan/vulkan.h>
--- a/gpuapi/vulkan/Shader.h
+++ b/gpuapi/vulkan/Shader.h
@ -0,0 +1,20 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_GPUAPI_VULKAN_SHADER_H
+#define TOS_GPUAPI_VULKAN_SHADER_H
+
+#include "../../stdlib/Types.h"
+
+struct Shader {
+    uint32 id;
+    uint32 locations[7];
+    byte data[16];
+};
+
+#endif
--- a/gpuapi/vulkan/ShaderUtils.h
+++ b/gpuapi/vulkan/ShaderUtils.h
@ -42,7 +42,7 @@ inline void shader_set_value(VkDevice device, VkCommandBuffer commandBuffer, VkD
    descriptorWrite.descriptorCount = 1;
    descriptorWrite.pBufferInfo = &bufferInfo;

-    vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, nullptr);
+    vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, NULL);
 }

 VkShaderModule shader_make(VkDevice device, const char* source, int32 source_size)
--- a/gpuapi/vulkan/VulkanUtils.h
+++ b/gpuapi/vulkan/VulkanUtils.h
@ -743,7 +743,7 @@ void vulkan_command_pool_create(

 void vulkan_command_buffer_create(VkDevice device, VkCommandBuffer* command_buffer, VkCommandPool command_pool)
 {
-    VkCommandBufferAllocateInfo allocInfo{};
+    VkCommandBufferAllocateInfo allocInfo = {};
    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
    allocInfo.commandPool = command_pool;
    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
--- a/image/Image.cpp
+++ b/image/Image.cpp
@ -11,12 +11,7 @@

 #include "../utils/StringUtils.h"
 #include "../memory/RingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
+#include "../system/FileUtils.cpp"

 #include "Image.h"
 #include "Tga.h"
@ -50,17 +45,6 @@ void image_flip_vertical(RingMemory* ring, Image* image)
        memcpy(image->pixels + y * stride, end - y * stride, stride);
    }

-    /* Flipping with small temp row
-    byte* temp_row = ring_get_memory(ring, stride);
-
-    for (int y = 0; y < image->height / 2; ++y) {
-        memcpy(temp_row, image->pixels + y * stride, stride);
-
-        memcpy(image->pixels + y * stride, image->pixels - y * stride, stride);
-        memcpy(image->pixels - y * stride, temp_row, stride);
-    }
-    */
-
    image->image_settings ^= IMAGE_SETTING_BOTTOM_TO_TOP;
 }

--- a/localization/Language.h
+++ b/localization/Language.h
@ -3,12 +3,7 @@

 #include "../stdlib/Types.h"
 #include "../memory/RingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
+#include "../system/FileUtils.cpp"

 #define LANGUAGE_VERSION 1

--- a/log/Debug.cpp
+++ b/log/Debug.cpp
@ -9,13 +9,23 @@
 #include "../utils/StringUtils.h"
 #include "../utils/TestUtils.h"
 #include "../utils/MathUtils.h"
+#include "../thread/Atomic.h"
+
+// Required for rdtsc();
+#if _WIN32
+    #include <intrin.h>
+#else
+    #include <x86intrin.h>
+#endif

 global_persist DebugContainer* debug_container = NULL;

+// WARNING: Spinlock uses TimeUtils which uses performance counter, which is part of DebugContainer
+// @todo The explanation above is insane. We did this so we only have to set the performance counter once but it is biting us now
+#include "../thread/Spinlock.cpp"
+
 #if _WIN32
    #include <windows.h>
-    #include "../platform/win32/threading/Atomic.h"
-    #include "../platform/win32/threading/Spinlock.cpp"
    void setup_performance_count() {
        if (!debug_container) {
            return;
@ -26,8 +36,6 @@ global_persist DebugContainer* debug_container = NULL;
        debug_container->performance_count_frequency = perf_counter.QuadPart;
    }
 #elif __linux__
-#include "../platform/linux/threading/Atomic.h"
-#include "../platform/linux/threading/Spinlock.cpp"
    void setup_performance_count() {
        if (!debug_container) {
            return;
@ -102,8 +110,8 @@ void update_timing_stat(uint32 stat, const char* function)

    spinlock_start(&debug_container->perf_stats_spinlock);
    timing_stat->function = function;
-    timing_stat->delta_tick = new_tick_count - timing_stat->old_tick_count;
-    timing_stat->delta_time = (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->delta_tick = (uint32) (new_tick_count - timing_stat->old_tick_count);
+    timing_stat->delta_time = (f64) timing_stat->delta_tick / (f64) debug_container->performance_count_frequency;
    timing_stat->old_tick_count = new_tick_count;
    spinlock_end(&debug_container->perf_stats_spinlock);
 }
@ -125,8 +133,8 @@ void update_timing_stat_end(uint32 stat, const char* function)

    spinlock_start(&debug_container->perf_stats_spinlock);
    timing_stat->function = function;
-    timing_stat->delta_tick = new_tick_count - timing_stat->old_tick_count;
-    timing_stat->delta_time = (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->delta_tick = (uint32) (new_tick_count - timing_stat->old_tick_count);
+    timing_stat->delta_time = (f64) timing_stat->delta_tick / (f64) debug_container->performance_count_frequency;
    timing_stat->old_tick_count = new_tick_count;
    spinlock_end(&debug_container->perf_stats_spinlock);
 }
@ -140,8 +148,8 @@ void update_timing_stat_end_continued(uint32 stat, const char* function)

    spinlock_start(&debug_container->perf_stats_spinlock);
    timing_stat->function = function;
-    timing_stat->delta_tick = timing_stat->delta_tick + new_tick_count - timing_stat->old_tick_count;
-    timing_stat->delta_time = timing_stat->delta_time + (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->delta_tick = (uint32) ((uint32) (new_tick_count - timing_stat->old_tick_count) + timing_stat->delta_tick);
+    timing_stat->delta_time = timing_stat->delta_time + (f64) timing_stat->delta_tick / (f64) debug_container->performance_count_frequency;
    timing_stat->old_tick_count = new_tick_count;
    spinlock_end(&debug_container->perf_stats_spinlock);
 }
@ -269,7 +277,7 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
    uint64 idx = atomic_fetch_add_relaxed(&mem->reserve_action_idx, 1);
    if (idx >= ARRAY_COUNT(mem->reserve_action)) {
        atomic_set_acquire(&mem->reserve_action_idx, 1);
-        idx %= ARRAY_COUNT(mem->last_action);
+        idx %= ARRAY_COUNT(mem->reserve_action);
    }

    DebugMemoryRange* dmr = &mem->reserve_action[idx];
@ -281,6 +289,27 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
    dmr->function_name = function;
 }

+// undo reserve
+void debug_memory_free(uint64 start, uint64 size)
+{
+    if (!start || !debug_container) {
+        return;
+    }
+
+    DebugMemory* mem = debug_memory_find(start);
+    if (!mem) {
+        return;
+    }
+
+    for (int32 i = 0; i < ARRAY_COUNT(mem->reserve_action); ++i) {
+        DebugMemoryRange* dmr = &mem->reserve_action[i];
+        if (dmr->start == start - mem->start) {
+            dmr->size = 0;
+            return;
+        }
+    }
+}
+
 // @bug This probably requires thread safety
 inline
 void debug_memory_reset()
@ -302,7 +331,7 @@ void debug_memory_reset()
 }

 // @bug This probably requires thread safety
-byte* log_get_memory(uint64 size, byte aligned = 1, bool zeroed = false)
+byte* log_get_memory(uint64 size, byte aligned = 4, bool zeroed = false)
 {
    if (!debug_container) {
        return 0;
@ -347,8 +376,8 @@ void log(const char* str, bool should_log, bool save, const char* file, const ch
    size_t file_len = strlen(file);
    size_t function_len = strlen(function);

-    char line_str[10];
-    int_to_str(line, line_str, '\0');
+    char line_str[14];
+    uint_to_str(line, line_str);

    size_t line_len = strlen(line_str);

--- a/log/Debug.h
+++ b/log/Debug.h
@ -12,12 +12,10 @@
 #include "../stdlib/Types.h"
 #include "DebugMemory.h"
 #include "TimingStat.h"
+#include "../thread/Spinlock.h"

 #if _WIN32
    #include <windows.h>
-    #include "../platform/win32/threading/Spinlock.h"
-#elif __linux__
-    #include "../platform/linux/threading/Spinlock.h"
 #endif

 struct LogMemory {
@ -26,7 +24,7 @@ struct LogMemory {
    uint32 id;
    uint64 size;
    uint64 pos;
-    int32 alignment;
+    uint32 alignment;
    uint64 start;
    uint64 end;
 };
--- a/log/DebugMemory.h
+++ b/log/DebugMemory.h
@ -9,18 +9,8 @@
 #ifndef TOS_LOG_DEBUG_MEMORY_H
 #define TOS_LOG_DEBUG_MEMORY_H

-#include <string.h>
-#include <malloc.h>
-
 #include "../stdlib/Types.h"

-// required for __rdtsc
-#if _WIN32
-    #include <intrin.h>
-#else
-    #include <x86intrin.h>
-#endif
-
 #define DEBUG_MEMORY_RANGE_MAX 500
 #define DEBUG_MEMORY_RANGE_RES_MAX 100

@ -55,6 +45,7 @@ struct DebugMemoryContainer {
    void debug_memory_init(uint64, uint64);
    void debug_memory_log(uint64, uint64, int32, const char*);
    void debug_memory_reserve(uint64, uint64, int32, const char*);
+    void debug_memory_free(uint64, uint64);
    void debug_memory_reset();

    #define DEBUG_MEMORY_INIT(start, size) debug_memory_init((start), (size))
@ -62,6 +53,7 @@ struct DebugMemoryContainer {
    #define DEBUG_MEMORY_WRITE(start, size) debug_memory_log((start), (size), 1, __func__)
    #define DEBUG_MEMORY_DELETE(start, size) debug_memory_log((start), (size), -1, __func__)
    #define DEBUG_MEMORY_RESERVE(start, size, type) debug_memory_reserve((start), (size), (type), __func__)
+    #define DEBUG_MEMORY_FREE(start, size) debug_memory_free((start), (size))
    #define DEBUG_MEMORY_RESET() debug_memory_reset()
 #else
    #define DEBUG_MEMORY_INIT(start, size) ((void) 0)
@ -69,6 +61,7 @@ struct DebugMemoryContainer {
    #define DEBUG_MEMORY_WRITE(start, size) ((void) 0)
    #define DEBUG_MEMORY_DELETE(start, size) ((void) 0)
    #define DEBUG_MEMORY_RESERVE(start, size, type) ((void) 0)
+    #define DEBUG_MEMORY_FREE(start, size) ((void) 0)
    #define DEBUG_MEMORY_RESET() ((void) 0)
 #endif

--- a/log/Log.h
+++ b/log/Log.h
@ -45,7 +45,7 @@ void log_counter(int32, int64);
    printf("%ld\n", __rdtsc() - (time_start));  \
 })

-#if (!DEBUG && !INTERNAL)
+#if (!DEBUG && !INTERNAL) || RELEASE
    // Don't perform any logging at log level 0
    #define LOG(str, should_log, save) log((str), (should_log), (save), __FILE__, __func__, __LINE__)
    #define LOG_FORMAT(format, data_type, data, should_log, save) log((format), (data_type), (data), (should_log), (save), __FILE__, __func__, __LINE__)
--- a/log/TimingStat.h
+++ b/log/TimingStat.h
@ -22,8 +22,8 @@
 struct TimingStat {
    const char* function;
    uint64 old_tick_count;
-    uint64 delta_tick;
-    double delta_time;
+    f64 delta_time;
+    uint32 delta_tick;
 };

 // Sometimes we want to only do logging in debug mode.
--- a/math/matrix/VectorFloat32.h
+++ b/math/matrix/VectorFloat32.h
@ -10,12 +10,7 @@
 #define TOS_MATH_MATRIX_VECTOR_FLOAT32_H

 #include "../../utils/MathUtils.h"
-
-#if __aarch64__
-    #include "../../../GameEngine/stdlib/sve/SVE_F32.h"
-#else
-    #include "../../../GameEngine/stdlib/simd/SIMD_F32.h"
-#endif
+#include "../../stdlib/Simd.h"

 struct v3_f32_4 {
    union {
--- a/math/matrix/VectorFloat64.h
+++ b/math/matrix/VectorFloat64.h
@ -10,11 +10,6 @@
 #define TOS_MATH_MATRIX_VECTOR_FLOAT64_H

 #include "../../utils/MathUtils.h"
-
-#if __aarch64__
-    #include "../../../GameEngine/stdlib/sve/SVE_F64.h"
-#else
-    #include "../../../GameEngine/stdlib/simd/SIMD_F64.h"
-#endif
+#include "../../stdlib/Simd.h"

 #endif
--- a/math/matrix/VectorInt32.h
+++ b/math/matrix/VectorInt32.h
@ -13,12 +13,7 @@
 #include <xmmintrin.h>

 #include "../../utils/MathUtils.h"
-
-#if __aarch64__
-    #include "../../../GameEngine/stdlib/sve/SVE_I32.h"
-#else
-    #include "../../../GameEngine/stdlib/simd/SIMD_I32.h"
-#endif
+#include "../../stdlib/Simd.h"

 struct v3_int32_4 {
    union {
--- a/math/matrix/VectorInt64.h
+++ b/math/matrix/VectorInt64.h
@ -13,12 +13,7 @@
 #include <xmmintrin.h>

 #include "../../utils/MathUtils.h"
-
-#if __aarch64__
-    #include "../../../GameEngine/stdlib/sve/SVE_I64.h"
-#else
-    #include "../../../GameEngine/stdlib/simd/SIMD_I64.h"
-#endif
+#include "../../stdlib/Simd.h"

 struct v3_int64_2 {
    union {
--- a/memory/BufferMemory.h
+++ b/memory/BufferMemory.h
@ -15,12 +15,7 @@
 #include "../utils/EndianUtils.h"
 #include "../utils/TestUtils.h"
 #include "../log/DebugMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/Allocator.h"
-#elif __linux__
-    #include "../platform/linux/Allocator.h"
-#endif
+#include "../system/Allocator.h"

 // @question Consider to use element_alignment to automatically align/pad elements

--- a/memory/ChunkMemory.h
+++ b/memory/ChunkMemory.h
@ -14,27 +14,19 @@
 #include "../utils/MathUtils.h"
 #include "../utils/TestUtils.h"
 #include "../utils/EndianUtils.h"
+#include "../utils/BitUtils.h"
 #include "../log/DebugMemory.h"
 #include "BufferMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/Allocator.h"
-#elif __linux__
-    #include "../platform/linux/Allocator.h"
-#endif
-
-#if _WIN32
-    #include "../platform/win32/threading/Thread.h"
-#elif __linux__
-    #include "../platform/linux/threading/Thread.h"
-#endif
+#include "../system/Allocator.h"
+#include "../thread/Thread.h"

 struct ChunkMemory {
    byte* memory;

-    uint64 count;
+    // @question Why are we making the count 64 bit? is this really realistically possible?
    uint64 size;
-    uint64 last_pos;
+    int32 last_pos;
+    uint32 count;
    uint32 chunk_size;
    uint32 alignment;

@ -44,7 +36,7 @@ struct ChunkMemory {
 };

 inline
-void chunk_alloc(ChunkMemory* buf, uint64 count, uint32 chunk_size, int32 alignment = 64)
+void chunk_alloc(ChunkMemory* buf, uint32 count, uint32 chunk_size, int32 alignment = 64)
 {
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);
@ -58,7 +50,7 @@ void chunk_alloc(ChunkMemory* buf, uint64 count, uint32 chunk_size, int32 alignm
    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
-    buf->last_pos = 0;
+    buf->last_pos = -1;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
@ -70,7 +62,7 @@ void chunk_alloc(ChunkMemory* buf, uint64 count, uint32 chunk_size, int32 alignm
 }

 inline
-void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint32 chunk_size, int32 alignment = 64)
+void chunk_init(ChunkMemory* buf, BufferMemory* data, uint32 count, uint32 chunk_size, int32 alignment = 64)
 {
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);
@ -82,7 +74,7 @@ void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint32 chunk
    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
-    buf->last_pos = 0;
+    buf->last_pos = -1;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
@ -95,7 +87,7 @@ void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint32 chunk
 }

 inline
-void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint32 chunk_size, int32 alignment = 64)
+void chunk_init(ChunkMemory* buf, byte* data, uint32 count, uint32 chunk_size, int32 alignment = 64)
 {
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);
@ -108,7 +100,7 @@ void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint32 chunk_size, i
    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
-    buf->last_pos = 0;
+    buf->last_pos = -1;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
@ -131,6 +123,11 @@ void chunk_free(ChunkMemory* buf)
    }
 }

+inline
+uint32 chunk_id_from_memory(ChunkMemory* buf, byte* pos) {
+    return (uint32) ((uintptr_t) pos - (uintptr_t) buf->memory) / buf->chunk_size;
+}
+
 inline
 byte* chunk_get_element(ChunkMemory* buf, uint64 element, bool zeroed = false)
 {
@ -146,93 +143,102 @@ byte* chunk_get_element(ChunkMemory* buf, uint64 element, bool zeroed = false)
    return offset;
 }

-/**
- * In some cases we know exactly which index is free
- */
-void chunk_reserve_index(ChunkMemory* buf, int64 index, int64 elements = 1, bool zeroed = false)
+// @performance This is a very important function, revisit in the future for optimization (e.g. ABM)
+int32 chunk_reserve(ChunkMemory* buf, uint32 elements = 1)
 {
-    int64 byte_index = index / 64;
-    int32 bit_index = index % 64;
+    int32 free_index = (buf->last_pos + 1) / 64;
+    int32 bit_index = (buf->last_pos + 1) & 63;
+    int32 free_element = -1;

-    // Mark the bits as reserved
-    for (int32 j = 0; j < elements; ++j) {
-        int64 current_byte_index = byte_index + (bit_index + j) / 64;
-        int32 current_bit_index = (bit_index + j) % 64;
-        buf->free[current_byte_index] |= (1LL << current_bit_index);
-    }
+    int32 i = -1;
+    int32 consecutive_free_bits = 0;

-    if (zeroed) {
-        memset(buf->memory + index * buf->chunk_size, 0, elements * buf->chunk_size);
-    }
-
-    DEBUG_MEMORY_WRITE((uint64) (buf->memory + index * buf->chunk_size), elements * buf->chunk_size);
-
-    buf->last_pos = index;
-}
-
-int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false)
-{
-    int64 free_index = (buf->last_pos + 1) / 64;
-    int32 bit_index = buf->last_pos - free_index * 64;
-    int64 free_element = -1;
-
-    int32 i = 0;
-    int64 max_bytes = (buf->count + 7) / 64;
-
-    while (free_element < 0 && i < buf->count) {
-        ++i;
-
-        if (free_index >= max_bytes) {
+    while (free_element < 0 && ++i < buf->count) {
+        // Skip fully filled ranges
+        if (free_index * 64 + bit_index + elements - consecutive_free_bits >= buf->count) {
            free_index = 0;
-        }
-
-        if (buf->free[free_index] == 0xFF) {
+            bit_index = 0;
+            i += buf->count - (free_index * 64 + bit_index);
+            consecutive_free_bits = 0;
+        } else if (buf->free[free_index] == 0xFFFFFFFFFFFFFFFF) {
            ++free_index;
+            bit_index = 0;
+            i += 63;
+            consecutive_free_bits = 0;

            continue;
        }

-        // @performance There is some redundancy happening down below, we should ++free_index in certain conditions?
-        for (; bit_index < 64; ++bit_index) {
-            int32 consecutive_free_bits = 0;
+        // Find first free element
+        while (IS_BIT_SET_64_R2L(buf->free[free_index], bit_index)) {
+            consecutive_free_bits = 0;
+            ++bit_index;
+            ++i;

-            // Check if there are 'elements' consecutive free bits
-            for (int32 j = 0; j < elements; ++j) {
-                // Check if there is enough space until the end of the buffer.
-                // Remember, the last free index may only allow only 1 bit if the size is 65
-                if (free_index * 64 + (bit_index + j) >= buf->count) {
-                    break;
-                }
-
-                uint64 current_free_index = free_index + (bit_index + j) / 64;
-                int32 current_bit_index = (bit_index + j) % 64;
-
-                int64 mask = 1LL << current_bit_index;
-                if ((buf->free[current_free_index] & mask) == 0) {
-                    ++consecutive_free_bits;
-                } else {
-                    break;
-                }
-            }
-
-            if (consecutive_free_bits == elements) {
-                free_element = free_index * 64 + bit_index;
-
-                // Mark the bits as reserved
-                for (int32 j = 0; j < elements; ++j) {
-                    int64 current_free_index = free_index + (bit_index + j) / 64;
-                    int32 current_bit_index = (bit_index + j) % 64;
-                    buf->free[current_free_index] |= (1LL << current_bit_index);
-                }
+            // We still need to check for overflow since our initial bit_index is based on buf->last_pos
+            if (bit_index > 63) {
+                bit_index = 0;
+                ++free_index;

                break;
            }
        }

-        bit_index = 0;
+        // The previous while may exit with an "overflow", that's why this check is required
+        if (IS_BIT_SET_64_R2L(buf->free[free_index], bit_index)) {
+            consecutive_free_bits = 0;

-        ++i;
-        ++free_index;
+            continue;
+        }
+
+        // We found our first free element, let's check if we have enough free space
+        while (!IS_BIT_SET_64_R2L(buf->free[free_index], bit_index)
+            && consecutive_free_bits != elements
+            && free_index * 64 + bit_index + elements - consecutive_free_bits < buf->count
+        ) {
+            ++i;
+            ++consecutive_free_bits;
+            ++bit_index;
+
+            if (bit_index > 63) {
+                bit_index = 0;
+                ++free_index;
+
+                break;
+            }
+        }
+
+        // Do we have enough free bits?
+        if (consecutive_free_bits == elements) {
+            free_element = free_index * 64 + bit_index - elements;
+            int32 possible_free_index = free_element / 64;
+            int32 possible_bit_index = free_element & 63;
+
+            // Mark as used
+            if (elements == 1) {
+                buf->free[possible_free_index] |= (1LL << possible_bit_index);
+            } else {
+                uint32 elements_temp = elements;
+                int64 current_free_index = possible_free_index;
+                int32 current_bit_index = possible_bit_index;
+
+                while (elements > 0) {
+                    // Calculate the number of bits we can set in the current 64-bit block
+                    int32 bits_in_current_block = OMS_MIN(64 - current_bit_index, elements);
+
+                    // Create a mask to set the bits
+                    uint64 mask = ((1ULL << bits_in_current_block) - 1) << current_bit_index;
+                    buf->free[current_free_index] |= mask;
+
+                    // Update the counters and indices
+                    elements -= bits_in_current_block;
+                    ++current_free_index;
+                    current_bit_index = 0;
+                }
+            }
+
+            break;
+        }
    }

    if (free_element < 0) {
@ -240,70 +246,46 @@ int64 chunk_reserve(ChunkMemory* buf, uint64 elements = 1, bool zeroed = false)
        return -1;
    }

-    if (zeroed) {
-        memset(buf->memory + free_element * buf->chunk_size, 0, elements * buf->chunk_size);
-    }
-
    DEBUG_MEMORY_WRITE((uint64) (buf->memory + free_element * buf->chunk_size), elements * buf->chunk_size);

    buf->last_pos = free_element;

-    return free_element;
-}
-
-byte* chunk_find_free(ChunkMemory* buf)
-{
-    int64 free_index = (buf->last_pos + 1) / 64;
-    int32 bit_index;
-
-    int64 free_element = -1;
-    int64 mask;
-
-    int32 i = 0;
-    int64 max_bytes = (buf->count + 7) / 64;
-
-    while (free_element < 0 && i < buf->count) {
-        if (free_index >= max_bytes) {
-            free_index = 0;
-        }
-
-        if (buf->free[free_index] == 0xFF) {
-            ++i;
-            ++free_index;
-
-            continue;
-        }
-
-        // This always breaks!
-        // @performance on the first iteration through the buffer we could optimize this by starting at a different bit_index
-        // because we know that the bit_index is based on last_pos
-        for (bit_index = 0; bit_index < 64; ++bit_index) {
-            mask = 1LL << bit_index;
-            if ((buf->free[free_index] & mask) == 0) {
-                free_element = free_index * 64 + bit_index;
-                buf->free[free_index] |= (1LL << bit_index);
-
-                break;
-            }
-        }
-    }
-
-    if (free_element < 0) {
-        return NULL;
-    }
-
-    return buf->memory + free_element * buf->chunk_size;
+    return (int32) free_element;
 }

 inline
-void chunk_free_element(ChunkMemory* buf, uint64 element)
+void chunk_free_element(ChunkMemory* buf, uint64 free_index, int32 bit_index)
+{
+    DEBUG_MEMORY_DELETE((uint64) (buf->memory + (free_index * 64 + bit_index) * buf->chunk_size), buf->chunk_size);
+    buf->free[free_index] &= ~(1LL << bit_index);
+}
+
+inline
+void chunk_free_elements(ChunkMemory* buf, uint64 element, uint32 element_count = 1)
 {
    DEBUG_MEMORY_DELETE((uint64) (buf->memory + element * buf->chunk_size), buf->chunk_size);

    int64 free_index = element / 64;
-    int32 bit_index = element % 64;
+    int32 bit_index = element & 63;

-    buf->free[free_index] &= ~(1LL << bit_index);
+    if (element == 1) {
+        chunk_free_element(buf, free_index, bit_index);
+        return;
+    }
+
+    while (element_count > 0) {
+        // Calculate the number of bits we can clear in the current 64-bit block
+        uint32 bits_in_current_block = OMS_MIN(64 - bit_index, element_count);
+
+        // Create a mask to clear the bits
+        uint64 mask = ((1ULL << bits_in_current_block) - 1) << bit_index;
+        buf->free[free_index] &= ~mask;
+
+        // Update the counters and indices
+        element_count -= bits_in_current_block;
+        ++free_index;
+        bit_index = 0;
+    }
 }

 inline
@ -312,7 +294,7 @@ int64 chunk_dump(const ChunkMemory* buf, byte* data)
    byte* start = data;

    // Count
-    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(buf->count);
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(buf->count);
    data += sizeof(buf->count);

    // Size
@ -324,7 +306,7 @@ int64 chunk_dump(const ChunkMemory* buf, byte* data)
    data += sizeof(buf->chunk_size);

    // Last pos
-    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(buf->last_pos);
+    *((int32 *) data) = SWAP_ENDIAN_LITTLE(buf->last_pos);
    data += sizeof(buf->last_pos);

    // Alignment
@ -343,7 +325,7 @@ inline
 int64 chunk_load(ChunkMemory* buf, const byte* data)
 {
    // Count
-    buf->count = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
+    buf->count = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
    data += sizeof(buf->count);

    // Size
@ -355,7 +337,7 @@ int64 chunk_load(ChunkMemory* buf, const byte* data)
    data += sizeof(buf->chunk_size);

    // Last pos
-    buf->last_pos = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
+    buf->last_pos = SWAP_ENDIAN_LITTLE(*((int32 *) data));
    data += sizeof(buf->last_pos);

    // Alignment
@ -370,4 +352,28 @@ int64 chunk_load(ChunkMemory* buf, const byte* data)
    return buf->size;
 }

+#define chunk_iterate_start(buf, chunk_id)                                                      \
+    int32 free_index = 0;                                                                       \
+    int32 bit_index = 0;                                                                        \
+                                                                                                \
+    /* Iterate the chunk memory */                                                              \
+    for (; chunk_id < (buf)->count; ++chunk_id) {                                               \
+        /* Check if asset is defined */                                                         \
+        if (!(buf)->free[free_index]) {                                                         \
+            /* Skip various elements */                                                         \
+            /* @performance Consider to only check 1 byte instead of 8 */                       \
+            /* There are probably even better ways by using compiler intrinsics if available */ \
+            bit_index += 63; /* +64 - 1 since the loop also increases by 1 */                   \
+        } else if ((buf)->free[free_index] & (1ULL << bit_index)) {
+
+#define chunk_iterate_end       \
+        }                       \
+                                \
+        ++bit_index;            \
+        if (bit_index > 63) {   \
+            bit_index = 0;      \
+            ++free_index;       \
+        }                       \
+    }
+
 #endif
--- a/memory/Heap.h
+++ b/memory/Heap.h
@ -14,12 +14,7 @@
 #include "../stdlib/Types.h"
 #include "../log/DebugMemory.h"
 #include "BufferMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/Allocator.h"
-#elif __linux__
-    #include "../platform/linux/Allocator.h"
-#endif
+#include "../system/Allocator.h"

 struct Heap {
    byte* elements;
--- a/memory/RingMemory.h
+++ b/memory/RingMemory.h
@ -19,18 +19,10 @@

 #include "BufferMemory.h"
 #include "../log/DebugMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/Allocator.h"
-    #include "../platform/win32/threading/ThreadDefines.h"
-    #include "../platform/win32/threading/Semaphore.h"
-    #include "../platform/win32/threading/Atomic.h"
-#elif __linux__
-    #include "../platform/linux/Allocator.h"
-    #include "../platform/linux/threading/ThreadDefines.h"
-    #include "../platform/linux/threading/Semaphore.h"
-    #include "../platform/linux/threading/Atomic.h"
-#endif
+#include "../thread/Atomic.h"
+#include "../thread/Semaphore.h"
+#include "../thread/ThreadDefines.h"
+#include "../system/Allocator.h"

 // WARNING: Changing this structure has effects on other data structures (e.g. Queue)
 // When chaning make sure you understand what you are doing
--- a/memory/ThreadedChunkMemory.h
+++ b/memory/ThreadedChunkMemory.h
@ -11,19 +11,14 @@

 #include <string.h>
 #include "../stdlib/Types.h"
-
-#if _WIN32
-    #include "../platform/win32/threading/Thread.h"
-#elif __linux__
-    #include "../platform/linux/threading/Thread.h"
-#endif
+#include "../thread/Thread.h"

 struct ThreadedChunkMemory {
    byte* memory;

-    uint64 count;
    uint64 size;
-    int64 last_pos;
+    uint32 last_pos;
+    uint32 count;
    uint32 chunk_size;
    int32 alignment;

--- a/memory/ThreadedQueue.h
+++ b/memory/ThreadedQueue.h
@ -14,14 +14,8 @@
 #include "../stdlib/Types.h"
 #include "../utils/Utils.h"
 #include "RingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/threading/Thread.h"
-    #include "../platform/win32/threading/Semaphore.h"
-#elif __linux__
-    #include "../platform/linux/threading/Thread.h"
-    #include "../platform/linux/threading/Semaphore.h"
-#endif
+#include "../thread/Thread.h"
+#include "../thread/Semaphore.h"

 struct ThreadedQueue {
    byte* memory;
--- a/memory/ThreadedRingMemory.h
+++ b/memory/ThreadedRingMemory.h
@ -10,12 +10,7 @@
 #define TOS_MEMORY_THREADED_RING_MEMORY_H

 #include "RingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/threading/Thread.h"
-#elif __linux__
-    #include "../platform/linux/threading/Thread.h"
-#endif
+#include "../thread/Thread.h"

 // @todo This is a horrible implementation. Please implement a lock free solution

--- a/models/mob/MobStats.cpp
+++ b/models/mob/MobStats.cpp
@ -10,7 +10,7 @@
 #define TOS_MODELS_MOB_STATS_C

 #include "MobStats.h"
-#include "../../stdlib/simd/SIMD_I32.h"
+#include "../../stdlib/Simd.h"

 // Calculate whenever character points or items change
 // 1. combine primary Item points with character points
--- a/models/mob/PrimaryStatsPoints.cpp
+++ b/models/mob/PrimaryStatsPoints.cpp
@ -9,7 +9,7 @@
 #ifndef TOS_MODELS_MOB_PRIMARY_STATS_POINTS_C
 #define TOS_MODELS_MOB_PRIMARY_STATS_POINTS_C

-#include "../../stdlib/simd/SIMD_I8.h"
+#include "../../stdlib/Simd.h"
 #include "PrimaryStatsPoints.h"

 void calculate_primary_values(const PrimaryStatsPoints* points, PrimaryStatsValues* values, int step = 8)
--- a/models/mob/SecondaryStatsPoints.cpp
+++ b/models/mob/SecondaryStatsPoints.cpp
@ -9,7 +9,7 @@
 #ifndef TOS_MODELS_MOB_SECONDARY_STATS_POINTS_C
 #define TOS_MODELS_MOB_SECONDARY_STATS_POINTS_C

-#include "../../stdlib/simd/SIMD_I8.h"
+#include "../../stdlib/Simd.h"
 #include "SecondaryStatsPoints.h"

 void calculate_primary_values(const SecondaryStatsPoints* points, SecondaryStatsValues* values, int step = 8)
--- a/module/Module.h
+++ b/module/Module.h
@ -2,10 +2,7 @@
 #define TOS_MODULE_H

 #include "../stdlib/Types.h"
-
-#ifdef _WIN32
-    #include "../../GameEngine/platform/win32/Library.h"
-#endif
+#include "../../GameEngine/system/Library.h"

 enum ModuleType {
    MODULE_TYPE_HUD,
--- a/module/ModuleManager.h
+++ b/module/ModuleManager.h
@ -3,12 +3,11 @@

 #include "Module.h"
 #include "../memory/RingMemory.h"
+#include "../system/FileUtils.cpp"

 #if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
    #include "../platform/win32/UtilsWin32.h"
 #elif __linux__
-    #include "../platform/linux/FileUtils.cpp"
 #endif

 struct ModuleManager {
--- a/network/Socket.h
+++ b/network/Socket.h
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_NETWORK_SOCKET_H
+#define TOS_NETWORK_SOCKET_H
+
+#if _WIN32
+    #include "../platform/win32/network/Socket.h"
+#elif __linux__
+    #include "../platform/linux/network/Socket.h"
+#endif
+
+#endif
--- a/object/Animation.h
+++ b/object/Animation.h
@ -11,12 +11,7 @@

 #include "../stdlib/Types.h"
 #include "../memory/RingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
+#include "../system/FileUtils.cpp"

 struct Skeleton {

--- a/object/Hitbox.h
+++ b/object/Hitbox.h
@ -11,12 +11,7 @@

 #include "../stdlib/Types.h"
 #include "../memory/RingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
+#include "../system/FileUtils.cpp"

 struct Hitbox {

--- a/object/Material.h
+++ b/object/Material.h
@ -11,12 +11,7 @@

 #include "../stdlib/Types.h"
 #include "../memory/RingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
+#include "../system/FileUtils.cpp"

 struct Material {

--- a/object/Mesh.h
+++ b/object/Mesh.h
@ -11,22 +11,11 @@

 #include "Vertex.h"
 #include "../stdlib/Types.h"
-
-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
-
+#include "../system/FileUtils.cpp"
 #include "../memory/RingMemory.h"
 #include "../utils/EndianUtils.h"
 #include "../utils/StringUtils.h"
-
-#if __aarch64__
-    #include "../stdlib/sve/SVE_I32.h"
-#else
-    #include "../stdlib/simd/SIMD_I32.h"
-#endif
+#include "../stdlib/Simd.h"

 #define MESH_VERSION 1

@ -36,8 +25,6 @@
 struct Mesh {
    byte* data; // memory owner that subdivides into the pointers below

-    // @todo Implement the version into the file, currently not implemented
-    int32 version;
    uint32 object;

    uint32 group_count;
@ -90,7 +77,8 @@ void mesh_from_file_txt(
    // move past the version string
    pos += 8;

-    mesh->version = strtol(pos, &pos, 10); ++pos;
+    // @todo us version for different handling
+    int32 version = strtol(pos, &pos, 10); ++pos;

    int32 object_index = 0;
    int32 group_index = 0;
@ -480,9 +468,9 @@ int32 mesh_from_data(
 {
    const byte* pos = data;

-    // Read version
-    mesh->version = *((int32 *) pos);
-    pos += sizeof(mesh->version);
+    // Read version, use to handle different versions differently
+    int32 version = *((int32 *) pos);
+    pos += sizeof(version);

    // Read base data
    mesh->vertex_type = *((int32 *) pos);
@ -549,7 +537,7 @@ int32 mesh_from_data(
 // We would have to check the vertex format to calculate the actual size
 int32 mesh_data_size(const Mesh* mesh)
 {
-    return sizeof(mesh->version)
+    return sizeof(int32)
        + sizeof(mesh->vertex_type)
        + sizeof(mesh->vertex_count)
        + 12 * sizeof(f32) * mesh->vertex_count; // 12 is the maximum value
@ -565,8 +553,8 @@ int32 mesh_to_data(
    byte* pos = data;

    // version
-    memcpy(pos, &mesh->version, sizeof(mesh->version));
-    pos += sizeof(mesh->version);
+    *((int32 *) pos) = MESH_VERSION;
+    pos += sizeof(int32);

    // vertices
    if (vertex_save_format == VERTEX_TYPE_ALL) {
--- a/platform/linux/Allocator.h
+++ b/platform/linux/Allocator.h
@ -17,7 +17,6 @@
 #include "../../utils/TestUtils.h"

 // @todo Currently alignment only effects the starting position, but it should also effect the ending/size
-// @todo Consider to rename file to Allocator.h

 // @question Since we store at least the size of the memory in the beginning,
 // does this have a negative impact on caching?
--- a/platform/linux/FileUtils.cpp
+++ b/platform/linux/FileUtils.cpp
@ -102,7 +102,7 @@ void relative_to_absolute(const char* rel, char* path)
    ++self_path_length;

    memcpy(path, self_path, self_path_length);
-    strcpy(path + self_path_length, temp);
+    str_copy_short(path + self_path_length, temp);
 }

 // @todo implement relative path support, similar to UtilsWin32
--- a/platform/linux/Library.cpp
+++ b/platform/linux/Library.cpp
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_PLATFORM_LINUX_LIBRARY_H
-#define TOS_PLATFORM_LINUX_LIBRARY_H
+#ifndef TOS_PLATFORM_LINUX_LIBRARY_C
+#define TOS_PLATFORM_LINUX_LIBRARY_C

 #include <stdio.h>
 #include <dlfcn.h>
@ -18,30 +18,22 @@
 #include "../../stdlib/Types.h"
 #include "../../utils/StringUtils.h"
 #include "UtilsLinux.h"
-#include "../Library.h"
+#include "../../system/Library.h"

 // @todo Rename file to Library.cpp

 inline
 bool library_load(Library* lib)
 {
-    size_t path_length = strlen(lib->dir);
-
    char dst[PATH_MAX];
-    str_concat(
-        lib->dir, path_length,
-        lib->dst, strlen(lib->dst),
-        dst
-    );
+    str_concat_new(dst, lib->dir, lib->dst);

    #if DEBUG
        char src[PATH_MAX];
        size_t dst_len = strlen(dst);

        memcpy(src, dst, dst_len + 1);
-
-        memcpy(dst + dst_len - (sizeof(".so") - 1), "_temp", sizeof("_temp") - 1);
-        memcpy(dst + dst_len - (sizeof(".so") - 1) + (sizeof("_temp") - 1), ".so", sizeof(".so"));
+        str_insert(dst, dst_len - (sizeof(".so") - 1), "_temp");

        lib->last_load = file_last_modified(src);
        file_copy(src, dst);
--- a/platform/linux/SystemInfo.cpp
+++ b/platform/linux/SystemInfo.cpp
@ -18,7 +18,7 @@
 #include <cpuid.h>

 #if __aarch64__
-    #include "../../stdlib/simd/SIMD_Helper.h"
+    #include "../../stdlib/SIMD_Helper.h"
 #else
    #include "../../stdlib/sve/SVE_Helper.h"
 #endif
@ -358,7 +358,7 @@ uint32 display_info_get(DisplayInfo* info) {
        mode.dmSize = sizeof(mode);

        if (EnumDisplaySettingsA(device.DeviceName, ENUM_CURRENT_SETTINGS, &mode)) {
-            strcpy(info[i].name, device.DeviceName);
+            str_copy_short(info[i].name, device.DeviceName);
            info[i].width = mode.dmPelsWidth;
            info[i].height = mode.dmPelsHeight;
            info[i].hz = mode.dmDisplayFrequency;
--- a/platform/win32/Allocator.h
+++ b/platform/win32/Allocator.h
@ -15,7 +15,6 @@
 #include "../../utils/TestUtils.h"

 // @todo Currently alignment only effects the starting position, but it should also effect the ending/size
-// @todo Consider to rename file to Allocator.h

 inline
 void* platform_alloc(size_t size)
@ -23,6 +22,10 @@ void* platform_alloc(size_t size)
    return VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 }

+// @question Since we store at least the size of the memory in the beginning,
+// does this have a negative impact on caching?
+// Our Memory doesn't start at the cache line beginning but at least offset by sizeof(size_t)
+
 inline
 void* platform_alloc_aligned(size_t size, int32 alignment)
 {
--- a/platform/win32/FileUtils.cpp
+++ b/platform/win32/FileUtils.cpp
@ -82,7 +82,7 @@ void relative_to_absolute(const char* rel, char* path)
    ++self_path_length;

    memcpy(path, self_path, self_path_length);
-    strcpy(path + self_path_length, temp);
+    str_copy_short(path + self_path_length, temp);
 }

 inline uint64
--- a/platform/win32/Library.cpp
+++ b/platform/win32/Library.cpp
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_PLATFORM_WIN32_LIBRARY_H
-#define TOS_PLATFORM_WIN32_LIBRARY_H
+#ifndef TOS_PLATFORM_WIN32_LIBRARY_C
+#define TOS_PLATFORM_WIN32_LIBRARY_C

 #include <stdio.h>
 #include <windows.h>
@ -23,23 +23,15 @@
 inline
 bool library_load(Library* lib)
 {
-    size_t path_length = strlen(lib->dir);
-
    char dst[MAX_PATH];
-    str_concat(
-        lib->dir, path_length,
-        lib->dst, strlen(lib->dst),
-        dst
-    );
+    str_concat_new(dst, lib->dir, lib->dst);

    #if DEBUG
        char src[MAX_PATH];
        size_t dst_len = strlen(dst);

        memcpy(src, dst, dst_len + 1);
-
-        memcpy(dst + dst_len - (sizeof(".dll") - 1), "_temp", sizeof(".temp") - 1);
-        memcpy(dst + dst_len - (sizeof(".dll") - 1) + (sizeof(".temp") - 1), ".dll", sizeof(".dll"));
+        str_insert(dst, dst_len - (sizeof(".dll") - 1), "_temp");

        lib->last_load = file_last_modified(src);
        file_copy(src, dst);
--- a/platform/win32/SystemInfo.cpp
+++ b/platform/win32/SystemInfo.cpp
@ -30,7 +30,7 @@
 #if __aarch64__
    #include "../../stdlib/sve/SVE_Helper.h"
 #else
-    #include "../../stdlib/simd/SIMD_Helper.h"
+    #include "../../stdlib/SIMD_Helper.h"
 #endif

 // @performance Do we really need all these libs, can't we simplify that?!
@ -451,7 +451,7 @@ void display_info_get_primary(DisplayInfo* info) {
        mode.dmSize = sizeof(mode);

        if (EnumDisplaySettingsA(device.DeviceName, ENUM_CURRENT_SETTINGS, &mode)) {
-            strcpy(info->name, device.DeviceName);
+            str_copy_short(info->name, device.DeviceName);
            info->width = mode.dmPelsWidth;
            info->height = mode.dmPelsHeight;
            info->hz = mode.dmDisplayFrequency;
@ -473,7 +473,7 @@ uint32 display_info_get(DisplayInfo* info) {
        mode.dmSize = sizeof(mode);

        if (EnumDisplaySettingsA(device.DeviceName, ENUM_CURRENT_SETTINGS, &mode)) {
-            strcpy(info[i].name, device.DeviceName);
+            str_copy_short(info[i].name, device.DeviceName);
            info[i].width = mode.dmPelsWidth;
            info[i].height = mode.dmPelsHeight;
            info[i].hz = mode.dmDisplayFrequency;
--- a/platform/win32/threading/Spinlock.cpp
+++ b/platform/win32/threading/Spinlock.cpp
@ -10,6 +10,7 @@
 #define TOS_PLATFORM_WIN32_THREADING_SPINLOCK_C

 #include <windows.h>
+#include "../../../stdlib/Types.h"
 #include "../TimeUtils.h"
 #include "Spinlock.h"

--- a/stdlib/HashMap.h
+++ b/stdlib/HashMap.h
@ -16,57 +16,113 @@
 #include "../memory/ChunkMemory.h"
 #include "../utils/StringUtils.h"

-#define HASH_MAP_MAX_KEY_LENGTH 32
+// WARNING Length of 28 used to ensure perfect padding with element_id and key
+#define HASH_MAP_MAX_KEY_LENGTH 28

+/////////////////////////////
+// string key
+/////////////////////////////
 struct HashEntryInt32 {
-    int64 element_id;
+    uint32 element_id;
    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryInt32* next;
    int32 value;
 };

 struct HashEntryInt64 {
-    int64 element_id;
+    uint32 element_id;
    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryInt64* next;
    int64 value;
 };

 struct HashEntryUIntPtr {
-    int64 element_id;
+    uint32 element_id;
    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryUIntPtr* next;
    uintptr_t value;
 };

 struct HashEntryVoidP {
-    int64 element_id;
+    uint32 element_id;
    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryVoidP* next;
    void* value;
 };

 struct HashEntryFloat {
-    int64 element_id;
+    uint32 element_id;
    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryFloat* next;
    f32 value;
 };

 struct HashEntryStr {
-    int64 element_id;
+    uint32 element_id;
    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryStr* next;
    char value[HASH_MAP_MAX_KEY_LENGTH];
 };

 struct HashEntry {
-    int64 element_id;
+    uint32 element_id;
    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntry* next;
    byte* value;
 };

+/////////////////////////////
+// int key
+/////////////////////////////
+struct HashEntryInt32KeyInt32 {
+    uint32 element_id;
+    int32 key;
+    HashEntryInt32KeyInt32* next;
+    int32 value;
+};
+
+struct HashEntryInt64KeyInt32 {
+    uint32 element_id;
+    int32 key;
+    HashEntryInt64KeyInt32* next;
+    int64 value;
+};
+
+struct HashEntryUIntPtrKeyInt32 {
+    uint32 element_id;
+    int32 key;
+    HashEntryUIntPtrKeyInt32* next;
+    uintptr_t value;
+};
+
+struct HashEntryVoidPKeyInt32 {
+    uint32 element_id;
+    int32 key;
+    HashEntryVoidPKeyInt32* next;
+    void* value;
+};
+
+struct HashEntryFloatKeyInt32 {
+    uint32 element_id;
+    int32 key;
+    HashEntryFloatKeyInt32* next;
+    f32 value;
+};
+
+struct HashEntryStrKeyInt32 {
+    uint32 element_id;
+    int32 key;
+    HashEntryStrKeyInt32* next;
+    char value[HASH_MAP_MAX_KEY_LENGTH];
+};
+
+struct HashEntryKeyInt32 {
+    uint32 element_id;
+    int32 key;
+    HashEntryKeyInt32* next;
+    byte* value;
+};
+
 struct HashMap {
    void** table;
    ChunkMemory buf;
@ -83,7 +139,7 @@ void hashmap_create(HashMap* hm, int32 count, int32 element_size, RingMemory* ri
    );

    hm->table = (void **) data;
-    chunk_init(&hm->buf, data + sizeof(void *) * count, count, element_size, 1);
+    chunk_init(&hm->buf, data + sizeof(void *) * count, count, element_size, 8);
 }

 // WARNING: element_size = element size + remaining HashEntry data size
@ -96,14 +152,14 @@ void hashmap_create(HashMap* hm, int32 count, int32 element_size, BufferMemory*
    );

    hm->table = (void **) data;
-    chunk_init(&hm->buf, data + sizeof(void *) * count, count, element_size, 1);
+    chunk_init(&hm->buf, data + sizeof(void *) * count, count, element_size, 8);
 }

 // WARNING: element_size = element size + remaining HashEntry data size
 void hashmap_create(HashMap* hm, int32 count, int32 element_size, byte* buf)
 {
    hm->table = (void **) buf;
-    chunk_init(&hm->buf, buf + sizeof(void *) * count, count, element_size, 1);
+    chunk_init(&hm->buf, buf + sizeof(void *) * count, count, element_size, 8);
 }

 // Calculates how large a hashmap will be
@ -121,10 +177,13 @@ int64 hashmap_size(const HashMap* hm)
    return hm->buf.count * sizeof(hm->table) + hm->buf.size;
 }

+/////////////////////////////
+// string key
+/////////////////////////////
 void hashmap_insert(HashMap* hm, const char* key, int32 value) {
    uint64 index = hash_djb2(key) % hm->buf.count;

-    int64 element = chunk_reserve(&hm->buf, 1);
+    int32 element = chunk_reserve(&hm->buf, 1);
    HashEntryInt32* entry = (HashEntryInt32 *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

@ -150,7 +209,7 @@ void hashmap_insert(HashMap* hm, const char* key, int32 value) {
 void hashmap_insert(HashMap* hm, const char* key, int64 value) {
    uint64 index = hash_djb2(key) % hm->buf.count;

-    int64 element = chunk_reserve(&hm->buf, 1);
+    int32 element = chunk_reserve(&hm->buf, 1);
    HashEntryInt64* entry = (HashEntryInt64 *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

@ -175,7 +234,7 @@ void hashmap_insert(HashMap* hm, const char* key, int64 value) {
 void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) {
    uint64 index = hash_djb2(key) % hm->buf.count;

-    int64 element = chunk_reserve(&hm->buf, 1);
+    int32 element = chunk_reserve(&hm->buf, 1);
    HashEntryUIntPtr* entry = (HashEntryUIntPtr *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

@ -200,7 +259,7 @@ void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) {
 void hashmap_insert(HashMap* hm, const char* key, void* value) {
    uint64 index = hash_djb2(key) % hm->buf.count;

-    int64 element = chunk_reserve(&hm->buf, 1);
+    int32 element = chunk_reserve(&hm->buf, 1);
    HashEntryVoidP* entry = (HashEntryVoidP *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

@ -225,7 +284,7 @@ void hashmap_insert(HashMap* hm, const char* key, void* value) {
 void hashmap_insert(HashMap* hm, const char* key, f32 value) {
    uint64 index = hash_djb2(key) % hm->buf.count;

-    int64 element = chunk_reserve(&hm->buf, 1);
+    int32 element = chunk_reserve(&hm->buf, 1);
    HashEntryFloat* entry = (HashEntryFloat *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

@ -250,7 +309,7 @@ void hashmap_insert(HashMap* hm, const char* key, f32 value) {
 void hashmap_insert(HashMap* hm, const char* key, const char* value) {
    uint64 index = hash_djb2(key) % hm->buf.count;

-    int64 element = chunk_reserve(&hm->buf, 1);
+    int32 element = chunk_reserve(&hm->buf, 1);
    HashEntryStr* entry = (HashEntryStr *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

@ -274,10 +333,10 @@ void hashmap_insert(HashMap* hm, const char* key, const char* value) {
    }
 }

-void hashmap_insert(HashMap* hm, const char* key, byte* value) {
+HashEntry* hashmap_insert(HashMap* hm, const char* key, byte* value) {
    uint64 index = hash_djb2(key) % hm->buf.count;

-    int64 element = chunk_reserve(&hm->buf, 1);
+    int32 element = chunk_reserve(&hm->buf, 1);
    HashEntry* entry = (HashEntry *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

@ -300,6 +359,73 @@ void hashmap_insert(HashMap* hm, const char* key, byte* value) {
    } else {
        hm->table[index] = entry;
    }
+
+    return entry;
+}
+
+HashEntry* hashmap_reserve(HashMap* hm, const char* key) {
+    uint64 index = hash_djb2(key) % hm->buf.count;
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntry* entry = (HashEntry *) chunk_get_element(&hm->buf, element, true);
+    entry->element_id = element;
+
+    entry->value = (byte *) entry + sizeof(HashEntry);
+
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
+
+    entry->next = NULL;
+
+    if (hm->table[index]) {
+        HashEntry* tmp = (HashEntry *) hm->table[index];
+        while(tmp->next) {
+            tmp = tmp->next;
+        }
+
+        tmp->next = entry;
+    } else {
+        hm->table[index] = entry;
+    }
+
+    return entry;
+}
+
+// Returns existing element or element to be filled
+HashEntry* hashmap_get_reserve(HashMap* hm, const char* key)
+{
+    uint64 index = hash_djb2(key) % hm->buf.count;
+    HashEntry* entry = (HashEntry *) hm->table[index];
+
+    while (entry != NULL) {
+        if (str_compare(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
+            DEBUG_MEMORY_READ((uint64) entry, sizeof(HashEntry));
+            return entry;
+        }
+
+        if (((HashEntry *) entry->next) == NULL) {
+            break;
+        }
+
+        entry = (HashEntry *) entry->next;
+    }
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntry* entry_new = (HashEntry *) chunk_get_element(&hm->buf, element, true);
+    entry_new->element_id = element;
+
+    entry_new->value = (byte *) entry_new + sizeof(HashEntry);
+
+    strncpy(entry_new->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry_new->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
+
+    if (entry) {
+        entry->next = entry_new;
+    } else {
+        hm->table[index] = entry_new;
+    }
+
+    return entry_new;
 }

 HashEntry* hashmap_get_entry(const HashMap* hm, const char* key) {
@ -307,7 +433,8 @@ HashEntry* hashmap_get_entry(const HashMap* hm, const char* key) {
    HashEntry* entry = (HashEntry *) hm->table[index];

    while (entry != NULL) {
-        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
+        if (str_compare(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
+            DEBUG_MEMORY_READ((uint64) entry, sizeof(HashEntry));
            return entry;
        }

@ -324,7 +451,8 @@ HashEntry* hashmap_get_entry(const HashMap* hm, const char* key, uint64 hash) {
    HashEntry* entry = (HashEntry *) hm->table[hash];

    while (entry != NULL) {
-        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
+        if (str_compare(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
+            DEBUG_MEMORY_READ((uint64) entry, sizeof(HashEntry));
            return entry;
        }

@ -334,20 +462,253 @@ HashEntry* hashmap_get_entry(const HashMap* hm, const char* key, uint64 hash) {
    return NULL;
 }

-void hashmap_delete_entry(HashMap* hm, const char* key) {
+// @performance If we had a doubly linked list we could delete keys much easier
+// However that would make insertion slower
+// Maybe we create a nother hashmap that is doubly linked
+void hashmap_remove(HashMap* hm, const char* key) {
    uint64 index = hash_djb2(key) % hm->buf.count;
    HashEntry* entry = (HashEntry *) hm->table[index];
    HashEntry* prev = NULL;

    while (entry != NULL) {
-        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
+        if (str_compare(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
            if (prev == NULL) {
                hm->table[index] = entry->next;
            } else {
                prev->next = entry->next;
            }

-            chunk_free_element(&hm->buf, entry->element_id);
+            chunk_free_elements(&hm->buf, entry->element_id);
+
+            return;
+        }
+
+        prev = entry;
+        entry = entry->next;
+    }
+}
+
+/////////////////////////////
+// int key
+/////////////////////////////
+void hashmap_insert(HashMap* hm, int32 key, int32 value) {
+    uint64 index = key % hm->buf.count;
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntryInt32KeyInt32* entry = (HashEntryInt32KeyInt32 *) chunk_get_element(&hm->buf, element, true);
+    entry->element_id = element;
+
+    entry->key = key;
+    entry->value = value;
+    entry->next = NULL;
+
+    if (hm->table[index]) {
+        HashEntryInt32KeyInt32* tmp = (HashEntryInt32KeyInt32 *) hm->table[index];
+        while(tmp->next) {
+            tmp = tmp->next;
+        }
+
+        tmp->next = entry;
+    } else {
+        hm->table[index] = entry;
+    }
+}
+
+void hashmap_insert(HashMap* hm, int32 key, int64 value) {
+    uint64 index = key % hm->buf.count;
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntryInt64KeyInt32* entry = (HashEntryInt64KeyInt32 *) chunk_get_element(&hm->buf, element, true);
+    entry->element_id = element;
+
+    entry->key = key;
+    entry->value = value;
+    entry->next = NULL;
+
+    if (hm->table[index]) {
+        HashEntryInt64KeyInt32* tmp = (HashEntryInt64KeyInt32 *) hm->table[index];
+        while(tmp->next) {
+            tmp = tmp->next;
+        }
+
+        tmp->next = entry;
+    } else {
+        hm->table[index] = entry;
+    }
+}
+
+void hashmap_insert(HashMap* hm, int32 key, uintptr_t value) {
+    uint64 index = key % hm->buf.count;
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntryUIntPtrKeyInt32* entry = (HashEntryUIntPtrKeyInt32 *) chunk_get_element(&hm->buf, element, true);
+    entry->element_id = element;
+
+    entry->key = key;
+    entry->value = value;
+    entry->next = NULL;
+
+    if (hm->table[index]) {
+        HashEntryUIntPtrKeyInt32* tmp = (HashEntryUIntPtrKeyInt32 *) hm->table[index];
+        while(tmp->next) {
+            tmp = tmp->next;
+        }
+
+        tmp->next = entry;
+    } else {
+        hm->table[index] = entry;
+    }
+}
+
+void hashmap_insert(HashMap* hm, int32 key, void* value) {
+    uint64 index = key % hm->buf.count;
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntryVoidPKeyInt32* entry = (HashEntryVoidPKeyInt32 *) chunk_get_element(&hm->buf, element, true);
+    entry->element_id = element;
+
+    entry->key = key;
+    entry->value = value;
+    entry->next = NULL;
+
+    if (hm->table[index]) {
+        HashEntryVoidPKeyInt32* tmp = (HashEntryVoidPKeyInt32 *) hm->table[index];
+        while(tmp->next) {
+            tmp = tmp->next;
+        }
+
+        tmp->next = entry;
+    } else {
+        hm->table[index] = entry;
+    }
+}
+
+void hashmap_insert(HashMap* hm, int32 key, f32 value) {
+    uint64 index = key % hm->buf.count;
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntryFloatKeyInt32* entry = (HashEntryFloatKeyInt32 *) chunk_get_element(&hm->buf, element, true);
+    entry->element_id = element;
+
+    entry->key = key;
+    entry->value = value;
+    entry->next = NULL;
+
+    if (hm->table[index]) {
+        HashEntryFloatKeyInt32* tmp = (HashEntryFloatKeyInt32 *) hm->table[index];
+        while(tmp->next) {
+            tmp = tmp->next;
+        }
+
+        tmp->next = entry;
+    } else {
+        hm->table[index] = entry;
+    }
+}
+
+void hashmap_insert(HashMap* hm, int32 key, const char* value) {
+    uint64 index = key % hm->buf.count;
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntryStrKeyInt32* entry = (HashEntryStrKeyInt32 *) chunk_get_element(&hm->buf, element, true);
+    entry->element_id = element;
+
+    entry->key = key;
+
+    strncpy(entry->value, value, HASH_MAP_MAX_KEY_LENGTH);
+    entry->value[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
+
+    entry->next = NULL;
+
+    if (hm->table[index]) {
+        HashEntryStrKeyInt32* tmp = (HashEntryStrKeyInt32 *) hm->table[index];
+        while(tmp->next) {
+            tmp = tmp->next;
+        }
+
+        tmp->next = entry;
+    } else {
+        hm->table[index] = entry;
+    }
+}
+
+void hashmap_insert(HashMap* hm, int32 key, byte* value) {
+    uint64 index = key % hm->buf.count;
+
+    int32 element = chunk_reserve(&hm->buf, 1);
+    HashEntryKeyInt32* entry = (HashEntryKeyInt32 *) chunk_get_element(&hm->buf, element, true);
+    entry->element_id = element;
+
+    entry->key = key;
+    entry->value = (byte *) entry + sizeof(HashEntryKeyInt32);
+
+    memcpy(entry->value, value, hm->buf.chunk_size - sizeof(HashEntryKeyInt32));
+
+    entry->next = NULL;
+
+    if (hm->table[index]) {
+        HashEntryKeyInt32* tmp = (HashEntryKeyInt32 *) hm->table[index];
+        while(tmp->next) {
+            tmp = tmp->next;
+        }
+
+        tmp->next = entry;
+    } else {
+        hm->table[index] = entry;
+    }
+}
+
+HashEntryKeyInt32* hashmap_get_entry(const HashMap* hm, int32 key) {
+    uint64 index = key % hm->buf.count;
+    HashEntryKeyInt32* entry = (HashEntryKeyInt32 *) hm->table[index];
+
+    while (entry != NULL) {
+        if (entry->key == key) {
+            DEBUG_MEMORY_READ((uint64) entry, sizeof(HashEntryKeyInt32));
+            return entry;
+        }
+
+        entry = (HashEntryKeyInt32 *) entry->next;
+    }
+
+    return NULL;
+}
+
+// This function only saves one step (omission of the hash function)
+// The reason for this is in some cases we can use compile time hashing
+HashEntryKeyInt32* hashmap_get_entry(const HashMap* hm, int32 key, uint64 hash) {
+    hash %= hm->buf.count;
+    HashEntryKeyInt32* entry = (HashEntryKeyInt32 *) hm->table[hash];
+
+    while (entry != NULL) {
+        if (entry->key == key) {
+            DEBUG_MEMORY_READ((uint64) entry, sizeof(HashEntryKeyInt32));
+            return entry;
+        }
+
+        entry = (HashEntryKeyInt32 *) entry->next;
+    }
+
+    return NULL;
+}
+
+// @performance If we had a doubly linked list we could delete keys much easier
+// However that would make insertion slower
+// Maybe we create a nother hashmap that is doubly linked
+void hashmap_remove(HashMap* hm, int32 key) {
+    uint64 index = key % hm->buf.count;
+    HashEntryKeyInt32* entry = (HashEntryKeyInt32 *) hm->table[index];
+    HashEntryKeyInt32* prev = NULL;
+
+    while (entry != NULL) {
+        if (entry->key == key) {
+            if (prev == NULL) {
+                hm->table[index] = entry->next;
+            } else {
+                prev->next = entry->next;
+            }
+
+            chunk_free_elements(&hm->buf, entry->element_id);

            return;
        }
@ -357,10 +718,22 @@ void hashmap_delete_entry(HashMap* hm, const char* key) {
    }
 }

+inline
+int32 hashmap_value_size(const HashMap* hm)
+{
+    return (uint32) (
+        hm->buf.chunk_size
+            - sizeof(uint32) // element id
+            - sizeof(char) * HASH_MAP_MAX_KEY_LENGTH // key
+            - sizeof(uintptr_t) // next pointer
+        );
+}
+
+// @question Shouldn't we also store the hashmap count, chunk size etc? Currently not done and expected to be correctly initialized.
 inline
 int64 hashmap_dump(const HashMap* hm, byte* data)
 {
-    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(hm->buf.count);
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(hm->buf.count);
    data += sizeof(hm->buf.count);

    // Dump the table content where the elements are relative indices/pointers
@ -371,17 +744,19 @@ int64 hashmap_dump(const HashMap* hm, byte* data)
    }
    data += sizeof(uint64) * hm->buf.count;

-    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * HASH_MAP_MAX_KEY_LENGTH - sizeof(uint64);
+    // @bug what if Int32 key?
+    int32 value_size = hashmap_value_size(hm);

    // Dumb hash map content = buffer memory
+    // Since we are using ChunkMemory we can be smart about it and iterate the chunk memory instead of performing pointer chasing
    int32 free_index = 0;
    int32 bit_index = 0;
    for (uint32 i = 0; i < hm->buf.count; ++i) {
-        if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
+        if (hm->buf.free[free_index] & (1ULL << bit_index)) {
            HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);

            // element_id
-            *((uint64 *) data) = SWAP_ENDIAN_LITTLE(entry->element_id);
+            *((uint32 *) data) = SWAP_ENDIAN_LITTLE(entry->element_id);
            data += sizeof(entry->element_id);

            // key
@ -430,8 +805,8 @@ int64 hashmap_dump(const HashMap* hm, byte* data)
 inline
 int64 hashmap_load(HashMap* hm, const byte* data)
 {
-    uint64 count = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
-    data += sizeof(uint64);
+    uint64 count = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
+    data += sizeof(uint32);

    // Load the table content
    for (uint32 i = 0; i < count; ++i) {
@ -450,33 +825,31 @@ int64 hashmap_load(HashMap* hm, const byte* data)
    // @question don't we have to possibly endian swap check the free array as well?
    memcpy(hm->buf.free, data, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64));

-    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * HASH_MAP_MAX_KEY_LENGTH - sizeof(uint64);
+    // @bug what if Int32 key?
+    int32 value_size = hashmap_value_size(hm);

    // Switch endian AND turn offsets to pointers
-    int32 free_index = 0;
-    int32 bit_index = 0;
-    for (uint32 i = 0; i < hm->buf.count; ++i) {
-        if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
-            HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);
+    int32 chunk_id = 0;
+    chunk_iterate_start(&hm->buf, chunk_id)
+        HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, chunk_id);

-            // element id
-            entry->element_id = SWAP_ENDIAN_LITTLE(entry->element_id);
+        // element id
+        entry->element_id = SWAP_ENDIAN_LITTLE(entry->element_id);

-            // key is already loaded with the memcpy
-            // @question Do we even want to use memcpy? We are re-checking all the values here anyways
+        // key is already loaded with the memcpy
+        // @question Do we even want to use memcpy? We are re-checking all the values here anyways

-            // next pointer
-            if (entry->next) {
-                entry->next = (HashEntry *) (hm->buf.memory + SWAP_ENDIAN_LITTLE((uint64) entry->next));
-            }
-
-            if (value_size == 4) {
-                ((HashEntryInt32 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt32 *) entry)->value);
-            } else if (value_size == 8) {
-                ((HashEntryInt64 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt64 *) entry)->value);
-            }
+        // next pointer
+        if (entry->next) {
+            entry->next = (HashEntry *) (hm->buf.memory + SWAP_ENDIAN_LITTLE((uint64) entry->next));
        }
-    }
+
+        if (value_size == 4) {
+            ((HashEntryInt32 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt32 *) entry)->value);
+        } else if (value_size == 8) {
+            ((HashEntryInt64 *) entry)->value = SWAP_ENDIAN_LITTLE(((HashEntryInt64 *) entry)->value);
+        }
+    chunk_iterate_end;

    // How many bytes was read from data
    return sizeof(hm->buf.count) // hash map count = buffer count
--- a/stdlib/PerfectHashMap.h
+++ b/stdlib/PerfectHashMap.h
@ -198,7 +198,7 @@ void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, int32 value) {
    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
    PerfectHashEntryInt32* entry = (PerfectHashEntryInt32 *) (hm->hash_entries + hm->entry_size * index);
    entry->element_id = index;
-    strcpy(entry->key, key);
+    str_copy_short(entry->key, key);
    entry->value = value;
 }

@ -207,7 +207,7 @@ void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, int64 value) {
    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
    PerfectHashEntryInt64* entry = (PerfectHashEntryInt64 *) (hm->hash_entries + hm->entry_size * index);
    entry->element_id = index;
-    strcpy(entry->key, key);
+    str_copy_short(entry->key, key);
    entry->value = value;
 }

@ -216,7 +216,7 @@ void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, uintptr_t value
    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
    PerfectHashEntryUIntPtr* entry = (PerfectHashEntryUIntPtr *) (hm->hash_entries + hm->entry_size * index);
    entry->element_id = index;
-    strcpy(entry->key, key);
+    str_copy_short(entry->key, key);
    entry->value = value;
 }

@ -225,7 +225,7 @@ void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, void* value) {
    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
    PerfectHashEntryVoidP* entry = (PerfectHashEntryVoidP *) (hm->hash_entries + hm->entry_size * index);
    entry->element_id = index;
-    strcpy(entry->key, key);
+    str_copy_short(entry->key, key);
    entry->value = value;
 }

@ -234,7 +234,7 @@ void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, f32 value) {
    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
    PerfectHashEntryFloat* entry = (PerfectHashEntryFloat *) (hm->hash_entries + hm->entry_size * index);
    entry->element_id = index;
-    strcpy(entry->key, key);
+    str_copy_short(entry->key, key);
    entry->value = value;
 }

@ -243,7 +243,7 @@ void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, const char* val
    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
    PerfectHashEntryStr* entry = (PerfectHashEntryStr *) (hm->hash_entries + hm->entry_size * index);
    entry->element_id = index;
-    strcpy(entry->key, key);
+    str_copy_short(entry->key, key);
    memcpy(entry->value, value, PERFECT_HASH_MAP_MAX_KEY_LENGTH);
 }

@ -252,7 +252,7 @@ void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, byte* value) {
    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
    PerfectHashEntryStr* entry = (PerfectHashEntryStr *) (hm->hash_entries + hm->entry_size * index);
    entry->element_id = index;
-    strcpy(entry->key, key);
+    str_copy_short(entry->key, key);
    memcpy(entry->value, value, hm->entry_size - sizeof(PerfectHashEntry));
 }

--- a/stdlib/simd/SIMD_Helper.h
+++ b/stdlib/simd/SIMD_Helper.h
@ -12,14 +12,12 @@
 #include <stdint.h>
 #include <immintrin.h>
 #include <xmmintrin.h>
-#include "../Types.h"
+#include "Types.h"

 // @todo split into platform code for windows and linux

 #if _WIN32
    #include <windows.h>
-    #include <stdio.h>
-
    #ifdef _MSC_VER
        #include <intrin.h>
    #endif
--- a/stdlib/Simd.h
+++ b/stdlib/Simd.h
@ -0,0 +1,24 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_STDLIB_SIMD_H
+#define TOS_STDLIB_SIMD_H
+
+#if __aarch64__
+
+#else
+    #include "simd/SIMD_F32.h"
+    #include "simd/SIMD_F64.h"
+    #include "simd/SIMD_I8.h"
+    #include "simd/SIMD_I16.h"
+    #include "simd/SIMD_I32.h"
+    #include "simd/SIMD_I64.h"
+    #include "simd/SIMD_SVML.h"
+#endif
+
+#endif
--- a/stdlib/ThreadedHashMap.h
+++ b/stdlib/ThreadedHashMap.h
@ -12,15 +12,9 @@
 #include "../stdlib/Types.h"
 #include "HashMap.h"

-#if _WIN32
-    #include "../platform/win32/threading/Thread.h"
-    #include "../platform/win32/threading/Semaphore.h"
-    #include "../platform/win32/threading/Atomic.h"
-#elif __linux__
-    #include "../platform/linux/threading/Thread.h"
-    #include "../platform/linux/threading/Semaphore.h"
-    #include "../platform/linux/threading/Atomic.h"
-#endif
+#include "../thread/Atomic.h"
+#include "../thread/Semaphore.h"
+#include "../thread/Thread.h"

 struct ThreadedHashMap {
    void** table;
@ -125,9 +119,9 @@ void thrd_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* k
 }

 inline
-void thrd_hashmap_delete_entry(ThreadedHashMap* hm, const char* key) {
+void thrd_hashmap_remove(ThreadedHashMap* hm, const char* key) {
    pthread_mutex_lock(&hm->mutex);
-    hashmap_delete_entry((HashMap *) hm, key);
+    hashmap_remove((HashMap *) hm, key);
    pthread_mutex_unlock(&hm->mutex);
 }

--- a/stdlib/Types.h
+++ b/stdlib/Types.h
@ -73,9 +73,11 @@ typedef intptr_t smm;
 #define MIN_INT32 0x80000000
 #define MIN_INT64 0x8000000000000000

+#define MIN_MILLI 60000
 #define SEC_MILLI 1000
-#define MILLI_MICRO 1000
+#define MIN_MICRO 60000000
 #define SEC_MICRO 1000000
+#define MILLI_MICRO 1000

 #define MHZ 1000000
 #define GHZ 1000000000
--- a/system/Allocator.h
+++ b/system/Allocator.h
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_SYSTEM_ALLOCATOR_H
+#define TOS_SYSTEM_ALLOCATOR_H
+
+#if _WIN32
+    #include "../platform/win32/Allocator.h"
+#elif __linux__
+    #include "../platform/linux/Allocator.h"
+#endif
+
+#endif
--- a/system/FileUtils.cpp
+++ b/system/FileUtils.cpp
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_SYSTEM_FILE_UTILS_C
+#define TOS_SYSTEM_FILE_UTILS_C
+
+#if _WIN32
+    #include "../platform/win32/FileUtils.cpp"
+#elif __linux__
+    #include "../platform/linux/FileUtils.cpp"
+#endif
+
+#endif
--- a/system/Library.cpp
+++ b/system/Library.cpp
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_SYSTEM_LIBRARY_C
+#define TOS_SYSTEM_LIBRARY_C
+
+#if _WIN32
+    #include "../platform/win32/Library.cpp"
+#elif __linux__
+    #include "../platform/linux/Library.cpp"
+#endif
+
+#endif
--- a/system/SystemInfo.cpp
+++ b/system/SystemInfo.cpp
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_SYSTEM_INFO_C
+#define TOS_SYSTEM_INFO_C
+
+#if _WIN32
+    #include "../platform/win32/SystemInfo.cpp"
+#elif __linux__
+    #include "../platform/linux/SystemInfo.cpp"
+#endif
+
+#endif
--- a/thread/Atomic.h
+++ b/thread/Atomic.h
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_THREADS_ATOMIC_H
+#define TOS_THREADS_ATOMIC_H
+
+#if _WIN32
+    #include "../platform/win32/threading/Atomic.h"
+#elif __linux__
+    #include "../platform/linux/threading/Atomic.h"
+#endif
+
+#endif
--- a/thread/Semaphore.h
+++ b/thread/Semaphore.h
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_THREADS_SEMAPHORE_H
+#define TOS_THREADS_SEMAPHORE_H
+
+#if _WIN32
+    #include "../platform/win32/threading/Semaphore.h"
+#elif __linux__
+    #include "../platform/linux/threading/Semaphore.h"
+#endif
+
+#endif
--- a/thread/Spinlock.cpp
+++ b/thread/Spinlock.cpp
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_THREADS_SPINLOCK_C
+#define TOS_THREADS_SPINLOCK_C
+
+#if _WIN32
+    #include "../platform/win32/threading/Spinlock.cpp"
+#elif __linux__
+    #include "../platform/linux/threading/Spinlock.cpp"
+#endif
+
+#endif
--- a/thread/Spinlock.h
+++ b/thread/Spinlock.h
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_THREADS_SPINLOCK_H
+#define TOS_THREADS_SPINLOCK_H
+
+#if _WIN32
+    #include "../platform/win32/threading/Spinlock.h"
+#elif __linux__
+    #include "../platform/linux/threading/Spinlock.h"
+#endif
+
+#endif
--- a/thread/Thread.h
+++ b/thread/Thread.h
@ -13,13 +13,12 @@
 #include <stdlib.h>

 #include "../stdlib/Types.h"
+#include "Atomic.h"

 #if _WIN32
    #include "../platform/win32/threading/Thread.h"
-    #include "../platform/win32/threading/Atomic.h"
 #elif __linux__
    #include "../platform/linux/threading/Thread.h"
-    #include "../platform/linux/threading/Atomic.h"
 #endif

 #include "ThreadJob.h"
--- a/thread/ThreadDefines.h
+++ b/thread/ThreadDefines.h
@ -0,0 +1,18 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_THREADS_THREAD_DEFINES_H
+#define TOS_THREADS_THREAD_DEFINES_H
+
+#if _WIN32
+    #include "../platform/win32/threading/ThreadDefines.h"
+#elif __linux__
+    #include "../platform/linux/threading/ThreadDefines.h"
+#endif
+
+#endif
--- a/thread/ThreadJob.h
+++ b/thread/ThreadJob.h
@ -14,12 +14,7 @@

 #include "../stdlib/Types.h"
 #include "../memory/ThreadedRingMemory.h"
-
-#if _WIN32
-    #include "../platform/win32/threading/ThreadDefines.h"
-#elif __linux__
-    #include "../platform/linux/threading/ThreadDefines.h"
-#endif
+#include "../thread/ThreadDefines.h"

 typedef void (*ThreadPoolJobFunc)(void*);

--- a/thread/ThreadPool.h
+++ b/thread/ThreadPool.h
@ -15,15 +15,9 @@
 #include "../stdlib/Types.h"
 #include "../memory/Queue.h"
 #include "../memory/BufferMemory.h"
-
-#ifdef _WIN32
-    #include "../platform/win32/threading/Thread.h"
-    #include "../platform/win32/threading/Atomic.h"
-#elif __linux__
-    #include "../platform/linux/threading/Thread.h"
-    #include "../platform/linux/threading/Atomic.h"
-#endif
-
+#include "../log/DebugMemory.h"
+#include "Thread.h"
+#include "Atomic.h"
 #include "ThreadJob.h"

 struct ThreadPool {
@ -70,6 +64,8 @@ static THREAD_RETURN thread_pool_worker(void* arg)
        atomic_increment_relaxed(&pool->working_cnt);
        atomic_set_release(&work->state, 2);
        work->func(work);
+        // At the end of a thread the ring memory automatically is considered freed
+        DEBUG_MEMORY_FREE((uint64) work->ring.memory, work->ring.size);
        atomic_set_release(&work->state, 1);

        // Job gets marked after completion -> can be overwritten now
--- a/ui/UIAttribute.h
+++ b/ui/UIAttribute.h
@ -119,7 +119,7 @@ UIAttribute* ui_attribute_from_group(UIAttributeGroup* group, UIAttributeType ty
    return NULL;
 }

-constexpr const char* ui_attribute_type_to_string_const(UIAttributeType e)
+constexpr const char* ui_attribute_type_to_string(UIAttributeType e)
 {
    switch (e) {
        case UI_ATTRIBUTE_TYPE_TYPE:
--- a/ui/UIElementType.h
+++ b/ui/UIElementType.h
@ -21,7 +21,7 @@ enum UIElementType {
    UI_ELEMENT_TYPE_SIZE,
 };

-constexpr const char* ui_element_type_to_string_const(UIElementType e)
+constexpr const char* ui_element_type_to_string(UIElementType e)
 {
    switch (e) {
        case UI_ELEMENT_TYPE_BUTTON:
--- a/ui/UILayout.h
+++ b/ui/UILayout.h
@ -28,6 +28,8 @@ struct UILayout {
    int32 vertex_size;
    Asset* ui_asset;

+    // @question Should we maybe also hold the font atlas asset here AND the color palette?
+
    // Defines the length of the static vertex array
    int32 vertex_size_static;
 };
--- a/ui/UITheme.h
+++ b/ui/UITheme.h
@ -7,16 +7,11 @@
 #include "../utils/StringUtils.h"
 #include "../stdlib/HashMap.h"
 #include "../font/Font.h"
+#include "../system/FileUtils.cpp"

 #include "UIAttribute.h"
 #include "UIElementType.h"

-#if _WIN32
-    #include "../platform/win32/FileUtils.cpp"
-#else
-    #include "../platform/linux/FileUtils.cpp"
-#endif
-
 #define UI_THEME_VERSION 1

 // @question Currently there is some data duplication in here and in the UIElement.
@ -27,8 +22,6 @@
 struct UIThemeStyle {
    byte* data;

-    int32 version;
-
    // A theme may have N named styles
    // The hashmap contains the offset where the respective style can be found
    // @performance Switch to perfect hash map
@ -121,7 +114,8 @@ void theme_from_file_txt(
    // move past the version string
    pos += 8;

-    theme->version = strtol(pos, &pos, 10); ++pos;
+    // Use version for different handling
+    int32 version = strtol(pos, &pos, 10); ++pos;

    bool block_open = false;
    char block_name[32];
@ -157,7 +151,9 @@ void theme_from_file_txt(
    UIAttributeGroup* temp_group = NULL;

    pos = (char *) file.content;
-    pos += 8; // move past version
+
+    // move past version string
+    str_move_past(&pos, '\n');

    while (*pos != '\0') {
        str_skip_whitespace(&pos);
@ -213,14 +209,14 @@ void theme_from_file_txt(

        // Handle different attribute types
        UIAttribute attribute = {};
-        if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TYPE), attribute_name) == 0) {
+        if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_TYPE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_TYPE;

            char str[32];
            str_copy_move_until(&pos, str, '\n');

            for (int32 j = 0; j < UI_ELEMENT_TYPE_SIZE; ++j) {
-                if (strcmp(str, ui_element_type_to_string_const((UIElementType) j)) == 0) {
+                if (strcmp(str, ui_element_type_to_string((UIElementType) j)) == 0) {

                    attribute.value_int = j;
                    break;
@ -228,135 +224,135 @@ void theme_from_file_txt(
            }

            ++pos;
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_STYLE), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_STYLE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_STYLE;

            str_copy_move_until(&pos, attribute.value_str, '\n');
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_FONT_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_SIZE), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_FONT_SIZE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_SIZE;
            attribute.value_float = strtof(pos, &pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_WEIGHT), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_FONT_WEIGHT), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_WEIGHT;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_FONT_LINE_HEIGHT;
            attribute.value_float = strtof(pos, &pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ALIGN_H), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_ALIGN_H), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_ALIGN_H;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ALIGN_V), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_ALIGN_V), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_ALIGN_V;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_ZINDEX), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_ZINDEX), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_ZINDEX;
            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos));
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG;

            str_copy_move_until(&pos, attribute.value_str, '\n');
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_OPACITY;
            attribute.value_float = SWAP_ENDIAN_LITTLE(strtof(pos, &pos));
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_V;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_POSITION_H;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BACKGROUND_IMG_STYLE;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_WIDTH), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_WIDTH;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_TOP_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_TOP_WIDTH;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_RIGHT_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_RIGHT_WIDTH;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_BOTTOM_WIDTH;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_LEFT_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_BORDER_LEFT_WIDTH;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_PADDING), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_TOP), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_PADDING_TOP), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_TOP;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_RIGHT), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_PADDING_RIGHT), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_RIGHT;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_BOTTOM), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_PADDING_BOTTOM), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_BOTTOM;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_PADDING_LEFT), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_PADDING_LEFT), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_PADDING_LEFT;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_ANGLE;
            attribute.value_float = strtof(pos, &pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_INNER_DISTANCE;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR), attribute_name) == 0) {
            ++pos; // Skip '#'

            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_COLOR;
            hexstr_to_rgba(&attribute.value_v4_f32, pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_ANGLE;
            attribute.value_float = strtof(pos, &pos);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_SHADOW_OUTER_DISTANCE;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_TRANSITION_ANIMATION;
            attribute.value_int = strtoul(pos, &pos, 10);
-        } else if (strcmp(ui_attribute_type_to_string_const(UI_ATTRIBUTE_TYPE_TRANSITION_DURATION), attribute_name) == 0) {
+        } else if (strcmp(ui_attribute_type_to_string(UI_ATTRIBUTE_TYPE_TRANSITION_DURATION), attribute_name) == 0) {
            attribute.attribute_id = UI_ATTRIBUTE_TYPE_TRANSITION_DURATION;
            attribute.value_float = strtof(pos, &pos);
        } else {
@ -406,8 +402,8 @@ int32 theme_from_data(
 ) {
    const byte* pos = data;

-    theme->version = *((int32 *) pos);
-    pos += sizeof(theme->version);
+    int32 version = *((int32 *) pos);
+    pos += sizeof(version);

    // Prepare hashmap (incl. reserve memory) by initializing it the same way we originally did
    // Of course we still need to populate the data using hashmap_load()
@ -487,8 +483,8 @@ int32 theme_to_data(
    byte* pos = data;

    // version
-    *((int32 *) pos) = SWAP_ENDIAN_LITTLE(theme->version);
-    pos += sizeof(theme->version);
+    *((int32 *) pos) = SWAP_ENDIAN_LITTLE(UI_THEME_VERSION);
+    pos += sizeof(int32);

    // hashmap
    byte* start = pos;
--- a/utils/BitUtils.h
+++ b/utils/BitUtils.h
@ -35,6 +35,7 @@

 // Right to left (little endian)
 #define IS_BIT_SET_R2L(num, pos) ((bool) ((num) & (1 << (pos))))
+#define IS_BIT_SET_64_R2L(num, pos) ((bool) ((num) & (1LL << (pos))))
 #define BIT_SET_R2L(num, pos) ((num) | ((uint32) 1 << (pos)))
 #define BIT_UNSET_R2L(num, pos) ((num) & ~((uint32) 1 << (pos)))
 #define BIT_FLIP_R2L(num, pos) ((num) ^ ((uint32) 1 << (pos)))
--- a/utils/MathUtils.h
+++ b/utils/MathUtils.h
@ -28,6 +28,7 @@
 #define OMS_CEIL(x) ((x) == (int)(x) ? (int)(x) : ((x) > 0 ? (int)(x) + 1 : (int)(x)))
 #define OMS_ROUND(x) (((x) >= 0) ? ((int)((x) + 0.5f)) : ((int)((x) - 0.5f)))
 #define OMS_ROUND_POSITIVE(x) ((int)((x) + 0.5f))
+#define FLOAT_CAST_EPS 0.001953125

 // Modulo function when b is a power of 2
 #define MODULO_2(a, b) ((a) & (b - 1))
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@ -167,13 +167,14 @@ void wchar_to_char(const char* __restrict str, char* __restrict dest)
 }

 inline constexpr
-int32 str_to_int(const char* str)
+int64 str_to_int(const char* str)
 {
-    int32 result = 0;
+    int64 result = 0;

-    int32 sign = 1;
-    if (*str++ == '-') {
+    int64 sign = 1;
+    if (*str == '-') {
        sign = -1;
+        ++str;
    }

    while (*str >= '0' && *str <= '9') {
@ -186,15 +187,21 @@ int32 str_to_int(const char* str)
    return result * sign;
 }

-inline constexpr
-int32 int_to_str(int64 number, char *str, const char thousands = ',') {
+inline
+int32 int_to_str(int64 number, char str[15], const char thousands)
+{
+    if (number == 0) {
+        *str++ = '0';
+        *str = '\0';
+
+        return 1;
+    }
+
    int32 i = 0;
    int32 digit_count = 0;
    int64 sign = number;

-    if (number == 0) {
-        str[i++] = '0';
-    } else if (number < 0) {
+    if (number < 0) {
        number = -number;
    }

@ -212,8 +219,84 @@ int32 int_to_str(int64 number, char *str, const char thousands = ',') {
        str[i++] = '-';
    }

+    for (int32 j = 0, k = i - 1; j < k; ++j, --k) {
+        char temp = str[j];
+        str[j] = str[k];
+        str[k] = temp;
+    }
+
    str[i] = '\0';

+    return i;
+}
+
+inline constexpr
+int32 int_to_str(int64 number, char str[12]) {
+    int32 i = -1;
+    int64 sign = number;
+
+    if (number < 0) {
+        number = -number;
+    }
+
+    do {
+        str[++i] = number % 10 + '0';
+        number /= 10;
+    } while (number > 0);
+
+    if (sign < 0) {
+        str[++i] = '-';
+    }
+
+    for (int32 j = 0, k = i; j < k; ++j, --k) {
+        char temp = str[j];
+        str[j] = str[k];
+        str[k] = temp;
+    }
+
+    str[++i] = '\0';
+
+    return i;
+}
+
+inline constexpr
+int32 uint_to_str(uint64 number, char str[12]) {
+    int32 i = -1;
+
+    do {
+        str[++i] = number % 10 + '0';
+        number /= 10;
+    } while (number > 0);
+
+    for (int32 j = 0, k = i; j < k; ++j, --k) {
+        char temp = str[j];
+        str[j] = str[k];
+        str[k] = temp;
+    }
+
+    str[++i] = '\0';
+
+    return i;
+}
+
+static const char HEX_TABLE[] = {
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+    'A', 'B', 'C', 'D', 'E', 'F'
+};
+
+inline constexpr
+int32 int_to_hex(int64 number, char str[9]) {
+    int32 i = -1;
+    uint64 n = (uint64) number;
+
+    do {
+        byte digit = n % 16;
+        str[++i] = HEX_TABLE[digit];
+        n /= 16;
+    } while (n > 0);
+
+    str[++i] = '\0';
+
    for (int32 j = 0, k = i - 1; j < k; ++j, --k) {
        char temp = str[j];
        str[j] = str[k];
@ -223,6 +306,29 @@ int32 int_to_str(int64 number, char *str, const char thousands = ',') {
    return i;
 }

+inline constexpr
+int64 hex_to_int(const char* hex)
+{
+    int64 result = 0;
+    while ((*hex >= '0' && *hex <= '9')
+        || (*hex >= 'A' && *hex <= 'F')
+        || (*hex >= 'a' && *hex <= 'f')
+    ) {
+        byte value = *hex++;
+        if (value >= '0' && value <= '9') {
+            value = value - '0';
+        } else if (value >= 'A' && value <='F') {
+            value = value - 'A' + 10;
+        } else if (value >= 'a' && value <='f') {
+            value = value - 'a' + 10;
+        }
+
+        result = (result << 4) | (value & 0xF);
+    }
+
+    return result;
+}
+
 inline
 size_t str_count(const char* __restrict str, const char* __restrict substr)
 {
@ -241,6 +347,142 @@ size_t str_count(const char* __restrict str, const char* __restrict substr)
    return count;
 }

+inline constexpr
+int32 is_eol(const char* str)
+{
+    if (*str == '\n') {
+        return 1;
+    } else if (*str == '\r' && str[1] == '\n') {
+        return 2;
+    }
+
+    return 0;
+}
+
+inline
+void str_copy_until(const char* __restrict src, char* __restrict dest, char delim)
+{
+    while (*src != delim && *src != '\0') {
+        *dest++ = *src++;
+    }
+
+    *dest = '\0';
+}
+
+inline
+void str_copy_until(const char* __restrict src, char* __restrict dest, const char* __restrict delim, int32 len)
+{
+    while (*src != '\0') {
+        for (int32 i = 0; i < len; ++i) {
+            if (*src == delim[i]) {
+                *dest = '\0';
+                return;
+            }
+        }
+
+        *dest++ = *src++;
+    }
+
+    *dest = '\0';
+}
+
+inline
+int32 str_copy_until(char* __restrict dest, const char* __restrict src, char delim)
+{
+    int32 len = 0;
+    while (*src != delim && *src != '\0') {
+        *dest++ = *src++;
+        ++len;
+    }
+
+    *dest = '\0';
+
+    return len;
+}
+
+inline
+void str_copy_short(char* __restrict dest, const char* __restrict src, char delim = '\0')
+{
+    while (*src != delim) {
+        *dest++ = *src++;
+    }
+
+    *dest = '\0';
+}
+
+inline
+void str_copy_long(char* __restrict dest, const char* __restrict src, char delim = '\0')
+{
+    char* d = dest;
+    const char *s = src;
+
+    // Align destination to its natural alignment
+    while (((uintptr_t) d & (sizeof(uintptr_t) - 1)) != 0 && *s != '\0') {
+        *d++ = *s++;
+    }
+
+    // Copy using larger chunks (size of uintptr_t)
+    uintptr_t* aligned_dest = (uintptr_t *) d;
+    const uintptr_t* aligned_src = (const uintptr_t *) s;
+
+    while (*aligned_src != 0) {
+        *aligned_dest++ = *aligned_src++;
+    }
+
+    d = (char *) aligned_dest;
+    s = (const char *) aligned_src;
+
+    // Copy remaining bytes
+    while (*s != '\0') {
+        *d++ = *s++;
+    }
+
+    *d = '\0';
+}
+
+inline
+void str_copy_move_until(char** __restrict src, char* __restrict dest, char delim)
+{
+    while (**src != delim && **src != '\0') {
+        *dest++ = **src;
+        ++(*src);
+    }
+
+    *dest = '\0';
+}
+
+inline
+void str_copy_move_until(char** __restrict src, char* __restrict dest, const char* __restrict delim, int32 len)
+{
+    while (**src != '\0') {
+        for (int32 i = 0; i < len; ++i) {
+            if (**src == delim[i]) {
+                *dest = '\0';
+                return;
+            }
+        }
+
+        *dest++ = **src;
+        ++(*src);
+    }
+
+    *dest = '\0';
+}
+
+inline
+int32 strcpy_to_eol(const char* src, char* dst)
+{
+    int32 offset = 0;
+    while (!is_eol(src) && *src != '\0')  {
+        *dst++ = *src++;
+        ++offset;
+    }
+
+    *dst = '\0';
+
+    return offset;
+}
+
 inline
 char* strsep(const char** sp, const char* sep)
 {
@ -262,69 +504,58 @@ char* strsep(const char** sp, const char* sep)
    return s;
 }

-inline int64
-str_concat(
+inline void
+str_concat_new(
+    char* dst,
    const char* src1,
-    const char* src2,
-    char* dst
+    const char* src2
 ) {
-    int64 len = strlen(src1);
-    int64 len_total = len;
-
-    memcpy(dst, src1, len);
-    dst += len;
-
-    len = strlen(src2);
-    memcpy(dst, src2, len);
-    dst += len;
+    while (*src1) { *dst++ = *src1++; }
+    while (*src2) { *dst++ = *src2++; }

    *dst = '\0';
-
-    return len_total + len;
-}
-
-// @question Why is this called str_add instead of str_concat like the other functions?
-inline void
-str_add(char* base, const char* src)
-{
-    while (*base) {
-        ++base;
-    }
-
-    strcpy(base, src);
 }

 inline void
-str_add(char* base, const char* src, size_t src_length)
+str_concat_append(char* dst, const char* src)
 {
-    while (*base) {
-        ++base;
+    while (*dst) {
+        ++dst;
    }

-    memcpy(base, src, src_length);
-    base[src_length] = '\0';
+    str_copy_short(dst, src);
+}
+
+inline void
+str_concat_new(char* dst, const char* src1, const char* src2, const char* src3)
+{
+    while (*src1) { *dst++ = *src1++; }
+    while (*src2) { *dst++ = *src2++; }
+    while (*src3) { *dst++ = *src3++; }
+
+    *dst = '\0';
 }

 inline int64
-str_add(char* base, size_t base_length, const char* src, size_t src_length)
+str_concat_append(char* dst, size_t dst_length, const char* src, size_t src_length)
 {
-    memcpy(&base[base_length], src, src_length);
-    base[base_length + src_length] = '\0';
+    memcpy(&dst[dst_length], src, src_length);
+    dst[dst_length + src_length] = '\0';

-    return base_length + src_length;
+    return dst_length + src_length;
 }

 inline void
-str_add(char* base, size_t base_length, const char* src)
+str_concat_append(char* dst, size_t dst_length, const char* src)
 {
-    strcpy(&base[base_length], src);
+    str_copy_short(&dst[dst_length], src);
 }

 inline int64
-str_concat(
+str_concat_new(
+    char* dst,
    const char* src1, size_t src1_length,
-    const char* src2, size_t src2_length,
-    char* dst
+    const char* src2, size_t src2_length
 ) {
    memcpy(dst, src1, src1_length);
    dst += src1_length;
@ -338,10 +569,10 @@ str_concat(
 }

 inline
-void str_concat(
+void str_concat_new(
+    char* dst,
    const char* src, size_t src_length,
-    int64 data,
-    char* dst
+    int64 data
 ) {
    memcpy(dst, src, src_length);
    int32 len = int_to_str(data, dst + src_length);
@ -349,6 +580,32 @@ void str_concat(
    dst[src_length + len] = '\0';
 }

+inline
+void str_concat_append(
+    char* dst,
+    int64 data
+) {
+    size_t dst_len = strlen(dst);
+    int_to_str(data, dst + dst_len);
+}
+
+inline void
+str_concat_new(char* dst, const char* src, int64 data)
+{
+    size_t src_len = strlen(src);
+    memcpy(dst, src, src_len);
+
+    int_to_str(data, dst + src_len);
+}
+
+inline
+void str_insert(char* __restrict dst, size_t insert_pos, const char* __restrict src) {
+    size_t src_length = strlen(src);
+    size_t dst_length = strlen(dst);
+    memcpy(dst + insert_pos + src_length, dst + insert_pos, dst_length - insert_pos + 1);
+    memcpy(dst + insert_pos, src, src_length);
+}
+
 inline
 char* strtok(char* str, const char* __restrict delim, char* *key) {
    char* result;
@ -426,6 +683,77 @@ void create_const_name(unsigned char* name)
    *name = '\0';
 }

+int32 str_compare(const char* str1, const char* str2)
+{
+    byte c1, c2;
+
+    do {
+        c1 = (byte) *str1++;
+        c2 = (byte) *str2++;
+
+        if (c1 == '\0') {
+	        return c1 - c2;
+        }
+    } while (c1 == c2);
+
+    return c1 - c2;
+}
+
+int32 str_compare(const char* str1, const char* str2, size_t n)
+{
+    byte c1 = '\0';
+    byte c2 = '\0';
+
+    if (n >= 4) {
+        size_t n4 = n >> 2;
+
+        do {
+            c1 = (byte) *str1++;
+            c2 = (byte) *str2++;
+
+            if (c1 == '\0' || c1 != c2) {
+                return c1 - c2;
+            }
+
+            c1 = (byte) *str1++;
+            c2 = (byte) *str2++;
+
+            if (c1 == '\0' || c1 != c2) {
+                return c1 - c2;
+            }
+
+            c1 = (byte) *str1++;
+            c2 = (byte) *str2++;
+
+            if (c1 == '\0' || c1 != c2) {
+                return c1 - c2;
+            }
+
+            c1 = (byte) *str1++;
+            c2 = (byte) *str2++;
+
+            if (c1 == '\0' || c1 != c2) {
+                return c1 - c2;
+            }
+        } while (--n4 > 0);
+
+        n &= 3;
+    }
+
+    while (n > 0) {
+        c1 = (byte) *str1++;
+        c2 = (byte) *str2++;
+
+        if (c1 == '\0' || c1 != c2) {
+            return c1 - c2;
+        }
+
+        --n;
+    }
+
+    return c1 - c2;
+}
+
 inline constexpr
 bool str_ends_with(const char* str, const char* suffix) {
    if (!str || !suffix) {
@ -439,7 +767,7 @@ bool str_ends_with(const char* str, const char* suffix) {
        return false;
    }

-    return strncmp(str + str_len - suffix_len, suffix, suffix_len) == 0;
+    return str_compare(str + str_len - suffix_len, suffix, suffix_len) == 0;
 }

 // WARNING: result needs to have the correct length
@ -452,7 +780,7 @@ void str_replace(const char* str, const char* __restrict search, const char* __r
    size_t replace_len = strlen(replace);

    if (search_len == 0) {
-        strcpy(result, str);
+        str_copy_short(result, str);
        return;
    }

@ -471,7 +799,7 @@ void str_replace(const char* str, const char* __restrict search, const char* __r
        str = current;
    }

-    strcpy(result_ptr, str);
+    str_copy_short(result_ptr, str);
 }

 void print_bytes(const void* ptr, size_t size)
@ -493,18 +821,6 @@ void print_bytes(const void* ptr, size_t size)
    }
 }

-inline constexpr
-int32 is_eol(const char* str)
-{
-    if (*str == '\n') {
-        return 1;
-    } else if (*str == '\r' && str[1] == '\n') {
-        return 2;
-    }
-
-    return 0;
-}
-
 inline constexpr
 bool is_whitespace(char str)
 {
@ -639,104 +955,6 @@ void str_skip_until_list(char** __restrict str, const char* __restrict delim)
    }
 }

-inline
-void str_copy_until(const char* __restrict src, char* __restrict dest, char delim)
-{
-    while (*src != delim && *src != '\0') {
-        *dest++ = *src++;
-    }
-
-    *dest = '\0';
-}
-
-inline
-void str_copy_until(const char* __restrict src, char* __restrict dest, const char* __restrict delim, int32 len)
-{
-    while (*src != '\0') {
-        for (int32 i = 0; i < len; ++i) {
-            if (*src == delim[i]) {
-                *dest = '\0';
-                return;
-            }
-        }
-
-        *dest++ = *src++;
-    }
-
-    *dest = '\0';
-}
-
-inline
-int32 str_copy_until(char* __restrict dest, const char* __restrict src, char delim)
-{
-    int32 len = 0;
-    while (*src != delim && *src != '\0') {
-        *dest++ = *src++;
-        ++len;
-    }
-
-    *dest = '\0';
-
-    return len;
-}
-
-inline
-int32 str_copy(char* __restrict dest, const char* __restrict src, char delim)
-{
-    int32 len = 0;
-    while (*src != delim) {
-        *dest++ = *src++;
-        ++len;
-    }
-
-    *dest = '\0';
-
-    return len;
-}
-
-inline
-void str_copy_move_until(char** __restrict src, char* __restrict dest, char delim)
-{
-    while (**src != delim && **src != '\0') {
-        *dest++ = **src;
-        ++(*src);
-    }
-
-    *dest = '\0';
-}
-
-inline
-void str_copy_move_until(char** __restrict src, char* __restrict dest, const char* __restrict delim, int32 len)
-{
-    while (**src != '\0') {
-        for (int32 i = 0; i < len; ++i) {
-            if (**src == delim[i]) {
-                *dest = '\0';
-                return;
-            }
-        }
-
-        *dest++ = **src;
-        ++(*src);
-    }
-
-    *dest = '\0';
-}
-
-inline
-int32 strcpy_to_eol(const char* src, char* dst)
-{
-    int32 offset = 0;
-    while (!is_eol(src) && *src != '\0')  {
-        *dst++ = *src++;
-        ++offset;
-    }
-
-    *dst = '\0';
-
-    return offset;
-}
-
 inline
 void hexstr_to_rgba(v4_f32* rgba, const char* hex)
 {