implemented asset archives and more threading. kinda working

2026-03-07 10:08:41 +00:00 · 2024-12-11 21:03:47 +01:00 · 2024-12-11 21:03:47 +01:00 · fe054ebb13
commit fe054ebb13
parent c132b425d2
60 changed files with 2052 additions and 730 deletions
--- a/asset/AssetArchive.h
+++ b/asset/AssetArchive.h
@ -16,6 +16,13 @@
 #include "../stdlib/simd/SIMD_I32.h"
 #include "../memory/RingMemory.h"
 #include "../memory/BufferMemory.h"
+#include "../image/Image.cpp"
+#include "../object/Mesh.h"
+#include "../object/Texture.h"
+#include "../audio/Audio.cpp"
+#include "../font/Font.h"
+#include "../localization/Language.h"
+#include "../ui/UITheme.h"
 #include "AssetManagementSystem.h"

 #if _WIN32
@ -25,16 +32,21 @@
    #include "../platform/win32/FileUtils.cpp"
 #endif

+#define ASSET_ARCHIVE_VERSION 1
+
 struct AssetArchiveElement {
-    int32 type;
+    uint32 type;

-    int32 start;
-    int32 length;
+    uint32 start;
+    uint32 length;

-    int32 dependency_start; // actual index for asset_dependencies
-    int32 dependency_count;
+    uint32 dependency_start; // actual index for asset_dependencies
+    uint32 dependency_count;
 };

+// It is important to understand that for performance reasons the assets addresses are stored in an array
+// This makes it very fast to access because there is only one indirection.
+// On the other hand we can only find assets by their ID/location and not by name.
 struct AssetArchiveHeader {
    int32 version;

@ -49,7 +61,14 @@ struct AssetArchive {
    AssetArchiveHeader header;
    byte* data; // owner of the data

-    FileHandler fd;
+    FileHandle fd;
+    FileHandle fd_async;
+
+    // @performance We still need to implement the loading with this and then profile it to see if it is faster.
+    // If not remove
+    MMFHandle mmf;
+
+    int32 asset_type_map[ASSET_TYPE_SIZE];
 };

 // Calculates how large the header memory has to be to hold all its information
@ -91,7 +110,9 @@ void asset_archive_header_load(AssetArchiveHeader* header, byte* data, int32 ste
        steps
    );

-    header->asset_dependencies = (int32 *) ((byte *) header->asset_element + header->asset_count * sizeof(AssetArchiveElement));
+    if (header->asset_dependency_count) {
+        header->asset_dependencies = (int32 *) ((byte *) header->asset_element + header->asset_count * sizeof(AssetArchiveElement));
+    }

    memcpy(header->asset_dependencies, data, header->asset_dependency_count * sizeof(int32));
    SWAP_ENDIAN_LITTLE_SIMD(
@ -110,17 +131,22 @@ AssetArchiveElement* asset_archive_element_find(const AssetArchive* archive, int

 void asset_archive_load(AssetArchive* archive, const char* path, BufferMemory* buf, RingMemory* ring, int32 steps = 8)
 {
-    // Get file handle
-    archive->fd = file_read_async_handle(path);
+    archive->fd = file_read_handle(path);
    if (!archive->fd) {
        return;
    }

+    archive->fd_async = file_read_async_handle(path);
+    if (!archive->fd_async) {
+        return;
+    }
+    archive->mmf = file_mmf_handle(archive->fd_async);
+
    FileBody file;
    file.size = 64;

    // Find header size
-    file.content = ring_get_memory(ring, file.size);
+    file.content = ring_get_memory(ring, file.size, 4);
    file_read(archive->fd, &file, 0, file.size);
    file.size = asset_archive_header_size(archive, file.content);

@ -134,33 +160,50 @@ void asset_archive_load(AssetArchive* archive, const char* path, BufferMemory* b
        4
    );

+    archive->header.asset_element = (AssetArchiveElement *) archive->data;
+
    // Read entire header
    file.content = ring_get_memory(ring, file.size);
    file_read(archive->fd, &file, 0, file.size);
    asset_archive_header_load(&archive->header, file.content, steps);
 }

-// @performance This can probably be done much faster by handling the loading of dependencies faster
-void asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetManagementSystem* ams_array, RingMemory* ring)
+// @question Do we want to allow a callback function?
+// Very often we want to do something with the data (e.g. upload it to the gpu)
+// Maybe we could just accept a int value which we set atomically as a flag that the asset is complete?
+// this way we can check much faster if we can work with this data from the caller?!
+// The only problem is that we need to pass the pointer to this int in the thrd_queue since we queue the files to load there
+Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetManagementSystem* ams_array, RingMemory* ring)
 {
-    AssetArchiveElement* element = &archive->header.asset_element[id];
-    AssetManagementSystem* ams = element->type > 0
-        ? &ams_array[element->type]
-        : &ams_array[0];
+    // @todo add calculation from element->type to ams index

-    uint64 hash = hash_djb2((const char *) &id);
+    AssetArchiveElement* element = &archive->header.asset_element[id];
+    AssetManagementSystem* ams = &ams_array[archive->asset_type_map[element->type]];
+
+    // @todo This is a little bit stupid, reconsider
+    char id_str[5];
+    id_str[4] = '\0';
+    *((int32 *) id_str) = id;
+
+    uint64 hash = hash_djb2(id_str);
+
+    Asset* asset;

    // @performance I think we could optimize the ams_reserver_asset in a way so we don't have to lock it the entire time
    pthread_mutex_lock(&ams->mutex);
-    // @bug this is not how this function works
-    if (hashmap_get_entry(&ams->hash_map, (const char *) &id, hash)) {
+    // @bug If we have multiple archive files the ids also repeat, which is not possible for the hash map
+    // Possible solution: also store a string name for every asset. This would add HASH_MAP_MAX_KEY_LENGTH bytes of data to every asset though (see hash map key size = 32)
+
+    asset = ams_get_asset(ams, id_str, hash);
+    if (asset) {
+        // Asset already loaded
        pthread_mutex_unlock(&ams->mutex);
+
+        return asset;
    }

    if (element->type == 0) {
-        // @bug We can't just do this, this won't work. Check if we might want to change the asset management directly to hash indices or at least int values
-        Asset* asset = ams_reserve_asset(ams, (const char *) &id, ams_calculate_chunks(ams, element->length));
-        asset->self = (byte *) (asset + 1);
+        asset = ams_reserve_asset(ams, id_str, ams_calculate_chunks(ams, element->length));

        FileBody file = {};
        file.content = asset->self;
@ -168,34 +211,83 @@ void asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetManage
        // We are directly reading into the correct destination
        file_read(archive->fd, &file, element->start, element->length);
    } else {
+        // @performance In this case we may want to check if memory mapped regions are better.
+        // 1. I don't think they work together with async loading
+        // 2. Profile which one is faster
+        // 3. The big benefit of mmf would be that we can avoid one memcpy and directly load the data into the object
+        // 4. Of course the disadvantage would be to no longer have async loading
+
        // We are reading into temp memory since we have to perform transformations on the data
        FileBodyAsync file = {};
-        file_read_async(archive->fd, &file, element->start, element->length, ring);
+        file_read_async(archive->fd_async, &file, element->start, element->length, ring);

        // This happens while the file system loads the data
-        Asset* asset = ams_reserve_asset(ams, (const char *) &id, ams_calculate_chunks(ams, element->length));
-        asset->self = (byte *) (asset + 1);
+        asset = ams_reserve_asset(ams, id_str, ams_calculate_chunks(ams, element->length));
+        asset->is_ram = true;

-        byte* data = ring_get_memory(ring, element->length, 64);
-        size_t data_size = 0;
-
-        // @todo create platform wrapper
-        GetOverlappedResult(archive->fd, &file.ov, NULL, true);
+        file_async_wait(archive->fd_async, &file.ov, true);
        switch (element->type) {
-            case 1: {
+            case ASSET_TYPE_IMAGE: {
+                // @todo Do we really want to store textures in the asset management system or only images?
+                // If it is only images then we need to somehow also manage textures
+                Texture* texture = (Texture *) asset->self;
+                texture->image.pixels = (byte *) (texture + 1);
+
+                image_from_data(file.content, &texture->image);
+
+                asset->vram_size = texture->image.pixel_count * image_pixel_size_from_type(texture->image.pixel_type);
+                asset->ram_size = asset->vram_size + sizeof(Texture);
+
+                #if OPENGL
+                    // @bug I think order_rows has the wrong value
+                    if (texture->image.order_rows == IMAGE_ROW_ORDER_TOP_TO_BOTTOM) {
+                        image_flip_vertical(ring, &texture->image);
+                        texture->image.order_rows = IMAGE_ROW_ORDER_BOTTOM_TO_TOP;
+                    }
+                #endif
+            } break;
+            case ASSET_TYPE_AUDIO: {
+                Audio* audio = (Audio *) asset->self;
+                audio->data = (byte *) (audio + 1);
+
+                audio_from_data(file.content, audio);
+            } break;
+            case ASSET_TYPE_OBJ: {
+                Mesh* mesh = (Mesh *) asset->self;
+                mesh->data = (byte *) (mesh + 1);
+
+                mesh_from_data(file.content, mesh);
+            } break;
+            case ASSET_TYPE_LANGUAGE: {
+                Language* language = (Language *) asset->self;
+                language->data = (byte *) (language + 1);
+
+                language_from_data(file.content, language);
+            } break;
+            case ASSET_TYPE_FONT: {
+                Font* font = (Font *) asset->self;
+                font->glyphs = (Glyph *) (font + 1);
+
+                font_from_data(file.content, font);
+            } break;
+            case ASSET_TYPE_THEME: {
+                UIThemeStyle* theme = (UIThemeStyle *) asset->self;
+                theme->data = (byte *) (theme + 1);
+
+                theme_from_data(file.content, theme);
            } break;
            default: {
            }
        }
-
-        memcpy(asset->self, data, data_size);
    }
    pthread_mutex_unlock(&ams->mutex);

-    // @performance maybe do in worker threads?
-    for (int32 i = 0; i < element->dependency_count; ++i) {
+    // @performance maybe do in worker threads? This just feels very slow
+    for (uint32 i = 0; i < element->dependency_count; ++i) {
        asset_archive_asset_load(archive, id, ams, ring);
    }
+
+    return asset;
 }

 #endif
--- a/asset/AssetManagementSystem.h
+++ b/asset/AssetManagementSystem.h
@ -34,9 +34,11 @@ struct AssetManagementSystem {
    // The indices of asset_memory and asset_data_memory are always linked

    // General asset memory
+    // Fixed chunk size of sizeof(Asset)
    ChunkMemory asset_memory;

    // Actual asset data
+    // Chunk size defined during initialization
    ChunkMemory asset_data_memory;

    // @performance Do we really need the linked list, the ChunkMemory should allow us to do some smart stuff
@ -44,7 +46,11 @@ struct AssetManagementSystem {
    Asset* last;

    // @question do we want to create an extra threaded version? Or a combined one, like we have right now.
+    // @question Do we want to add a mutex to assets. This way we don't have to lock the entire ams.
    pthread_mutex_t mutex;
+
+    // @bug We probably also need a overhead value.
+    // In some cases we need more data than our normal data (see texture, it contains image + texture)
 };

 void ams_create(AssetManagementSystem* ams, BufferMemory* buf, int32 chunk_size, int32 count)
@ -201,9 +207,9 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key)
 }

 inline
-Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 index)
+Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash)
 {
-    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key, index);
+    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key, hash);

    // @bug entry->value seems to be an address outside of any known buffer, how?
    DEBUG_MEMORY_READ(
@ -215,7 +221,7 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 index)
 }

 // @performance We could probably avoid locking by adding a atomic flag to indicate if the value is valid
-Asset* threaded_ams_get_asset(AssetManagementSystem* ams, uint64 element) {
+Asset* thrd_ams_get_asset(AssetManagementSystem* ams, uint64 element) {
    pthread_mutex_lock(&ams->mutex);
    Asset* asset = ams_get_asset(ams, element);
    pthread_mutex_unlock(&ams->mutex);
@ -223,7 +229,7 @@ Asset* threaded_ams_get_asset(AssetManagementSystem* ams, uint64 element) {
    return asset;
 }

-Asset* threaded_ams_get_asset(AssetManagementSystem* ams, const char* key) {
+Asset* thrd_ams_get_asset(AssetManagementSystem* ams, const char* key) {
    pthread_mutex_lock(&ams->mutex);
    Asset* asset = ams_get_asset(ams, key);
    pthread_mutex_unlock(&ams->mutex);
@ -231,9 +237,9 @@ Asset* threaded_ams_get_asset(AssetManagementSystem* ams, const char* key) {
    return asset;
 }

-Asset* threaded_ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 index) {
+Asset* thrd_ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash) {
    pthread_mutex_lock(&ams->mutex);
-    Asset* asset = ams_get_asset(ams, key, index);
+    Asset* asset = ams_get_asset(ams, key, hash);
    pthread_mutex_unlock(&ams->mutex);

    return asset;
@ -309,4 +315,22 @@ Asset* ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 el
    return asset;
 }

+Asset* thrd_ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 elements = 1) {
+    pthread_mutex_lock(&ams->mutex);
+    Asset* asset = ams_reserve_asset(ams, name, elements);
+    pthread_mutex_unlock(&ams->mutex);
+
+    return asset;
+}
+
+Asset* thrd_ams_reserve_asset_start(AssetManagementSystem* ams, const char* name, uint32 elements = 1) {
+    pthread_mutex_lock(&ams->mutex);
+
+    return ams_reserve_asset(ams, name, elements);
+}
+
+void thrd_ams_reserve_asset_end(AssetManagementSystem* ams) {
+    pthread_mutex_unlock(&ams->mutex);
+}
+
 #endif
--- a/asset/AssetType.h
+++ b/asset/AssetType.h
@ -12,9 +12,11 @@
 enum AssetType {
    ASSET_TYPE_GENERAL,
    ASSET_TYPE_OBJ,
-    ASSET_TYPE_TEXTURE,
    ASSET_TYPE_AUDIO,
-    ASSET_TYPE_ANIM,
+    ASSET_TYPE_LANGUAGE,
+    ASSET_TYPE_FONT,
+    ASSET_TYPE_THEME,
+    ASSET_TYPE_IMAGE,
    ASSET_TYPE_SIZE
 };

--- a/audio/Audio.cpp
+++ b/audio/Audio.cpp
@ -22,14 +22,80 @@
 #include "AudioSetting.h"
 #include "Wav.h"

-void audio_from_file(RingMemory* ring, const char* path, Audio* audio)
+void audio_from_file(Audio* audio, const char* path, RingMemory* ring)
 {
    FileBody file;
    file_read(path, &file, ring);

+    ASSERT_SIMPLE(file.size);
+
    if (str_ends_with(path, ".wav")) {
-        wav_audio_generate(&file, audio);
+        wav_from_data(file.content, (uint32) file.size, audio, ring);
    }
 }

+int32 audio_data_size(const Audio* audio)
+{
+    return (int32) (audio->size
+        + sizeof(audio->sample_rate)
+        + sizeof(audio->sample_size)
+        + sizeof(audio->channels)
+        + sizeof(audio->bloc_size)
+        + sizeof(audio->byte_per_sec)
+        + sizeof(audio->size)
+    );
+}
+
+int32 audio_from_data(const byte* data, Audio* audio)
+{
+    audio->sample_rate = SWAP_ENDIAN_LITTLE(*((uint16 *) data));
+    data += sizeof(audio->sample_rate);
+
+    audio->sample_size = *data;
+    data += sizeof(audio->sample_size);
+
+    audio->channels = *data;
+    data += sizeof(audio->channels);
+
+    audio->bloc_size = *data;
+    data += sizeof(audio->bloc_size);
+
+    audio->byte_per_sec = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
+    data += sizeof(audio->byte_per_sec);
+
+    audio->size = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
+    data += sizeof(audio->size);
+
+    memcpy(audio->data, data, audio->size);
+    data += audio->size;
+
+    return audio_data_size(audio);
+}
+
+int32 audio_to_data(const Audio* audio, byte* data)
+{
+    *((uint16 *) data) = SWAP_ENDIAN_LITTLE(audio->sample_rate);
+    data += sizeof(audio->sample_rate);
+
+    *data = audio->sample_size;
+    data += sizeof(audio->sample_size);
+
+    *data = audio->channels;
+    data += sizeof(audio->channels);
+
+    *data = audio->bloc_size;
+    data += sizeof(audio->bloc_size);
+
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(audio->byte_per_sec);
+    data += sizeof(audio->byte_per_sec);
+
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(audio->size);
+    data += sizeof(audio->size);
+
+    memcpy(data, audio->data, audio->size);
+    data += audio->size;
+
+    return audio_data_size(audio);
+}
+
 #endif
--- a/audio/Audio.h
+++ b/audio/Audio.h
@ -15,19 +15,19 @@
 struct Audio {
    // bits per sample
    // usually 48000 or 44100
-    uint32 sample_rate;
+    uint16 sample_rate;

    // bytes per bloc
    // channel count * bit
    // usually 2 * 16 = 4
-    uint32 sample_size;
+    byte sample_size;

    // audio channels
    // usually 2
-    uint32 channels;
+    byte channels;

    // usually 16 = 2
-    uint32 bloc_size;
+    byte bloc_size;

    // sample_rate * sample_size
    uint32 byte_per_sec;
--- a/audio/AudioMixer.h
+++ b/audio/AudioMixer.h
@ -48,6 +48,8 @@ struct AudioInstance {

    uint32 audio_size;
    byte* audio_data;
+
+    uint32 sample_index;
 };

 struct AudioMixer {
@ -71,6 +73,7 @@ struct AudioMixer {
    // do we need a condition or semaphore?
 };

+// @todo expand AudioLocationSetting so that it also includes audio effects, repeat etc.
 void audio_mixer_add(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSetting* origin)
 {
    int64 index = chunk_reserve(&mixer->audio_instances, 1);
@ -90,7 +93,7 @@ void audio_mixer_add(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSet

 void audio_mixer_add_unique(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSetting* origin)
 {
-    for (int32 i = 0; i < mixer->audio_instances.count; ++i) {
+    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
        // @performance We are not really utilizing chunk memory.
        // Maybe a simple array would be better
        // Or we need to use more chunk functions / maybe even create a chunk_iterate() function?
@ -105,7 +108,7 @@ void audio_mixer_add_unique(AudioMixer* mixer, int64 id, Audio* audio, AudioLoca

 void audio_mixer_remove(AudioMixer* mixer, int64 id)
 {
-    for (int32 i = 0; i < mixer->audio_instances.count; ++i) {
+    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
        AudioInstance* instance = (AudioInstance *) chunk_get_element(&mixer->audio_instances, i);
        if (instance->id == id) {
            instance->id = 0;
@ -116,38 +119,38 @@ void audio_mixer_remove(AudioMixer* mixer, int64 id)
    }
 }

-void apply_echo(int16* buffer, uint16 buffer_size, f32 delay, f32 feedback, int32 sample_rate) {
+void apply_echo(int16* buffer, uint32 buffer_size, f32 delay, f32 feedback, int32 sample_rate) {
    int32 delay_samples = (int32) (delay * sample_rate);
-    for (int32 i = delay_samples; i < buffer_size; ++i) {
+    for (uint32 i = delay_samples; i < buffer_size; ++i) {
        buffer[i] += (int16) (buffer[i - delay_samples] * feedback);
    }
 }

-void apply_reverb(int16* buffer, uint16 buffer_size, f32 intensity) {
+void apply_reverb(int16* buffer, uint32 buffer_size, f32 intensity) {
    intensity *= 0.5f;
-    for (int32 i = 1; i < buffer_size; ++i) {
+    for (uint32 i = 1; i < buffer_size; ++i) {
        buffer[i] += (int16) (buffer[i - 1] * intensity); // Simple reverb with decay
    }
 }

-void apply_cave(int16* buffer, uint16 buffer_size, int32 sample_rate) {
+void apply_cave(int16* buffer, uint32 buffer_size, int32 sample_rate) {
    f32 echo_delay = 0.1f; // Echo delay in seconds
    f32 feedback = 0.3f;  // Echo feedback level
    apply_echo(buffer, buffer_size, echo_delay, feedback, sample_rate);
    apply_reverb(buffer, buffer_size, 0.4f); // Add mild reverb
 }

-void apply_underwater(int16* buffer, uint16 buffer_size) {
-    for (int32 i = 0; i < buffer_size; ++i) {
+void apply_underwater(int16* buffer, uint32 buffer_size) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        buffer[i] = (int16) sinf(buffer[i] * 0.5f); // Dampen + distortion
    }
 }

-void apply_flanger(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
-    int32 delay_samples = (int32) (depth * sample_rate);
+void apply_flanger(int16* buffer, uint32 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
+    f32 delay_samples = depth * sample_rate;
    f32 temp = OMS_TWO_PI * rate / sample_rate;

-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        int32 delay = (int32) (delay_samples * (0.5f + 0.5f * sinf(i * temp)));
        if (i >= delay) {
            buffer[i] += (int16) (buffer[i - delay] * 0.5f);
@ -155,27 +158,27 @@ void apply_flanger(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32
    }
 }

-void apply_tremolo(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
+void apply_tremolo(int16* buffer, uint32 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
    f32 temp = OMS_TWO_PI * rate / sample_rate;
    f32 temp2 = (1.0f - depth) + depth;

-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        f32 mod = temp2 * (0.5f + 0.5f * sinf(i * temp));
        buffer[i] = (int16) (buffer[i] * mod);
    }
 }

-void apply_distortion(int16* buffer, uint16 buffer_size, f32 gain) {
-    for (int32 i = 0; i < buffer_size; ++i) {
+void apply_distortion(int16* buffer, uint32 buffer_size, f32 gain) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        buffer[i] = (int16) tanh(buffer[i] * gain);
    }
 }

-void apply_chorus(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
+void apply_chorus(int16* buffer, uint32 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
    f32 temp = OMS_TWO_PI * rate / sample_rate;

    int32 max_delay = (int32) (depth * sample_rate);
-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        int32 delay = (int32) (max_delay * (0.5f + 0.5f * sinf(i * temp)));
        if (i >= delay) {
            buffer[i] += (int16) (buffer[i - delay] * 0.5f);
@ -183,26 +186,26 @@ void apply_chorus(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32
    }
 }

-void apply_pitch_shift(int16* buffer, uint16 buffer_size, f32 pitch_factor) {
-    for (int32 i = 0; i < buffer_size; ++i) {
+void apply_pitch_shift(int16* buffer, uint32 buffer_size, f32 pitch_factor) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        buffer[i] = (int16) (buffer[i] * pitch_factor);
    }
 }

-void apply_granular_delay(int16* buffer, uint16 buffer_size, f32 delay, f32 granularity, int32 sample_rate) {
+void apply_granular_delay(int16* buffer, uint32 buffer_size, f32 delay, f32 granularity, int32 sample_rate) {
    int32 delay_samples = (int32) (delay * sample_rate);
    int32 limit = (int32) (granularity * sample_rate);

-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        if (i % limit == 0 && i >= delay_samples) {
            buffer[i] += (int16) (buffer[i - delay_samples] * 0.6f);
        }
    }
 }

-void apply_frequency_modulation(int16* buffer, uint16 buffer_size, f32 mod_freq, f32 mod_depth, int32 sample_rate) {
+void apply_frequency_modulation(int16* buffer, uint32 buffer_size, f32 mod_freq, f32 mod_depth, int32 sample_rate) {
    f32 temp = OMS_TWO_PI * mod_freq / sample_rate;
-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        buffer[i] = (int16) (buffer[i] * sinf(i * temp) * mod_depth);
    }
 }
@ -211,20 +214,20 @@ void apply_stereo_panning(int16* buffer, int32 buffer_size, f32 pan) {
    f32 left_gain = 1.0f - pan;
    f32 right_gain = pan;

-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
        buffer[i] = (int16) (buffer[i] * left_gain);
        buffer[i + 1] = (int16) (buffer[i + 1] * right_gain);
    }
 }

-void apply_highpass(int16* buffer, uint16 buffer_size, f32 cutoff, int32 sample_rate) {
+void apply_highpass(int16* buffer, uint32 buffer_size, f32 cutoff, int32 sample_rate) {
    f32 rc = 1.0f / (OMS_TWO_PI * cutoff);
    f32 dt = 1.0f / sample_rate;
    f32 alpha = rc / (rc + dt);
    f32 previous = buffer[0];
    f32 previous_output = buffer[0];

-    for (int32 i = 1; i < buffer_size; ++i) {
+    for (uint32 i = 1; i < buffer_size; ++i) {
        f32 current = buffer[i];
        buffer[i] = (int16) (alpha * (previous_output + current - previous));
        previous = current;
@ -232,53 +235,89 @@ void apply_highpass(int16* buffer, uint16 buffer_size, f32 cutoff, int32 sample_
    }
 }

-
-void apply_lowpass(int16* buffer, uint16 buffer_size, f32 cutoff, int32 sample_rate) {
+void apply_lowpass(int16* buffer, uint32 buffer_size, f32 cutoff, int32 sample_rate) {
    f32 rc = 1.0f / (OMS_TWO_PI * cutoff);
    f32 dt = 1.0f / sample_rate;
    f32 alpha = dt / (rc + dt);
    f32 previous = buffer[0];

-    for (int32 i = 1; i < buffer_size; ++i) {
+    for (uint32 i = 1; i < buffer_size; ++i) {
        buffer[i] = (int16) (previous + alpha * (buffer[i] - previous));
        previous = buffer[i];
    }
 }

-void audio_mixer_mix(AudioMixer *mixer) {
-    uint16 limit = (uint16) (mixer->settings.sample_buffer_size / mixer->settings.sample_size);
+void audio_mixer_mix(AudioMixer* mixer) {
+    uint32 limit = OMS_MIN(
+        mixer->settings.sample_buffer_size / mixer->settings.sample_size,
+        mixer->settings.buffer_size / mixer->settings.sample_size
+    );

-    for (int32 i = 0; i < mixer->audio_instances.count; ++i) {
+    bool has_location = !is_empty((byte *) &mixer->camera.audio_location, sizeof(mixer->camera.audio_location));
+
+    f32 volume_scale = mixer->settings.master_volume * mixer->settings.master_volume;
+
+    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
        AudioInstance* sound = (AudioInstance *) chunk_get_element(&mixer->audio_instances, i);
        if (sound->id == 0) {
            continue;
        }

        // Compute the vector from the player to the sound's origin
-        v3_f32 to_sound;
-        vec3_sub(&to_sound, &sound->origin.audio_location, &mixer->camera.audio_location);
-        f32 distance = vec3_length(&to_sound);
-        f32 distance_attenuation = OMS_MAX(0.0f, 1.0f - (distance / 50.0f));
-        vec3_normalize(&to_sound);
-        f32 alignment = vec3_dot(&mixer->camera.audio_lookat, &to_sound);
-        f32 directional_attenuation = OMS_MAX(0.0f, alignment);
-        f32 total_attenuation = distance_attenuation * directional_attenuation;
+        v3_f32 to_sound = {};
+        f32 total_attenuation = 1.0f;
+        bool has_origin = !is_empty((byte *) &sound->origin.audio_location, sizeof(sound->origin.audio_location));
+
+        if (has_location && has_origin) {
+            vec3_sub(&to_sound, &sound->origin.audio_location, &mixer->camera.audio_location);
+
+            f32 distance = vec3_length(&to_sound);
+            if (distance) {
+                f32 distance_attenuation = OMS_MAX(0.0f, 1.0f - (distance / 50.0f));
+
+                vec3_normalize(&to_sound);
+                f32 alignment = vec3_dot(&mixer->camera.audio_lookat, &to_sound);
+                f32 directional_attenuation = OMS_MAX(0.0f, alignment);
+
+                total_attenuation = distance_attenuation * directional_attenuation;
+            }
+        }
+
+        uint32 sound_sample_count = sound->audio_size / mixer->settings.sample_size;
+        uint32 sound_sample_index = sound->sample_index;
+        int16* audio_data = (int16 *) sound->audio_data;

        // Temporary buffer for effects processing
        // @performance If there are situations where only one file exists in the mixer that should be played we could directly write to
        // the output buffer improving the performance. Some of those mixers are: music, cinematic, ui
        // Careful, NOT voice since we will probably manually layer them according to their position?
        for (int32 j = 0; j < limit; ++j) {
-            // @todo if repeat handle here
+            if (sound_sample_index >= sound_sample_count) {
+                // @todo if repeat we need to handle part of it here, else quit

-            mixer->buffer_temp[j] = (int16) (sound->audio_data[j * 2] * mixer->settings.master_volume * total_attenuation);
-            mixer->buffer_temp[j + 1] = (int16) (sound->audio_data[j * 2 + 2] * mixer->settings.master_volume * total_attenuation);
+                sound_sample_index = 0;
+
+                // @question why are we doing this?
+                mixer->settings.sample_index = 0;
+            }
+
+            mixer->buffer_temp[j * 2] = (int16) (audio_data[sound_sample_index * 2] * volume_scale * total_attenuation);
+            mixer->buffer_temp[j * 2 + 1] = (int16) (audio_data[sound_sample_index * 2 + 1] * volume_scale * total_attenuation);
+
+            ++sound_sample_index;

            // @performance Some adjustments could be made right here the question is if this is faster.
            // Probably depends on how likely the adjustment is to happen.
+
+            // @todo if end of file and no repeat -> remove from list
        }

+        // @question We also have to set setting->sample_index = sound_sample_index.
+        // But that currently happens in the sound api. Do we want to keep it there or move it here
+
        // Apply effects based on sound's effect type
+        // @performance Depending on how we implement effects we could even pull them out of this loop
+        // What I mean is effects could either be sound file dependent (current location correct) or mixer dependent
        if (mixer->effect) {
            if (mixer->effect & AUDIO_EFFECT_ECHO) {
                apply_echo(mixer->buffer_temp, limit, 0.2f, 0.4f, mixer->settings.sample_rate);
@ -337,8 +376,11 @@ void audio_mixer_mix(AudioMixer *mixer) {
            }
        }

+        // @bug the actual output "limit" could be smaller if sound files end earlier and no repeat is defined
+        // In that case we would also have to adjust mixer->settings.sample_buffer_size
+
        // Add the processed sound to the output buffer
-        for (int32 j = 0; j < limit; j++) {
+        for (uint32 j = 0; j < limit; j++) {
            mixer->settings.buffer[j] += mixer->buffer_temp[j];
        }
    }
--- a/audio/AudioSetting.h
+++ b/audio/AudioSetting.h
@ -20,23 +20,19 @@ struct AudioSetting {
    // WARNING: not the byte position, but the index based on the sample size
    uint32 sample_index;

-    // @todo add more settings e.g. repeat etc
-
-    uint32 latency;
-
    f32 master_volume;

    // bits per sample
    // usually 48000 or 44100
-    uint32 sample_rate;
+    uint16 sample_rate;

    // bytes per bloc
    // channel count * bit
    // usually 2 * 16 = 4
-    uint32 sample_size;
+    byte sample_size;

    // how often has the audio_play been called (required for xaudio)
-    uint32 sample_output;
+    byte sample_output;

    // max buffer content/size
    uint32 buffer_size;
@ -47,6 +43,9 @@ struct AudioSetting {
    int16* buffer;

    byte type = SOUND_API_DIRECT_SOUND;
+    byte latency;
+
+    // @todo add more settings e.g. repeat etc
 };

 struct AudioLocationSetting {
--- a/audio/Wav.h
+++ b/audio/Wav.h
@ -46,29 +46,23 @@ struct WavHeader {
 struct Wav {
    WavHeader header;

-    byte* sample_data; // WARNING: This is not the owner of the data. The owner is the FileBody
+    byte* sample_data; // WARNING: This is not the owner of the data.

    uint32 size;
-    byte* data; // WARNING: This is not the owner of the data. The owner is the FileBody
+    byte* data; // Data owner
 };

-void generate_default_wav_references(const FileBody* file, Wav* wav)
+void generate_default_wav_references(const byte* data, uint32 size, Wav* wav)
 {
-    wav->size = (uint32) file->size;
-    wav->data = file->content;
-
-    if (wav->size < WAV_HEADER_SIZE) {
-        // This shouldn't happen
-        return;
-    }
+    wav->size = size;
+    ASSERT_SIMPLE(size >= WAV_HEADER_SIZE);

    // Check if we can copy memory directly
    // The struct layout and header size should match on x86, but we still check it
    if constexpr (sizeof(WavHeader) == WAV_HEADER_SIZE) {
-        memcpy(&wav->header, file->content, WAV_HEADER_SIZE);
+        memcpy(&wav->header, data, WAV_HEADER_SIZE);

        // swap endian if we are on big endian system
-        // @question Maybe this needs to be a runtime check?
        #if !_WIN32 && !__LITTLE_ENDIAN
            wav->header.size = SWAP_ENDIAN_LITTLE(wav->header.size);
            wav->header.bloc_size = SWAP_ENDIAN_LITTLE(wav->header.bloc_size);
@ -121,33 +115,32 @@ void generate_default_wav_references(const FileBody* file, Wav* wav)
        wav->header.bits_per_sample = SWAP_ENDIAN_LITTLE(*((uint16 *) (wav->data + 34)));

        // Sample data header
-        wav->header.data_bloc_id[0] = *(wav->data + 36);
-        wav->header.data_bloc_id[1] = *(wav->data + 37);
-        wav->header.data_bloc_id[2] = *(wav->data + 38);
-        wav->header.data_bloc_id[3] = *(wav->data + 39);
+        memcpy(wav->header.data_bloc_id, wav->data + 36, 4);

-        wav->header.data_size = SWAP_ENDIAN_LITTLE(*((uint32 *) *(wav->data + 40)));
+        wav->header.data_size = SWAP_ENDIAN_LITTLE(*((uint32 *) *(wav->data + WAV_HEADER_SIZE - sizeof(wav->header.data_bloc_id))));
    }

    wav->sample_data = wav->data + WAV_HEADER_SIZE;
+    memcpy(wav->sample_data, data + WAV_HEADER_SIZE, wav->header.data_size);
 }

-void wav_audio_generate(const FileBody* src_data, Audio* audio)
+void wav_from_data(const byte* data, uint32 size, Audio* audio, RingMemory* ring)
 {
    // @performance We are generating the struct and then filling the data.
-    //      There is some asignment/copy overhead
+    //      There is some assignment/copy overhead
    Wav src = {};
-    generate_default_wav_references(src_data, &src);
+    src.data = ring_get_memory(ring, size, 4);
+    generate_default_wav_references(data, size, &src);

    if (!src.size) {
        return;
    }

-    audio->sample_rate = src.header.frequency;
-    audio->sample_size = (src.header.bits_per_sample / 8) * src.header.nbr_channels;
-    audio->channels = src.header.nbr_channels;
-    audio->byte_per_sec = src.header.byte_per_sec;
-    audio->bloc_size = src.header.bloc_size;
+    audio->sample_rate = (uint16) src.header.frequency;
+    audio->sample_size = (byte) ((src.header.bits_per_sample / 8) * src.header.nbr_channels);
+    audio->channels = (byte) src.header.nbr_channels;
+    audio->byte_per_sec = (uint32) src.header.byte_per_sec;
+    audio->bloc_size = (byte) src.header.bloc_size;
    audio->size = src.header.data_size;

    memcpy((void *) audio->data, src.sample_data, audio->size);
--- a/camera/Camera.h
+++ b/camera/Camera.h
@ -19,8 +19,14 @@

 // @todo Please check out if we can switch to quaternions. We tried but failed.

+enum CameraStateChanges : byte {
+    CAMERA_STATE_CHANGE_NONE = 0,
+    CAMERA_STATE_CHANGE_NORMAL = 1,
+    CAMERA_STATE_CHANGE_WINDOW = 2,
+};
+
 struct Camera {
-    bool is_changed;
+    byte state_changes;

    v3_f32 location;
    v4_f32 orientation;
@ -43,6 +49,8 @@ struct Camera {
    f32 aspect;

    f32 view[16];
+    f32 projection[16];
+    f32 orth[16];
 };

 void
@ -64,7 +72,7 @@ camera_update_vectors(Camera* camera)

 void camera_rotate(Camera* camera, int32 dx, int32 dy, f32 dt)
 {
-    camera->is_changed = true;
+    camera->state_changes |= CAMERA_STATE_CHANGE_NORMAL;
    camera->orientation.x += dy * camera->sensitivity;
    camera->orientation.y -= dx * camera->sensitivity;

@ -88,7 +96,7 @@ void camera_rotate(Camera* camera, int32 dx, int32 dy, f32 dt)
 // you can have up to 4 camera movement inputs at the same time
 void camera_movement(Camera* camera, CameraMovement* movement, f32 dt, bool relative_to_world = true)
 {
-    camera->is_changed = true;
+    camera->state_changes |= CAMERA_STATE_CHANGE_NORMAL;
    f32 velocity = camera->speed * dt;

    if (relative_to_world) {
@ -214,11 +222,11 @@ void camera_movement(Camera* camera, CameraMovement* movement, f32 dt, bool rela
 }

 inline
-void camera_orth_matrix_lh(const Camera* __restrict camera, f32* __restrict orth)
+void camera_orth_matrix_lh(Camera* __restrict camera)
 {
-    mat4_identity_sparse(orth);
+    mat4_identity(camera->orth);
    mat4_ortho_sparse_lh(
-        orth,
+        camera->orth,
        0, camera->viewport_width,
        0, camera->viewport_height,
        camera->znear,
@ -227,11 +235,11 @@ void camera_orth_matrix_lh(const Camera* __restrict camera, f32* __restrict orth
 }

 inline
-void camera_orth_matrix_rh(const Camera* __restrict camera, f32* __restrict orth)
+void camera_orth_matrix_rh(Camera* __restrict camera)
 {
-    mat4_identity_sparse(orth);
+    mat4_identity(camera->orth);
    mat4_ortho_sparse_rh(
-        orth,
+        camera->orth,
        0, camera->viewport_width,
        0, camera->viewport_height,
        camera->znear,
@ -240,11 +248,11 @@ void camera_orth_matrix_rh(const Camera* __restrict camera, f32* __restrict orth
 }

 inline
-void camera_projection_matrix_lh(const Camera* __restrict camera, f32* __restrict projection)
+void camera_projection_matrix_lh(Camera* __restrict camera)
 {
-    mat4_identity_sparse(projection);
+    mat4_identity(camera->projection);
    mat4_perspective_sparse_lh(
-        projection,
+        camera->projection,
        camera->fov,
        camera->aspect,
        camera->znear,
@ -253,11 +261,11 @@ void camera_projection_matrix_lh(const Camera* __restrict camera, f32* __restric
 }

 inline
-void camera_projection_matrix_rh(const Camera* __restrict camera, f32* __restrict projection)
+void camera_projection_matrix_rh(Camera* __restrict camera)
 {
-    mat4_identity_sparse(projection);
+    mat4_identity(camera->projection);
    mat4_perspective_sparse_rh(
-        projection,
+        camera->projection,
        camera->fov,
        camera->aspect,
        camera->znear,
--- a/compression/Huffman.h
+++ b/compression/Huffman.h
@ -14,6 +14,7 @@

 #include "../stdlib/Types.h"
 #include "../utils/BitUtils.h"
+#include "../utils/MathUtils.h"
 #include "../utils/EndianUtils.h"

 struct HuffmanNode {
@ -34,31 +35,37 @@ struct Huffman {
    char* code[256];   // Contains a pointer per ASCII character to the huffman code sequence
 };

+// We could combine this function with the one below but this would introduce a if != 0 check for the frequency
+// I would assume the current version is faster since we avoid a branch
+inline
 HuffmanNode* huffman_node_create(Huffman* hf, int32 frequency, byte character, HuffmanNode* left, HuffmanNode* right)
 {
    HuffmanNode* node = hf->pool + hf->node_count++;
-    if (frequency) {
-        node->character = character;
-        node->frequency = frequency;
-    } else {
-        node->left = left;
-        node->right = right;
-        node->frequency = left->frequency + right->frequency;
-    }
+    node->character = character;
+    node->frequency = frequency;

    return node;
 }

+// Same as other function but frequency = 0
+inline
+HuffmanNode* huffman_node_create(Huffman* hf, byte character, HuffmanNode* left, HuffmanNode* right)
+{
+    HuffmanNode* node = hf->pool + hf->node_count++;
+    node->left = left;
+    node->right = right;
+    node->frequency = left->frequency + right->frequency;
+
+    return node;
+}
+
+inline
 void huffman_node_insert(Huffman* hf, HuffmanNode* node)
 {
    int32 child_id;
    int32 parent_id = hf->pq_end++;

-    while ((child_id = parent_id / 2)) {
-        if (hf->pq[child_id]->frequency <= node->frequency) {
-            break;
-        }
-
+    while ((child_id = parent_id / 2) && hf->pq[child_id]->frequency <= node->frequency) {
        hf->pq[parent_id] = hf->pq[child_id];
        parent_id = child_id;
    }
@ -111,13 +118,15 @@ int64 huffman_code_build(Huffman* hf, HuffmanNode* root, char* code, int32 lengt
 void huffman_init(Huffman* hf, const byte* in)
 {
    int32 frequency[256] = {0};
-    char temp_code[16];
    int32 buffer_position = 0;
+    char temp_code[16];

    // We artificially force the root element (usually the 0 element) to have the index 1.
    hf->pq = (HuffmanNode **) (hf->priority_queue - 1);

-    while (*in) frequency[(byte) *in++]++;
+    while (*in) {
+        ++frequency[(byte) *in++];
+    }

    for (int32 i = 0; i < 256; ++i) {
        if (frequency[i]) {
@ -126,21 +135,20 @@ void huffman_init(Huffman* hf, const byte* in)
    }

    while (hf->pq_end > 2) {
-        huffman_node_insert(hf, huffman_node_create(hf, 0, 0, huffman_node_remove(hf), huffman_node_remove(hf)));
+        huffman_node_insert(hf, huffman_node_create(hf, 0, huffman_node_remove(hf), huffman_node_remove(hf)));
    }

    huffman_code_build(hf, hf->pq[1], temp_code, 0, hf->buffer, &buffer_position);
 }

+inline
 void huffman_dump(const Huffman* hf, byte* out)
 {
-    // dump the char -> code relations as relative indeces
+    // dump the char -> code relations as relative indices
    for (int32 i = 0; i < ARRAY_COUNT(hf->code); ++i) {
-        if (hf->code[i]) {
-            *((int64 *) out) = SWAP_ENDIAN_LITTLE(hf->code[i] - hf->buffer);
-        } else {
-            *((int64 *) out) = SWAP_ENDIAN_LITTLE(-1);
-        }
+        *((int64 *) out) = hf->code[i]
+            ? SWAP_ENDIAN_LITTLE(hf->code[i] - hf->buffer)
+            : SWAP_ENDIAN_LITTLE(-1);

        out += sizeof(int64);
    }
@ -149,6 +157,7 @@ void huffman_dump(const Huffman* hf, byte* out)
    memcpy(out, hf->buffer, sizeof(char) * ARRAY_COUNT(hf->buffer));
 }

+inline
 void huffman_load(Huffman* hf, const byte* in)
 {
    // load the char -> code relations and convert relative indices to pointers
@ -165,6 +174,7 @@ void huffman_load(Huffman* hf, const byte* in)
    memcpy(hf->buffer, in, sizeof(char) * ARRAY_COUNT(hf->buffer));
 }

+inline
 int64 huffman_encode(Huffman* hf, const byte* in, byte* out)
 {
    uint64 bit_length = 0;
@ -180,11 +190,11 @@ int64 huffman_encode(Huffman* hf, const byte* in, byte* out)

            ++code;
            ++bit_length;
-            ++pos_bit;

-            if (pos_bit > 7) {
+            // Make sure it wraps around to 0 for pos_bit > 7
+            pos_bit = MODULO_2(++pos_bit, 8);
+            if (pos_bit == 0) {
                ++out;
-                pos_bit = 0;
            }
        }
    }
@ -192,29 +202,26 @@ int64 huffman_encode(Huffman* hf, const byte* in, byte* out)
    return bit_length;
 }

+inline
 int64 huffman_decode(Huffman* hf, const byte* in, byte* out, uint64 bit_length)
 {
    HuffmanNode* current = hf->pq[1];
    int32 pos_bit = 0;
-    int64 out_length = 0;
-
    byte* start = out;

    while (pos_bit < bit_length) {
-        if (BITS_GET_8_L2R(*in, pos_bit++, 1)) {
-            current = current->right;
-        } else {
-            current = current->left;
-        }
+        // Branchless version of checking if bit is set and then updating current
+        int32 bit = BITS_GET_8_L2R(*in, pos_bit, 1);
+        current = (HuffmanNode *) (((uintptr_t) current->left & ~bit) | ((uintptr_t) current->right & bit));

        if (current->character) {
            *out++ = current->character;
            current = hf->pq[1];
        }

-        if (pos_bit > 7) {
+        pos_bit = MODULO_2(++pos_bit, 8);
+        if (pos_bit == 0) {
            ++in;
-            pos_bit = 0;
        }
    }

--- a/compression/LZP.h
+++ b/compression/LZP.h
@ -92,10 +92,8 @@ uint32 lzp_decode(const byte* in, size_t length, byte* out)
            hash = (hash << 4) ^ c;
        }

-        if (j > 0) {
-            for (i = 0; i < j; ++i) {
-                out[out_pos++] = buf[i];
-            }
+        for (i = 0; i < j; ++i) {
+            out[out_pos++] = buf[i];
        }
    }

@ -106,13 +104,14 @@ int32 find_longest_match(char *window, int32 window_start, char *buffer, int32 b
    int32 best_length = 0;
    int32 best_offset = 0;

-    for (int32 i = window_start; i < 4096   && i < buffer_size; ++i) {
+    for (int32 i = window_start; i < 4096 && i < buffer_size; ++i) {
        int32 length = 0;

-        while (length < 18 &&
-               i + length < 4096   &&
-               buffer[length] == window[i + length]) {
-            length++;
+        while (length < 18
+            && i + length < 4096
+            && buffer[length] == window[i + length]
+        ) {
+            ++length;
        }

        if (length > best_length) {
@ -135,7 +134,12 @@ uint32 lzp3_encode(const byte* in, size_t length, byte* out) {
    size_t i = 0;
    while (i < length) {
        int32 match_position = 0;
-        int32 match_length = find_longest_match(window, window_start, (char *)&in[i], (int32) (length - i), &match_position);
+        int32 match_length = find_longest_match(
+            window,
+            window_start,
+            (char *) &in[i], (int32) (length - i),
+            &match_position
+        );

        if (match_length > 2) {
            out[out_size++] = 0xFF;
@ -170,7 +174,7 @@ uint32 lzp3_decode(const byte* in, size_t length, byte* out) {
            int32 match_length = in[i + 2];

            for (int32 j = 0; j < match_length; j++) {
-                out[out_size++] = window[(match_position + j) % 4096];
+                out[out_size++] = window[MODULO_2(match_position + j, 4096)];
            }

            memmove(window, window + match_length, 4096 - match_length);
--- a/compression/RLE.h
+++ b/compression/RLE.h
@ -21,7 +21,7 @@ uint64 rle_encode(const char* in, size_t length, char* out)
    uint64 count;
    uint64 j = 0;

-    for (uint64 i = 0; i < length; i++) {
+    for (uint64 i = 0; i < length; ++i) {
        count = 1;
        while (i + 1 < length && in[i] == in[i + 1]) {
            ++count;
@ -42,7 +42,7 @@ uint64 rle_decode(const char* in, size_t length, char* out)
 {
    uint64 j = 0;

-    for (int64 i = 0; i < length; i++) {
+    for (int64 i = 0; i < length; ++i) {
        char current_char = in[i];
        ++i;

@ -53,7 +53,7 @@ uint64 rle_decode(const char* in, size_t length, char* out)
        }
        --i;

-        for (int32 k = 0; k < count; k++) {
+        for (int32 k = 0; k < count; ++k) {
            out[j++] = current_char;
        }
    }
--- a/font/Font.h
+++ b/font/Font.h
@ -28,6 +28,7 @@ struct GlyphTextureCoords {
    f32 y2;
 };

+#define GLYPH_SIZE 40
 struct Glyph {
    uint32 codepoint;
    GlyphMetrics metrics;
@ -55,7 +56,7 @@ void font_init(Font* font, byte* data, int count)
 inline
 Glyph* font_glyph_find(Font* font, uint32 codepoint)
 {
-    for (int i = 0; i < font->glyph_count; ++i) {
+    for (uint32 i = 0; i < font->glyph_count; ++i) {
        if (font->glyphs[i].codepoint == codepoint) {
            return &font->glyphs[i];
        }
@ -66,10 +67,15 @@ Glyph* font_glyph_find(Font* font, uint32 codepoint)

 void font_from_file_txt(
    Font* font,
-    byte* data
+    const char* path,
+    RingMemory* ring
 )
 {
-    char* pos = (char *) data;
+    FileBody file;
+    file_read(path, &file, ring);
+    ASSERT_SIMPLE(file.size);
+
+    char* pos = (char *) file.content;

    bool start = true;
    char block_name[32];
@ -147,25 +153,21 @@ void font_from_file_txt(
    }
 }

-// Calculates the required size for representing a font definition in memory
 inline
-uint64 font_size_from_file(const byte* data)
+int32 font_data_size(const Font* font)
 {
-    return SWAP_ENDIAN_LITTLE(*((uint32 *) data)) * sizeof(Glyph);
+    ASSERT_SIMPLE_CONST(sizeof(Glyph) == GLYPH_SIZE);
+    return font->glyph_count * sizeof(Glyph)
+        + sizeof(font->glyph_count)
+        + sizeof(font->texture_name)
+        + sizeof(font->size)
+        + sizeof(font->line_height);
 }

-inline
-uint64 font_size(const Font* font)
-{
-    // We have to remove the size of the pointer which will not be stored
-    return sizeof(font) - sizeof(Glyph*)
-        + font->glyph_count * sizeof(Glyph);
-}
-
-void font_from_file(
-    Font* font,
+int32 font_from_data(
    const byte* data,
-    int32 size = 8
+    Font* font,
+    int32 steps = 8
 )
 {
    const byte* pos = data;
@ -190,7 +192,7 @@ void font_from_file(

    #if OPENGL
        // @todo Implement y-offset correction
-        for (int32 i = 0; i < font->glyph_count; ++i) {
+        for (uint32 i = 0; i < font->glyph_count; ++i) {
            float temp = font->glyphs[i].coords.y1;
            font->glyphs[i].coords.y1 = 1.0f - font->glyphs[i].coords.y2;
            font->glyphs[i].coords.y2 = 1.0f - temp;
@ -203,26 +205,17 @@ void font_from_file(
        font->glyph_count * sizeof(Glyph) / 4, // everything in here is 4 bytes -> super easy to swap
        steps
    );
+
+    return font_data_size(font);
 }

-inline
-int64 font_size_from_font(Font* font)
-{
-    return font->glyph_count * sizeof(Glyph) + sizeof(Font);
-}
-
-void font_to_file(
-    RingMemory* ring,
-    const char* path,
+int32 font_to_data(
    const Font* font,
+    byte* data,
    int32 steps = 8
 )
 {
-    FileBody file;
-    file.size = font->glyph_count * sizeof(Glyph) + sizeof(Font);
-    file.content = ring_get_memory(ring, file.size, 64);
-
-    byte* pos = file.content;
+    byte* pos = data;

    // Glyph count
    *((uint32 *) pos) = font->glyph_count;
@ -244,16 +237,16 @@ void font_to_file(
    memcpy(pos, font->glyphs, font->glyph_count * sizeof(Glyph));
    pos += font->glyph_count * sizeof(Glyph);

-    file.size = pos - file.content;
+    int32 size = (int32) (pos - data);

    SWAP_ENDIAN_LITTLE_SIMD(
        (int32 *) file.content,
        (int32 *) file.content,
-        file.size / 4, // everything in here is 4 bytes -> super easy to swap
+        size / 4, // everything in here is 4 bytes -> super easy to swap
        steps
    );

-    file_write(path, &file);
+    return font_data_size(font);
 }

 #endif
--- a/gpuapi/opengl/OpenglUtils.h
+++ b/gpuapi/opengl/OpenglUtils.h
@ -13,6 +13,7 @@
 #include "../../memory/RingMemory.h"
 #include "../../utils/TestUtils.h"
 #include "../../object/Texture.h"
+#include "../../image/Image.cpp"
 #include "../../utils/StringUtils.h"
 #include "../../log/Log.h"

@ -136,6 +137,8 @@ void load_texture_to_gpu(const Texture* texture, int32 mipmap_level = 0)
    if (mipmap_level > -1) {
        glGenerateMipmap(GL_TEXTURE_2D);
    }
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_GPU_UPLOAD, texture->image.pixel_count * image_pixel_size_from_type(texture->image.pixel_type));
 }

 inline
@ -162,17 +165,19 @@ GLuint shader_make(GLenum type, const char *source, RingMemory* ring)
    GLint status;
    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);

-    if (status == GL_FALSE) {
-        GLint length;
-        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length);
+    #if DEBUG || INTERNAL
+        if (status == GL_FALSE) {
+            GLint length;
+            glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length);

-        GLchar *info = (GLchar *) ring_get_memory(ring, length * sizeof(GLchar));
+            GLchar *info = (GLchar *) ring_get_memory(ring, length * sizeof(GLchar));

-        glGetShaderInfoLog(shader, length, NULL, info);
-        LOG(info, true, true);
+            glGetShaderInfoLog(shader, length, NULL, info);
+            LOG(info, true, true);

-        ASSERT_SIMPLE(false);
-    }
+            ASSERT_SIMPLE(false);
+        }
+    #endif

    return shader;
 }
@ -222,17 +227,19 @@ GLuint program_make(
    GLint status;
    glGetProgramiv(program, GL_LINK_STATUS, &status);

-    if (status == GL_FALSE) {
-        GLint length;
-        glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length);
+    #if DEBUG || INTERNAL
+        if (status == GL_FALSE) {
+            GLint length;
+            glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length);

-        GLchar *info = (GLchar *) ring_get_memory(ring, length * sizeof(GLchar));
+            GLchar *info = (GLchar *) ring_get_memory(ring, length * sizeof(GLchar));

-        glGetProgramInfoLog(program, length, NULL, info);
-        LOG(info, true, true);
+            glGetProgramInfoLog(program, length, NULL, info);
+            LOG(info, true, true);

-        ASSERT_SIMPLE(false);
-    }
+            ASSERT_SIMPLE(false);
+        }
+    #endif

    // @question really?
    if (geometry_shader > -1) {
@ -442,6 +449,8 @@ uint32 gpuapi_buffer_generate(int32 size, const void* data)
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glBufferData(GL_ARRAY_BUFFER, size, data, GL_STATIC_DRAW);

+    LOG_INCREMENT_BY(DEBUG_COUNTER_GPU_UPLOAD, size);
+
    return vbo;
 }

@ -454,6 +463,8 @@ uint32 gpuapi_buffer_generate_dynamic(int32 size, const void* data)
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glBufferData(GL_ARRAY_BUFFER, size, data, GL_DYNAMIC_DRAW);

+    LOG_INCREMENT_BY(DEBUG_COUNTER_GPU_UPLOAD, size);
+
    return vbo;
 }

@ -473,6 +484,8 @@ void gpuapi_buffer_update_dynamic(uint32 vbo, int32 size, const void* data)
 {
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glBufferData(GL_ARRAY_BUFFER, size, data, GL_DYNAMIC_DRAW);
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_GPU_UPLOAD, size);
 }

 inline
--- a/gpuapi/opengl/ShaderUtils.h
+++ b/gpuapi/opengl/ShaderUtils.h
@ -161,4 +161,87 @@ void shader_check_compile_errors(uint32 id, char* log)
    }
 }

+int32 shader_program_optimize(const char* input, char* output)
+{
+    const char* read_ptr = input;
+    char* write_ptr = output;
+    bool in_string = false;
+
+    while (*read_ptr) {
+        // Remove leading whitespace
+        while (*read_ptr == ' ' || *read_ptr == '\t' || is_eol(read_ptr)) {
+            ++read_ptr;
+        }
+
+        if (write_ptr != output
+            && *(write_ptr - 1) != '\n' && *(write_ptr - 1) != ';' && *(write_ptr - 1) != '{'
+            && *(write_ptr - 1) != '('
+            && *(write_ptr - 1) != ','
+        ) {
+            *write_ptr++ = '\n';
+        }
+
+        // Handle single-line comments (//)
+        if (*read_ptr == '/' && *(read_ptr + 1) == '/' && !in_string) {
+            // Go to end of line
+            while (*read_ptr && *read_ptr != '\n') {
+                ++read_ptr;
+            }
+
+            continue;
+        }
+
+        // Handle multi-line comments (/* */)
+        if (*read_ptr == '/' && *(read_ptr + 1) == '*' && !in_string) {
+            // Go to end of comment
+            while (*read_ptr && (*read_ptr != '*' || *(read_ptr + 1) != '/')) {
+                ++read_ptr;
+            }
+
+            if (*read_ptr == '*' && *(read_ptr + 1) == '/') {
+                read_ptr += 2;
+            }
+
+            continue;
+        }
+
+        // Handle strings to avoid removing content within them
+        if (*read_ptr == '"') {
+            in_string = !in_string;
+        }
+
+        // Copy valid characters to write_ptr
+        while (*read_ptr && !is_eol(read_ptr) && *read_ptr != '"'
+            && !(*read_ptr == '/' && (*(read_ptr + 1) == '/' || *(read_ptr + 1) == '*'))
+        ) {
+            if (!in_string
+                && (*read_ptr == '*' || *read_ptr == '/' || *read_ptr == '=' || *read_ptr == '+' || *read_ptr == '-' || *read_ptr == '%'
+                    || *read_ptr == '(' || *read_ptr == ')'
+                    || *read_ptr == '{' || *read_ptr == '}'
+                    || *read_ptr == ',' || *read_ptr == '?' || *read_ptr == ':' || *read_ptr == ';'
+                    || *read_ptr == '&' || *read_ptr == '|'
+                    || *read_ptr == '>' || *read_ptr == '<'
+                )
+            ) {
+                if (is_whitespace(*(write_ptr - 1)) || *(write_ptr - 1) == '\n') {
+                    --write_ptr;
+                }
+
+                *write_ptr++ = *read_ptr++;
+
+                if (*read_ptr && is_whitespace(*read_ptr)) {
+                    ++read_ptr;
+                }
+            } else {
+                *write_ptr++ = *read_ptr++;
+            }
+        }
+    }
+
+    *write_ptr = '\0';
+
+    // -1 to remove \0 from length, same as strlen
+    return (int32) (write_ptr - output);
+}
+
 #endif
--- a/image/Bitmap.h
+++ b/image/Bitmap.h
@ -271,7 +271,7 @@ void generate_default_bitmap_references(const FileBody* file, Bitmap* bitmap)
 void image_bmp_generate(const FileBody* src_data, Image* image)
 {
    // @performance We are generating the struct and then filling the data.
-    //      There is some asignment/copy overhead
+    //      There is some assignment/copy overhead
    Bitmap src = {};
    generate_default_bitmap_references(src_data, &src);

@ -285,7 +285,13 @@ void image_bmp_generate(const FileBody* src_data, Image* image)
    uint32 pixel_bytes = src.dib_header.bits_per_pixel / 8;
    byte alpha_offset = pixel_bytes > 3;

-    image->has_alpha |= (bool) alpha_offset;
+    if (pixel_bytes == 4) {
+        image->pixel_type = (byte) PIXEL_TYPE_RGBA;
+    } else if (pixel_bytes == 3) {
+        image->pixel_type = (byte) PIXEL_TYPE_RGB;
+    } else {
+        ASSERT_SIMPLE(false);
+    }

    if (image->order_pixels == IMAGE_PIXEL_ORDER_BGRA
        && image->order_rows == IMAGE_ROW_ORDER_BOTTOM_TO_TOP
@ -331,7 +337,7 @@ void image_bmp_generate(const FileBody* src_data, Image* image)
            // Add alpha channel at end of every RGB value
            if (alpha_offset > 0) {
                image->pixels[row_pos1 + x * pixel_bytes + 3] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes + 3];
-            } else if (image->has_alpha) {
+            } else if (image->pixel_type == PIXEL_TYPE_RGBA) {
                image->pixels[row_pos1 + x * pixel_bytes + 3] = 0xFF;
            }
        }
--- a/image/Image.cpp
+++ b/image/Image.cpp
@ -23,7 +23,7 @@
 #include "Bitmap.h"
 #include "Png.h"

-void image_from_file(RingMemory* ring, const char* path, Image* image)
+void image_from_file(Image* image, const char* path, RingMemory* ring)
 {
    FileBody file;
    file_read(path, &file, ring);
@ -46,7 +46,7 @@ void image_flip_vertical(RingMemory* ring, Image* image)
    // Last row
    const byte* end = temp + image->pixel_count * sizeof(uint32) - image->width * sizeof(uint32);

-    for (int y = 0; y < image->height; ++y) {
+    for (uint32 y = 0; y < image->height; ++y) {
        memcpy(image->pixels + y * stride, end - y * stride, stride);
    }

@ -64,4 +64,90 @@ void image_flip_vertical(RingMemory* ring, Image* image)
    image->order_rows = (byte) (!((bool) image->order_rows));
 }

+inline
+int32 image_pixel_size_from_type(byte type)
+{
+    switch (type) {
+        case PIXEL_TYPE_RGBA: {
+            return 4;
+        } break;
+        case PIXEL_TYPE_RGB: {
+            return 3;
+        } break;
+        case PIXEL_TYPE_MONO: {
+            return 1;
+        } break;
+        case PIXEL_TYPE_RGBA_F: {
+            return 16;
+        } break;
+        case PIXEL_TYPE_RGB_F: {
+            return 12;
+        } break;
+        case PIXEL_TYPE_MONO_F: {
+            return 4;
+        } break;
+        default: {
+            return 0;
+        }
+    }
+}
+
+int32 image_from_data(const byte* data, Image* image)
+{
+    const byte* pos = data;
+
+    image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
+    pos += sizeof(image->width);
+
+    image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
+    pos += sizeof(image->height);
+
+    image->pixel_count = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
+    pos += sizeof(image->pixel_count);
+
+    image->order_pixels = *pos;
+    pos += sizeof(image->order_pixels);
+
+    image->order_rows = *pos;
+    pos += sizeof(image->order_rows);
+
+    image->pixel_type = *pos;
+    pos += sizeof(image->pixel_type);
+
+    int32 image_size;
+    memcpy(image->pixels, pos, image_size = (image_pixel_size_from_type(image->pixel_type) * image->pixel_count));
+    pos += image_size;
+
+    return (int32) (pos - data);
+}
+
+int32 image_to_data(const Image* image, byte* data)
+{
+    byte* pos = data;
+
+    *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->width);
+    pos += sizeof(image->width);
+
+    *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->height);
+    pos += sizeof(image->height);
+
+    *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->pixel_count);
+    pos += sizeof(image->pixel_count);
+
+    *pos = image->order_pixels;
+    pos += sizeof(image->order_pixels);
+
+    *pos = image->order_rows;
+    pos += sizeof(image->order_rows);
+
+    *pos = image->pixel_type;
+    pos += sizeof(image->pixel_type);
+
+    int32 image_size;
+    memcpy(pos, image->pixels, image_size = (image_pixel_size_from_type(image->pixel_type) * image->pixel_count));
+    pos += image_size;
+
+    return (int32) (pos - data);
+}
+
 #endif
--- a/image/Image.h
+++ b/image/Image.h
@ -17,6 +17,16 @@
 #define IMAGE_ROW_ORDER_TOP_TO_BOTTOM 0
 #define IMAGE_ROW_ORDER_BOTTOM_TO_TOP 1

+enum PixelType
+{
+    PIXEL_TYPE_RGBA, // 4 bytes
+    PIXEL_TYPE_RGB, // 3 bytes
+    PIXEL_TYPE_MONO, // 1 byte
+    PIXEL_TYPE_RGBA_F, // 16 bytes
+    PIXEL_TYPE_RGB_F, // 12 bytes
+    PIXEL_TYPE_MONO_F, // 4 bytes
+};
+
 // This struct also functions as a setting on how to load the image data
 //      has_alpha is defined it forces an alpha channel even for bitmaps
 //      order_pixels defines how the pixels should be ordered
@ -27,11 +37,11 @@ struct Image {
    uint32 pixel_count; // @question Do we even need this?

    // Image settings
-    bool has_alpha;
    byte order_pixels; // RGBA vs BGRA
    byte order_rows; // top-to-bottom vs bottom-to-top
+    byte pixel_type; // Usually 4 or 3 bytes unless monochrome data

-    uint32* pixels; // owner of data
+    byte* pixels; // owner of data
 };

 #endif
--- a/image/Png.h
+++ b/image/Png.h
@ -623,7 +623,7 @@ bool image_png_generate(const FileBody* src_data, Image* image, RingMemory* ring
        // essentially overwriting the **current** chunk header data, which doesn't matter since we already parsed it
        // then we reset the pos pointer backwards to where we want to start... gg

-        // https://www.ietf.org/rfc/rfc1951.txt - defalte
+        // https://www.ietf.org/rfc/rfc1951.txt - deflate
        // This data might be stored in the prvious IDAT chunk?!
        BFINAL = (uint8) BITS_GET_8_R2L(*stream.pos, stream.bit_pos, 1);
        bits_walk(&stream, 1);
@ -783,7 +783,7 @@ bool image_png_generate(const FileBody* src_data, Image* image, RingMemory* ring
    image->width = src.ihdr.width;
    image->height = src.ihdr.height;
    image->pixel_count = image->width * image->height;
-    image->has_alpha = (src.ihdr.color_type == 6);
+    image->pixel_type = (byte) (src.ihdr.color_type == 6 ? PIXEL_TYPE_RGBA : PIXEL_TYPE_RGB);
    image->order_pixels = IMAGE_PIXEL_ORDER_RGBA;
    image->order_rows = IMAGE_ROW_ORDER_TOP_TO_BOTTOM;

--- a/image/Tga.h
+++ b/image/Tga.h
@ -90,7 +90,13 @@ void image_tga_generate(const FileBody* src_data, Image* image)
    uint32 pixel_bytes = src.header.bits_per_pixel / 8;
    byte alpha_offset = pixel_bytes > 3;

-    image->has_alpha |= (bool) alpha_offset;
+    if (pixel_bytes == 4) {
+        image->pixel_type = (byte) PIXEL_TYPE_RGBA;
+    } else if (pixel_bytes == 3) {
+        image->pixel_type = (byte) PIXEL_TYPE_RGB;
+    } else {
+        ASSERT_SIMPLE(false);
+    }

    // We can check same settings through equality since we use the same values
    if (image->order_rows == src.header.vertical_ordering
@ -131,7 +137,7 @@ void image_tga_generate(const FileBody* src_data, Image* image)
            // Add alpha channel at end of every RGB value
            if (alpha_offset > 0) {
                image->pixels[row_pos1 + x * pixel_bytes + 3] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes + 3];
-            } else if (image->has_alpha) {
+            } else if (image->pixel_type == PIXEL_TYPE_RGBA) {
                image->pixels[row_pos1 + x * pixel_bytes + 3] = 0xFF;
            }
        }
--- a/localization/Language.h
+++ b/localization/Language.h
@ -10,22 +10,32 @@
    #include "../platform/linux/FileUtils.cpp"
 #endif

+#define LANGUAGE_VERSION 1
+
 struct Language {
    // WARNING: the actual start of data is data -= sizeof(count); see file loading below
    byte* data;

    int32 count;
+    int64 size;
    char** lang;
 };

 void language_from_file_txt(
    Language* language,
-    byte* data
+    const char* path,
+    RingMemory* ring
 ) {
+    FileBody file;
+    file_read(path, &file, ring);
+    ASSERT_SIMPLE(file.size);
+
    // count elements
    language->count = 1;
    int64 len = 0;

+    byte* data = file.content;
+
    while (data[len] != '\0') {
        if (data[len] == '\n' && data[len + 1] == '\n') {
            ++language->count;
@ -36,6 +46,7 @@ void language_from_file_txt(
        ++len;
    }

+    language->size = len;
    language->lang = (char **) language->data;
    memcpy(language->data + language->count * sizeof(char *), data, len);

@ -54,22 +65,35 @@ void language_from_file_txt(
    }
 }

+int32 language_data_size(const Language* language)
+{
+    return (int32) (language->size
+        + sizeof(language->count)
+        + sizeof(language->size)
+        + language->count * sizeof(uint64)
+    );
+}
+
 // File layout - binary
 // offsets for start of strings
 // actual string data
-void language_from_file(
+int32 language_from_data(
+    const byte* data,
    Language* language
 ) {
-    byte* pos = language->data;
+    const byte* pos = data;

    // Count
    language->count = SWAP_ENDIAN_LITTLE(*((int32 *) pos));
    pos += sizeof(language->count);

-    language->lang = (char **) pos;
+    language->size = SWAP_ENDIAN_LITTLE(*((int32 *) pos));
+    pos += sizeof(language->size);
+
+    language->lang = (char **) language->data;
    char** pos_lang = language->lang;

-    byte* start = pos;
+    byte* start = language->data;

    // Load pointers/offsets
    for (int32 i = 0; i < language->count; ++i) {
@ -77,28 +101,29 @@ void language_from_file(
        pos += sizeof(uint64);
    }

-    // We don't have to load the actual strings, they are already in ->data due to the file reading
+    memcpy(
+        language->data + language->count * sizeof(uint64),
+        pos,
+        language->size
+    );
+
+    return language_data_size(language);
 }

-void language_to_file(
-    RingMemory* ring,
-    const char* path,
-    Language* language
+int32 language_to_data(
+    const Language* language,
+    byte* data
 ) {
-    FileBody file;
-
-    // Temporary file size for buffer
-    // @todo This is a bad placeholder, The problem is we don't know how much we actually need without stepping through the elements
-    //      I also don't want to add a size variable to the theme as it is useless in all other cases
-    file.size = MEGABYTE * 32;
-
-    file.content = ring_get_memory(ring, file.size, 64);
-    byte* pos = file.content;
+    byte* pos = data;

    // Count
    *((int32 *) pos) = SWAP_ENDIAN_LITTLE(language->count);
    pos += sizeof(language->count);

+    // Count
+    *((int32 *) pos) = SWAP_ENDIAN_LITTLE((int32) language->size);
+    pos += sizeof(language->size);
+
    byte* start = pos;

    // Save pointers
@ -107,19 +132,14 @@ void language_to_file(
        pos += sizeof(uint64);
    }

-    int64 len_total = 0;
-
    // Save actual strings
-    int64 len;
-    for (int32 i = 0; i < language->count; ++i) {
-        len = strlen(language->lang[i]);
-        len_total += len;
-        memcpy((char *) pos, language->lang[i], len + 1);
-        pos += len;
-    }
+    memcpy(
+        pos,
+        language->data + language->count * sizeof(uint64),
+        language->size
+    );

-    file.size = pos - file.content;
-    file_write(path, &file);
+    return language_data_size(language);
 }

 #endif
--- a/log/Debug.cpp
+++ b/log/Debug.cpp
@ -136,7 +136,12 @@ void update_timing_stat_end_continued(uint32 stat, const char* function)
 inline
 void update_timing_stat_reset(uint32 stat)
 {
-    atomic_set((int32 *) debug_container->perf_stats[stat].function, 0);
+    spinlock_start(&debug_container->perf_stats_spinlock);
+    TimingStat* timing_stat = &debug_container->perf_stats[stat];
+    timing_stat->function = NULL;
+    timing_stat->delta_tick = 0;
+    timing_stat->delta_time = 0;
+    spinlock_end(&debug_container->perf_stats_spinlock);
 }

 inline
@ -146,13 +151,13 @@ void reset_counter(int32 id)
 }

 inline
-void log_increment(int32 id, int32 by = 1)
+void log_increment(int32 id, int64 by = 1)
 {
    atomic_add(&debug_container->counter[id], by);
 }

 inline
-void log_counter(int32 id, int32 value)
+void log_counter(int32 id, int64 value)
 {
    atomic_set(&debug_container->counter[id], value);
 }
@ -215,11 +220,13 @@ void debug_memory_log(uint64 start, uint64 size, int32 type, const char* functio
        return;
    }

-    if (mem->action_idx == DEBUG_MEMORY_RANGE_MAX) {
-        mem->action_idx = 0;
+    uint64 idx = atomic_add_fetch(&mem->action_idx, 1);
+    if (idx >= ARRAY_COUNT(mem->last_action)) {
+        atomic_set(&mem->action_idx, 1);
+        idx %= ARRAY_COUNT(mem->last_action);
    }

-    DebugMemoryRange* dmr = &mem->last_action[mem->action_idx];
+    DebugMemoryRange* dmr = &mem->last_action[idx];
    dmr->type = type;
    dmr->start = start - mem->start;
    dmr->size = size;
@ -228,8 +235,6 @@ void debug_memory_log(uint64 start, uint64 size, int32 type, const char* functio
    dmr->time = __rdtsc();
    dmr->function_name = function;

-    ++mem->action_idx;
-
    if (type < 0 && mem->usage < size * -type) {
        mem->usage = 0;
    } else {
@ -248,11 +253,13 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
        return;
    }

-    if (mem->reserve_action_idx == DEBUG_MEMORY_RANGE_MAX) {
-        mem->reserve_action_idx = 0;
+    uint64 idx = atomic_add_fetch(&mem->reserve_action_idx, 1);
+    if (idx >= ARRAY_COUNT(mem->reserve_action)) {
+        atomic_set(&mem->reserve_action_idx, 1);
+        idx %= ARRAY_COUNT(mem->last_action);
    }

-    DebugMemoryRange* dmr = &mem->reserve_action[mem->reserve_action_idx];
+    DebugMemoryRange* dmr = &mem->reserve_action[idx];
    dmr->type = type;
    dmr->start = start - mem->start;
    dmr->size = size;
@ -260,10 +267,9 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
    // We are using rdtsc since it is faster -> less debugging overhead than using time()
    dmr->time = __rdtsc();
    dmr->function_name = function;
-
-    ++mem->reserve_action_idx;
 }

+// @bug This probably requires thread safety
 inline
 void debug_memory_reset()
 {
@ -271,7 +277,8 @@ void debug_memory_reset()
        return;
    }

-    uint64 time = __rdtsc() - 1000000000;
+    // We remove debug information that are "older" than 1GHz
+    uint64 time = __rdtsc() - 1 * GHZ;

    for (uint64 i = 0; i < debug_container->dmc.memory_element_idx; ++i) {
        for (int32 j = 0; j < DEBUG_MEMORY_RANGE_MAX; ++j) {
@ -282,6 +289,7 @@ void debug_memory_reset()
    }
 }

+// @bug This probably requires thread safety
 byte* log_get_memory(uint64 size, byte aligned = 1, bool zeroed = false)
 {
    if (!debug_container) {
--- a/log/Debug.h
+++ b/log/Debug.h
@ -45,7 +45,7 @@ struct DebugContainer {
    LogMemory log_memory;

    // Used to log general int values (e.g. counter for draw calls etc.)
-    int32* counter;
+    int64* counter;

    #if _WIN32
        HANDLE log_fp;
--- a/log/Log.h
+++ b/log/Log.h
@ -36,8 +36,8 @@ enum LogDataType {
 void log_to_file();
 void log(const char* str, bool should_log, bool save, const char* file, const char* function, int32 line);
 void log(const char* format, LogDataType data_type, void* data, bool should_log, bool save, const char* file, const char* function, int32 line);
-void log_increment(int32, int32);
-void log_counter(int32, int32);
+void log_increment(int32, int64);
+void log_counter(int32, int64);

 #if (LOG_LEVEL == 0)
    // Don't perform any logging at log level 0
--- a/math/matrix/MatrixFloat32.h
+++ b/math/matrix/MatrixFloat32.h
@ -23,6 +23,10 @@

 // @todo Implement intrinsic versions!

+// INFO: I thought we could remove some of the functions. Sometimes we have a function that modifies the original value and then we also have the same function that fills a new result value.
+//      On gcc the optimized code creates the same assembly if we would just choose to return the new value vs. modifying a value by pointer.
+//      However, on MSVC this is not the case and the pointer version has more and slower assembly code for the pass-by-value function
+
 inline
 void vec2_normalize(f32* __restrict x, f32* __restrict y)
 {
--- a/memory/Queue.h
+++ b/memory/Queue.h
@ -38,13 +38,30 @@ void queue_free(Queue* queue)
    ring_free(queue);
 }

+inline
+bool queue_is_empty(Queue* queue) {
+    return queue->head == queue->tail;
+}
+
+inline
+bool queue_set_empty(Queue* queue) {
+    return queue->head = queue->tail;
+}
+
+inline
+bool queue_is_full(Queue* queue, uint64 size, byte aligned = 0) {
+    return !ring_commit_safe((RingMemory *) queue, size, aligned);
+}
+
 // Conditional Lock
 inline
-void queue_enqueue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
+byte* queue_enqueue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
 {
    byte* mem = ring_get_memory_nomove(queue, size, aligned);
    memcpy(mem, data, size);
    ring_move_pointer(queue, &queue->head, size, aligned);
+
+    return mem;
 }

 inline
@ -60,10 +77,34 @@ void queue_enqueue_end(Queue* queue, uint64 size, byte aligned = 0)
 }

 inline
-byte* queue_dequeue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
+bool queue_dequeue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
 {
-    memcpy(data, queue->tail, size);
+    if (queue->head == queue->tail) {
+        return false;
+    }
+
+    if (size == 4) {
+        *((int32 *) data) = *((int32 *) queue->tail);
+    } else {
+        memcpy(data, queue->tail, size);
+    }
+
    ring_move_pointer(queue, &queue->tail, size, aligned);
+
+    return true;
+}
+
+inline
+byte* queue_dequeue_keep(Queue* queue, uint64 size, byte aligned = 0)
+{
+    if (queue->head == queue->tail) {
+        return NULL;
+    }
+
+    byte* data = queue->tail;
+    ring_move_pointer(queue, &queue->tail, size, aligned);
+
+    return data;
 }

 inline
--- a/memory/RingMemory.h
+++ b/memory/RingMemory.h
@ -58,7 +58,7 @@ void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64)
        ? (byte *) platform_alloc(size)
        : (byte *) platform_alloc_aligned(size, alignment);

-    ring->end = ring->memory + size;;
+    ring->end = ring->memory + size;
    ring->head = ring->memory;
    ring->tail = ring->memory;
    ring->size = size;
@ -77,7 +77,7 @@ void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment

    ring->memory = buffer_get_memory(buf, size, alignment, true);

-    ring->end = ring->memory + size;;
+    ring->end = ring->memory + size;
    ring->head = ring->memory;
    ring->tail = ring->memory;
    ring->size = size;
@ -96,7 +96,7 @@ void ring_init(RingMemory* ring, byte* buf, uint64 size, int32 alignment = 64)
    // @bug what if an alignment is defined?
    ring->memory = buf;

-    ring->end = ring->memory + size;;
+    ring->end = ring->memory + size;
    ring->head = ring->memory;
    ring->tail = ring->memory;
    ring->size = size;
@ -110,12 +110,12 @@ void ring_init(RingMemory* ring, byte* buf, uint64 size, int32 alignment = 64)
 }

 inline
-void ring_free(RingMemory* buf)
+void ring_free(RingMemory* ring)
 {
-    if (buf->alignment < 2) {
-        platform_free((void **) &buf->memory);
+    if (ring->alignment < 2) {
+        platform_free((void **) &ring->memory);
    } else {
-        platform_aligned_free((void **) &buf->memory);
+        platform_aligned_free((void **) &ring->memory);
    }
 }

--- a/memory/ThreadedQueue.h
+++ b/memory/ThreadedQueue.h
@ -6,8 +6,10 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_MEMORY_QUEUE_H
-#define TOS_MEMORY_QUEUE_H
+#ifndef TOS_MEMORY_THREADED_QUEUE_H
+#define TOS_MEMORY_THREADED_QUEUE_H
+
+// @todo This is a horrible implementation. Please implement a lock free solution

 #include "../stdlib/Types.h"
 #include "../utils/Utils.h"
@ -47,7 +49,7 @@ struct ThreadedQueue {
 };

 inline
-void threaded_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element_size, int32 alignment = 64)
+void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element_size, int32 alignment = 64)
 {
    ring_alloc((RingMemory *) queue, element_count * element_size, alignment);

@ -59,7 +61,7 @@ void threaded_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 ele
 }

 inline
-void threaded_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
 {
    ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);

@ -71,7 +73,7 @@ void threaded_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element
 }

 inline
-void threaded_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
 {
    ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);

@ -83,7 +85,7 @@ void threaded_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count,
 }

 inline
-void threaded_queue_free(ThreadedQueue* queue)
+void thrd_queue_free(ThreadedQueue* queue)
 {
    ring_free((RingMemory *) queue);
    sem_destroy(&queue->empty);
@ -92,9 +94,9 @@ void threaded_queue_free(ThreadedQueue* queue)
    pthread_cond_destroy(&queue->cond);
 }

-// @todo Create enqueue_unique
+// @todo Create enqueue_unique and enqueue_unique_sem
 inline
-void threaded_queue_enqueue_unique_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_unique_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
    ASSERT_SIMPLE((uint64_t) data % 4 == 0);
    pthread_mutex_lock(&queue->mutex);
@ -113,7 +115,7 @@ void threaded_queue_enqueue_unique_wait(ThreadedQueue* queue, byte* data, uint64
        ring_move_pointer((RingMemory *) queue, &tail, size, aligned);
    }

-    while (!ring_commit_safe((RingMemory *) queue, size)) {
+    while (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
        pthread_cond_wait(&queue->cond, &queue->mutex);
    }

@ -125,7 +127,7 @@ void threaded_queue_enqueue_unique_wait(ThreadedQueue* queue, byte* data, uint64
 }

 inline
-void threaded_queue_enqueue_unique(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_unique(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
    ASSERT_SIMPLE((uint64_t) data % 4 == 0);
    pthread_mutex_lock(&queue->mutex);
@ -144,7 +146,7 @@ void threaded_queue_enqueue_unique(ThreadedQueue* queue, byte* data, uint64 size
        ring_move_pointer((RingMemory *) queue, &tail, size, aligned);
    }

-    if (!ring_commit_safe((RingMemory *) queue, size)) {
+    if (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
        pthread_mutex_unlock(&queue->mutex);

        return;
@ -159,11 +161,11 @@ void threaded_queue_enqueue_unique(ThreadedQueue* queue, byte* data, uint64 size

 // Conditional Lock
 inline
-void threaded_queue_enqueue(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
    pthread_mutex_lock(&queue->mutex);

-    if (!ring_commit_safe((RingMemory *) queue, size)) {
+    if (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
        pthread_mutex_unlock(&queue->mutex);

        return;
@ -177,11 +179,11 @@ void threaded_queue_enqueue(ThreadedQueue* queue, byte* data, uint64 size, byte
 }

 inline
-void threaded_queue_enqueue_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
    pthread_mutex_lock(&queue->mutex);

-    while (!ring_commit_safe((RingMemory *) queue, size)) {
+    while (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
        pthread_cond_wait(&queue->cond, &queue->mutex);
    }

@ -193,7 +195,7 @@ void threaded_queue_enqueue_wait(ThreadedQueue* queue, byte* data, uint64 size,
 }

 inline
-byte* threaded_queue_enqueue_start_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+byte* thrd_queue_enqueue_start_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
 {
    pthread_mutex_lock(&queue->mutex);

@ -205,33 +207,61 @@ byte* threaded_queue_enqueue_start_wait(ThreadedQueue* queue, uint64 size, byte
 }

 inline
-void threaded_queue_enqueue_end_wait(ThreadedQueue* queue)
+void thrd_queue_enqueue_end_wait(ThreadedQueue* queue)
 {
    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
 }

 inline
-void threaded_queue_dequeue(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+bool thrd_queue_dequeue(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
 {
-    pthread_mutex_lock(&queue->mutex);
+    if (queue->head == queue->tail) {
+        return false;
+    }

+    // we do this twice because the first one is very fast but may return a false positive
+    pthread_mutex_lock(&queue->mutex);
    if (queue->head == queue->tail) {
        pthread_mutex_unlock(&queue->mutex);

-        return;
+        return false;
    }

-    memcpy(data, queue->tail, size);
+    if (size == 4) {
+        *((int32 *) data) = *((int32 *) queue->tail);
+    } else {
+        memcpy(data, queue->tail, size);
+    }
    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);

    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
+
+    return true;
+}
+
+inline
+bool thrd_queue_empty(ThreadedQueue* queue) {
+    pthread_mutex_lock(&queue->mutex);
+    bool is_empty = queue->head == queue->tail;
+    pthread_mutex_unlock(&queue->mutex);
+
+    return is_empty;
+}
+
+inline
+bool thrd_queue_full(ThreadedQueue* queue, uint64 size, byte aligned = 0) {
+    pthread_mutex_lock(&queue->mutex);
+    bool is_full = !ring_commit_safe((RingMemory *) queue, size, aligned);
+    pthread_mutex_unlock(&queue->mutex);
+
+    return is_full;
 }

 // Waits until a dequeue is available
 inline
-void threaded_queue_dequeue_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
 {
    pthread_mutex_lock(&queue->mutex);

@ -247,7 +277,7 @@ void threaded_queue_dequeue_wait(ThreadedQueue* queue, byte* data, uint64 size,
 }

 inline
-byte* threaded_queue_dequeue_start_wait(ThreadedQueue* queue)
+byte* thrd_queue_dequeue_start_wait(ThreadedQueue* queue)
 {
    pthread_mutex_lock(&queue->mutex);

@ -259,7 +289,7 @@ byte* threaded_queue_dequeue_start_wait(ThreadedQueue* queue)
 }

 inline
-void threaded_queue_dequeue_end_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_end_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
 {
    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);

@ -269,7 +299,7 @@ void threaded_queue_dequeue_end_wait(ThreadedQueue* queue, uint64 size, byte ali

 // Semaphore Lock
 inline
-void threaded_queue_enqueue_sem_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_sem_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
    sem_wait(&queue->empty);
    pthread_mutex_lock(&queue->mutex);
@ -282,7 +312,25 @@ void threaded_queue_enqueue_sem_wait(ThreadedQueue* queue, byte* data, uint64 si
 }

 inline
-byte* threaded_queue_enqueue_start_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+bool thrd_queue_enqueue_sem_timedwait(ThreadedQueue* queue, const byte* data, uint64 size, uint64 wait, byte aligned = 0)
+{
+    if (sem_timedwait(&queue->empty, wait)) {
+        return false;
+    }
+
+    pthread_mutex_lock(&queue->mutex);
+
+    byte* mem = ring_get_memory((RingMemory *) queue, size, aligned);
+    memcpy(mem, data, size);
+
+    pthread_mutex_unlock(&queue->mutex);
+    sem_post(&queue->full);
+
+    return true;
+}
+
+inline
+byte* thrd_queue_enqueue_start_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
 {
    sem_wait(&queue->empty);
    pthread_mutex_lock(&queue->mutex);
@ -291,14 +339,14 @@ byte* threaded_queue_enqueue_start_sem_wait(ThreadedQueue* queue, uint64 size, b
 }

 inline
-void threaded_queue_enqueue_end_sem_wait(ThreadedQueue* queue)
+void thrd_queue_enqueue_end_sem_wait(ThreadedQueue* queue)
 {
    pthread_mutex_unlock(&queue->mutex);
    sem_post(&queue->full);
 }

 inline
-byte* threaded_queue_dequeue_sem_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+byte* thrd_queue_dequeue_sem_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
 {
    sem_wait(&queue->full);
    pthread_mutex_lock(&queue->mutex);
@ -311,7 +359,25 @@ byte* threaded_queue_dequeue_sem_wait(ThreadedQueue* queue, byte* data, uint64 s
 }

 inline
-byte* threaded_queue_dequeue_start_sem_wait(ThreadedQueue* queue)
+bool thrd_queue_dequeue_sem_timedwait(ThreadedQueue* queue, byte* data, uint64 size, uint64 wait, byte aligned = 0)
+{
+    if (sem_timedwait(&queue->full, wait)) {
+        return false;
+    }
+
+    pthread_mutex_lock(&queue->mutex);
+
+    memcpy(data, queue->tail, size);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
+
+    pthread_mutex_unlock(&queue->mutex);
+    sem_post(&queue->empty);
+
+    return true;
+}
+
+inline
+byte* thrd_queue_dequeue_start_sem_wait(ThreadedQueue* queue)
 {
    sem_wait(&queue->full);
    pthread_mutex_lock(&queue->mutex);
@ -320,7 +386,7 @@ byte* threaded_queue_dequeue_start_sem_wait(ThreadedQueue* queue)
 }

 inline
-void threaded_queue_dequeue_end_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_end_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
 {
    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);

--- a/memory/ThreadedRingMemory.h
+++ b/memory/ThreadedRingMemory.h
@ -0,0 +1,163 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_MEMORY_THREADED_RING_MEMORY_H
+#define TOS_MEMORY_THREADED_RING_MEMORY_H
+
+#include "RingMemory.h"
+
+#if _WIN32
+    #include "../platform/win32/threading/Thread.h"
+#elif __linux__
+    #include "../platform/linux/threading/Thread.h"
+#endif
+
+// @todo This is a horrible implementation. Please implement a lock free solution
+
+struct ThreadedRingMemory {
+    byte* memory;
+    byte* end;
+
+    byte* head;
+
+    // This variable is usually only used by single producer/consumer code mostly found in threads.
+    // One thread inserts elements -> updates head
+    // The other thread reads elements -> updates tail
+    // This code itself doesn't change this variable
+    byte* tail;
+
+    uint64 size;
+    int32 alignment;
+    int32 element_alignment;
+
+    pthread_mutex_t mutex;
+};
+
+// @bug alignment should also include the end point, not just the start
+
+inline
+void thrd_ring_alloc(ThreadedRingMemory* ring, uint64 size, int32 alignment = 64)
+{
+    ring_alloc((RingMemory *) ring, size, alignment);
+    pthread_mutex_init(&ring->mutex, NULL);
+}
+
+inline
+void thrd_ring_init(ThreadedRingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment = 64)
+{
+    ring_init((RingMemory *) ring, buf, size, alignment);
+    pthread_mutex_init(&ring->mutex, NULL);
+}
+
+inline
+void thrd_ring_init(ThreadedRingMemory* ring, byte* buf, uint64 size, int32 alignment = 64)
+{
+    ring_init((RingMemory *) ring, buf, size, alignment);
+    pthread_mutex_init(&ring->mutex, NULL);
+}
+
+inline
+void thrd_ring_free(ThreadedRingMemory* ring)
+{
+    ring_free((RingMemory *) ring);
+    pthread_mutex_destroy(&ring->mutex);
+}
+
+inline
+byte* thrd_ring_calculate_position(ThreadedRingMemory* ring, uint64 size, byte aligned = 0)
+{
+    pthread_mutex_lock(&ring->mutex);
+    byte* result = ring_calculate_position((RingMemory *) ring, size, aligned);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+inline
+void thrd_ring_reset(ThreadedRingMemory* ring)
+{
+    pthread_mutex_lock(&ring->mutex);
+    ring_reset((RingMemory *) ring);
+    pthread_mutex_unlock(&ring->mutex);
+}
+
+// Moves a pointer based on the size you want to consume (new position = after consuming size)
+void thrd_ring_move_pointer(ThreadedRingMemory* ring, byte** pos, uint64 size, byte aligned = 0)
+{
+    pthread_mutex_lock(&ring->mutex);
+    ring_move_pointer((RingMemory *) ring, pos, size, aligned);
+    pthread_mutex_unlock(&ring->mutex);
+}
+
+byte* thrd_ring_get_memory(ThreadedRingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+{
+    pthread_mutex_lock(&ring->mutex);
+    byte* result = ring_get_memory((RingMemory *) ring, size, aligned, zeroed);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+// Same as ring_get_memory but DOESN'T move the head
+byte* thrd_ring_get_memory_nomove(ThreadedRingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+{
+    pthread_mutex_lock(&ring->mutex);
+    byte* result = ring_get_memory_nomove((RingMemory *) ring, size, aligned, zeroed);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+// Used if the ring only contains elements of a certain size
+// This way you can get a certain element
+inline
+byte* thrd_ring_get_element(ThreadedRingMemory* ring, uint64 element_count, uint64 element, uint64 size)
+{
+    pthread_mutex_lock(&ring->mutex);
+    byte* result = ring_get_element((RingMemory *) ring, element_count, element, size);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+/**
+ * Checks if one additional element can be inserted without overwriting the tail index
+ */
+inline
+bool thrd_ring_commit_safe(ThreadedRingMemory* ring, uint64 size, byte aligned = 0)
+{
+    pthread_mutex_lock(&ring->mutex);
+    bool result = ring_commit_safe((RingMemory *) ring, size, aligned);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+inline
+void thrd_ring_force_head_update(const ThreadedRingMemory* ring)
+{
+    _mm_clflush(ring->head);
+}
+
+inline
+void thrd_ring_force_tail_update(const ThreadedRingMemory* ring)
+{
+    _mm_clflush(ring->tail);
+}
+
+inline
+int64 thrd_ring_dump(ThreadedRingMemory* ring, byte* data)
+{
+    pthread_mutex_lock(&ring->mutex);
+    int64 result = ring_dump((RingMemory *) ring, data);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+#endif
--- a/object/Mesh.h
+++ b/object/Mesh.h
@ -29,7 +29,7 @@
 //      maybe make a mesh hold other meshes?
 // @todo handle vertices arrays where for example no texture coordinates are defined/used
 struct Mesh {
-    byte* data; // memory owner that subdevides into the pointers below
+    byte* data; // memory owner that subdivides into the pointers below

    // @todo Implement the version into the file, currently not implemented
    int32 version;
@ -70,13 +70,17 @@ struct Mesh {
 };

 // @todo also handle textures etc.
-// WARNING: mesh needs to have memory already reserved and asigned to data
+// WARNING: mesh needs to have memory already reserved and assigned to data
 void mesh_from_file_txt(
    Mesh* mesh,
-    byte* data,
+    const char* path,
    RingMemory* ring
 ) {
-    char* pos = (char *) data;
+    FileBody file;
+    file_read(path, &file, ring);
+    ASSERT_SIMPLE(file.size);
+
+    char* pos = (char *) file.content;

    // move past the version string
    pos += 8;
@ -458,19 +462,15 @@ enum MeshLoadingRestriction {
 // @todo sometimes we don't care about some data, we should have an option which defines which data should be loaded
 //      this can improve performance for algorithms on this. e.g.:
 //      on the server side we only care about the vertex positions for collision (no normals, no color, ...)
-int32 mesh_from_file(
-    RingMemory* ring,
-    const char* path,
+int32 mesh_from_data(
+    const byte* data,
    Mesh* mesh,
    const char* group = NULL,
    int32 load_format = MESH_LOADING_RESTRICTION_EVERYTHING,
    int32 steps = 8
 )
 {
-    FileBody file;
-    file_read(path, &file, ring);
-
-    byte* pos = file.content;
+    const byte* pos = data;

    // Read version
    mesh->version = *((int32 *) pos);
@ -537,24 +537,24 @@ int32 mesh_from_file(
    return offset;
 }

-void mesh_to_file(
-    RingMemory* ring,
-    const char* path,
+// @bug this is wrong, since it is the max size
+// We would have to check the vertex format to calculate the actual size
+int32 mesh_data_size(const Mesh* mesh)
+{
+    return sizeof(mesh->version)
+        + sizeof(mesh->vertex_type)
+        + sizeof(mesh->vertex_count)
+        + 12 * sizeof(f32) * mesh->vertex_count; // 12 is the maximum value
+}
+
+int32 mesh_to_data(
    const Mesh* mesh,
+    byte* data,
    int32 vertex_save_format = VERTEX_TYPE_ALL,
    int32 steps = 8
 )
 {
-    FileBody file;
-
-    // Temporary file size for buffer
-    // @todo check the actual size, we are currently more or less guessing
-    file.size = sizeof(mesh)
-        + sizeof(Vertex3D) * mesh->vertex_count
-        + 4096;
-
-    file.content = ring_get_memory(ring, file.size, 64);
-    byte* pos = file.content;
+    byte* pos = data;

    // version
    memcpy(pos, &mesh->version, sizeof(mesh->version));
@ -571,7 +571,7 @@ void mesh_to_file(
    memcpy(pos, &mesh->vertex_count, sizeof(mesh->vertex_count));
    pos += sizeof(mesh->vertex_count);

-    // verticies
+    // vertices
    int32 vertex_size = 0;
    if (mesh->vertex_type & VERTEX_TYPE_POSITION) {
        vertex_size += 3;
@ -614,16 +614,16 @@ void mesh_to_file(
        pos += vertex_size * sizeof(f32) * mesh->vertex_count;
    }

-    file.size = pos - file.content;
+    int32 size = (int32) (pos - data);

    SWAP_ENDIAN_LITTLE_SIMD(
-        (int32 *) file.content,
-        (int32 *) file.content,
-        file.size / 4, // everything in here is 4 bytes -> super easy to swap
+        (int32 *) data,
+        (int32 *) data,
+        size / 4, // everything in here is 4 bytes -> super easy to swap
        steps
    );

-    file_write(path, &file);
+    return size;
 }

 #endif
--- a/platform/linux/FileUtils.cpp
+++ b/platform/linux/FileUtils.cpp
@ -13,6 +13,8 @@
 #include <stdlib.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
 #include <unistd.h>
 #include <linux/limits.h>
 #include <stdarg.h>
@ -28,7 +30,54 @@
    #define MAX_PATH PATH_MAX
 #endif

-typedef int32 FileHandler;
+typedef int32 FileHandle;
+typedef int MMFHandle;
+
+inline
+MMFHandle file_mmf_handle(FileHandle fp) {
+    return fp;
+}
+
+inline
+void* mmf_region_init(MMFHandle fh, size_t offset, size_t length = 0) {
+    if (length == 0) {
+        struct stat st;
+        if (fstat(fh, &st) != 0) {
+            return null;
+        }
+
+        length = st.st_size - offset;
+    }
+
+    size_t page_size = sysconf(_SC_PAGESIZE);
+
+    // Offset (must be page-aligned)
+    size_t aligned_offset = offset & ~(page_size - 1);
+    size_t offset_diff = offset - aligned_offset;
+    size_t map_length = length + offset_diff;
+
+    void *mapped_region = mmap(nullptr, map_length, PROT_READ, MAP_PRIVATE, fh, aligned_offset);
+
+    if (mapped_region == MAP_FAILED) {
+        return null;
+    }
+
+    return (char *) mapped_region + offset_diff;
+}
+
+inline
+void mmf_region_release(void* region, size_t length = 0) {
+    size_t page_size = sysconf(_SC_PAGESIZE);
+
+    void *aligned_region = (void *) ((uintptr_t)region & ~(page_size - 1));
+
+    munmap(aligned_region, length);
+}
+
+inline
+void file_mmf_close(MMFHandle fh) {
+    close(fh);
+}

 inline
 void relative_to_absolute(const char* rel, char* path)
@ -77,8 +126,8 @@ uint64 file_last_modified(const char* filename)
 }

 inline
-FileHandler file_append_handle(const char* path) {
-    FileHandler fp;
+FileHandle file_append_handle(const char* path) {
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -151,6 +200,9 @@ bool file_copy(const char* src, const char* dst) {
    close(src_fd);
    close(dst_fd);

+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes_read);
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, bytes_written);
+
    return success;
 }

@ -205,6 +257,8 @@ void file_read(const char* path, FileBody* file, RingMemory* ring) {
    file->content[bytes_read] = '\0';
    file->size = bytes_read;

+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes_read);
+
    close(fp);
 }

@ -235,11 +289,13 @@ bool file_write(const char* path, const FileBody* file) {
        return false;
    }

+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
    return true;
 }

 inline
-void close_handle(FileHandler fp)
+void file_close_handle(FileHandle fp)
 {
    close(fp);
 }
--- a/platform/linux/network/Server.h
+++ b/platform/linux/network/Server.h
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_PLATFORM_LINUX_SERVER_H
-#define TOS_PLATFORM_LINUX_SERVER_H
+#ifndef TOS_PLATFORM_LINUX_NETWORK_SERVER_H
+#define TOS_PLATFORM_LINUX_NETWORK_SERVER_H

 #include <stdio.h>
 #include <stdlib.h>
@ -21,9 +21,9 @@
 #include <fcntl.h>
 #include <errno.h>

-#include "../../stdlib/Types.h"
-#include "../../network/SocketConnection.h"
-#include "../../utils/EndianUtils.h"
+#include "../../../stdlib/Types.h"
+#include "../../../network/SocketConnection.h"
+#include "../../../utils/EndianUtils.h"

 // WARNING: requires `sudo setcap cap_net_raw=eip /path/to/your_program`
 void socket_server_raw_create(const char* hostname, SocketConnection* con) {
--- a/platform/linux/network/Socket.h
+++ b/platform/linux/network/Socket.h
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_PLATFORM_LINUX_SOCKET_H
-#define TOS_PLATFORM_LINUX_SOCKET_H
+#ifndef TOS_PLATFORM_LINUX_NETWORK_SOCKET_H
+#define TOS_PLATFORM_LINUX_NETWORK_SOCKET_H

 #define socket_close close

--- a/platform/linux/threading/Atomic.h
+++ b/platform/linux/threading/Atomic.h
@ -12,6 +12,16 @@
 #include <stdatomic.h>
 #include "../../../stdlib/Types.h"

+inline
+void atomic_set(void** target, void* value) {
+    __atomic_store_n(target, value, __ATOMIC_SEQ_CST);
+}
+
+inline
+void* atomic_get(void** target) {
+    return __atomic_load_n(target, __ATOMIC_SEQ_CST);
+}
+
 inline
 void atomic_set(volatile int32* value, int32 new_value)
 {
@ -60,22 +70,42 @@ void atomic_get(volatile byte* value, byte data[16])

 inline
 void atomic_increment(volatile int32* value) {
-    __atomic_fetch_add(value, 1, __ATOMIC_SEQ_CST);
+    __atomic_add_fetch(value, 1, __ATOMIC_SEQ_CST);
 }

 inline
 void atomic_decrement(volatile int32* value) {
-    __atomic_fetch_sub(value, 1, __ATOMIC_SEQ_CST);
+    __atomic_sub_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_increment(volatile int64* value) {
+    __atomic_add_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_decrement(volatile int64* value) {
+    __atomic_sub_fetch(value, 1, __ATOMIC_SEQ_CST);
 }

 inline
 void atomic_add(volatile int32* value, int32 increment) {
-    __atomic_fetch_add(value, increment, __ATOMIC_SEQ_CST);
+    __atomic_add_fetch(value, increment, __ATOMIC_SEQ_CST);
 }

 inline
 void atomic_sub(volatile int32* value, int32 decrement) {
-    __atomic_fetch_sub(value, decrement, __ATOMIC_SEQ_CST);
+    __atomic_sub_fetch(value, decrement, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_add(volatile int64* value, int64 increment) {
+    __atomic_add_fetch(value, increment, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_sub(volatile int64* value, int64 decrement) {
+    __atomic_sub_fetch(value, decrement, __ATOMIC_SEQ_CST);
 }

 inline
@ -85,13 +115,158 @@ int32 atomic_compare_exchange_weak(volatile int32* value, int32* expected, int32
 }

 inline
-int32 atomic_fetch_add(volatile int32* value, int32 operand) {
-    return __atomic_fetch_add(value, operand, __ATOMIC_SEQ_CST);
+int32 atomic_add_fetch(volatile int32* value, int32 operand) {
+    return __atomic_add_fetch(value, operand, __ATOMIC_SEQ_CST);
 }

 inline
-int32 atomic_fetch_sub(volatile int32* value, int32 operand) {
-    return __atomic_fetch_sub(value, operand, __ATOMIC_SEQ_CST);
+int32 atomic_sub_fetch(volatile int32* value, int32 operand) {
+    return __atomic_sub_fetch(value, operand, __ATOMIC_SEQ_CST);
 }

+inline
+int64 atomic_add_fetch(volatile int64* value, int64 operand) {
+    return __atomic_add_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+int64 atomic_sub_fetch(volatile int64* value, int64 operand) {
+    return __atomic_sub_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_set(volatile uint32* value, uint32 new_value)
+{
+    __atomic_store_n(value, new_value, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_set(volatile uint64* value, uint64 new_value)
+{
+    __atomic_store_n(value, new_value, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint32 atomic_set_fetch(volatile uint32* value, uint32 new_value) {
+    return __atomic_exchange_n(value, new_value, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint64 atomic_set_fetch(volatile uint64* value, uint64 new_value) {
+    return __atomic_exchange_n(value, new_value, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_get(volatile byte* value, byte data[16])
+{
+    __atomic_store((volatile __uint128 *) value, (__uint128 *) data, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint32 atomic_get(volatile uint32* value)
+{
+    return __atomic_load_n((uint32 *) value, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint64 atomic_get(volatile uint64* value)
+{
+    return __atomic_load_n((uint64 *) value, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_increment(volatile uint32* value) {
+    __atomic_add_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_decrement(volatile uint32* value) {
+    __atomic_sub_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_increment(volatile uint64* value) {
+    __atomic_add_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_decrement(volatile uint64* value) {
+    __atomic_sub_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_add(volatile uint32* value, uint32 increment) {
+    __atomic_add_fetch(value, increment, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_sub(volatile uint32* value, uint32 decrement) {
+    __atomic_sub_fetch(value, decrement, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint32 atomic_compare_exchange_weak(volatile uint32* value, uint32* expected, uint32 desired) {
+    __atomic_compare_exchange_n(value, expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+    return *expected;
+}
+
+inline
+uint32 atomic_add_fetch(volatile uint32* value, uint32 operand) {
+    return __atomic_add_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint32 atomic_sub_fetch(volatile uint32* value, uint32 operand) {
+    return __atomic_sub_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint64 atomic_add_fetch(volatile uint64* value, uint64 operand) {
+    return __atomic_add_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint64 atomic_sub_fetch(volatile uint64* value, uint64 operand) {
+    return __atomic_sub_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_and(volatile uint32* value, uint32 mask) {
+    __atomic_fetch_and(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_and(volatile int32* value, int32 mask) {
+    __atomic_fetch_and(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_and(volatile uint64* value, uint64 mask) {
+    __atomic_fetch_and(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_and(volatile int64* value, int64 mask) {
+    __atomic_fetch_and(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_or(volatile uint32* value, uint32 mask) {
+    __atomic_fetch_or(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_or(volatile int32* value, int32 mask) {
+    __atomic_fetch_or(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_or(volatile uint64* value, uint64 mask) {
+    __atomic_fetch_or(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_or(volatile int64* value, int64 mask) {
+    __atomic_fetch_or(value, mask, __ATOMIC_SEQ_CST);
+}
 #endif
--- a/platform/linux/threading/Thread.h
+++ b/platform/linux/threading/Thread.h
@ -98,7 +98,7 @@ int32 pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex) {
 }

 int32 pthread_cond_signal(pthread_cond_t* cond) {
-    atomic_fetch_add(cond, 1);
+    atomic_add_fetch(cond, 1);
    syscall(SYS_futex, cond, FUTEX_WAKE, 1, NULL, NULL, 0);

    return 0;
@ -114,7 +114,7 @@ int32 pthread_rwlock_init(pthread_rwlock_t* rwlock, const pthread_rwlockattr_t*)
 int32 pthread_rwlock_rdlock(pthread_rwlock_t* rwlock) {
    while (atomic_get(&rwlock->writer)) {}

-    atomic_fetch_add(&rwlock->readers, 1);
+    atomic_add_fetch(&rwlock->readers, 1);

    return 0;
 }
@ -129,7 +129,7 @@ int32 pthread_rwlock_unlock(pthread_rwlock_t* rwlock) {
    if (atomic_get(&rwlock->writer)) {
        atomic_set(&rwlock->writer, 0);
    } else {
-        atomic_fetch_sub(&rwlock->readers, 1);
+        atomic_sub_fetch(&rwlock->readers, 1);
    }

    return 0;
--- a/platform/win32/FileUtils.cpp
+++ b/platform/win32/FileUtils.cpp
@ -22,7 +22,9 @@
 #include "../../utils/TestUtils.h"
 #include "../../memory/RingMemory.h"

-typedef HANDLE FileHandler;
+typedef HANDLE FileHandle;
+typedef HANDLE MMFHandle;
+typedef OVERLAPPED file_overlapped;

 struct FileBodyAsync {
    // doesn't include null termination (same as strlen)
@ -31,7 +33,30 @@ struct FileBodyAsync {
    OVERLAPPED ov;
 };

-// @todo Consider to implement directly mapped files (CreateFileMapping) for certain files (e.g. map data or texture data, ...)
+inline
+MMFHandle file_mmf_handle(FileHandle fp)
+{
+    return CreateFileMappingA(fp, NULL, PAGE_READONLY, 0, 0, NULL);
+}
+
+inline
+void* mmf_region_init(MMFHandle fh, size_t offset, size_t length = 0)
+{
+    DWORD high = (DWORD) ((offset >> 32) & 0xFFFFFFFF);
+    DWORD low = (DWORD) (offset & 0xFFFFFFFF);
+
+    return MapViewOfFile(fh, FILE_MAP_READ, high, low, length);
+}
+
+inline
+void mmf_region_release(void* fh) {
+    UnmapViewOfFile(fh);
+}
+
+inline
+void file_mmf_close(MMFHandle fh) {
+    CloseHandle(fh);
+}

 inline
 void relative_to_absolute(const char* rel, char* path)
@ -63,7 +88,7 @@ inline uint64
 file_size(const char* path)
 {
    // @performance Profile against fseek strategy
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -121,7 +146,7 @@ bool file_exists(const char* path)
 inline void
 file_read(const char* path, FileBody* file, RingMemory* ring = NULL)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -159,11 +184,10 @@ file_read(const char* path, FileBody* file, RingMemory* ring = NULL)
    }

    if (ring != NULL) {
-        file->content = ring_get_memory(ring, size.QuadPart);
+        file->content = ring_get_memory(ring, size.QuadPart + 1);
    }

    DWORD bytes;
-    ASSERT_SIMPLE(size.QuadPart < MAX_UINT32);
    if (!ReadFile(fp, file->content, (uint32) size.QuadPart, &bytes, NULL)) {
        CloseHandle(fp);
        file->content = NULL;
@ -175,12 +199,14 @@ file_read(const char* path, FileBody* file, RingMemory* ring = NULL)

    file->content[bytes] = '\0';
    file->size = size.QuadPart;
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes);
 }

 inline
 void file_read(const char* path, FileBody* file, uint64 offset, uint64 length = MAX_UINT64, RingMemory* ring = NULL)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -232,7 +258,7 @@ void file_read(const char* path, FileBody* file, uint64 offset, uint64 length =
    uint64 read_length = OMS_MIN(length, file_size - offset);

    if (ring != NULL) {
-        file->content = ring_get_memory(ring, read_length);
+        file->content = ring_get_memory(ring, read_length + 1);
    }

    // Move the file pointer to the offset position
@ -246,7 +272,6 @@ void file_read(const char* path, FileBody* file, uint64 offset, uint64 length =
    }

    DWORD bytes;
-    ASSERT_SIMPLE(read_length < MAX_UINT32);
    if (!ReadFile(fp, file->content, (uint32) read_length, &bytes, NULL)) {
        CloseHandle(fp);
        file->content = NULL;
@ -258,10 +283,12 @@ void file_read(const char* path, FileBody* file, uint64 offset, uint64 length =

    file->content[bytes] = '\0';
    file->size = bytes;
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes);
 }

 inline
-void file_read(FileHandler fp, FileBody* file, uint64 offset = 0, uint64 length = MAX_UINT64, RingMemory* ring = NULL)
+void file_read(FileHandle fp, FileBody* file, uint64 offset = 0, uint64 length = MAX_UINT64, RingMemory* ring = NULL)
 {
    LARGE_INTEGER size;
    if (!GetFileSizeEx(fp, &size)) {
@ -285,7 +312,7 @@ void file_read(FileHandler fp, FileBody* file, uint64 offset = 0, uint64 length
    uint64 read_length = OMS_MIN(length, file_size - offset);

    if (ring != NULL) {
-        file->content = ring_get_memory(ring, read_length);
+        file->content = ring_get_memory(ring, read_length + 1);
    }

    // Move the file pointer to the offset position
@ -299,7 +326,6 @@ void file_read(FileHandler fp, FileBody* file, uint64 offset = 0, uint64 length
    }

    DWORD bytes;
-    ASSERT_SIMPLE(read_length < MAX_UINT32);
    if (!ReadFile(fp, file->content, (uint32) read_length, &bytes, NULL)) {
        CloseHandle(fp);
        file->content = NULL;
@ -307,16 +333,16 @@ void file_read(FileHandler fp, FileBody* file, uint64 offset = 0, uint64 length
        return;
    }

-    CloseHandle(fp);
-
    file->content[bytes] = '\0';
    file->size = bytes;
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes);
 }

 inline uint64
 file_read_struct(const char* path, void* file, uint32 size)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -361,13 +387,15 @@ file_read_struct(const char* path, void* file, uint32 size)

    CloseHandle(fp);

+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, read);
+
    return read;
 }

 inline bool
 file_write(const char* path, const FileBody* file)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -397,7 +425,6 @@ file_write(const char* path, const FileBody* file)

    DWORD written;
    DWORD length = (DWORD) file->size;
-    ASSERT_SIMPLE(file->size < MAX_UINT32);
    if (!WriteFile(fp, file->content, length, &written, NULL)) {
        CloseHandle(fp);
        return false;
@ -405,13 +432,15 @@ file_write(const char* path, const FileBody* file)

    CloseHandle(fp);

+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, length);
+
    return true;
 }

 inline bool
 file_write_struct(const char* path, const void* file, uint32 size)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -444,6 +473,8 @@ file_write_struct(const char* path, const void* file, uint32 size)

    CloseHandle(fp);

+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
    return true;
 }

@ -473,7 +504,7 @@ file_copy(const char* src, const char* dst)
 }

 inline
-void close_handle(FileHandler fp)
+void file_close_handle(FileHandle fp)
 {
    CloseHandle(fp);
 }
@ -481,7 +512,7 @@ void close_handle(FileHandler fp)
 inline
 HANDLE file_append_handle(const char* path)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -514,10 +545,10 @@ HANDLE file_append_handle(const char* path)

 inline
 bool file_read_async(
-    FileHandler fp,
+    FileHandle fp,
    FileBodyAsync* file,
    uint64_t offset = 0,
-    uint64_t length = MAXUINT64,
+    uint64_t length = MAX_UINT64,
    RingMemory* ring = NULL
 ) {
    LARGE_INTEGER size;
@ -559,7 +590,6 @@ bool file_read_async(
    file->ov.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);

    DWORD bytes_read = 0;
-    ASSERT_SIMPLE(read_length < MAXDWORD);
    if (!ReadFile(fp, file->content, (DWORD) read_length, &bytes_read, &file->ov)) {
        DWORD error = GetLastError();
        if (error != ERROR_IO_PENDING) {
@ -573,13 +603,23 @@ bool file_read_async(
    }

    file->size = read_length;
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, read_length);
+
    return true;
 }

 inline
-FileHandler file_read_handle(const char* path)
+void file_async_wait(FileHandle fp, file_overlapped* overlapped, bool wait)
 {
-    FileHandler fp;
+    DWORD bytesTransferred;
+    GetOverlappedResult(fp, overlapped, &bytesTransferred, wait);
+}
+
+inline
+FileHandle file_read_handle(const char* path)
+{
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -611,9 +651,9 @@ FileHandler file_read_handle(const char* path)
 }

 inline
-FileHandler file_read_async_handle(const char* path)
+FileHandle file_read_async_handle(const char* path)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -646,7 +686,7 @@ FileHandler file_read_async_handle(const char* path)

 bool file_append(const char* path, const char* file)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -675,39 +715,40 @@ bool file_append(const char* path, const char* file)
    }

    DWORD written;
-    DWORD length = (DWORD) strlen(file); // @question WHY is WriteFile not supporting larger data?
-    ASSERT_SIMPLE(length < MAX_UINT32);
+    DWORD length = (DWORD) strlen(file);
    if (!WriteFile(fp, file, length, &written, NULL)) {
        CloseHandle(fp);
        return false;
    }

    CloseHandle(fp);
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
    return true;
 }

 inline bool
-file_append(FileHandler fp, const char* file)
+file_append(FileHandle fp, const char* file)
 {
    if (fp == INVALID_HANDLE_VALUE) {
        return false;
    }

    DWORD written;
-    DWORD length = (DWORD) strlen(file); // @question WHY is WriteFile not supporting larger data?
-    ASSERT_SIMPLE(length < MAX_UINT32);
-
+    DWORD length = (DWORD) strlen(file);
    if (!WriteFile(fp, file, length, &written, NULL)) {
        CloseHandle(fp);
        return false;
    }

-    CloseHandle(fp);
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
    return true;
 }

 inline bool
-file_append(FileHandler fp, const char* file, size_t length)
+file_append(FileHandle fp, const char* file, size_t length)
 {
    if (fp == INVALID_HANDLE_VALUE) {
        return false;
@ -719,13 +760,15 @@ file_append(FileHandler fp, const char* file, size_t length)
        return false;
    }

+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
    return true;
 }

 inline bool
 file_append(const char* path, const FileBody* file)
 {
-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
@ -755,13 +798,15 @@ file_append(const char* path, const FileBody* file)

    DWORD bytes;
    DWORD length = (DWORD) file->size;
-    ASSERT_SIMPLE(file->size < MAX_UINT32);
    if (!WriteFile(fp, file->content, length, &bytes, NULL)) {
        CloseHandle(fp);
        return false;
    }

    CloseHandle(fp);
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, bytes);
+
    return true;
 }

@ -770,7 +815,7 @@ uint64 file_last_modified(const char* path)
 {
    WIN32_FIND_DATA find_data;

-    FileHandler fp;
+    FileHandle fp;
    if (*path == '.') {
        char full_path[MAX_PATH];
        relative_to_absolute(path, full_path);
--- a/platform/win32/Library.h
+++ b/platform/win32/Library.h
@ -72,6 +72,7 @@ bool library_load(Library* lib)
        if (function) {
            lib->functions[c] = function;
        } else {
+            ASSERT_SIMPLE(false);
            lib->is_valid = false;
        }
    }
--- a/platform/win32/SystemInfo.cpp
+++ b/platform/win32/SystemInfo.cpp
@ -25,6 +25,7 @@
 #include <wbemidl.h>
 #include <comdef.h>
 #include <winnls.h>
+#include <hidsdi.h>

 // @performance Do we really need all these libs, can't we simplify that?!
 #include <intrin.h>
--- a/platform/win32/Window.h
+++ b/platform/win32/Window.h
@ -21,10 +21,14 @@ struct WindowState {
    uint64 style;
 };

-#define WINDOW_STATE_CHANGE_SIZE 1
-#define WINDOW_STATE_CHANGE_POS 2
-#define WINDOW_STATE_CHANGE_FOCUS 4
-#define WINDOW_STATE_CHANGE_FULLSCREEN 8
+enum WindowStateChanges : byte {
+    WINDOW_STATE_CHANGE_NONE = 0,
+    WINDOW_STATE_CHANGE_SIZE = 1,
+    WINDOW_STATE_CHANGE_POS = 2,
+    WINDOW_STATE_CHANGE_FOCUS = 4,
+    WINDOW_STATE_CHANGE_FULLSCREEN = 8,
+    WINDOW_STATE_CHANGE_ALL = 16,
+};

 struct Window {
    uint16 width;
--- a/platform/win32/audio/DirectSound.h
+++ b/platform/win32/audio/DirectSound.h
@ -9,8 +9,9 @@
 #ifndef TOS_SOUND_DIRECT_SOUND_H
 #define TOS_SOUND_DIRECT_SOUND_H

-#include <dsound.h>
 #include <windows.h>
+#include <mmeapi.h>
+#include <dsound.h>

 #include "../../../stdlib/Types.h"
 #include "../../../audio/AudioSetting.h"
@ -172,10 +173,10 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
        return;
    }

-    void *region1;
+    void* region1;
    DWORD region1_size;

-    void *region2;
+    void* region2;
    DWORD region2_size;

    DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
@ -203,6 +204,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)

    api_setting->secondary_buffer->Unlock(region1, region1_size, region2, region2_size);

+    // @question Do we want to keep this here or move it to the audio mixer?
    setting->sample_index += setting->sample_buffer_size / setting->sample_size;
    setting->sample_buffer_size = 0;
 }
--- a/platform/win32/audio/XAudio2.h
+++ b/platform/win32/audio/XAudio2.h
@ -9,9 +9,9 @@
 #ifndef TOS_SOUND_XAUDIO2_H
 #define TOS_SOUND_XAUDIO2_H

-#include <xaudio2.h>
 #include <windows.h>
 #include <objbase.h>
+#include <xaudio2.h>

 #include "../../../stdlib/Types.h"
 #include "../../../audio/AudioSetting.h"
--- a/platform/win32/input/HidInput.h
+++ b/platform/win32/input/HidInput.h
@ -137,7 +137,7 @@ void hid_init_controllers(Input* __restrict states, int32 state_count, RingMemor
    SetupDiDestroyDeviceInfoList(device_info_set);
 }

-uint32 hid_divice_poll(Input* state, uint64 time) {
+uint32 hid_device_poll(Input* state, uint64 time) {
    UCHAR buffer[128];
    DWORD bytes_read;

--- a/platform/win32/network/Client.h
+++ b/platform/win32/network/Client.h
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_PLATFORM_WIN32_SERVER_H
-#define TOS_PLATFORM_WIN32_SERVER_H
+#ifndef TOS_PLATFORM_WIN32_NETWORK_SERVER_H
+#define TOS_PLATFORM_WIN32_NETWORK_SERVER_H

 #include <stdio.h>
 #include <stdlib.h>
@ -16,9 +16,9 @@
 #include <winsock2.h>
 #include <ws2tcpip.h>

-#include "../../stdlib/Types.h"
-#include "../../network/SocketConnection.h"
-#include "../../utils/EndianUtils.h"
+#include "../../../stdlib/Types.h"
+#include "../../../network/SocketConnection.h"
+#include "../../../utils/EndianUtils.h"

 #pragma comment(lib, "Ws2_32.lib")

--- a/platform/win32/network/Server.h
+++ b/platform/win32/network/Server.h
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_PLATFORM_WIN32_SERVER_H
-#define TOS_PLATFORM_WIN32_SERVER_H
+#ifndef TOS_PLATFORM_WIN32_NETWORK_SERVER_H
+#define TOS_PLATFORM_WIN32_NETWORK_SERVER_H

 #include <stdio.h>
 #include <stdlib.h>
@ -16,8 +16,8 @@
 #include <winsock2.h>
 #include <ws2tcpip.h>

-#include "../../network/SocketConnection.h"
-#include "../../utils/EndianUtils.h"
+#include "../../../network/SocketConnection.h"
+#include "../../../utils/EndianUtils.h"

 #pragma comment(lib, "Ws2_32.lib")

--- a/platform/win32/network/Socket.h
+++ b/platform/win32/network/Socket.h
@ -6,8 +6,8 @@
 * @version   1.0.0
 * @link      https://jingga.app
 */
-#ifndef TOS_PLATFORM_WIN32_SOCKET_H
-#define TOS_PLATFORM_WIN32_SOCKET_H
+#ifndef TOS_PLATFORM_WIN32_NETWORK_SOCKET_H
+#define TOS_PLATFORM_WIN32_NETWORK_SOCKET_H

 #define socket_close closesocket

--- a/platform/win32/threading/Atomic.h
+++ b/platform/win32/threading/Atomic.h
@ -12,6 +12,18 @@
 #include <windows.h>
 #include "../../../stdlib/Types.h"

+inline
+void atomic_set(void** target, void* new_pointer)
+{
+    InterlockedExchangePointer(target, new_pointer);
+}
+
+inline
+void* atomic_get(void** target)
+{
+    return InterlockedCompareExchangePointer(target, NULL, NULL);
+}
+
 inline
 void atomic_set(volatile int32* value, int32 new_value)
 {
@ -85,6 +97,16 @@ void atomic_decrement(volatile int32* value) {
    InterlockedDecrement((long *) value);
 }

+inline
+void atomic_increment(volatile int64* value) {
+    InterlockedIncrement((long *) value);
+}
+
+inline
+void atomic_decrement(volatile int64* value) {
+    InterlockedDecrement((long *) value);
+}
+
 inline
 void atomic_add(volatile int32* value, int32 increment) {
    InterlockedAdd((long *) value, increment);
@ -95,19 +117,180 @@ void atomic_sub(volatile int32* value, int32 decrement) {
    InterlockedAdd((long *) value, -decrement);
 }

+inline
+void atomic_add(volatile int64* value, int64 increment) {
+    InterlockedAdd((long *) value, (long) increment);
+}
+
+inline
+void atomic_sub(volatile int64* value, int64 decrement) {
+    InterlockedAdd((long *) value, -1 * ((long) decrement));
+}
+
 inline
 int32 atomic_compare_exchange_weak(volatile int32* value, int32* expected, int32 desired) {
    return (int32) InterlockedCompareExchange((long *) value, desired, *expected);
 }

 inline
-int32 atomic_fetch_add(volatile int32* value, int32 operand) {
+int32 atomic_add_fetch(volatile int32* value, int32 operand) {
    return (int32) InterlockedExchangeAdd((long *) value, operand);
 }

 inline
-int32 atomic_fetch_sub(volatile int32* value, int32 operand) {
+int32 atomic_sub_fetch(volatile int32* value, int32 operand) {
    return (int32) InterlockedExchangeSubtract((unsigned long *) value, operand);
 }

+inline
+int64 atomic_add_fetch(volatile int64* value, int64 operand) {
+    return (int64) InterlockedExchangeAdd((long *) value, (long) operand);
+}
+
+inline
+int64 atomic_sub_fetch(volatile int64* value, int64 operand) {
+    return (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+}
+
+inline
+void atomic_set(volatile uint32* value, uint32 new_value)
+{
+    InterlockedExchange((long *) value, new_value);
+}
+
+inline
+void atomic_set(volatile uint64* value, uint64 new_value)
+{
+    InterlockedExchange((long *) value, (long) new_value);
+}
+
+inline
+uint32 atomic_set_fetch(volatile uint32* value, uint32 new_value)
+{
+    return (uint32) InterlockedExchange((long *) value, new_value);
+}
+
+inline
+uint64 atomic_set_fetch(volatile uint64* value, uint64 new_value)
+{
+    return (uint64) InterlockedExchange((long *) value, (long) new_value);
+}
+
+inline
+uint32 atomic_get(volatile uint32* value)
+{
+    return (uint32) InterlockedCompareExchange((long *) value, 0, 0);
+}
+
+inline
+uint64 atomic_get(volatile uint64* value)
+{
+    return (uint64) InterlockedCompareExchange((long *) value, 0, 0);
+}
+
+inline
+void atomic_increment(volatile uint32* value) {
+    InterlockedIncrement((long *) value);
+}
+
+inline
+void atomic_decrement(volatile uint32* value) {
+    InterlockedDecrement((long *) value);
+}
+
+inline
+void atomic_increment(volatile uint64* value) {
+    InterlockedIncrement((long *) value);
+}
+
+inline
+void atomic_decrement(volatile uint64* value) {
+    InterlockedDecrement((long *) value);
+}
+
+inline
+void atomic_add(volatile uint32* value, uint32 increment) {
+    InterlockedAdd((long *) value, increment);
+}
+
+inline
+void atomic_sub(volatile uint32* value, uint32 decrement) {
+    InterlockedAdd((long *) value, -1 * ((int32) decrement));
+}
+
+inline
+void atomic_add(volatile uint64* value, uint64 increment) {
+    InterlockedAdd((long *) value, (long) increment);
+}
+
+inline
+void atomic_sub(volatile uint64* value, uint64 decrement) {
+    InterlockedAdd((long *) value, -1 * ((long) decrement));
+}
+
+inline
+uint32 atomic_compare_exchange_weak(volatile uint32* value, uint32* expected, uint32 desired) {
+    return (uint32) InterlockedCompareExchange((long *) value, desired, *expected);
+}
+
+inline
+uint32 atomic_add_fetch(volatile uint32* value, uint32 operand) {
+    return (uint32) InterlockedExchangeAdd((long *) value, operand);
+}
+
+inline
+uint32 atomic_sub_fetch(volatile uint32* value, uint32 operand) {
+    return (uint32) InterlockedExchangeSubtract((unsigned long *) value, operand);
+}
+
+inline
+uint64 atomic_add_fetch(volatile uint64* value, uint64 operand) {
+    return (uint64) InterlockedExchangeAdd((long *) value, (long) operand);
+}
+
+inline
+uint64 atomic_sub_fetch(volatile uint64* value, uint64 operand) {
+    return (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+}
+
+inline
+void atomic_and(volatile uint32* value, uint32 mask) {
+    InterlockedAnd((volatile LONG *) value, mask);
+}
+
+inline
+void atomic_and(volatile int32* value, int32 mask) {
+    InterlockedAnd((volatile LONG *) value, (LONG)mask);
+}
+
+inline
+void atomic_and(volatile uint64* value, uint64 mask) {
+    InterlockedAnd64((volatile LONG64 *) value, mask);
+}
+
+inline
+void atomic_and(volatile int64* value, int64 mask) {
+    InterlockedAnd64((volatile LONG64 *) value, mask);
+}
+
+inline
+void atomic_or(volatile uint32* value, uint32 mask) {
+    InterlockedOr((volatile LONG *) value, mask);
+}
+
+inline
+void atomic_or(volatile int32* value, int32 mask) {
+    InterlockedOr((volatile LONG *) value, (LONG)mask);
+}
+
+inline
+void atomic_or(volatile uint64* value, uint64 mask) {
+    InterlockedOr64((volatile LONG64 *) value, mask);
+}
+
+inline
+void atomic_or(volatile int64* value, int64 mask) {
+    InterlockedOr64((volatile LONG64 *) value, mask);
+}
+
 #endif
--- a/platform/win32/threading/Semaphore.h
+++ b/platform/win32/threading/Semaphore.h
@ -29,6 +29,14 @@ void sem_wait(sem_t* semaphore) {
    WaitForSingleObject(*semaphore, INFINITE);
 }

+int32 sem_timedwait(sem_t* semaphore, uint64 ms) {
+    return (int32) WaitForSingleObject(*semaphore, (DWORD) ms);
+}
+
+int32 sem_trywait(sem_t* semaphore) {
+    return (int32) WaitForSingleObject(*semaphore, 0);
+}
+
 // increment
 void sem_post(sem_t* semaphore) {
    ReleaseSemaphore(*semaphore, 1, NULL);
--- a/scene/SceneState.h
+++ b/scene/SceneState.h
@ -0,0 +1,23 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_SCENE_STATE_H
+#define TOS_SCENE_STATE_H
+
+#include "../stdlib/Types.h"
+
+enum SceneState : byte {
+    SCENE_STATE_DEFAULT = 0,
+    SCENE_STATE_WINDOW_CHANGED = 1,
+    SCENE_STATE_SHOULD_SWITCH = 2,
+    SCENE_STATE_STARTED_SETUP = 4,
+    SCENE_STATE_WAITING_SETUP = 8,
+    SCENE_STATE_READY = 16,
+};
+
+#endif
--- a/stdlib/HashMap.h
+++ b/stdlib/HashMap.h
@ -16,53 +16,53 @@
 #include "../memory/ChunkMemory.h"
 #include "../utils/StringUtils.h"

-#define MAX_KEY_LENGTH 32
+#define HASH_MAP_MAX_KEY_LENGTH 32

 struct HashEntryInt32 {
    int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryInt32* next;
    int32 value;
 };

 struct HashEntryInt64 {
    int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryInt64* next;
    int64 value;
 };

 struct HashEntryUIntPtr {
    int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryUIntPtr* next;
    uintptr_t value;
 };

 struct HashEntryVoidP {
    int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryVoidP* next;
    void* value;
 };

 struct HashEntryFloat {
    int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryFloat* next;
    f32 value;
 };

 struct HashEntryStr {
    int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntryStr* next;
-    char value[MAX_KEY_LENGTH];
+    char value[HASH_MAP_MAX_KEY_LENGTH];
 };

 struct HashEntry {
    int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
    HashEntry* next;
    byte* value;
 };
@ -128,8 +128,8 @@ void hashmap_insert(HashMap* hm, const char* key, int32 value) {
    HashEntryInt32* entry = (HashEntryInt32 *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

    entry->value = value;
    entry->next = NULL;
@ -153,8 +153,8 @@ void hashmap_insert(HashMap* hm, const char* key, int64 value) {
    HashEntryInt64* entry = (HashEntryInt64 *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

    entry->value = value;
    entry->next = NULL;
@ -178,8 +178,8 @@ void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) {
    HashEntryUIntPtr* entry = (HashEntryUIntPtr *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

    entry->value = value;
    entry->next = NULL;
@ -203,8 +203,8 @@ void hashmap_insert(HashMap* hm, const char* key, void* value) {
    HashEntryVoidP* entry = (HashEntryVoidP *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

    entry->value = value;
    entry->next = NULL;
@ -228,8 +228,8 @@ void hashmap_insert(HashMap* hm, const char* key, f32 value) {
    HashEntryFloat* entry = (HashEntryFloat *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

    entry->value = value;
    entry->next = NULL;
@ -253,11 +253,11 @@ void hashmap_insert(HashMap* hm, const char* key, const char* value) {
    HashEntryStr* entry = (HashEntryStr *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

-    strncpy(entry->value, value, MAX_KEY_LENGTH);
-    entry->value[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->value, value, HASH_MAP_MAX_KEY_LENGTH);
+    entry->value[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

    entry->next = NULL;

@ -282,8 +282,8 @@ void hashmap_insert(HashMap* hm, const char* key, byte* value) {

    entry->value = (byte *) entry + sizeof(HashEntry);

-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

    memcpy(entry->value, value, hm->buf.chunk_size - sizeof(HashEntry));

@ -306,7 +306,7 @@ HashEntry* hashmap_get_entry(const HashMap* hm, const char* key) {
    HashEntry* entry = (HashEntry *) hm->table[index];

    while (entry != NULL) {
-        if (strncmp(entry->key, key, MAX_KEY_LENGTH) == 0) {
+        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
            return entry;
        }

@ -318,12 +318,12 @@ HashEntry* hashmap_get_entry(const HashMap* hm, const char* key) {

 // This function only saves one step (omission of the hash function)
 // The reason for this is in some cases we can use compile time hashing
-HashEntry* hashmap_get_entry(const HashMap* hm, const char* key, uint64 index) {
-    index %= hm->buf.count;
-    HashEntry* entry = (HashEntry *) hm->table[index];
+HashEntry* hashmap_get_entry(const HashMap* hm, const char* key, uint64 hash) {
+    hash %= hm->buf.count;
+    HashEntry* entry = (HashEntry *) hm->table[hash];

    while (entry != NULL) {
-        if (strncmp(entry->key, key, MAX_KEY_LENGTH) == 0) {
+        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
            return entry;
        }

@ -339,7 +339,7 @@ void hashmap_delete_entry(HashMap* hm, const char* key) {
    HashEntry* prev = NULL;

    while (entry != NULL) {
-        if (strncmp(entry->key, key, MAX_KEY_LENGTH) == 0) {
+        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
            if (prev == NULL) {
                hm->table[index] = entry->next;
            } else {
@ -370,7 +370,7 @@ int64 hashmap_dump(const HashMap* hm, byte* data)
    }
    data += sizeof(uint64) * hm->buf.count;

-    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64);
+    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * HASH_MAP_MAX_KEY_LENGTH - sizeof(uint64);

    // Dumb hash map content = buffer memory
    int32 free_index = 0;
@ -449,7 +449,7 @@ int64 hashmap_load(HashMap* hm, const byte* data)
    // @question don't we have to possibly endian swap check the free array as well?
    memcpy(hm->buf.free, data, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64));

-    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64);
+    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * HASH_MAP_MAX_KEY_LENGTH - sizeof(uint64);

    // Switch endian AND turn offsets to pointers
    int32 free_index = 0;
--- a/stdlib/ThreadedHashMap.h
+++ b/stdlib/ThreadedHashMap.h
@ -31,7 +31,7 @@ struct ThreadedHashMap {

 // WARNING: element_size = element size + remaining HashEntry data size
 inline
-void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, RingMemory* ring)
+void thrd_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, RingMemory* ring)
 {
    hashmap_create((HashMap *) hm, count, element_size, ring);
    pthread_mutex_init(&hm->mutex, NULL);
@ -39,7 +39,7 @@ void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_siz

 // WARNING: element_size = element size + remaining HashEntry data size
 inline
-void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, BufferMemory* buf)
+void thrd_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, BufferMemory* buf)
 {
    hashmap_create((HashMap *) hm, count, element_size, buf);
    pthread_mutex_init(&hm->mutex, NULL);
@ -47,69 +47,69 @@ void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_siz

 // WARNING: element_size = element size + remaining HashEntry data size
 inline
-void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, byte* buf)
+void thrd_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, byte* buf)
 {
    hashmap_create((HashMap *) hm, count, element_size, buf);
    pthread_mutex_init(&hm->mutex, NULL);
 }

 inline
-void threaded_hashmap_free(ThreadedHashMap* hm)
+void thrd_hashmap_free(ThreadedHashMap* hm)
 {
    pthread_mutex_destroy(&hm->mutex);
 }

 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, int32 value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, int32 value) {
    pthread_mutex_lock(&hm->mutex);
    hashmap_insert((HashMap *) hm, key, value);
    pthread_mutex_unlock(&hm->mutex);
 }

 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, int64 value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, int64 value) {
    pthread_mutex_lock(&hm->mutex);
    hashmap_insert((HashMap *) hm, key, value);
    pthread_mutex_unlock(&hm->mutex);
 }

 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, uintptr_t value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, uintptr_t value) {
    pthread_mutex_lock(&hm->mutex);
    hashmap_insert((HashMap *) hm, key, value);
    pthread_mutex_unlock(&hm->mutex);
 }

 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, void* value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, void* value) {
    pthread_mutex_lock(&hm->mutex);
    hashmap_insert((HashMap *) hm, key, value);
    pthread_mutex_unlock(&hm->mutex);
 }

 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, f32 value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, f32 value) {
    pthread_mutex_lock(&hm->mutex);
    hashmap_insert((HashMap *) hm, key, value);
    pthread_mutex_unlock(&hm->mutex);
 }

 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, const char* value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, const char* value) {
    pthread_mutex_lock(&hm->mutex);
    hashmap_insert((HashMap *) hm, key, value);
    pthread_mutex_unlock(&hm->mutex);
 }

 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, byte* value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, byte* value) {
    pthread_mutex_lock(&hm->mutex);
    hashmap_insert((HashMap *) hm, key, value);
    pthread_mutex_unlock(&hm->mutex);
 }

 inline
-void threaded_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* key) {
+void thrd_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* key) {
    pthread_mutex_lock(&hm->mutex);
    HashEntry* temp = hashmap_get_entry((HashMap *) hm, key);
    memcpy(entry, temp, hm->buf.chunk_size);
@ -117,7 +117,7 @@ void threaded_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const cha
 }

 inline
-void threaded_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* key, uint64 index) {
+void thrd_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* key, uint64 index) {
    pthread_mutex_lock(&hm->mutex);
    HashEntry* temp = hashmap_get_entry((HashMap *) hm, key, index);
    memcpy(entry, temp, hm->buf.chunk_size);
@ -125,7 +125,7 @@ void threaded_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const cha
 }

 inline
-void threaded_hashmap_delete_entry(ThreadedHashMap* hm, const char* key) {
+void thrd_hashmap_delete_entry(ThreadedHashMap* hm, const char* key) {
    pthread_mutex_lock(&hm->mutex);
    hashmap_delete_entry((HashMap *) hm, key);
    pthread_mutex_unlock(&hm->mutex);
--- a/stdlib/Types.h
+++ b/stdlib/Types.h
@ -12,12 +12,14 @@
 #include <stdint.h>

 #ifdef _MSC_VER
+    #include <windows.h>
+
    #define PACKED_STRUCT  __pragma(pack(push, 1))
    #define UNPACKED_STRUCT __pragma(pack(pop))
    typedef SSIZE_T ssize_t;
 #else
    #define PACKED_STRUCT  __attribute__((__packed__))
-    #define UNPACKED_STRUCT
+    #define UNPACKED_STRUCT ((void) 0)
 #endif

 #define ARRAY_COUNT(a) (sizeof(a) / sizeof((a)[0]))
@ -61,12 +63,49 @@ typedef intptr_t smm;
 #define MIN_INT32 0x80000000
 #define MIN_INT64 0x8000000000000000

+#define SEC_MILLI 1000
 #define MILLI_MICRO 1000
+#define SEC_MICRO 1000000
+
+#define MHZ 1000000
+#define GHZ 1000000000

 #define internal static // only allows local "file" access
 #define local_persist static
 #define global_persist static

+struct v3_byte {
+    union {
+        struct {
+            byte x, y, z;
+        };
+
+        struct {
+            byte r, g, b;
+        };
+
+        byte v[3];
+    };
+};
+
+struct v4_byte {
+    union {
+        struct {
+            byte x, y, z, w;
+        };
+
+        struct {
+            byte r, g, b, a;
+        };
+
+        union {
+            byte v[4];
+            uint32 val;
+        };
+    };
+};
+
+
 struct v2_int32 {
    union {
        struct {
@ -265,19 +304,19 @@ struct m_f64 {
    size_t m, n;
 };

-#define HALF_FLOAT_SIGN_MASK   0x8000
-#define HALF_FLOAT_EXP_MASK    0x7C00
-#define HALF_FLOAT_FRAC_MASK   0x03FF
+#define HALF_FLOAT_SIGN_MASK 0x8000
+#define HALF_FLOAT_EXP_MASK 0x7C00
+#define HALF_FLOAT_FRAC_MASK 0x03FF

-#define HALF_FLOAT_EXP_SHIFT   10
-#define HALF_FLOAT_EXP_BIAS    15
+#define HALF_FLOAT_EXP_SHIFT 10
+#define HALF_FLOAT_EXP_BIAS 15

-#define FLOAT32_SIGN_MASK      0x80000000
-#define FLOAT32_EXP_MASK       0x7F800000
-#define FLOAT32_FRAC_MASK      0x007FFFFF
+#define FLOAT32_SIGN_MASK 0x80000000
+#define FLOAT32_EXP_MASK 0x7F800000
+#define FLOAT32_FRAC_MASK 0x007FFFFF

-#define FLOAT32_EXP_SHIFT      23
-#define FLOAT32_EXP_BIAS       127
+#define FLOAT32_EXP_SHIFT 23
+#define FLOAT32_EXP_BIAS 127

 uint16 float_to_f16(float f) {
    uint32_t f_bits = *((uint32_t*)&f);
--- a/stdlib/simd/SIMD_I32.h
+++ b/stdlib/simd/SIMD_I32.h
@ -1332,7 +1332,6 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
            result += steps;
       }
    } else if (steps == 8) {
-        // @todo this his how all the functions should be implemented that take in baseic types and output basic types
        __m256i a_8;
        __m256 af_8;
        __m256 b_8 = _mm256_set1_ps(b);
--- a/stdlib/simd/SIMD_SVML.h
+++ b/stdlib/simd/SIMD_SVML.h
@ -18,46 +18,46 @@
    inline __m128i _mm_div_epi32(__m128i a, __m128i b) {
        alignas(16) int32_t a_array[4], b_array[4], result[4];

-        _mm_storeu_si128((__m128i*)a_array, a);
-        _mm_storeu_si128((__m128i*)b_array, b);
+        _mm_storeu_si128((__m128i*) a_array, a);
+        _mm_storeu_si128((__m128i*) b_array, b);

-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
            result[i] = a_array[i] / b_array[i];
        }

-        return _mm_load_si128((__m128i*)result);
+        return _mm_load_si128((__m128i*) result);
    }

    inline __m256i _mm256_div_epi32(__m256i a, __m256i b) {
        alignas(32) int32_t a_array[8], b_array[8], result[8];

-        _mm256_storeu_si256((__m256i*)a_array, a);
-        _mm256_storeu_si256((__m256i*)b_array, b);
+        _mm256_storeu_si256((__m256i*) a_array, a);
+        _mm256_storeu_si256((__m256i*) b_array, b);

-        for (int i = 0; i < 8; ++i) {
+        for (int32 i = 0; i < 8; ++i) {
            result[i] = a_array[i] / b_array[i];
        }

-        return _mm256_load_si256((__m256i*)result);
+        return _mm256_load_si256((__m256i*) result);
    }

    inline __m512i _mm512_div_epi32(__m512i a, __m512i b) {
        alignas(64) int32_t a_array[16], b_array[16], result[16];

-        _mm512_storeu_si512((__m512i*)a_array, a);
-        _mm512_storeu_si512((__m512i*)b_array, b);
+        _mm512_storeu_si512((__m512i*) a_array, a);
+        _mm512_storeu_si512((__m512i*) b_array, b);

-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
            result[i] = a_array[i] / b_array[i];
        }

-        return _mm512_load_si512((__m512i*)result);
+        return _mm512_load_si512((__m512i*) result);
    }

    inline __m128 _mm_sin_ps(__m128 a) {
        alignas(16) f32 a_array[4], result[4];
        _mm_storeu_ps(a_array, a);
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
            result[i] = sinf(a_array[i]);
        }
        return _mm_load_ps(result);
@ -66,7 +66,7 @@
    inline __m128 _mm_cos_ps(__m128 a) {
        alignas(16) f32 a_array[4], result[4];
        _mm_storeu_ps(a_array, a);
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
            result[i] = cosf(a_array[i]);
        }
        return _mm_load_ps(result);
@ -75,7 +75,7 @@
    inline __m128 _mm_asin_ps(__m128 a) {
        alignas(16) f32 a_array[4], result[4];
        _mm_storeu_ps(a_array, a);
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
            result[i] = asinf(a_array[i]);
        }
        return _mm_load_ps(result);
@ -84,7 +84,7 @@
    inline __m128 _mm_acos_ps(__m128 a) {
        alignas(16) f32 a_array[4], result[4];
        _mm_storeu_ps(a_array, a);
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
            result[i] = acosf(a_array[i]);
        }
        return _mm_load_ps(result);
@ -93,7 +93,7 @@
    inline __m256 _mm256_sin_ps(__m256 a) {
        alignas(32) f32 a_array[8], result[8];
        _mm256_storeu_ps(a_array, a);
-        for (int i = 0; i < 8; ++i) {
+        for (int32 i = 0; i < 8; ++i) {
            result[i] = sinf(a_array[i]);
        }
        return _mm256_load_ps(result);
@ -102,7 +102,7 @@
    inline __m256 _mm256_cos_ps(__m256 a) {
        alignas(32) f32 a_array[8], result[8];
        _mm256_storeu_ps(a_array, a);
-        for (int i = 0; i < 8; ++i) {
+        for (int32 i = 0; i < 8; ++i) {
            result[i] = cosf(a_array[i]);
        }
        return _mm256_load_ps(result);
@ -111,7 +111,7 @@
    inline __m256 _mm256_asin_ps(__m256 a) {
        alignas(32) f32 a_array[8], result[8];
        _mm256_storeu_ps(a_array, a);
-        for (int i = 0; i < 8; ++i) {
+        for (int32 i = 0; i < 8; ++i) {
            result[i] = asinf(a_array[i]);
        }
        return _mm256_load_ps(result);
@ -120,7 +120,7 @@
    inline __m256 _mm256_acos_ps(__m256 a) {
        alignas(32) f32 a_array[8], result[8];
        _mm256_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
            result[i] = acosf(a_array[i]);
        }
        return _mm256_load_ps(result);
@ -129,7 +129,7 @@
    inline __m512 _mm512_sin_ps(__m512 a) {
        alignas(64) f32 a_array[8], result[8];
        _mm512_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
            result[i] = sinf(a_array[i]);
        }
        return _mm512_load_ps(result);
@ -138,7 +138,7 @@
    inline __m512 _mm512_cos_ps(__m512 a) {
        alignas(64) f32 a_array[8], result[8];
        _mm512_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
            result[i] = cosf(a_array[i]);
        }
        return _mm512_load_ps(result);
@ -147,7 +147,7 @@
    inline __m512 _mm512_asin_ps(__m512 a) {
        alignas(64) f32 a_array[8], result[8];
        _mm512_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
            result[i] = asinf(a_array[i]);
        }
        return _mm512_load_ps(result);
@ -156,7 +156,7 @@
    inline __m512 _mm512_acos_ps(__m512 a) {
        alignas(64) f32 a_array[16], result[16];
        _mm512_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
            result[i] = acosf(a_array[i]);
        }
        return _mm512_load_ps(result);
--- a/thread/ThreadJob.h
+++ b/thread/ThreadJob.h
@ -13,6 +13,7 @@
 #include <stdlib.h>

 #include "../stdlib/Types.h"
+#include "../memory/ThreadedRingMemory.h"

 #if _WIN32
    #include "../platform/win32/threading/ThreadDefines.h"
@ -20,14 +21,16 @@
    #include "../platform/linux/threading/ThreadDefines.h"
 #endif

-struct PoolWorker {
-    ThreadJobFunc func;
-    void *arg;
-    volatile int32 state;
-    PoolWorker *next;
-};
+typedef void (*ThreadPoolJobFunc)(void*);

-typedef PoolWorker ThreadJob;
+struct PoolWorker {
+    int32 id;
+    volatile int32 state;
+    void* arg;
+    void* result;
+    RingMemory ring;
+    ThreadPoolJobFunc func;
+};

 struct Worker {
    volatile int32 state;
--- a/thread/ThreadPool.h
+++ b/thread/ThreadPool.h
@ -13,158 +13,120 @@
 #include <stdlib.h>

 #include "../stdlib/Types.h"
+#include "../memory/Queue.h"
+#include "../memory/BufferMemory.h"

 #ifdef _WIN32
    #include "../platform/win32/threading/Thread.h"
+    #include "../platform/win32/threading/Atomic.h"
 #elif __linux__
    #include "../platform/linux/threading/Thread.h"
+    #include "../platform/linux/threading/Atomic.h"
 #endif

 #include "ThreadJob.h"

 struct ThreadPool {
-    ThreadJob *work_first;
-    ThreadJob *work_last;
+    // This is not a threaded queue since we want to handle the mutex in here, not in the queue for finer control
+    Queue work_queue;

    pthread_mutex_t work_mutex;
    pthread_cond_t work_cond;
    pthread_cond_t working_cond;

-    size_t working_cnt;
-    size_t thread_cnt;
+    int32 working_cnt;
+    int32 thread_cnt;

    int32 size;
-    bool stop;
+    int32 state;
+
+    uint32 id_counter;
 };

-ThreadJob *thread_pool_work_poll(ThreadPool *pool)
-{
-    if (pool == NULL) {
-        return NULL;
-    }
-
-    ThreadJob *work = pool->work_first;
-    if (work == NULL) {
-        return NULL;
-    }
-
-    if (work->next == NULL) {
-        pool->work_first = NULL;
-        pool->work_last  = NULL;
-    } else {
-        pool->work_first = work->next;
-    }
-
-    return work;
-}
-
 static THREAD_RETURN thread_pool_worker(void* arg)
 {
-    ThreadPool *pool = (ThreadPool *) arg;
-    ThreadJob *work;
+    ThreadPool* pool = (ThreadPool *) arg;
+    PoolWorker* work;

    while (true) {
        pthread_mutex_lock(&pool->work_mutex);
-
-        while (pool->work_first == NULL && !pool->stop) {
+        while (queue_is_empty(&pool->work_queue) && !pool->state) {
            pthread_cond_wait(&pool->work_cond, &pool->work_mutex);
        }

-        if (pool->stop) {
+        if (pool->state == 1) {
+            pthread_mutex_unlock(&pool->work_mutex);
+
            break;
        }

-        work = thread_pool_work_poll(pool);
-        ++(pool->working_cnt);
+        work = (PoolWorker *) queue_dequeue_keep(&pool->work_queue, sizeof(PoolWorker), 64);
        pthread_mutex_unlock(&pool->work_mutex);

-        if (work != NULL) {
-            work->func(work);
+        if (!work) {
+            continue;
        }

-        pthread_mutex_lock(&pool->work_mutex);
-        --(pool->working_cnt);
+        atomic_increment(&pool->working_cnt);
+        atomic_set(&work->state, 2);
+        work->func(work);
+        atomic_set(&work->state, 1);

-        if (!pool->stop && pool->working_cnt == 0 && pool->work_first == NULL) {
+        // Job gets marked after completion -> can be overwritten now
+        if (atomic_get(&work->id) == -1) {
+            atomic_set(&work->id, 0);
+        }
+
+        atomic_decrement(&pool->working_cnt);
+
+        if (atomic_get(&pool->state) == 0 && atomic_get(&pool->working_cnt) == 0) {
            pthread_cond_signal(&pool->working_cond);
        }
-
-        pthread_mutex_unlock(&pool->work_mutex);
    }

-    --(pool->thread_cnt);
    pthread_cond_signal(&pool->working_cond);
-    pthread_mutex_unlock(&pool->work_mutex);
+    atomic_decrement(&pool->thread_cnt);

    return NULL;
 }

-ThreadPool *thread_pool_create(size_t num, ThreadPool* pool)
+void thread_pool_create(ThreadPool* pool, BufferMemory* buf, int32 thread_count)
 {
-    pthread_t thread;
-    size_t i;
+    queue_init(&pool->work_queue, buf, 64, sizeof(PoolWorker), 64);

-    if (num == 0) {
-        num = 2;
-    }
-
-    pool->thread_cnt = num;
+    pool->thread_cnt = thread_count;

    // @todo switch from pool mutex and pool cond to threadjob mutex/cond
-    //      thread_pool_wait etc. should just itereate over all mutexes
+    //      thread_pool_wait etc. should just iterate over all mutexes
    pthread_mutex_init(&pool->work_mutex, NULL);
    pthread_cond_init(&pool->work_cond, NULL);
    pthread_cond_init(&pool->working_cond, NULL);

-    pool->work_first = NULL;
-    pool->work_last  = NULL;
-
-    for (i = 0; i < num; ++i) {
+    pthread_t thread;
+    for (pool->size = 0; pool->size < thread_count; ++pool->size) {
        pthread_create(&thread, NULL, thread_pool_worker, pool);
-        ++(pool->size);
-
        pthread_detach(thread);
    }
-
-    return pool;
 }

-void thread_pool_wait(ThreadPool *pool)
+void thread_pool_wait(ThreadPool* pool)
 {
-    if (pool == NULL) {
-        return;
-    }
-
    pthread_mutex_lock(&pool->work_mutex);
-
-    while (true) {
-        if ((!pool->stop && pool->working_cnt != 0) || (pool->stop && pool->thread_cnt != 0)) {
-            pthread_cond_wait(&pool->working_cond, &pool->work_mutex);
-        } else {
-            break;
-        }
+    while ((!pool->state && pool->working_cnt != 0) || (pool->state && pool->thread_cnt != 0)) {
+        pthread_cond_wait(&pool->working_cond, &pool->work_mutex);
    }
-
    pthread_mutex_unlock(&pool->work_mutex);
 }

-void thread_pool_destroy(ThreadPool *pool)
+void thread_pool_destroy(ThreadPool* pool)
 {
-    if (pool == NULL) {
-        return;
-    }
+    // This sets the queue to empty
+    atomic_set((void **) &pool->work_queue.tail, (void **) &pool->work_queue.head);

-    pthread_mutex_lock(&pool->work_mutex);
-    ThreadJob *work = pool->work_first;
+    // This sets the state to "shutdown"
+    atomic_set(&pool->state, 1);

-    while (work != NULL) {
-        work = work->next;
-    }
-
-    pool->stop = true;
    pthread_cond_broadcast(&pool->work_cond);
-    pthread_mutex_unlock(&pool->work_mutex);
-
    thread_pool_wait(pool);

    pthread_mutex_destroy(&pool->work_mutex);
@ -172,25 +134,58 @@ void thread_pool_destroy(ThreadPool *pool)
    pthread_cond_destroy(&pool->working_cond);
 }

-ThreadJob* thread_pool_add_work(ThreadPool *pool, ThreadJob* job)
+PoolWorker* thread_pool_add_work(ThreadPool* pool, const PoolWorker* job)
 {
-    if (pool == NULL || job == NULL) {
+    pthread_mutex_lock(&pool->work_mutex);
+    PoolWorker* temp_job = (PoolWorker *) ring_get_memory_nomove(&pool->work_queue, sizeof(PoolWorker), 64);
+    if (atomic_get(&temp_job->id) > 0) {
+        pthread_mutex_unlock(&pool->work_mutex);
+        ASSERT_SIMPLE(temp_job->id == 0);
+
        return NULL;
    }

-    pthread_mutex_lock(&pool->work_mutex);
-    if (pool->work_first == NULL) {
-        pool->work_first = job;
-        pool->work_last  = pool->work_first;
-    } else {
-        pool->work_last->next = job;
-        pool->work_last       = job;
+    memcpy(temp_job, job, sizeof(PoolWorker));
+    ring_move_pointer(&pool->work_queue, &pool->work_queue.head, sizeof(PoolWorker), 64);
+
+    if (temp_job->id == 0) {
+        temp_job->id = atomic_add_fetch(&pool->id_counter, 1);
    }

    pthread_cond_broadcast(&pool->work_cond);
    pthread_mutex_unlock(&pool->work_mutex);

-    return job;
+    return temp_job;
 }

+// This is basically the same as thread_pool_add_work but allows us to directly write into the memory in the caller
+// This makes it faster, since we can avoid a memcpy
+PoolWorker* thread_pool_add_work_start(ThreadPool* pool)
+{
+    pthread_mutex_lock(&pool->work_mutex);
+
+    PoolWorker* temp_job = (PoolWorker *) queue_enqueue_start(&pool->work_queue, sizeof(PoolWorker), 64);
+    if (atomic_get(&temp_job->id) > 0) {
+        pthread_mutex_unlock(&pool->work_mutex);
+        ASSERT_SIMPLE(temp_job->id == 0);
+
+        return NULL;
+    }
+
+    if (temp_job->id == 0) {
+        // +1 because otherwise the very first job would be id = 0 which is not a valid id
+        temp_job->id = atomic_add_fetch(&pool->id_counter, 1) + 1;
+    }
+
+    return temp_job;
+}
+
+void thread_pool_add_work_end(ThreadPool* pool)
+{
+    queue_enqueue_end(&pool->work_queue, sizeof(PoolWorker), 64);
+    pthread_cond_broadcast(&pool->work_cond);
+    pthread_mutex_unlock(&pool->work_mutex);
+}
+
+
 #endif
--- a/ui/UITheme.h
+++ b/ui/UITheme.h
@ -108,9 +108,14 @@ int compare_by_attribute_id(const void* a, const void* b) {
 // WARNING: theme needs to have memory already reserved and assigned to data
 void theme_from_file_txt(
    UIThemeStyle* theme,
-    byte* data
+    const char* path,
+    RingMemory* ring
 ) {
-    char* pos = (char *) data;
+    FileBody file;
+    file_read(path, &file, ring);
+    ASSERT_SIMPLE(file.size);
+
+    char* pos = (char *) file.content;

    // move past the version string
    pos += 8;
@ -150,11 +155,11 @@ void theme_from_file_txt(

    UIAttributeGroup* temp_group = NULL;

-    pos = (char *) data;
+    pos = (char *) file.content;
    pos += 8; // move past version

    while (*pos != '\0') {
-        str_skip_empty(&pos);
+        str_skip_whitespace(&pos);

        if (*pos == '\n') {
            ++pos;
@ -200,7 +205,7 @@ void theme_from_file_txt(

        str_copy_move_until(&pos, attribute_name, " :\n", sizeof(" :\n") - 1);

-        // Skip any white spaces or other delimeters
+        // Skip any white spaces or other delimeter
        str_skip_list(&pos, " \t:", sizeof(" \t:") - 1);

        ASSERT_SIMPLE((*pos != '\0' && *pos != '\n'));
@ -394,9 +399,9 @@ void theme_from_file_txt(

 // The size of theme->data should be the file size.
 // Yes, this means we have a little too much data but not by a lot
-void theme_from_file(
-    UIThemeStyle* theme,
-    const byte* data
+int32 theme_from_data(
+    const byte* data,
+    UIThemeStyle* theme
 ) {
    const byte* pos = data;

@ -445,13 +450,15 @@ void theme_from_file(
            entry = entry->next;
        }
    }
+
+    return (int32) (pos - data);
 }

 // Calculates the maximum theme size
 // Not every group has all the attributes (most likely only a small subset)
 // However, an accurate calculation is probably too slow and not needed most of the time
 inline
-int64 theme_size(const UIThemeStyle* theme)
+int64 theme_data_size(const UIThemeStyle* theme)
 {
    return hashmap_size(&theme->hash_map)
        + theme->hash_map.buf.count * UI_ATTRIBUTE_TYPE_SIZE * sizeof(UIAttribute);
@ -472,20 +479,11 @@ int64 theme_size(const UIThemeStyle* theme)
 //      attributes ...
 //      attributes ...

-void theme_to_file(
-    RingMemory* ring,
-    const char* path,
-    const UIThemeStyle* theme
+int32 theme_to_data(
+    const UIThemeStyle* theme,
+    byte* data
 ) {
-    FileBody file;
-
-    // Temporary file size for buffer
-    // @todo This is a bad placeholder, The problem is we don't know how much we actually need without stepping through the elements
-    //      I also don't want to add a size variable to the theme as it is useless in all other cases
-    file.size = theme_size(theme);
-
-    file.content = ring_get_memory(ring, file.size, 64, true);
-    byte* pos = file.content;
+    byte* pos = data;

    // version
    *((int32 *) pos) = SWAP_ENDIAN_LITTLE(theme->version);
@ -497,7 +495,7 @@ void theme_to_file(

    // theme data
    // Layout: first save the size of the group, then save the individual attributes
-    for (int32 i = 0; i < theme->hash_map.buf.count; ++i) {
+    for (uint32 i = 0; i < theme->hash_map.buf.count; ++i) {
        if (!theme->hash_map.table[i]) {
            continue;
        }
@ -530,8 +528,7 @@ void theme_to_file(
        }
    }

-    file.size = pos - file.content;
-    file_write(path, &file);
+    return (int32) (pos - data);
 }

 #endif
--- a/utils/MathUtils.h
+++ b/utils/MathUtils.h
@ -27,6 +27,9 @@
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 #define OMS_CEIL(x) ((x) == (int)(x) ? (int)(x) : ((x) > 0 ? (int)(x) + 1 : (int)(x)))

+// Modulo function when b is a power of 2
+#define MODULO_2(a, b) ((a) & (b - 1))
+
 #define SQRT_2 1.4142135623730950488016887242097f

 #endif
--- a/utils/RandomUtils.h
+++ b/utils/RandomUtils.h
@ -0,0 +1,81 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_UTILS_RANDOM_H
+#define TOS_UTILS_RANDOM_H
+
+#include <stdlib.h>
+#include "../stdlib/Types.h"
+
+global_persist uint32 fast_seed;
+#define FAST_RAND_MAX 32767
+
+inline
+uint32 fast_rand1(void) {
+    fast_seed = (214013 * fast_seed + 2531011);
+
+    return (fast_seed >> 16) & 0x7FFF;
+}
+
+uint32 fast_rand2(uint32* state) {
+    uint32 x = *state;
+
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 5;
+
+    *state = x;
+
+    return x;
+}
+
+inline
+f32 fast_rand_percentage(void) {
+    return (f32) fast_rand1() / (f32) FAST_RAND_MAX;
+}
+
+/**
+ * Picks n random elements from end and stores them in begin.
+ */
+inline
+void random_unique(int32* array, int32 size) {
+    for (int32 i = size - 1; i > 0; --i) {
+        int32 j = rand() % (i + 1);
+
+        int32 temp = array[i];
+        array[i] = array[j];
+        array[j] = temp;
+    }
+}
+
+/**
+ * Gets random index based value probability
+ */
+int32 random_weighted_index(const int32* arr, int32 array_count)
+{
+    uint32 prob_sum = 0;
+    for (int32 i = 0; i < array_count; ++i) {
+        prob_sum += arr[i];
+    }
+
+    uint32 random_prob = rand() % (prob_sum + 1);
+    uint32 current_rarity = 0;
+    int32 item_rarity = array_count - 1;
+    for (int32 i = 0; i < array_count - 1; ++i) {
+        current_rarity += arr[i];
+
+        if (current_rarity < random_prob) {
+            item_rarity = i;
+            break;
+        }
+    }
+
+    return item_rarity;
+}
+
+#endif
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@ -10,6 +10,7 @@
 #define TOS_UTILS_STRING_UTILS_H

 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <ctype.h>

@ -430,7 +431,8 @@ void str_replace(const char* str, const char* __restrict search, const char* __r
        memcpy(result_ptr, replace, replace_len);
        result_ptr += replace_len;

-        str = current + search_len;
+        current += search_len;
+        str = current;
    }

    strcpy(result_ptr, str);
@ -709,4 +711,16 @@ void hexstr_to_rgba(v4_f32* rgba, const char* hex)
    rgba->a = (f32) (value & 0xFF) / 255.0f;
 }

+inline constexpr
+void str_pad(const char* input, char* output, char pad, size_t len) {
+    size_t i = 0;
+    for (; i < len && input[i] != '\0'; ++i) {
+        output[i] = input[i];
+    }
+
+    for (; i < len; ++i) {
+        output[i] = pad;
+    }
+}
+
 #endif
--- a/utils/Utils.h
+++ b/utils/Utils.h
@ -10,7 +10,6 @@
 #define TOS_UTILS_H

 #include <stdlib.h>
-
 #include "../stdlib/Types.h"

 struct FileBody {
@ -18,76 +17,11 @@ struct FileBody {
    byte* content;
 };

-global_persist uint32 fast_seed;
-#define FAST_RAND_MAX 32767
-
-inline
-uint32 fast_rand1(void) {
-    fast_seed = (214013 * fast_seed + 2531011);
-
-    return (fast_seed >> 16) & 0x7FFF;
-}
-
-uint32 fast_rand2(uint32* state) {
-    uint32 x = *state;
-
-    x ^= x << 13;
-    x ^= x >> 17;
-    x ^= x << 5;
-
-    *state = x;
-
-    return x;
-}
-
-inline
-f32 fast_rand_percentage(void) {
-    return (f32) fast_rand1() / (f32) FAST_RAND_MAX;
-}
-
-/**
- * Picks n random elements from end and stores them in begin.
- */
-inline
-void random_unique(int32* array, int32 size) {
-    for (int32 i = size - 1; i > 0; --i) {
-        int32 j = rand() % (i + 1);
-
-        int32 temp = array[i];
-        array[i] = array[j];
-        array[j] = temp;
-    }
-}
-
-/**
- * Gets random index based value probability
- */
-int random_weighted_index(const int32* arr, int32 array_count)
-{
-    uint32 prob_sum = 0;
-    for (int32 i = 0; i < array_count; ++i) {
-        prob_sum += arr[i];
-    }
-
-    uint32 random_prob = rand() % (prob_sum + 1);
-    uint32 current_rarity = 0;
-    int32 item_rarity = array_count - 1;
-    for (int32 i = 0; i < array_count - 1; ++i) {
-        current_rarity += arr[i];
-
-        if (current_rarity < random_prob) {
-            item_rarity = i;
-            break;
-        }
-    }
-
-    return item_rarity;
-}
-
+// @question Do we want to make the size comparison a step variable?
 bool is_equal_aligned(const byte* region1, const byte* region2, uint64 size)
 {
    while (size > 4) {
-        if (*(const int32_t*) region1 != *(const int32_t*) region2) {
+        if (*(const int32 *) region1 != *(const int32 *) region2) {
            return false;
        }

@ -108,4 +42,27 @@ bool is_equal_aligned(const byte* region1, const byte* region2, uint64 size)
    return true;
 }

+// @question Do we want to make the size comparison a step variable?
+bool is_empty(const byte* region, uint64 size)
+{
+    while (size > 4) {
+        if (*(const int32 *) region != 0) {
+            return false;
+        }
+
+        region += 4;
+        size -= 4;
+    }
+
+    for (; size > 0; --size) {
+        if (region != 0) {
+            return false;
+        }
+
+        ++region;
+    }
+
+    return true;
+}
+
 #endif