From fe054ebb13cad5054ebc5b6411c691e803100b5b Mon Sep 17 00:00:00 2001
From: Dennis Eichhorn <spl1nes.com@googlemail.com>
Date: Wed, 11 Dec 2024 21:03:47 +0100
Subject: [PATCH] implemented asset archives and more threading. kinda working

---
 asset/AssetArchive.h                  | 162 +++++++++++++++++-----
 asset/AssetManagementSystem.h         |  36 ++++-
 asset/AssetType.h                     |   6 +-
 audio/Audio.cpp                       |  70 +++++++++-
 audio/Audio.h                         |   8 +-
 audio/AudioMixer.h                    | 132 ++++++++++++------
 audio/AudioSetting.h                  |  13 +-
 audio/Wav.h                           |  43 +++---
 camera/Camera.h                       |  38 +++--
 compression/Huffman.h                 |  75 +++++-----
 compression/LZP.h                     |  26 ++--
 compression/RLE.h                     |   6 +-
 font/Font.h                           |  63 ++++-----
 gpuapi/opengl/OpenglUtils.h           |  45 +++---
 gpuapi/opengl/ShaderUtils.h           |  83 +++++++++++
 image/Bitmap.h                        |  12 +-
 image/Image.cpp                       |  90 +++++++++++-
 image/Image.h                         |  14 +-
 image/Png.h                           |   4 +-
 image/Tga.h                           |  10 +-
 localization/Language.h               |  80 +++++++----
 log/Debug.cpp                         |  36 +++--
 log/Debug.h                           |   2 +-
 log/Log.h                             |   4 +-
 math/matrix/MatrixFloat32.h           |   4 +
 memory/Queue.h                        |  47 ++++++-
 memory/RingMemory.h                   |  14 +-
 memory/ThreadedQueue.h                | 126 +++++++++++++----
 memory/ThreadedRingMemory.h           | 163 ++++++++++++++++++++++
 object/Mesh.h                         |  60 ++++----
 platform/linux/FileUtils.cpp          |  64 ++++++++-
 platform/linux/{ => network}/Server.h |  10 +-
 platform/linux/{ => network}/Socket.h |   4 +-
 platform/linux/threading/Atomic.h     | 191 ++++++++++++++++++++++++--
 platform/linux/threading/Thread.h     |   6 +-
 platform/win32/FileUtils.cpp          | 123 +++++++++++------
 platform/win32/Library.h              |   1 +
 platform/win32/SystemInfo.cpp         |   1 +
 platform/win32/Window.h               |  12 +-
 platform/win32/audio/DirectSound.h    |   8 +-
 platform/win32/audio/XAudio2.h        |   2 +-
 platform/win32/input/HidInput.h       |   2 +-
 platform/win32/{ => network}/Client.h |  10 +-
 platform/win32/{ => network}/Server.h |   8 +-
 platform/win32/{ => network}/Socket.h |   4 +-
 platform/win32/threading/Atomic.h     | 187 ++++++++++++++++++++++++-
 platform/win32/threading/Semaphore.h  |   8 ++
 scene/SceneState.h                    |  23 ++++
 stdlib/HashMap.h                      |  66 ++++-----
 stdlib/ThreadedHashMap.h              |  28 ++--
 stdlib/Types.h                        |  61 ++++++--
 stdlib/simd/SIMD_I32.h                |   1 -
 stdlib/simd/SIMD_SVML.h               |  48 +++----
 thread/ThreadJob.h                    |  17 ++-
 thread/ThreadPool.h                   | 185 ++++++++++++-------------
 ui/UITheme.h                          |  47 +++----
 utils/MathUtils.h                     |   3 +
 utils/RandomUtils.h                   |  81 +++++++++++
 utils/StringUtils.h                   |  16 ++-
 utils/Utils.h                         |  93 ++++---------
 60 files changed, 2052 insertions(+), 730 deletions(-)
 create mode 100644 memory/ThreadedRingMemory.h
 rename platform/linux/{ => network}/Server.h (91%)
 rename platform/linux/{ => network}/Socket.h (62%)
 rename platform/win32/{ => network}/Client.h (84%)
 rename platform/win32/{ => network}/Server.h (86%)
 rename platform/win32/{ => network}/Socket.h (62%)
 create mode 100644 scene/SceneState.h
 create mode 100644 utils/RandomUtils.h

diff --git a/asset/AssetArchive.h b/asset/AssetArchive.h
index 8183d5d..ddef886 100644
--- a/asset/AssetArchive.h
+++ b/asset/AssetArchive.h
@@ -16,6 +16,13 @@
 #include "../stdlib/simd/SIMD_I32.h"
 #include "../memory/RingMemory.h"
 #include "../memory/BufferMemory.h"
+#include "../image/Image.cpp"
+#include "../object/Mesh.h"
+#include "../object/Texture.h"
+#include "../audio/Audio.cpp"
+#include "../font/Font.h"
+#include "../localization/Language.h"
+#include "../ui/UITheme.h"
 #include "AssetManagementSystem.h"
 
 #if _WIN32
@@ -25,16 +32,21 @@
     #include "../platform/win32/FileUtils.cpp"
 #endif
 
+#define ASSET_ARCHIVE_VERSION 1
+
 struct AssetArchiveElement {
-    int32 type;
+    uint32 type;
 
-    int32 start;
-    int32 length;
+    uint32 start;
+    uint32 length;
 
-    int32 dependency_start; // actual index for asset_dependencies
-    int32 dependency_count;
+    uint32 dependency_start; // actual index for asset_dependencies
+    uint32 dependency_count;
 };
 
+// It is important to understand that for performance reasons the assets addresses are stored in an array
+// This makes it very fast to access because there is only one indirection.
+// On the other hand we can only find assets by their ID/location and not by name.
 struct AssetArchiveHeader {
     int32 version;
 
@@ -49,7 +61,14 @@ struct AssetArchive {
     AssetArchiveHeader header;
     byte* data; // owner of the data
 
-    FileHandler fd;
+    FileHandle fd;
+    FileHandle fd_async;
+
+    // @performance We still need to implement the loading with this and then profile it to see if it is faster.
+    // If not remove
+    MMFHandle mmf;
+
+    int32 asset_type_map[ASSET_TYPE_SIZE];
 };
 
 // Calculates how large the header memory has to be to hold all its information
@@ -91,7 +110,9 @@ void asset_archive_header_load(AssetArchiveHeader* header, byte* data, int32 ste
         steps
     );
 
-    header->asset_dependencies = (int32 *) ((byte *) header->asset_element + header->asset_count * sizeof(AssetArchiveElement));
+    if (header->asset_dependency_count) {
+        header->asset_dependencies = (int32 *) ((byte *) header->asset_element + header->asset_count * sizeof(AssetArchiveElement));
+    }
 
     memcpy(header->asset_dependencies, data, header->asset_dependency_count * sizeof(int32));
     SWAP_ENDIAN_LITTLE_SIMD(
@@ -110,17 +131,22 @@ AssetArchiveElement* asset_archive_element_find(const AssetArchive* archive, int
 
 void asset_archive_load(AssetArchive* archive, const char* path, BufferMemory* buf, RingMemory* ring, int32 steps = 8)
 {
-    // Get file handle
-    archive->fd = file_read_async_handle(path);
+    archive->fd = file_read_handle(path);
     if (!archive->fd) {
         return;
     }
 
+    archive->fd_async = file_read_async_handle(path);
+    if (!archive->fd_async) {
+        return;
+    }
+    archive->mmf = file_mmf_handle(archive->fd_async);
+
     FileBody file;
     file.size = 64;
 
     // Find header size
-    file.content = ring_get_memory(ring, file.size);
+    file.content = ring_get_memory(ring, file.size, 4);
     file_read(archive->fd, &file, 0, file.size);
     file.size = asset_archive_header_size(archive, file.content);
 
@@ -134,33 +160,50 @@ void asset_archive_load(AssetArchive* archive, const char* path, BufferMemory* b
         4
     );
 
+    archive->header.asset_element = (AssetArchiveElement *) archive->data;
+
     // Read entire header
     file.content = ring_get_memory(ring, file.size);
     file_read(archive->fd, &file, 0, file.size);
     asset_archive_header_load(&archive->header, file.content, steps);
 }
 
-// @performance This can probably be done much faster by handling the loading of dependencies faster
-void asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetManagementSystem* ams_array, RingMemory* ring)
+// @question Do we want to allow a callback function?
+// Very often we want to do something with the data (e.g. upload it to the gpu)
+// Maybe we could just accept a int value which we set atomically as a flag that the asset is complete?
+// this way we can check much faster if we can work with this data from the caller?!
+// The only problem is that we need to pass the pointer to this int in the thrd_queue since we queue the files to load there
+Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetManagementSystem* ams_array, RingMemory* ring)
 {
-    AssetArchiveElement* element = &archive->header.asset_element[id];
-    AssetManagementSystem* ams = element->type > 0
-        ? &ams_array[element->type]
-        : &ams_array[0];
+    // @todo add calculation from element->type to ams index
 
-    uint64 hash = hash_djb2((const char *) &id);
+    AssetArchiveElement* element = &archive->header.asset_element[id];
+    AssetManagementSystem* ams = &ams_array[archive->asset_type_map[element->type]];
+
+    // @todo This is a little bit stupid, reconsider
+    char id_str[5];
+    id_str[4] = '\0';
+    *((int32 *) id_str) = id;
+
+    uint64 hash = hash_djb2(id_str);
+
+    Asset* asset;
 
     // @performance I think we could optimize the ams_reserver_asset in a way so we don't have to lock it the entire time
     pthread_mutex_lock(&ams->mutex);
-    // @bug this is not how this function works
-    if (hashmap_get_entry(&ams->hash_map, (const char *) &id, hash)) {
+    // @bug If we have multiple archive files the ids also repeat, which is not possible for the hash map
+    // Possible solution: also store a string name for every asset. This would add HASH_MAP_MAX_KEY_LENGTH bytes of data to every asset though (see hash map key size = 32)
+
+    asset = ams_get_asset(ams, id_str, hash);
+    if (asset) {
+        // Asset already loaded
         pthread_mutex_unlock(&ams->mutex);
+
+        return asset;
     }
 
     if (element->type == 0) {
-        // @bug We can't just do this, this won't work. Check if we might want to change the asset management directly to hash indices or at least int values
-        Asset* asset = ams_reserve_asset(ams, (const char *) &id, ams_calculate_chunks(ams, element->length));
-        asset->self = (byte *) (asset + 1);
+        asset = ams_reserve_asset(ams, id_str, ams_calculate_chunks(ams, element->length));
 
         FileBody file = {};
         file.content = asset->self;
@@ -168,34 +211,83 @@ void asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetManage
         // We are directly reading into the correct destination
         file_read(archive->fd, &file, element->start, element->length);
     } else {
+        // @performance In this case we may want to check if memory mapped regions are better.
+        // 1. I don't think they work together with async loading
+        // 2. Profile which one is faster
+        // 3. The big benefit of mmf would be that we can avoid one memcpy and directly load the data into the object
+        // 4. Of course the disadvantage would be to no longer have async loading
+
         // We are reading into temp memory since we have to perform transformations on the data
         FileBodyAsync file = {};
-        file_read_async(archive->fd, &file, element->start, element->length, ring);
+        file_read_async(archive->fd_async, &file, element->start, element->length, ring);
 
         // This happens while the file system loads the data
-        Asset* asset = ams_reserve_asset(ams, (const char *) &id, ams_calculate_chunks(ams, element->length));
-        asset->self = (byte *) (asset + 1);
+        asset = ams_reserve_asset(ams, id_str, ams_calculate_chunks(ams, element->length));
+        asset->is_ram = true;
 
-        byte* data = ring_get_memory(ring, element->length, 64);
-        size_t data_size = 0;
-
-        // @todo create platform wrapper
-        GetOverlappedResult(archive->fd, &file.ov, NULL, true);
+        file_async_wait(archive->fd_async, &file.ov, true);
         switch (element->type) {
-            case 1: {
+            case ASSET_TYPE_IMAGE: {
+                // @todo Do we really want to store textures in the asset management system or only images?
+                // If it is only images then we need to somehow also manage textures
+                Texture* texture = (Texture *) asset->self;
+                texture->image.pixels = (byte *) (texture + 1);
+
+                image_from_data(file.content, &texture->image);
+
+                asset->vram_size = texture->image.pixel_count * image_pixel_size_from_type(texture->image.pixel_type);
+                asset->ram_size = asset->vram_size + sizeof(Texture);
+
+                #if OPENGL
+                    // @bug I think order_rows has the wrong value
+                    if (texture->image.order_rows == IMAGE_ROW_ORDER_TOP_TO_BOTTOM) {
+                        image_flip_vertical(ring, &texture->image);
+                        texture->image.order_rows = IMAGE_ROW_ORDER_BOTTOM_TO_TOP;
+                    }
+                #endif
+            } break;
+            case ASSET_TYPE_AUDIO: {
+                Audio* audio = (Audio *) asset->self;
+                audio->data = (byte *) (audio + 1);
+
+                audio_from_data(file.content, audio);
+            } break;
+            case ASSET_TYPE_OBJ: {
+                Mesh* mesh = (Mesh *) asset->self;
+                mesh->data = (byte *) (mesh + 1);
+
+                mesh_from_data(file.content, mesh);
+            } break;
+            case ASSET_TYPE_LANGUAGE: {
+                Language* language = (Language *) asset->self;
+                language->data = (byte *) (language + 1);
+
+                language_from_data(file.content, language);
+            } break;
+            case ASSET_TYPE_FONT: {
+                Font* font = (Font *) asset->self;
+                font->glyphs = (Glyph *) (font + 1);
+
+                font_from_data(file.content, font);
+            } break;
+            case ASSET_TYPE_THEME: {
+                UIThemeStyle* theme = (UIThemeStyle *) asset->self;
+                theme->data = (byte *) (theme + 1);
+
+                theme_from_data(file.content, theme);
             } break;
             default: {
             }
         }
-
-        memcpy(asset->self, data, data_size);
     }
     pthread_mutex_unlock(&ams->mutex);
 
-    // @performance maybe do in worker threads?
-    for (int32 i = 0; i < element->dependency_count; ++i) {
+    // @performance maybe do in worker threads? This just feels very slow
+    for (uint32 i = 0; i < element->dependency_count; ++i) {
         asset_archive_asset_load(archive, id, ams, ring);
     }
+
+    return asset;
 }
 
 #endif
\ No newline at end of file
diff --git a/asset/AssetManagementSystem.h b/asset/AssetManagementSystem.h
index bc39f91..f251feb 100644
--- a/asset/AssetManagementSystem.h
+++ b/asset/AssetManagementSystem.h
@@ -34,9 +34,11 @@ struct AssetManagementSystem {
     // The indices of asset_memory and asset_data_memory are always linked
 
     // General asset memory
+    // Fixed chunk size of sizeof(Asset)
     ChunkMemory asset_memory;
 
     // Actual asset data
+    // Chunk size defined during initialization
     ChunkMemory asset_data_memory;
 
     // @performance Do we really need the linked list, the ChunkMemory should allow us to do some smart stuff
@@ -44,7 +46,11 @@ struct AssetManagementSystem {
     Asset* last;
 
     // @question do we want to create an extra threaded version? Or a combined one, like we have right now.
+    // @question Do we want to add a mutex to assets. This way we don't have to lock the entire ams.
     pthread_mutex_t mutex;
+
+    // @bug We probably also need a overhead value.
+    // In some cases we need more data than our normal data (see texture, it contains image + texture)
 };
 
 void ams_create(AssetManagementSystem* ams, BufferMemory* buf, int32 chunk_size, int32 count)
@@ -201,9 +207,9 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key)
 }
 
 inline
-Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 index)
+Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash)
 {
-    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key, index);
+    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key, hash);
 
     // @bug entry->value seems to be an address outside of any known buffer, how?
     DEBUG_MEMORY_READ(
@@ -215,7 +221,7 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 index)
 }
 
 // @performance We could probably avoid locking by adding a atomic flag to indicate if the value is valid
-Asset* threaded_ams_get_asset(AssetManagementSystem* ams, uint64 element) {
+Asset* thrd_ams_get_asset(AssetManagementSystem* ams, uint64 element) {
     pthread_mutex_lock(&ams->mutex);
     Asset* asset = ams_get_asset(ams, element);
     pthread_mutex_unlock(&ams->mutex);
@@ -223,7 +229,7 @@ Asset* threaded_ams_get_asset(AssetManagementSystem* ams, uint64 element) {
     return asset;
 }
 
-Asset* threaded_ams_get_asset(AssetManagementSystem* ams, const char* key) {
+Asset* thrd_ams_get_asset(AssetManagementSystem* ams, const char* key) {
     pthread_mutex_lock(&ams->mutex);
     Asset* asset = ams_get_asset(ams, key);
     pthread_mutex_unlock(&ams->mutex);
@@ -231,9 +237,9 @@ Asset* threaded_ams_get_asset(AssetManagementSystem* ams, const char* key) {
     return asset;
 }
 
-Asset* threaded_ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 index) {
+Asset* thrd_ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash) {
     pthread_mutex_lock(&ams->mutex);
-    Asset* asset = ams_get_asset(ams, key, index);
+    Asset* asset = ams_get_asset(ams, key, hash);
     pthread_mutex_unlock(&ams->mutex);
 
     return asset;
@@ -309,4 +315,22 @@ Asset* ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 el
     return asset;
 }
 
+Asset* thrd_ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 elements = 1) {
+    pthread_mutex_lock(&ams->mutex);
+    Asset* asset = ams_reserve_asset(ams, name, elements);
+    pthread_mutex_unlock(&ams->mutex);
+
+    return asset;
+}
+
+Asset* thrd_ams_reserve_asset_start(AssetManagementSystem* ams, const char* name, uint32 elements = 1) {
+    pthread_mutex_lock(&ams->mutex);
+
+    return ams_reserve_asset(ams, name, elements);
+}
+
+void thrd_ams_reserve_asset_end(AssetManagementSystem* ams) {
+    pthread_mutex_unlock(&ams->mutex);
+}
+
 #endif
\ No newline at end of file
diff --git a/asset/AssetType.h b/asset/AssetType.h
index 163d0e3..40f8beb 100644
--- a/asset/AssetType.h
+++ b/asset/AssetType.h
@@ -12,9 +12,11 @@
 enum AssetType {
     ASSET_TYPE_GENERAL,
     ASSET_TYPE_OBJ,
-    ASSET_TYPE_TEXTURE,
     ASSET_TYPE_AUDIO,
-    ASSET_TYPE_ANIM,
+    ASSET_TYPE_LANGUAGE,
+    ASSET_TYPE_FONT,
+    ASSET_TYPE_THEME,
+    ASSET_TYPE_IMAGE,
     ASSET_TYPE_SIZE
 };
 
diff --git a/audio/Audio.cpp b/audio/Audio.cpp
index 22f00b3..682cea1 100644
--- a/audio/Audio.cpp
+++ b/audio/Audio.cpp
@@ -22,14 +22,80 @@
 #include "AudioSetting.h"
 #include "Wav.h"
 
-void audio_from_file(RingMemory* ring, const char* path, Audio* audio)
+void audio_from_file(Audio* audio, const char* path, RingMemory* ring)
 {
     FileBody file;
     file_read(path, &file, ring);
 
+    ASSERT_SIMPLE(file.size);
+
     if (str_ends_with(path, ".wav")) {
-        wav_audio_generate(&file, audio);
+        wav_from_data(file.content, (uint32) file.size, audio, ring);
     }
 }
 
+int32 audio_data_size(const Audio* audio)
+{
+    return (int32) (audio->size
+        + sizeof(audio->sample_rate)
+        + sizeof(audio->sample_size)
+        + sizeof(audio->channels)
+        + sizeof(audio->bloc_size)
+        + sizeof(audio->byte_per_sec)
+        + sizeof(audio->size)
+    );
+}
+
+int32 audio_from_data(const byte* data, Audio* audio)
+{
+    audio->sample_rate = SWAP_ENDIAN_LITTLE(*((uint16 *) data));
+    data += sizeof(audio->sample_rate);
+
+    audio->sample_size = *data;
+    data += sizeof(audio->sample_size);
+
+    audio->channels = *data;
+    data += sizeof(audio->channels);
+
+    audio->bloc_size = *data;
+    data += sizeof(audio->bloc_size);
+
+    audio->byte_per_sec = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
+    data += sizeof(audio->byte_per_sec);
+
+    audio->size = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
+    data += sizeof(audio->size);
+
+    memcpy(audio->data, data, audio->size);
+    data += audio->size;
+
+    return audio_data_size(audio);
+}
+
+int32 audio_to_data(const Audio* audio, byte* data)
+{
+    *((uint16 *) data) = SWAP_ENDIAN_LITTLE(audio->sample_rate);
+    data += sizeof(audio->sample_rate);
+
+    *data = audio->sample_size;
+    data += sizeof(audio->sample_size);
+
+    *data = audio->channels;
+    data += sizeof(audio->channels);
+
+    *data = audio->bloc_size;
+    data += sizeof(audio->bloc_size);
+
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(audio->byte_per_sec);
+    data += sizeof(audio->byte_per_sec);
+
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(audio->size);
+    data += sizeof(audio->size);
+
+    memcpy(data, audio->data, audio->size);
+    data += audio->size;
+
+    return audio_data_size(audio);
+}
+
 #endif
diff --git a/audio/Audio.h b/audio/Audio.h
index 1ac446f..8bc764a 100644
--- a/audio/Audio.h
+++ b/audio/Audio.h
@@ -15,19 +15,19 @@
 struct Audio {
     // bits per sample
     // usually 48000 or 44100
-    uint32 sample_rate;
+    uint16 sample_rate;
 
     // bytes per bloc
     // channel count * bit
     // usually 2 * 16 = 4
-    uint32 sample_size;
+    byte sample_size;
 
     // audio channels
     // usually 2
-    uint32 channels;
+    byte channels;
 
     // usually 16 = 2
-    uint32 bloc_size;
+    byte bloc_size;
 
     // sample_rate * sample_size
     uint32 byte_per_sec;
diff --git a/audio/AudioMixer.h b/audio/AudioMixer.h
index 9e8677e..f3e2d4e 100644
--- a/audio/AudioMixer.h
+++ b/audio/AudioMixer.h
@@ -48,6 +48,8 @@ struct AudioInstance {
 
     uint32 audio_size;
     byte* audio_data;
+
+    uint32 sample_index;
 };
 
 struct AudioMixer {
@@ -71,6 +73,7 @@ struct AudioMixer {
     // do we need a condition or semaphore?
 };
 
+// @todo expand AudioLocationSetting so that it also includes audio effects, repeat etc.
 void audio_mixer_add(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSetting* origin)
 {
     int64 index = chunk_reserve(&mixer->audio_instances, 1);
@@ -90,7 +93,7 @@ void audio_mixer_add(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSet
 
 void audio_mixer_add_unique(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSetting* origin)
 {
-    for (int32 i = 0; i < mixer->audio_instances.count; ++i) {
+    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
         // @performance We are not really utilizing chunk memory.
         // Maybe a simple array would be better
         // Or we need to use more chunk functions / maybe even create a chunk_iterate() function?
@@ -105,7 +108,7 @@ void audio_mixer_add_unique(AudioMixer* mixer, int64 id, Audio* audio, AudioLoca
 
 void audio_mixer_remove(AudioMixer* mixer, int64 id)
 {
-    for (int32 i = 0; i < mixer->audio_instances.count; ++i) {
+    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
         AudioInstance* instance = (AudioInstance *) chunk_get_element(&mixer->audio_instances, i);
         if (instance->id == id) {
             instance->id = 0;
@@ -116,38 +119,38 @@ void audio_mixer_remove(AudioMixer* mixer, int64 id)
     }
 }
 
-void apply_echo(int16* buffer, uint16 buffer_size, f32 delay, f32 feedback, int32 sample_rate) {
+void apply_echo(int16* buffer, uint32 buffer_size, f32 delay, f32 feedback, int32 sample_rate) {
     int32 delay_samples = (int32) (delay * sample_rate);
-    for (int32 i = delay_samples; i < buffer_size; ++i) {
+    for (uint32 i = delay_samples; i < buffer_size; ++i) {
         buffer[i] += (int16) (buffer[i - delay_samples] * feedback);
     }
 }
 
-void apply_reverb(int16* buffer, uint16 buffer_size, f32 intensity) {
+void apply_reverb(int16* buffer, uint32 buffer_size, f32 intensity) {
     intensity *= 0.5f;
-    for (int32 i = 1; i < buffer_size; ++i) {
+    for (uint32 i = 1; i < buffer_size; ++i) {
         buffer[i] += (int16) (buffer[i - 1] * intensity); // Simple reverb with decay
     }
 }
 
-void apply_cave(int16* buffer, uint16 buffer_size, int32 sample_rate) {
+void apply_cave(int16* buffer, uint32 buffer_size, int32 sample_rate) {
     f32 echo_delay = 0.1f; // Echo delay in seconds
     f32 feedback = 0.3f;  // Echo feedback level
     apply_echo(buffer, buffer_size, echo_delay, feedback, sample_rate);
     apply_reverb(buffer, buffer_size, 0.4f); // Add mild reverb
 }
 
-void apply_underwater(int16* buffer, uint16 buffer_size) {
-    for (int32 i = 0; i < buffer_size; ++i) {
+void apply_underwater(int16* buffer, uint32 buffer_size) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         buffer[i] = (int16) sinf(buffer[i] * 0.5f); // Dampen + distortion
     }
 }
 
-void apply_flanger(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
-    int32 delay_samples = (int32) (depth * sample_rate);
+void apply_flanger(int16* buffer, uint32 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
+    f32 delay_samples = depth * sample_rate;
     f32 temp = OMS_TWO_PI * rate / sample_rate;
 
-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         int32 delay = (int32) (delay_samples * (0.5f + 0.5f * sinf(i * temp)));
         if (i >= delay) {
             buffer[i] += (int16) (buffer[i - delay] * 0.5f);
@@ -155,27 +158,27 @@ void apply_flanger(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32
     }
 }
 
-void apply_tremolo(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
+void apply_tremolo(int16* buffer, uint32 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
     f32 temp = OMS_TWO_PI * rate / sample_rate;
     f32 temp2 = (1.0f - depth) + depth;
 
-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         f32 mod = temp2 * (0.5f + 0.5f * sinf(i * temp));
         buffer[i] = (int16) (buffer[i] * mod);
     }
 }
 
-void apply_distortion(int16* buffer, uint16 buffer_size, f32 gain) {
-    for (int32 i = 0; i < buffer_size; ++i) {
+void apply_distortion(int16* buffer, uint32 buffer_size, f32 gain) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         buffer[i] = (int16) tanh(buffer[i] * gain);
     }
 }
 
-void apply_chorus(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
+void apply_chorus(int16* buffer, uint32 buffer_size, f32 rate, f32 depth, int32 sample_rate) {
     f32 temp = OMS_TWO_PI * rate / sample_rate;
 
     int32 max_delay = (int32) (depth * sample_rate);
-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         int32 delay = (int32) (max_delay * (0.5f + 0.5f * sinf(i * temp)));
         if (i >= delay) {
             buffer[i] += (int16) (buffer[i - delay] * 0.5f);
@@ -183,26 +186,26 @@ void apply_chorus(int16* buffer, uint16 buffer_size, f32 rate, f32 depth, int32
     }
 }
 
-void apply_pitch_shift(int16* buffer, uint16 buffer_size, f32 pitch_factor) {
-    for (int32 i = 0; i < buffer_size; ++i) {
+void apply_pitch_shift(int16* buffer, uint32 buffer_size, f32 pitch_factor) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         buffer[i] = (int16) (buffer[i] * pitch_factor);
     }
 }
 
-void apply_granular_delay(int16* buffer, uint16 buffer_size, f32 delay, f32 granularity, int32 sample_rate) {
+void apply_granular_delay(int16* buffer, uint32 buffer_size, f32 delay, f32 granularity, int32 sample_rate) {
     int32 delay_samples = (int32) (delay * sample_rate);
     int32 limit = (int32) (granularity * sample_rate);
 
-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         if (i % limit == 0 && i >= delay_samples) {
             buffer[i] += (int16) (buffer[i - delay_samples] * 0.6f);
         }
     }
 }
 
-void apply_frequency_modulation(int16* buffer, uint16 buffer_size, f32 mod_freq, f32 mod_depth, int32 sample_rate) {
+void apply_frequency_modulation(int16* buffer, uint32 buffer_size, f32 mod_freq, f32 mod_depth, int32 sample_rate) {
     f32 temp = OMS_TWO_PI * mod_freq / sample_rate;
-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         buffer[i] = (int16) (buffer[i] * sinf(i * temp) * mod_depth);
     }
 }
@@ -211,20 +214,20 @@ void apply_stereo_panning(int16* buffer, int32 buffer_size, f32 pan) {
     f32 left_gain = 1.0f - pan;
     f32 right_gain = pan;
 
-    for (int32 i = 0; i < buffer_size; ++i) {
+    for (uint32 i = 0; i < buffer_size; ++i) {
         buffer[i] = (int16) (buffer[i] * left_gain);
         buffer[i + 1] = (int16) (buffer[i + 1] * right_gain);
     }
 }
 
-void apply_highpass(int16* buffer, uint16 buffer_size, f32 cutoff, int32 sample_rate) {
+void apply_highpass(int16* buffer, uint32 buffer_size, f32 cutoff, int32 sample_rate) {
     f32 rc = 1.0f / (OMS_TWO_PI * cutoff);
     f32 dt = 1.0f / sample_rate;
     f32 alpha = rc / (rc + dt);
     f32 previous = buffer[0];
     f32 previous_output = buffer[0];
 
-    for (int32 i = 1; i < buffer_size; ++i) {
+    for (uint32 i = 1; i < buffer_size; ++i) {
         f32 current = buffer[i];
         buffer[i] = (int16) (alpha * (previous_output + current - previous));
         previous = current;
@@ -232,53 +235,89 @@ void apply_highpass(int16* buffer, uint16 buffer_size, f32 cutoff, int32 sample_
     }
 }
 
-
-void apply_lowpass(int16* buffer, uint16 buffer_size, f32 cutoff, int32 sample_rate) {
+void apply_lowpass(int16* buffer, uint32 buffer_size, f32 cutoff, int32 sample_rate) {
     f32 rc = 1.0f / (OMS_TWO_PI * cutoff);
     f32 dt = 1.0f / sample_rate;
     f32 alpha = dt / (rc + dt);
     f32 previous = buffer[0];
 
-    for (int32 i = 1; i < buffer_size; ++i) {
+    for (uint32 i = 1; i < buffer_size; ++i) {
         buffer[i] = (int16) (previous + alpha * (buffer[i] - previous));
         previous = buffer[i];
     }
 }
 
-void audio_mixer_mix(AudioMixer *mixer) {
-    uint16 limit = (uint16) (mixer->settings.sample_buffer_size / mixer->settings.sample_size);
+void audio_mixer_mix(AudioMixer* mixer) {
+    uint32 limit = OMS_MIN(
+        mixer->settings.sample_buffer_size / mixer->settings.sample_size,
+        mixer->settings.buffer_size / mixer->settings.sample_size
+    );
 
-    for (int32 i = 0; i < mixer->audio_instances.count; ++i) {
+    bool has_location = !is_empty((byte *) &mixer->camera.audio_location, sizeof(mixer->camera.audio_location));
+
+    f32 volume_scale = mixer->settings.master_volume * mixer->settings.master_volume;
+
+    for (uint32 i = 0; i < mixer->audio_instances.count; ++i) {
         AudioInstance* sound = (AudioInstance *) chunk_get_element(&mixer->audio_instances, i);
         if (sound->id == 0) {
             continue;
         }
 
         // Compute the vector from the player to the sound's origin
-        v3_f32 to_sound;
-        vec3_sub(&to_sound, &sound->origin.audio_location, &mixer->camera.audio_location);
-        f32 distance = vec3_length(&to_sound);
-        f32 distance_attenuation = OMS_MAX(0.0f, 1.0f - (distance / 50.0f));
-        vec3_normalize(&to_sound);
-        f32 alignment = vec3_dot(&mixer->camera.audio_lookat, &to_sound);
-        f32 directional_attenuation = OMS_MAX(0.0f, alignment);
-        f32 total_attenuation = distance_attenuation * directional_attenuation;
+        v3_f32 to_sound = {};
+        f32 total_attenuation = 1.0f;
+        bool has_origin = !is_empty((byte *) &sound->origin.audio_location, sizeof(sound->origin.audio_location));
+
+        if (has_location && has_origin) {
+            vec3_sub(&to_sound, &sound->origin.audio_location, &mixer->camera.audio_location);
+
+            f32 distance = vec3_length(&to_sound);
+            if (distance) {
+                f32 distance_attenuation = OMS_MAX(0.0f, 1.0f - (distance / 50.0f));
+
+                vec3_normalize(&to_sound);
+                f32 alignment = vec3_dot(&mixer->camera.audio_lookat, &to_sound);
+                f32 directional_attenuation = OMS_MAX(0.0f, alignment);
+
+                total_attenuation = distance_attenuation * directional_attenuation;
+            }
+        }
+
+        uint32 sound_sample_count = sound->audio_size / mixer->settings.sample_size;
+        uint32 sound_sample_index = sound->sample_index;
+        int16* audio_data = (int16 *) sound->audio_data;
 
         // Temporary buffer for effects processing
         // @performance If there are situations where only one file exists in the mixer that should be played we could directly write to
         // the output buffer improving the performance. Some of those mixers are: music, cinematic, ui
         // Careful, NOT voice since we will probably manually layer them according to their position?
         for (int32 j = 0; j < limit; ++j) {
-            // @todo if repeat handle here
+            if (sound_sample_index >= sound_sample_count) {
+                // @todo if repeat we need to handle part of it here, else quit
 
-            mixer->buffer_temp[j] = (int16) (sound->audio_data[j * 2] * mixer->settings.master_volume * total_attenuation);
-            mixer->buffer_temp[j + 1] = (int16) (sound->audio_data[j * 2 + 2] * mixer->settings.master_volume * total_attenuation);
+                sound_sample_index = 0;
+
+                // @question why are we doing this?
+                mixer->settings.sample_index = 0;
+            }
+
+            mixer->buffer_temp[j * 2] = (int16) (audio_data[sound_sample_index * 2] * volume_scale * total_attenuation);
+            mixer->buffer_temp[j * 2 + 1] = (int16) (audio_data[sound_sample_index * 2 + 1] * volume_scale * total_attenuation);
+
+            ++sound_sample_index;
 
             // @performance Some adjustments could be made right here the question is if this is faster.
             // Probably depends on how likely the adjustment is to happen.
+
+            // @todo if end of file and no repeat -> remove from list
         }
 
+        // @question We also have to set setting->sample_index = sound_sample_index.
+        // But that currently happens in the sound api. Do we want to keep it there or move it here
+
         // Apply effects based on sound's effect type
+        // @performance Depending on how we implement effects we could even pull them out of this loop
+        // What I mean is effects could either be sound file dependent (current location correct) or mixer dependent
         if (mixer->effect) {
             if (mixer->effect & AUDIO_EFFECT_ECHO) {
                 apply_echo(mixer->buffer_temp, limit, 0.2f, 0.4f, mixer->settings.sample_rate);
@@ -337,8 +376,11 @@ void audio_mixer_mix(AudioMixer *mixer) {
             }
         }
 
+        // @bug the actual output "limit" could be smaller if sound files end earlier and no repeat is defined
+        // In that case we would also have to adjust mixer->settings.sample_buffer_size
+
         // Add the processed sound to the output buffer
-        for (int32 j = 0; j < limit; j++) {
+        for (uint32 j = 0; j < limit; j++) {
             mixer->settings.buffer[j] += mixer->buffer_temp[j];
         }
     }
diff --git a/audio/AudioSetting.h b/audio/AudioSetting.h
index ba32693..d1031f6 100644
--- a/audio/AudioSetting.h
+++ b/audio/AudioSetting.h
@@ -20,23 +20,19 @@ struct AudioSetting {
     // WARNING: not the byte position, but the index based on the sample size
     uint32 sample_index;
 
-    // @todo add more settings e.g. repeat etc
-
-    uint32 latency;
-
     f32 master_volume;
 
     // bits per sample
     // usually 48000 or 44100
-    uint32 sample_rate;
+    uint16 sample_rate;
 
     // bytes per bloc
     // channel count * bit
     // usually 2 * 16 = 4
-    uint32 sample_size;
+    byte sample_size;
 
     // how often has the audio_play been called (required for xaudio)
-    uint32 sample_output;
+    byte sample_output;
 
     // max buffer content/size
     uint32 buffer_size;
@@ -47,6 +43,9 @@ struct AudioSetting {
     int16* buffer;
 
     byte type = SOUND_API_DIRECT_SOUND;
+    byte latency;
+
+    // @todo add more settings e.g. repeat etc
 };
 
 struct AudioLocationSetting {
diff --git a/audio/Wav.h b/audio/Wav.h
index f48fe19..fd67c62 100644
--- a/audio/Wav.h
+++ b/audio/Wav.h
@@ -46,29 +46,23 @@ struct WavHeader {
 struct Wav {
     WavHeader header;
 
-    byte* sample_data; // WARNING: This is not the owner of the data. The owner is the FileBody
+    byte* sample_data; // WARNING: This is not the owner of the data.
 
     uint32 size;
-    byte* data; // WARNING: This is not the owner of the data. The owner is the FileBody
+    byte* data; // Data owner
 };
 
-void generate_default_wav_references(const FileBody* file, Wav* wav)
+void generate_default_wav_references(const byte* data, uint32 size, Wav* wav)
 {
-    wav->size = (uint32) file->size;
-    wav->data = file->content;
-
-    if (wav->size < WAV_HEADER_SIZE) {
-        // This shouldn't happen
-        return;
-    }
+    wav->size = size;
+    ASSERT_SIMPLE(size >= WAV_HEADER_SIZE);
 
     // Check if we can copy memory directly
     // The struct layout and header size should match on x86, but we still check it
     if constexpr (sizeof(WavHeader) == WAV_HEADER_SIZE) {
-        memcpy(&wav->header, file->content, WAV_HEADER_SIZE);
+        memcpy(&wav->header, data, WAV_HEADER_SIZE);
 
         // swap endian if we are on big endian system
-        // @question Maybe this needs to be a runtime check?
         #if !_WIN32 && !__LITTLE_ENDIAN
             wav->header.size = SWAP_ENDIAN_LITTLE(wav->header.size);
             wav->header.bloc_size = SWAP_ENDIAN_LITTLE(wav->header.bloc_size);
@@ -121,33 +115,32 @@ void generate_default_wav_references(const FileBody* file, Wav* wav)
         wav->header.bits_per_sample = SWAP_ENDIAN_LITTLE(*((uint16 *) (wav->data + 34)));
 
         // Sample data header
-        wav->header.data_bloc_id[0] = *(wav->data + 36);
-        wav->header.data_bloc_id[1] = *(wav->data + 37);
-        wav->header.data_bloc_id[2] = *(wav->data + 38);
-        wav->header.data_bloc_id[3] = *(wav->data + 39);
+        memcpy(wav->header.data_bloc_id, wav->data + 36, 4);
 
-        wav->header.data_size = SWAP_ENDIAN_LITTLE(*((uint32 *) *(wav->data + 40)));
+        wav->header.data_size = SWAP_ENDIAN_LITTLE(*((uint32 *) *(wav->data + WAV_HEADER_SIZE - sizeof(wav->header.data_bloc_id))));
     }
 
     wav->sample_data = wav->data + WAV_HEADER_SIZE;
+    memcpy(wav->sample_data, data + WAV_HEADER_SIZE, wav->header.data_size);
 }
 
-void wav_audio_generate(const FileBody* src_data, Audio* audio)
+void wav_from_data(const byte* data, uint32 size, Audio* audio, RingMemory* ring)
 {
     // @performance We are generating the struct and then filling the data.
-    //      There is some asignment/copy overhead
+    //      There is some assignment/copy overhead
     Wav src = {};
-    generate_default_wav_references(src_data, &src);
+    src.data = ring_get_memory(ring, size, 4);
+    generate_default_wav_references(data, size, &src);
 
     if (!src.size) {
         return;
     }
 
-    audio->sample_rate = src.header.frequency;
-    audio->sample_size = (src.header.bits_per_sample / 8) * src.header.nbr_channels;
-    audio->channels = src.header.nbr_channels;
-    audio->byte_per_sec = src.header.byte_per_sec;
-    audio->bloc_size = src.header.bloc_size;
+    audio->sample_rate = (uint16) src.header.frequency;
+    audio->sample_size = (byte) ((src.header.bits_per_sample / 8) * src.header.nbr_channels);
+    audio->channels = (byte) src.header.nbr_channels;
+    audio->byte_per_sec = (uint32) src.header.byte_per_sec;
+    audio->bloc_size = (byte) src.header.bloc_size;
     audio->size = src.header.data_size;
 
     memcpy((void *) audio->data, src.sample_data, audio->size);
diff --git a/camera/Camera.h b/camera/Camera.h
index e102f34..e311e97 100644
--- a/camera/Camera.h
+++ b/camera/Camera.h
@@ -19,8 +19,14 @@
 
 // @todo Please check out if we can switch to quaternions. We tried but failed.
 
+enum CameraStateChanges : byte {
+    CAMERA_STATE_CHANGE_NONE = 0,
+    CAMERA_STATE_CHANGE_NORMAL = 1,
+    CAMERA_STATE_CHANGE_WINDOW = 2,
+};
+
 struct Camera {
-    bool is_changed;
+    byte state_changes;
 
     v3_f32 location;
     v4_f32 orientation;
@@ -43,6 +49,8 @@ struct Camera {
     f32 aspect;
 
     f32 view[16];
+    f32 projection[16];
+    f32 orth[16];
 };
 
 void
@@ -64,7 +72,7 @@ camera_update_vectors(Camera* camera)
 
 void camera_rotate(Camera* camera, int32 dx, int32 dy, f32 dt)
 {
-    camera->is_changed = true;
+    camera->state_changes |= CAMERA_STATE_CHANGE_NORMAL;
     camera->orientation.x += dy * camera->sensitivity;
     camera->orientation.y -= dx * camera->sensitivity;
 
@@ -88,7 +96,7 @@ void camera_rotate(Camera* camera, int32 dx, int32 dy, f32 dt)
 // you can have up to 4 camera movement inputs at the same time
 void camera_movement(Camera* camera, CameraMovement* movement, f32 dt, bool relative_to_world = true)
 {
-    camera->is_changed = true;
+    camera->state_changes |= CAMERA_STATE_CHANGE_NORMAL;
     f32 velocity = camera->speed * dt;
 
     if (relative_to_world) {
@@ -214,11 +222,11 @@ void camera_movement(Camera* camera, CameraMovement* movement, f32 dt, bool rela
 }
 
 inline
-void camera_orth_matrix_lh(const Camera* __restrict camera, f32* __restrict orth)
+void camera_orth_matrix_lh(Camera* __restrict camera)
 {
-    mat4_identity_sparse(orth);
+    mat4_identity(camera->orth);
     mat4_ortho_sparse_lh(
-        orth,
+        camera->orth,
         0, camera->viewport_width,
         0, camera->viewport_height,
         camera->znear,
@@ -227,11 +235,11 @@ void camera_orth_matrix_lh(const Camera* __restrict camera, f32* __restrict orth
 }
 
 inline
-void camera_orth_matrix_rh(const Camera* __restrict camera, f32* __restrict orth)
+void camera_orth_matrix_rh(Camera* __restrict camera)
 {
-    mat4_identity_sparse(orth);
+    mat4_identity(camera->orth);
     mat4_ortho_sparse_rh(
-        orth,
+        camera->orth,
         0, camera->viewport_width,
         0, camera->viewport_height,
         camera->znear,
@@ -240,11 +248,11 @@ void camera_orth_matrix_rh(const Camera* __restrict camera, f32* __restrict orth
 }
 
 inline
-void camera_projection_matrix_lh(const Camera* __restrict camera, f32* __restrict projection)
+void camera_projection_matrix_lh(Camera* __restrict camera)
 {
-    mat4_identity_sparse(projection);
+    mat4_identity(camera->projection);
     mat4_perspective_sparse_lh(
-        projection,
+        camera->projection,
         camera->fov,
         camera->aspect,
         camera->znear,
@@ -253,11 +261,11 @@ void camera_projection_matrix_lh(const Camera* __restrict camera, f32* __restric
 }
 
 inline
-void camera_projection_matrix_rh(const Camera* __restrict camera, f32* __restrict projection)
+void camera_projection_matrix_rh(Camera* __restrict camera)
 {
-    mat4_identity_sparse(projection);
+    mat4_identity(camera->projection);
     mat4_perspective_sparse_rh(
-        projection,
+        camera->projection,
         camera->fov,
         camera->aspect,
         camera->znear,
diff --git a/compression/Huffman.h b/compression/Huffman.h
index 6cfd080..1cd764d 100644
--- a/compression/Huffman.h
+++ b/compression/Huffman.h
@@ -14,6 +14,7 @@
 
 #include "../stdlib/Types.h"
 #include "../utils/BitUtils.h"
+#include "../utils/MathUtils.h"
 #include "../utils/EndianUtils.h"
 
 struct HuffmanNode {
@@ -34,31 +35,37 @@ struct Huffman {
     char* code[256];   // Contains a pointer per ASCII character to the huffman code sequence
 };
 
+// We could combine this function with the one below but this would introduce a if != 0 check for the frequency
+// I would assume the current version is faster since we avoid a branch
+inline
 HuffmanNode* huffman_node_create(Huffman* hf, int32 frequency, byte character, HuffmanNode* left, HuffmanNode* right)
 {
     HuffmanNode* node = hf->pool + hf->node_count++;
-    if (frequency) {
-        node->character = character;
-        node->frequency = frequency;
-    } else {
-        node->left = left;
-        node->right = right;
-        node->frequency = left->frequency + right->frequency;
-    }
+    node->character = character;
+    node->frequency = frequency;
 
     return node;
 }
 
+// Same as other function but frequency = 0
+inline
+HuffmanNode* huffman_node_create(Huffman* hf, byte character, HuffmanNode* left, HuffmanNode* right)
+{
+    HuffmanNode* node = hf->pool + hf->node_count++;
+    node->left = left;
+    node->right = right;
+    node->frequency = left->frequency + right->frequency;
+
+    return node;
+}
+
+inline
 void huffman_node_insert(Huffman* hf, HuffmanNode* node)
 {
     int32 child_id;
     int32 parent_id = hf->pq_end++;
 
-    while ((child_id = parent_id / 2)) {
-        if (hf->pq[child_id]->frequency <= node->frequency) {
-            break;
-        }
-
+    while ((child_id = parent_id / 2) && hf->pq[child_id]->frequency <= node->frequency) {
         hf->pq[parent_id] = hf->pq[child_id];
         parent_id = child_id;
     }
@@ -111,13 +118,15 @@ int64 huffman_code_build(Huffman* hf, HuffmanNode* root, char* code, int32 lengt
 void huffman_init(Huffman* hf, const byte* in)
 {
     int32 frequency[256] = {0};
-    char temp_code[16];
     int32 buffer_position = 0;
+    char temp_code[16];
 
     // We artificially force the root element (usually the 0 element) to have the index 1.
     hf->pq = (HuffmanNode **) (hf->priority_queue - 1);
 
-    while (*in) frequency[(byte) *in++]++;
+    while (*in) {
+        ++frequency[(byte) *in++];
+    }
 
     for (int32 i = 0; i < 256; ++i) {
         if (frequency[i]) {
@@ -126,21 +135,20 @@ void huffman_init(Huffman* hf, const byte* in)
     }
 
     while (hf->pq_end > 2) {
-        huffman_node_insert(hf, huffman_node_create(hf, 0, 0, huffman_node_remove(hf), huffman_node_remove(hf)));
+        huffman_node_insert(hf, huffman_node_create(hf, 0, huffman_node_remove(hf), huffman_node_remove(hf)));
     }
 
     huffman_code_build(hf, hf->pq[1], temp_code, 0, hf->buffer, &buffer_position);
 }
 
+inline
 void huffman_dump(const Huffman* hf, byte* out)
 {
-    // dump the char -> code relations as relative indeces
+    // dump the char -> code relations as relative indices
     for (int32 i = 0; i < ARRAY_COUNT(hf->code); ++i) {
-        if (hf->code[i]) {
-            *((int64 *) out) = SWAP_ENDIAN_LITTLE(hf->code[i] - hf->buffer);
-        } else {
-            *((int64 *) out) = SWAP_ENDIAN_LITTLE(-1);
-        }
+        *((int64 *) out) = hf->code[i]
+            ? SWAP_ENDIAN_LITTLE(hf->code[i] - hf->buffer)
+            : SWAP_ENDIAN_LITTLE(-1);
 
         out += sizeof(int64);
     }
@@ -149,6 +157,7 @@ void huffman_dump(const Huffman* hf, byte* out)
     memcpy(out, hf->buffer, sizeof(char) * ARRAY_COUNT(hf->buffer));
 }
 
+inline
 void huffman_load(Huffman* hf, const byte* in)
 {
     // load the char -> code relations and convert relative indices to pointers
@@ -165,6 +174,7 @@ void huffman_load(Huffman* hf, const byte* in)
     memcpy(hf->buffer, in, sizeof(char) * ARRAY_COUNT(hf->buffer));
 }
 
+inline
 int64 huffman_encode(Huffman* hf, const byte* in, byte* out)
 {
     uint64 bit_length = 0;
@@ -180,11 +190,11 @@ int64 huffman_encode(Huffman* hf, const byte* in, byte* out)
 
             ++code;
             ++bit_length;
-            ++pos_bit;
 
-            if (pos_bit > 7) {
+            // Make sure it wraps around to 0 for pos_bit > 7
+            pos_bit = MODULO_2(++pos_bit, 8);
+            if (pos_bit == 0) {
                 ++out;
-                pos_bit = 0;
             }
         }
     }
@@ -192,29 +202,26 @@ int64 huffman_encode(Huffman* hf, const byte* in, byte* out)
     return bit_length;
 }
 
+inline
 int64 huffman_decode(Huffman* hf, const byte* in, byte* out, uint64 bit_length)
 {
     HuffmanNode* current = hf->pq[1];
     int32 pos_bit = 0;
-    int64 out_length = 0;
-
     byte* start = out;
 
     while (pos_bit < bit_length) {
-        if (BITS_GET_8_L2R(*in, pos_bit++, 1)) {
-            current = current->right;
-        } else {
-            current = current->left;
-        }
+        // Branchless version of checking if bit is set and then updating current
+        int32 bit = BITS_GET_8_L2R(*in, pos_bit, 1);
+        current = (HuffmanNode *) (((uintptr_t) current->left & ~bit) | ((uintptr_t) current->right & bit));
 
         if (current->character) {
             *out++ = current->character;
             current = hf->pq[1];
         }
 
-        if (pos_bit > 7) {
+        pos_bit = MODULO_2(++pos_bit, 8);
+        if (pos_bit == 0) {
             ++in;
-            pos_bit = 0;
         }
     }
 
diff --git a/compression/LZP.h b/compression/LZP.h
index 8dc7d4a..bbef1ac 100644
--- a/compression/LZP.h
+++ b/compression/LZP.h
@@ -92,10 +92,8 @@ uint32 lzp_decode(const byte* in, size_t length, byte* out)
             hash = (hash << 4) ^ c;
         }
 
-        if (j > 0) {
-            for (i = 0; i < j; ++i) {
-                out[out_pos++] = buf[i];
-            }
+        for (i = 0; i < j; ++i) {
+            out[out_pos++] = buf[i];
         }
     }
 
@@ -106,13 +104,14 @@ int32 find_longest_match(char *window, int32 window_start, char *buffer, int32 b
     int32 best_length = 0;
     int32 best_offset = 0;
 
-    for (int32 i = window_start; i < 4096   && i < buffer_size; ++i) {
+    for (int32 i = window_start; i < 4096 && i < buffer_size; ++i) {
         int32 length = 0;
 
-        while (length < 18 &&
-               i + length < 4096   &&
-               buffer[length] == window[i + length]) {
-            length++;
+        while (length < 18
+            && i + length < 4096
+            && buffer[length] == window[i + length]
+        ) {
+            ++length;
         }
 
         if (length > best_length) {
@@ -135,7 +134,12 @@ uint32 lzp3_encode(const byte* in, size_t length, byte* out) {
     size_t i = 0;
     while (i < length) {
         int32 match_position = 0;
-        int32 match_length = find_longest_match(window, window_start, (char *)&in[i], (int32) (length - i), &match_position);
+        int32 match_length = find_longest_match(
+            window,
+            window_start,
+            (char *) &in[i], (int32) (length - i),
+            &match_position
+        );
 
         if (match_length > 2) {
             out[out_size++] = 0xFF;
@@ -170,7 +174,7 @@ uint32 lzp3_decode(const byte* in, size_t length, byte* out) {
             int32 match_length = in[i + 2];
 
             for (int32 j = 0; j < match_length; j++) {
-                out[out_size++] = window[(match_position + j) % 4096];
+                out[out_size++] = window[MODULO_2(match_position + j, 4096)];
             }
 
             memmove(window, window + match_length, 4096 - match_length);
diff --git a/compression/RLE.h b/compression/RLE.h
index 1e6b3bc..b261d68 100644
--- a/compression/RLE.h
+++ b/compression/RLE.h
@@ -21,7 +21,7 @@ uint64 rle_encode(const char* in, size_t length, char* out)
     uint64 count;
     uint64 j = 0;
 
-    for (uint64 i = 0; i < length; i++) {
+    for (uint64 i = 0; i < length; ++i) {
         count = 1;
         while (i + 1 < length && in[i] == in[i + 1]) {
             ++count;
@@ -42,7 +42,7 @@ uint64 rle_decode(const char* in, size_t length, char* out)
 {
     uint64 j = 0;
 
-    for (int64 i = 0; i < length; i++) {
+    for (int64 i = 0; i < length; ++i) {
         char current_char = in[i];
         ++i;
 
@@ -53,7 +53,7 @@ uint64 rle_decode(const char* in, size_t length, char* out)
         }
         --i;
 
-        for (int32 k = 0; k < count; k++) {
+        for (int32 k = 0; k < count; ++k) {
             out[j++] = current_char;
         }
     }
diff --git a/font/Font.h b/font/Font.h
index 40d4d3f..bc7e8a6 100644
--- a/font/Font.h
+++ b/font/Font.h
@@ -28,6 +28,7 @@ struct GlyphTextureCoords {
     f32 y2;
 };
 
+#define GLYPH_SIZE 40
 struct Glyph {
     uint32 codepoint;
     GlyphMetrics metrics;
@@ -55,7 +56,7 @@ void font_init(Font* font, byte* data, int count)
 inline
 Glyph* font_glyph_find(Font* font, uint32 codepoint)
 {
-    for (int i = 0; i < font->glyph_count; ++i) {
+    for (uint32 i = 0; i < font->glyph_count; ++i) {
         if (font->glyphs[i].codepoint == codepoint) {
             return &font->glyphs[i];
         }
@@ -66,10 +67,15 @@ Glyph* font_glyph_find(Font* font, uint32 codepoint)
 
 void font_from_file_txt(
     Font* font,
-    byte* data
+    const char* path,
+    RingMemory* ring
 )
 {
-    char* pos = (char *) data;
+    FileBody file;
+    file_read(path, &file, ring);
+    ASSERT_SIMPLE(file.size);
+
+    char* pos = (char *) file.content;
 
     bool start = true;
     char block_name[32];
@@ -147,25 +153,21 @@ void font_from_file_txt(
     }
 }
 
-// Calculates the required size for representing a font definition in memory
 inline
-uint64 font_size_from_file(const byte* data)
+int32 font_data_size(const Font* font)
 {
-    return SWAP_ENDIAN_LITTLE(*((uint32 *) data)) * sizeof(Glyph);
+    ASSERT_SIMPLE_CONST(sizeof(Glyph) == GLYPH_SIZE);
+    return font->glyph_count * sizeof(Glyph)
+        + sizeof(font->glyph_count)
+        + sizeof(font->texture_name)
+        + sizeof(font->size)
+        + sizeof(font->line_height);
 }
 
-inline
-uint64 font_size(const Font* font)
-{
-    // We have to remove the size of the pointer which will not be stored
-    return sizeof(font) - sizeof(Glyph*)
-        + font->glyph_count * sizeof(Glyph);
-}
-
-void font_from_file(
-    Font* font,
+int32 font_from_data(
     const byte* data,
-    int32 size = 8
+    Font* font,
+    int32 steps = 8
 )
 {
     const byte* pos = data;
@@ -190,7 +192,7 @@ void font_from_file(
 
     #if OPENGL
         // @todo Implement y-offset correction
-        for (int32 i = 0; i < font->glyph_count; ++i) {
+        for (uint32 i = 0; i < font->glyph_count; ++i) {
             float temp = font->glyphs[i].coords.y1;
             font->glyphs[i].coords.y1 = 1.0f - font->glyphs[i].coords.y2;
             font->glyphs[i].coords.y2 = 1.0f - temp;
@@ -203,26 +205,17 @@ void font_from_file(
         font->glyph_count * sizeof(Glyph) / 4, // everything in here is 4 bytes -> super easy to swap
         steps
     );
+
+    return font_data_size(font);
 }
 
-inline
-int64 font_size_from_font(Font* font)
-{
-    return font->glyph_count * sizeof(Glyph) + sizeof(Font);
-}
-
-void font_to_file(
-    RingMemory* ring,
-    const char* path,
+int32 font_to_data(
     const Font* font,
+    byte* data,
     int32 steps = 8
 )
 {
-    FileBody file;
-    file.size = font->glyph_count * sizeof(Glyph) + sizeof(Font);
-    file.content = ring_get_memory(ring, file.size, 64);
-
-    byte* pos = file.content;
+    byte* pos = data;
 
     // Glyph count
     *((uint32 *) pos) = font->glyph_count;
@@ -244,16 +237,16 @@ void font_to_file(
     memcpy(pos, font->glyphs, font->glyph_count * sizeof(Glyph));
     pos += font->glyph_count * sizeof(Glyph);
 
-    file.size = pos - file.content;
+    int32 size = (int32) (pos - data);
 
     SWAP_ENDIAN_LITTLE_SIMD(
         (int32 *) file.content,
         (int32 *) file.content,
-        file.size / 4, // everything in here is 4 bytes -> super easy to swap
+        size / 4, // everything in here is 4 bytes -> super easy to swap
         steps
     );
 
-    file_write(path, &file);
+    return font_data_size(font);
 }
 
 #endif
\ No newline at end of file
diff --git a/gpuapi/opengl/OpenglUtils.h b/gpuapi/opengl/OpenglUtils.h
index c4571b6..416e0e4 100644
--- a/gpuapi/opengl/OpenglUtils.h
+++ b/gpuapi/opengl/OpenglUtils.h
@@ -13,6 +13,7 @@
 #include "../../memory/RingMemory.h"
 #include "../../utils/TestUtils.h"
 #include "../../object/Texture.h"
+#include "../../image/Image.cpp"
 #include "../../utils/StringUtils.h"
 #include "../../log/Log.h"
 
@@ -136,6 +137,8 @@ void load_texture_to_gpu(const Texture* texture, int32 mipmap_level = 0)
     if (mipmap_level > -1) {
         glGenerateMipmap(GL_TEXTURE_2D);
     }
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_GPU_UPLOAD, texture->image.pixel_count * image_pixel_size_from_type(texture->image.pixel_type));
 }
 
 inline
@@ -162,17 +165,19 @@ GLuint shader_make(GLenum type, const char *source, RingMemory* ring)
     GLint status;
     glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
 
-    if (status == GL_FALSE) {
-        GLint length;
-        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length);
+    #if DEBUG || INTERNAL
+        if (status == GL_FALSE) {
+            GLint length;
+            glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length);
 
-        GLchar *info = (GLchar *) ring_get_memory(ring, length * sizeof(GLchar));
+            GLchar *info = (GLchar *) ring_get_memory(ring, length * sizeof(GLchar));
 
-        glGetShaderInfoLog(shader, length, NULL, info);
-        LOG(info, true, true);
+            glGetShaderInfoLog(shader, length, NULL, info);
+            LOG(info, true, true);
 
-        ASSERT_SIMPLE(false);
-    }
+            ASSERT_SIMPLE(false);
+        }
+    #endif
 
     return shader;
 }
@@ -222,17 +227,19 @@ GLuint program_make(
     GLint status;
     glGetProgramiv(program, GL_LINK_STATUS, &status);
 
-    if (status == GL_FALSE) {
-        GLint length;
-        glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length);
+    #if DEBUG || INTERNAL
+        if (status == GL_FALSE) {
+            GLint length;
+            glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length);
 
-        GLchar *info = (GLchar *) ring_get_memory(ring, length * sizeof(GLchar));
+            GLchar *info = (GLchar *) ring_get_memory(ring, length * sizeof(GLchar));
 
-        glGetProgramInfoLog(program, length, NULL, info);
-        LOG(info, true, true);
+            glGetProgramInfoLog(program, length, NULL, info);
+            LOG(info, true, true);
 
-        ASSERT_SIMPLE(false);
-    }
+            ASSERT_SIMPLE(false);
+        }
+    #endif
 
     // @question really?
     if (geometry_shader > -1) {
@@ -442,6 +449,8 @@ uint32 gpuapi_buffer_generate(int32 size, const void* data)
     glBindBuffer(GL_ARRAY_BUFFER, vbo);
     glBufferData(GL_ARRAY_BUFFER, size, data, GL_STATIC_DRAW);
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_GPU_UPLOAD, size);
+
     return vbo;
 }
 
@@ -454,6 +463,8 @@ uint32 gpuapi_buffer_generate_dynamic(int32 size, const void* data)
     glBindBuffer(GL_ARRAY_BUFFER, vbo);
     glBufferData(GL_ARRAY_BUFFER, size, data, GL_DYNAMIC_DRAW);
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_GPU_UPLOAD, size);
+
     return vbo;
 }
 
@@ -473,6 +484,8 @@ void gpuapi_buffer_update_dynamic(uint32 vbo, int32 size, const void* data)
 {
     glBindBuffer(GL_ARRAY_BUFFER, vbo);
     glBufferData(GL_ARRAY_BUFFER, size, data, GL_DYNAMIC_DRAW);
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_GPU_UPLOAD, size);
 }
 
 inline
diff --git a/gpuapi/opengl/ShaderUtils.h b/gpuapi/opengl/ShaderUtils.h
index 642ab81..ca858ef 100644
--- a/gpuapi/opengl/ShaderUtils.h
+++ b/gpuapi/opengl/ShaderUtils.h
@@ -161,4 +161,87 @@ void shader_check_compile_errors(uint32 id, char* log)
     }
 }
 
+int32 shader_program_optimize(const char* input, char* output)
+{
+    const char* read_ptr = input;
+    char* write_ptr = output;
+    bool in_string = false;
+
+    while (*read_ptr) {
+        // Remove leading whitespace
+        while (*read_ptr == ' ' || *read_ptr == '\t' || is_eol(read_ptr)) {
+            ++read_ptr;
+        }
+
+        if (write_ptr != output
+            && *(write_ptr - 1) != '\n' && *(write_ptr - 1) != ';' && *(write_ptr - 1) != '{'
+            && *(write_ptr - 1) != '('
+            && *(write_ptr - 1) != ','
+        ) {
+            *write_ptr++ = '\n';
+        }
+
+        // Handle single-line comments (//)
+        if (*read_ptr == '/' && *(read_ptr + 1) == '/' && !in_string) {
+            // Go to end of line
+            while (*read_ptr && *read_ptr != '\n') {
+                ++read_ptr;
+            }
+
+            continue;
+        }
+
+        // Handle multi-line comments (/* */)
+        if (*read_ptr == '/' && *(read_ptr + 1) == '*' && !in_string) {
+            // Go to end of comment
+            while (*read_ptr && (*read_ptr != '*' || *(read_ptr + 1) != '/')) {
+                ++read_ptr;
+            }
+
+            if (*read_ptr == '*' && *(read_ptr + 1) == '/') {
+                read_ptr += 2;
+            }
+
+            continue;
+        }
+
+        // Handle strings to avoid removing content within them
+        if (*read_ptr == '"') {
+            in_string = !in_string;
+        }
+
+        // Copy valid characters to write_ptr
+        while (*read_ptr && !is_eol(read_ptr) && *read_ptr != '"'
+            && !(*read_ptr == '/' && (*(read_ptr + 1) == '/' || *(read_ptr + 1) == '*'))
+        ) {
+            if (!in_string
+                && (*read_ptr == '*' || *read_ptr == '/' || *read_ptr == '=' || *read_ptr == '+' || *read_ptr == '-' || *read_ptr == '%'
+                    || *read_ptr == '(' || *read_ptr == ')'
+                    || *read_ptr == '{' || *read_ptr == '}'
+                    || *read_ptr == ',' || *read_ptr == '?' || *read_ptr == ':' || *read_ptr == ';'
+                    || *read_ptr == '&' || *read_ptr == '|'
+                    || *read_ptr == '>' || *read_ptr == '<'
+                )
+            ) {
+                if (is_whitespace(*(write_ptr - 1)) || *(write_ptr - 1) == '\n') {
+                    --write_ptr;
+                }
+
+                *write_ptr++ = *read_ptr++;
+
+                if (*read_ptr && is_whitespace(*read_ptr)) {
+                    ++read_ptr;
+                }
+            } else {
+                *write_ptr++ = *read_ptr++;
+            }
+        }
+    }
+
+    *write_ptr = '\0';
+
+    // -1 to remove \0 from length, same as strlen
+    return (int32) (write_ptr - output);
+}
+
 #endif
\ No newline at end of file
diff --git a/image/Bitmap.h b/image/Bitmap.h
index b2d7194..1f985b1 100644
--- a/image/Bitmap.h
+++ b/image/Bitmap.h
@@ -271,7 +271,7 @@ void generate_default_bitmap_references(const FileBody* file, Bitmap* bitmap)
 void image_bmp_generate(const FileBody* src_data, Image* image)
 {
     // @performance We are generating the struct and then filling the data.
-    //      There is some asignment/copy overhead
+    //      There is some assignment/copy overhead
     Bitmap src = {};
     generate_default_bitmap_references(src_data, &src);
 
@@ -285,7 +285,13 @@ void image_bmp_generate(const FileBody* src_data, Image* image)
     uint32 pixel_bytes = src.dib_header.bits_per_pixel / 8;
     byte alpha_offset = pixel_bytes > 3;
 
-    image->has_alpha |= (bool) alpha_offset;
+    if (pixel_bytes == 4) {
+        image->pixel_type = (byte) PIXEL_TYPE_RGBA;
+    } else if (pixel_bytes == 3) {
+        image->pixel_type = (byte) PIXEL_TYPE_RGB;
+    } else {
+        ASSERT_SIMPLE(false);
+    }
 
     if (image->order_pixels == IMAGE_PIXEL_ORDER_BGRA
         && image->order_rows == IMAGE_ROW_ORDER_BOTTOM_TO_TOP
@@ -331,7 +337,7 @@ void image_bmp_generate(const FileBody* src_data, Image* image)
             // Add alpha channel at end of every RGB value
             if (alpha_offset > 0) {
                 image->pixels[row_pos1 + x * pixel_bytes + 3] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes + 3];
-            } else if (image->has_alpha) {
+            } else if (image->pixel_type == PIXEL_TYPE_RGBA) {
                 image->pixels[row_pos1 + x * pixel_bytes + 3] = 0xFF;
             }
         }
diff --git a/image/Image.cpp b/image/Image.cpp
index 3614359..d0109b6 100644
--- a/image/Image.cpp
+++ b/image/Image.cpp
@@ -23,7 +23,7 @@
 #include "Bitmap.h"
 #include "Png.h"
 
-void image_from_file(RingMemory* ring, const char* path, Image* image)
+void image_from_file(Image* image, const char* path, RingMemory* ring)
 {
     FileBody file;
     file_read(path, &file, ring);
@@ -46,7 +46,7 @@ void image_flip_vertical(RingMemory* ring, Image* image)
     // Last row
     const byte* end = temp + image->pixel_count * sizeof(uint32) - image->width * sizeof(uint32);
 
-    for (int y = 0; y < image->height; ++y) {
+    for (uint32 y = 0; y < image->height; ++y) {
         memcpy(image->pixels + y * stride, end - y * stride, stride);
     }
 
@@ -64,4 +64,90 @@ void image_flip_vertical(RingMemory* ring, Image* image)
     image->order_rows = (byte) (!((bool) image->order_rows));
 }
 
+inline
+int32 image_pixel_size_from_type(byte type)
+{
+    switch (type) {
+        case PIXEL_TYPE_RGBA: {
+            return 4;
+        } break;
+        case PIXEL_TYPE_RGB: {
+            return 3;
+        } break;
+        case PIXEL_TYPE_MONO: {
+            return 1;
+        } break;
+        case PIXEL_TYPE_RGBA_F: {
+            return 16;
+        } break;
+        case PIXEL_TYPE_RGB_F: {
+            return 12;
+        } break;
+        case PIXEL_TYPE_MONO_F: {
+            return 4;
+        } break;
+        default: {
+            return 0;
+        }
+    }
+}
+
+int32 image_from_data(const byte* data, Image* image)
+{
+    const byte* pos = data;
+
+    image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
+    pos += sizeof(image->width);
+
+    image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
+    pos += sizeof(image->height);
+
+    image->pixel_count = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
+    pos += sizeof(image->pixel_count);
+
+    image->order_pixels = *pos;
+    pos += sizeof(image->order_pixels);
+
+    image->order_rows = *pos;
+    pos += sizeof(image->order_rows);
+
+    image->pixel_type = *pos;
+    pos += sizeof(image->pixel_type);
+
+    int32 image_size;
+    memcpy(image->pixels, pos, image_size = (image_pixel_size_from_type(image->pixel_type) * image->pixel_count));
+    pos += image_size;
+
+    return (int32) (pos - data);
+}
+
+int32 image_to_data(const Image* image, byte* data)
+{
+    byte* pos = data;
+
+    *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->width);
+    pos += sizeof(image->width);
+
+    *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->height);
+    pos += sizeof(image->height);
+
+    *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->pixel_count);
+    pos += sizeof(image->pixel_count);
+
+    *pos = image->order_pixels;
+    pos += sizeof(image->order_pixels);
+
+    *pos = image->order_rows;
+    pos += sizeof(image->order_rows);
+
+    *pos = image->pixel_type;
+    pos += sizeof(image->pixel_type);
+
+    int32 image_size;
+    memcpy(pos, image->pixels, image_size = (image_pixel_size_from_type(image->pixel_type) * image->pixel_count));
+    pos += image_size;
+
+    return (int32) (pos - data);
+}
+
 #endif
\ No newline at end of file
diff --git a/image/Image.h b/image/Image.h
index e5ffa0b..f17ce7a 100644
--- a/image/Image.h
+++ b/image/Image.h
@@ -17,6 +17,16 @@
 #define IMAGE_ROW_ORDER_TOP_TO_BOTTOM 0
 #define IMAGE_ROW_ORDER_BOTTOM_TO_TOP 1
 
+enum PixelType
+{
+    PIXEL_TYPE_RGBA, // 4 bytes
+    PIXEL_TYPE_RGB, // 3 bytes
+    PIXEL_TYPE_MONO, // 1 byte
+    PIXEL_TYPE_RGBA_F, // 16 bytes
+    PIXEL_TYPE_RGB_F, // 12 bytes
+    PIXEL_TYPE_MONO_F, // 4 bytes
+};
+
 // This struct also functions as a setting on how to load the image data
 //      has_alpha is defined it forces an alpha channel even for bitmaps
 //      order_pixels defines how the pixels should be ordered
@@ -27,11 +37,11 @@ struct Image {
     uint32 pixel_count; // @question Do we even need this?
 
     // Image settings
-    bool has_alpha;
     byte order_pixels; // RGBA vs BGRA
     byte order_rows; // top-to-bottom vs bottom-to-top
+    byte pixel_type; // Usually 4 or 3 bytes unless monochrome data
 
-    uint32* pixels; // owner of data
+    byte* pixels; // owner of data
 };
 
 #endif
\ No newline at end of file
diff --git a/image/Png.h b/image/Png.h
index 8736bac..94bbb3c 100644
--- a/image/Png.h
+++ b/image/Png.h
@@ -623,7 +623,7 @@ bool image_png_generate(const FileBody* src_data, Image* image, RingMemory* ring
         // essentially overwriting the **current** chunk header data, which doesn't matter since we already parsed it
         // then we reset the pos pointer backwards to where we want to start... gg
 
-        // https://www.ietf.org/rfc/rfc1951.txt - defalte
+        // https://www.ietf.org/rfc/rfc1951.txt - deflate
         // This data might be stored in the prvious IDAT chunk?!
         BFINAL = (uint8) BITS_GET_8_R2L(*stream.pos, stream.bit_pos, 1);
         bits_walk(&stream, 1);
@@ -783,7 +783,7 @@ bool image_png_generate(const FileBody* src_data, Image* image, RingMemory* ring
     image->width = src.ihdr.width;
     image->height = src.ihdr.height;
     image->pixel_count = image->width * image->height;
-    image->has_alpha = (src.ihdr.color_type == 6);
+    image->pixel_type = (byte) (src.ihdr.color_type == 6 ? PIXEL_TYPE_RGBA : PIXEL_TYPE_RGB);
     image->order_pixels = IMAGE_PIXEL_ORDER_RGBA;
     image->order_rows = IMAGE_ROW_ORDER_TOP_TO_BOTTOM;
 
diff --git a/image/Tga.h b/image/Tga.h
index adddb58..735be6a 100644
--- a/image/Tga.h
+++ b/image/Tga.h
@@ -90,7 +90,13 @@ void image_tga_generate(const FileBody* src_data, Image* image)
     uint32 pixel_bytes = src.header.bits_per_pixel / 8;
     byte alpha_offset = pixel_bytes > 3;
 
-    image->has_alpha |= (bool) alpha_offset;
+    if (pixel_bytes == 4) {
+        image->pixel_type = (byte) PIXEL_TYPE_RGBA;
+    } else if (pixel_bytes == 3) {
+        image->pixel_type = (byte) PIXEL_TYPE_RGB;
+    } else {
+        ASSERT_SIMPLE(false);
+    }
 
     // We can check same settings through equality since we use the same values
     if (image->order_rows == src.header.vertical_ordering
@@ -131,7 +137,7 @@ void image_tga_generate(const FileBody* src_data, Image* image)
             // Add alpha channel at end of every RGB value
             if (alpha_offset > 0) {
                 image->pixels[row_pos1 + x * pixel_bytes + 3] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes + 3];
-            } else if (image->has_alpha) {
+            } else if (image->pixel_type == PIXEL_TYPE_RGBA) {
                 image->pixels[row_pos1 + x * pixel_bytes + 3] = 0xFF;
             }
         }
diff --git a/localization/Language.h b/localization/Language.h
index 9208e08..200b4fa 100644
--- a/localization/Language.h
+++ b/localization/Language.h
@@ -10,22 +10,32 @@
     #include "../platform/linux/FileUtils.cpp"
 #endif
 
+#define LANGUAGE_VERSION 1
+
 struct Language {
     // WARNING: the actual start of data is data -= sizeof(count); see file loading below
     byte* data;
 
     int32 count;
+    int64 size;
     char** lang;
 };
 
 void language_from_file_txt(
     Language* language,
-    byte* data
+    const char* path,
+    RingMemory* ring
 ) {
+    FileBody file;
+    file_read(path, &file, ring);
+    ASSERT_SIMPLE(file.size);
+
     // count elements
     language->count = 1;
     int64 len = 0;
 
+    byte* data = file.content;
+
     while (data[len] != '\0') {
         if (data[len] == '\n' && data[len + 1] == '\n') {
             ++language->count;
@@ -36,6 +46,7 @@ void language_from_file_txt(
         ++len;
     }
 
+    language->size = len;
     language->lang = (char **) language->data;
     memcpy(language->data + language->count * sizeof(char *), data, len);
 
@@ -54,22 +65,35 @@ void language_from_file_txt(
     }
 }
 
+int32 language_data_size(const Language* language)
+{
+    return (int32) (language->size
+        + sizeof(language->count)
+        + sizeof(language->size)
+        + language->count * sizeof(uint64)
+    );
+}
+
 // File layout - binary
 // offsets for start of strings
 // actual string data
-void language_from_file(
+int32 language_from_data(
+    const byte* data,
     Language* language
 ) {
-    byte* pos = language->data;
+    const byte* pos = data;
 
     // Count
     language->count = SWAP_ENDIAN_LITTLE(*((int32 *) pos));
     pos += sizeof(language->count);
 
-    language->lang = (char **) pos;
+    language->size = SWAP_ENDIAN_LITTLE(*((int32 *) pos));
+    pos += sizeof(language->size);
+
+    language->lang = (char **) language->data;
     char** pos_lang = language->lang;
 
-    byte* start = pos;
+    byte* start = language->data;
 
     // Load pointers/offsets
     for (int32 i = 0; i < language->count; ++i) {
@@ -77,28 +101,29 @@ void language_from_file(
         pos += sizeof(uint64);
     }
 
-    // We don't have to load the actual strings, they are already in ->data due to the file reading
+    memcpy(
+        language->data + language->count * sizeof(uint64),
+        pos,
+        language->size
+    );
+
+    return language_data_size(language);
 }
 
-void language_to_file(
-    RingMemory* ring,
-    const char* path,
-    Language* language
+int32 language_to_data(
+    const Language* language,
+    byte* data
 ) {
-    FileBody file;
-
-    // Temporary file size for buffer
-    // @todo This is a bad placeholder, The problem is we don't know how much we actually need without stepping through the elements
-    //      I also don't want to add a size variable to the theme as it is useless in all other cases
-    file.size = MEGABYTE * 32;
-
-    file.content = ring_get_memory(ring, file.size, 64);
-    byte* pos = file.content;
+    byte* pos = data;
 
     // Count
     *((int32 *) pos) = SWAP_ENDIAN_LITTLE(language->count);
     pos += sizeof(language->count);
 
+    // Count
+    *((int32 *) pos) = SWAP_ENDIAN_LITTLE((int32) language->size);
+    pos += sizeof(language->size);
+
     byte* start = pos;
 
     // Save pointers
@@ -107,19 +132,14 @@ void language_to_file(
         pos += sizeof(uint64);
     }
 
-    int64 len_total = 0;
-
     // Save actual strings
-    int64 len;
-    for (int32 i = 0; i < language->count; ++i) {
-        len = strlen(language->lang[i]);
-        len_total += len;
-        memcpy((char *) pos, language->lang[i], len + 1);
-        pos += len;
-    }
+    memcpy(
+        pos,
+        language->data + language->count * sizeof(uint64),
+        language->size
+    );
 
-    file.size = pos - file.content;
-    file_write(path, &file);
+    return language_data_size(language);
 }
 
 #endif
\ No newline at end of file
diff --git a/log/Debug.cpp b/log/Debug.cpp
index 9742417..4aa3ca9 100644
--- a/log/Debug.cpp
+++ b/log/Debug.cpp
@@ -136,7 +136,12 @@ void update_timing_stat_end_continued(uint32 stat, const char* function)
 inline
 void update_timing_stat_reset(uint32 stat)
 {
-    atomic_set((int32 *) debug_container->perf_stats[stat].function, 0);
+    spinlock_start(&debug_container->perf_stats_spinlock);
+    TimingStat* timing_stat = &debug_container->perf_stats[stat];
+    timing_stat->function = NULL;
+    timing_stat->delta_tick = 0;
+    timing_stat->delta_time = 0;
+    spinlock_end(&debug_container->perf_stats_spinlock);
 }
 
 inline
@@ -146,13 +151,13 @@ void reset_counter(int32 id)
 }
 
 inline
-void log_increment(int32 id, int32 by = 1)
+void log_increment(int32 id, int64 by = 1)
 {
     atomic_add(&debug_container->counter[id], by);
 }
 
 inline
-void log_counter(int32 id, int32 value)
+void log_counter(int32 id, int64 value)
 {
     atomic_set(&debug_container->counter[id], value);
 }
@@ -215,11 +220,13 @@ void debug_memory_log(uint64 start, uint64 size, int32 type, const char* functio
         return;
     }
 
-    if (mem->action_idx == DEBUG_MEMORY_RANGE_MAX) {
-        mem->action_idx = 0;
+    uint64 idx = atomic_add_fetch(&mem->action_idx, 1);
+    if (idx >= ARRAY_COUNT(mem->last_action)) {
+        atomic_set(&mem->action_idx, 1);
+        idx %= ARRAY_COUNT(mem->last_action);
     }
 
-    DebugMemoryRange* dmr = &mem->last_action[mem->action_idx];
+    DebugMemoryRange* dmr = &mem->last_action[idx];
     dmr->type = type;
     dmr->start = start - mem->start;
     dmr->size = size;
@@ -228,8 +235,6 @@ void debug_memory_log(uint64 start, uint64 size, int32 type, const char* functio
     dmr->time = __rdtsc();
     dmr->function_name = function;
 
-    ++mem->action_idx;
-
     if (type < 0 && mem->usage < size * -type) {
         mem->usage = 0;
     } else {
@@ -248,11 +253,13 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
         return;
     }
 
-    if (mem->reserve_action_idx == DEBUG_MEMORY_RANGE_MAX) {
-        mem->reserve_action_idx = 0;
+    uint64 idx = atomic_add_fetch(&mem->reserve_action_idx, 1);
+    if (idx >= ARRAY_COUNT(mem->reserve_action)) {
+        atomic_set(&mem->reserve_action_idx, 1);
+        idx %= ARRAY_COUNT(mem->last_action);
     }
 
-    DebugMemoryRange* dmr = &mem->reserve_action[mem->reserve_action_idx];
+    DebugMemoryRange* dmr = &mem->reserve_action[idx];
     dmr->type = type;
     dmr->start = start - mem->start;
     dmr->size = size;
@@ -260,10 +267,9 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
     // We are using rdtsc since it is faster -> less debugging overhead than using time()
     dmr->time = __rdtsc();
     dmr->function_name = function;
-
-    ++mem->reserve_action_idx;
 }
 
+// @bug This probably requires thread safety
 inline
 void debug_memory_reset()
 {
@@ -271,7 +277,8 @@ void debug_memory_reset()
         return;
     }
 
-    uint64 time = __rdtsc() - 1000000000;
+    // We remove debug information that are "older" than 1GHz
+    uint64 time = __rdtsc() - 1 * GHZ;
 
     for (uint64 i = 0; i < debug_container->dmc.memory_element_idx; ++i) {
         for (int32 j = 0; j < DEBUG_MEMORY_RANGE_MAX; ++j) {
@@ -282,6 +289,7 @@ void debug_memory_reset()
     }
 }
 
+// @bug This probably requires thread safety
 byte* log_get_memory(uint64 size, byte aligned = 1, bool zeroed = false)
 {
     if (!debug_container) {
diff --git a/log/Debug.h b/log/Debug.h
index 8c89c0c..566a23c 100644
--- a/log/Debug.h
+++ b/log/Debug.h
@@ -45,7 +45,7 @@ struct DebugContainer {
     LogMemory log_memory;
 
     // Used to log general int values (e.g. counter for draw calls etc.)
-    int32* counter;
+    int64* counter;
 
     #if _WIN32
         HANDLE log_fp;
diff --git a/log/Log.h b/log/Log.h
index d0a997f..7ca55eb 100644
--- a/log/Log.h
+++ b/log/Log.h
@@ -36,8 +36,8 @@ enum LogDataType {
 void log_to_file();
 void log(const char* str, bool should_log, bool save, const char* file, const char* function, int32 line);
 void log(const char* format, LogDataType data_type, void* data, bool should_log, bool save, const char* file, const char* function, int32 line);
-void log_increment(int32, int32);
-void log_counter(int32, int32);
+void log_increment(int32, int64);
+void log_counter(int32, int64);
 
 #if (LOG_LEVEL == 0)
     // Don't perform any logging at log level 0
diff --git a/math/matrix/MatrixFloat32.h b/math/matrix/MatrixFloat32.h
index cb14c56..4ceff1a 100644
--- a/math/matrix/MatrixFloat32.h
+++ b/math/matrix/MatrixFloat32.h
@@ -23,6 +23,10 @@
 
 // @todo Implement intrinsic versions!
 
+// INFO: I thought we could remove some of the functions. Sometimes we have a function that modifies the original value and then we also have the same function that fills a new result value.
+//      On gcc the optimized code creates the same assembly if we would just choose to return the new value vs. modifying a value by pointer.
+//      However, on MSVC this is not the case and the pointer version has more and slower assembly code for the pass-by-value function
+
 inline
 void vec2_normalize(f32* __restrict x, f32* __restrict y)
 {
diff --git a/memory/Queue.h b/memory/Queue.h
index 531426a..8115c6e 100644
--- a/memory/Queue.h
+++ b/memory/Queue.h
@@ -38,13 +38,30 @@ void queue_free(Queue* queue)
     ring_free(queue);
 }
 
+inline
+bool queue_is_empty(Queue* queue) {
+    return queue->head == queue->tail;
+}
+
+inline
+bool queue_set_empty(Queue* queue) {
+    return queue->head = queue->tail;
+}
+
+inline
+bool queue_is_full(Queue* queue, uint64 size, byte aligned = 0) {
+    return !ring_commit_safe((RingMemory *) queue, size, aligned);
+}
+
 // Conditional Lock
 inline
-void queue_enqueue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
+byte* queue_enqueue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
 {
     byte* mem = ring_get_memory_nomove(queue, size, aligned);
     memcpy(mem, data, size);
     ring_move_pointer(queue, &queue->head, size, aligned);
+
+    return mem;
 }
 
 inline
@@ -60,10 +77,34 @@ void queue_enqueue_end(Queue* queue, uint64 size, byte aligned = 0)
 }
 
 inline
-byte* queue_dequeue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
+bool queue_dequeue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
 {
-    memcpy(data, queue->tail, size);
+    if (queue->head == queue->tail) {
+        return false;
+    }
+
+    if (size == 4) {
+        *((int32 *) data) = *((int32 *) queue->tail);
+    } else {
+        memcpy(data, queue->tail, size);
+    }
+
     ring_move_pointer(queue, &queue->tail, size, aligned);
+
+    return true;
+}
+
+inline
+byte* queue_dequeue_keep(Queue* queue, uint64 size, byte aligned = 0)
+{
+    if (queue->head == queue->tail) {
+        return NULL;
+    }
+
+    byte* data = queue->tail;
+    ring_move_pointer(queue, &queue->tail, size, aligned);
+
+    return data;
 }
 
 inline
diff --git a/memory/RingMemory.h b/memory/RingMemory.h
index 5a47056..eae684f 100644
--- a/memory/RingMemory.h
+++ b/memory/RingMemory.h
@@ -58,7 +58,7 @@ void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64)
         ? (byte *) platform_alloc(size)
         : (byte *) platform_alloc_aligned(size, alignment);
 
-    ring->end = ring->memory + size;;
+    ring->end = ring->memory + size;
     ring->head = ring->memory;
     ring->tail = ring->memory;
     ring->size = size;
@@ -77,7 +77,7 @@ void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment
 
     ring->memory = buffer_get_memory(buf, size, alignment, true);
 
-    ring->end = ring->memory + size;;
+    ring->end = ring->memory + size;
     ring->head = ring->memory;
     ring->tail = ring->memory;
     ring->size = size;
@@ -96,7 +96,7 @@ void ring_init(RingMemory* ring, byte* buf, uint64 size, int32 alignment = 64)
     // @bug what if an alignment is defined?
     ring->memory = buf;
 
-    ring->end = ring->memory + size;;
+    ring->end = ring->memory + size;
     ring->head = ring->memory;
     ring->tail = ring->memory;
     ring->size = size;
@@ -110,12 +110,12 @@ void ring_init(RingMemory* ring, byte* buf, uint64 size, int32 alignment = 64)
 }
 
 inline
-void ring_free(RingMemory* buf)
+void ring_free(RingMemory* ring)
 {
-    if (buf->alignment < 2) {
-        platform_free((void **) &buf->memory);
+    if (ring->alignment < 2) {
+        platform_free((void **) &ring->memory);
     } else {
-        platform_aligned_free((void **) &buf->memory);
+        platform_aligned_free((void **) &ring->memory);
     }
 }
 
diff --git a/memory/ThreadedQueue.h b/memory/ThreadedQueue.h
index a4d63a6..b9f38f8 100644
--- a/memory/ThreadedQueue.h
+++ b/memory/ThreadedQueue.h
@@ -6,8 +6,10 @@
  * @version   1.0.0
  * @link      https://jingga.app
  */
-#ifndef TOS_MEMORY_QUEUE_H
-#define TOS_MEMORY_QUEUE_H
+#ifndef TOS_MEMORY_THREADED_QUEUE_H
+#define TOS_MEMORY_THREADED_QUEUE_H
+
+// @todo This is a horrible implementation. Please implement a lock free solution
 
 #include "../stdlib/Types.h"
 #include "../utils/Utils.h"
@@ -47,7 +49,7 @@ struct ThreadedQueue {
 };
 
 inline
-void threaded_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element_size, int32 alignment = 64)
+void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element_size, int32 alignment = 64)
 {
     ring_alloc((RingMemory *) queue, element_count * element_size, alignment);
 
@@ -59,7 +61,7 @@ void threaded_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 ele
 }
 
 inline
-void threaded_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
 {
     ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);
 
@@ -71,7 +73,7 @@ void threaded_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element
 }
 
 inline
-void threaded_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
 {
     ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);
 
@@ -83,7 +85,7 @@ void threaded_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count,
 }
 
 inline
-void threaded_queue_free(ThreadedQueue* queue)
+void thrd_queue_free(ThreadedQueue* queue)
 {
     ring_free((RingMemory *) queue);
     sem_destroy(&queue->empty);
@@ -92,9 +94,9 @@ void threaded_queue_free(ThreadedQueue* queue)
     pthread_cond_destroy(&queue->cond);
 }
 
-// @todo Create enqueue_unique
+// @todo Create enqueue_unique and enqueue_unique_sem
 inline
-void threaded_queue_enqueue_unique_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_unique_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
     ASSERT_SIMPLE((uint64_t) data % 4 == 0);
     pthread_mutex_lock(&queue->mutex);
@@ -113,7 +115,7 @@ void threaded_queue_enqueue_unique_wait(ThreadedQueue* queue, byte* data, uint64
         ring_move_pointer((RingMemory *) queue, &tail, size, aligned);
     }
 
-    while (!ring_commit_safe((RingMemory *) queue, size)) {
+    while (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
         pthread_cond_wait(&queue->cond, &queue->mutex);
     }
 
@@ -125,7 +127,7 @@ void threaded_queue_enqueue_unique_wait(ThreadedQueue* queue, byte* data, uint64
 }
 
 inline
-void threaded_queue_enqueue_unique(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_unique(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
     ASSERT_SIMPLE((uint64_t) data % 4 == 0);
     pthread_mutex_lock(&queue->mutex);
@@ -144,7 +146,7 @@ void threaded_queue_enqueue_unique(ThreadedQueue* queue, byte* data, uint64 size
         ring_move_pointer((RingMemory *) queue, &tail, size, aligned);
     }
 
-    if (!ring_commit_safe((RingMemory *) queue, size)) {
+    if (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
         pthread_mutex_unlock(&queue->mutex);
 
         return;
@@ -159,11 +161,11 @@ void threaded_queue_enqueue_unique(ThreadedQueue* queue, byte* data, uint64 size
 
 // Conditional Lock
 inline
-void threaded_queue_enqueue(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
     pthread_mutex_lock(&queue->mutex);
 
-    if (!ring_commit_safe((RingMemory *) queue, size)) {
+    if (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
         pthread_mutex_unlock(&queue->mutex);
 
         return;
@@ -177,11 +179,11 @@ void threaded_queue_enqueue(ThreadedQueue* queue, byte* data, uint64 size, byte
 }
 
 inline
-void threaded_queue_enqueue_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
     pthread_mutex_lock(&queue->mutex);
 
-    while (!ring_commit_safe((RingMemory *) queue, size)) {
+    while (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
         pthread_cond_wait(&queue->cond, &queue->mutex);
     }
 
@@ -193,7 +195,7 @@ void threaded_queue_enqueue_wait(ThreadedQueue* queue, byte* data, uint64 size,
 }
 
 inline
-byte* threaded_queue_enqueue_start_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+byte* thrd_queue_enqueue_start_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
 {
     pthread_mutex_lock(&queue->mutex);
 
@@ -205,33 +207,61 @@ byte* threaded_queue_enqueue_start_wait(ThreadedQueue* queue, uint64 size, byte
 }
 
 inline
-void threaded_queue_enqueue_end_wait(ThreadedQueue* queue)
+void thrd_queue_enqueue_end_wait(ThreadedQueue* queue)
 {
     pthread_cond_signal(&queue->cond);
     pthread_mutex_unlock(&queue->mutex);
 }
 
 inline
-void threaded_queue_dequeue(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+bool thrd_queue_dequeue(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
 {
-    pthread_mutex_lock(&queue->mutex);
+    if (queue->head == queue->tail) {
+        return false;
+    }
 
+    // we do this twice because the first one is very fast but may return a false positive
+    pthread_mutex_lock(&queue->mutex);
     if (queue->head == queue->tail) {
         pthread_mutex_unlock(&queue->mutex);
 
-        return;
+        return false;
     }
 
-    memcpy(data, queue->tail, size);
+    if (size == 4) {
+        *((int32 *) data) = *((int32 *) queue->tail);
+    } else {
+        memcpy(data, queue->tail, size);
+    }
     ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
 
     pthread_cond_signal(&queue->cond);
     pthread_mutex_unlock(&queue->mutex);
+
+    return true;
+}
+
+inline
+bool thrd_queue_empty(ThreadedQueue* queue) {
+    pthread_mutex_lock(&queue->mutex);
+    bool is_empty = queue->head == queue->tail;
+    pthread_mutex_unlock(&queue->mutex);
+
+    return is_empty;
+}
+
+inline
+bool thrd_queue_full(ThreadedQueue* queue, uint64 size, byte aligned = 0) {
+    pthread_mutex_lock(&queue->mutex);
+    bool is_full = !ring_commit_safe((RingMemory *) queue, size, aligned);
+    pthread_mutex_unlock(&queue->mutex);
+
+    return is_full;
 }
 
 // Waits until a dequeue is available
 inline
-void threaded_queue_dequeue_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
 {
     pthread_mutex_lock(&queue->mutex);
 
@@ -247,7 +277,7 @@ void threaded_queue_dequeue_wait(ThreadedQueue* queue, byte* data, uint64 size,
 }
 
 inline
-byte* threaded_queue_dequeue_start_wait(ThreadedQueue* queue)
+byte* thrd_queue_dequeue_start_wait(ThreadedQueue* queue)
 {
     pthread_mutex_lock(&queue->mutex);
 
@@ -259,7 +289,7 @@ byte* threaded_queue_dequeue_start_wait(ThreadedQueue* queue)
 }
 
 inline
-void threaded_queue_dequeue_end_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_end_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
 {
     ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
 
@@ -269,7 +299,7 @@ void threaded_queue_dequeue_end_wait(ThreadedQueue* queue, uint64 size, byte ali
 
 // Semaphore Lock
 inline
-void threaded_queue_enqueue_sem_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_sem_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
 {
     sem_wait(&queue->empty);
     pthread_mutex_lock(&queue->mutex);
@@ -282,7 +312,25 @@ void threaded_queue_enqueue_sem_wait(ThreadedQueue* queue, byte* data, uint64 si
 }
 
 inline
-byte* threaded_queue_enqueue_start_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+bool thrd_queue_enqueue_sem_timedwait(ThreadedQueue* queue, const byte* data, uint64 size, uint64 wait, byte aligned = 0)
+{
+    if (sem_timedwait(&queue->empty, wait)) {
+        return false;
+    }
+
+    pthread_mutex_lock(&queue->mutex);
+
+    byte* mem = ring_get_memory((RingMemory *) queue, size, aligned);
+    memcpy(mem, data, size);
+
+    pthread_mutex_unlock(&queue->mutex);
+    sem_post(&queue->full);
+
+    return true;
+}
+
+inline
+byte* thrd_queue_enqueue_start_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
 {
     sem_wait(&queue->empty);
     pthread_mutex_lock(&queue->mutex);
@@ -291,14 +339,14 @@ byte* threaded_queue_enqueue_start_sem_wait(ThreadedQueue* queue, uint64 size, b
 }
 
 inline
-void threaded_queue_enqueue_end_sem_wait(ThreadedQueue* queue)
+void thrd_queue_enqueue_end_sem_wait(ThreadedQueue* queue)
 {
     pthread_mutex_unlock(&queue->mutex);
     sem_post(&queue->full);
 }
 
 inline
-byte* threaded_queue_dequeue_sem_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+byte* thrd_queue_dequeue_sem_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
 {
     sem_wait(&queue->full);
     pthread_mutex_lock(&queue->mutex);
@@ -311,7 +359,25 @@ byte* threaded_queue_dequeue_sem_wait(ThreadedQueue* queue, byte* data, uint64 s
 }
 
 inline
-byte* threaded_queue_dequeue_start_sem_wait(ThreadedQueue* queue)
+bool thrd_queue_dequeue_sem_timedwait(ThreadedQueue* queue, byte* data, uint64 size, uint64 wait, byte aligned = 0)
+{
+    if (sem_timedwait(&queue->full, wait)) {
+        return false;
+    }
+
+    pthread_mutex_lock(&queue->mutex);
+
+    memcpy(data, queue->tail, size);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
+
+    pthread_mutex_unlock(&queue->mutex);
+    sem_post(&queue->empty);
+
+    return true;
+}
+
+inline
+byte* thrd_queue_dequeue_start_sem_wait(ThreadedQueue* queue)
 {
     sem_wait(&queue->full);
     pthread_mutex_lock(&queue->mutex);
@@ -320,7 +386,7 @@ byte* threaded_queue_dequeue_start_sem_wait(ThreadedQueue* queue)
 }
 
 inline
-void threaded_queue_dequeue_end_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_end_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
 {
     ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
 
diff --git a/memory/ThreadedRingMemory.h b/memory/ThreadedRingMemory.h
new file mode 100644
index 0000000..b9faa6e
--- /dev/null
+++ b/memory/ThreadedRingMemory.h
@@ -0,0 +1,163 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_MEMORY_THREADED_RING_MEMORY_H
+#define TOS_MEMORY_THREADED_RING_MEMORY_H
+
+#include "RingMemory.h"
+
+#if _WIN32
+    #include "../platform/win32/threading/Thread.h"
+#elif __linux__
+    #include "../platform/linux/threading/Thread.h"
+#endif
+
+// @todo This is a horrible implementation. Please implement a lock free solution
+
+struct ThreadedRingMemory {
+    byte* memory;
+    byte* end;
+
+    byte* head;
+
+    // This variable is usually only used by single producer/consumer code mostly found in threads.
+    // One thread inserts elements -> updates head
+    // The other thread reads elements -> updates tail
+    // This code itself doesn't change this variable
+    byte* tail;
+
+    uint64 size;
+    int32 alignment;
+    int32 element_alignment;
+
+    pthread_mutex_t mutex;
+};
+
+// @bug alignment should also include the end point, not just the start
+
+inline
+void thrd_ring_alloc(ThreadedRingMemory* ring, uint64 size, int32 alignment = 64)
+{
+    ring_alloc((RingMemory *) ring, size, alignment);
+    pthread_mutex_init(&ring->mutex, NULL);
+}
+
+inline
+void thrd_ring_init(ThreadedRingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment = 64)
+{
+    ring_init((RingMemory *) ring, buf, size, alignment);
+    pthread_mutex_init(&ring->mutex, NULL);
+}
+
+inline
+void thrd_ring_init(ThreadedRingMemory* ring, byte* buf, uint64 size, int32 alignment = 64)
+{
+    ring_init((RingMemory *) ring, buf, size, alignment);
+    pthread_mutex_init(&ring->mutex, NULL);
+}
+
+inline
+void thrd_ring_free(ThreadedRingMemory* ring)
+{
+    ring_free((RingMemory *) ring);
+    pthread_mutex_destroy(&ring->mutex);
+}
+
+inline
+byte* thrd_ring_calculate_position(ThreadedRingMemory* ring, uint64 size, byte aligned = 0)
+{
+    pthread_mutex_lock(&ring->mutex);
+    byte* result = ring_calculate_position((RingMemory *) ring, size, aligned);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+inline
+void thrd_ring_reset(ThreadedRingMemory* ring)
+{
+    pthread_mutex_lock(&ring->mutex);
+    ring_reset((RingMemory *) ring);
+    pthread_mutex_unlock(&ring->mutex);
+}
+
+// Moves a pointer based on the size you want to consume (new position = after consuming size)
+void thrd_ring_move_pointer(ThreadedRingMemory* ring, byte** pos, uint64 size, byte aligned = 0)
+{
+    pthread_mutex_lock(&ring->mutex);
+    ring_move_pointer((RingMemory *) ring, pos, size, aligned);
+    pthread_mutex_unlock(&ring->mutex);
+}
+
+byte* thrd_ring_get_memory(ThreadedRingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+{
+    pthread_mutex_lock(&ring->mutex);
+    byte* result = ring_get_memory((RingMemory *) ring, size, aligned, zeroed);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+// Same as ring_get_memory but DOESN'T move the head
+byte* thrd_ring_get_memory_nomove(ThreadedRingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+{
+    pthread_mutex_lock(&ring->mutex);
+    byte* result = ring_get_memory_nomove((RingMemory *) ring, size, aligned, zeroed);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+// Used if the ring only contains elements of a certain size
+// This way you can get a certain element
+inline
+byte* thrd_ring_get_element(ThreadedRingMemory* ring, uint64 element_count, uint64 element, uint64 size)
+{
+    pthread_mutex_lock(&ring->mutex);
+    byte* result = ring_get_element((RingMemory *) ring, element_count, element, size);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+/**
+ * Checks if one additional element can be inserted without overwriting the tail index
+ */
+inline
+bool thrd_ring_commit_safe(ThreadedRingMemory* ring, uint64 size, byte aligned = 0)
+{
+    pthread_mutex_lock(&ring->mutex);
+    bool result = ring_commit_safe((RingMemory *) ring, size, aligned);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+inline
+void thrd_ring_force_head_update(const ThreadedRingMemory* ring)
+{
+    _mm_clflush(ring->head);
+}
+
+inline
+void thrd_ring_force_tail_update(const ThreadedRingMemory* ring)
+{
+    _mm_clflush(ring->tail);
+}
+
+inline
+int64 thrd_ring_dump(ThreadedRingMemory* ring, byte* data)
+{
+    pthread_mutex_lock(&ring->mutex);
+    int64 result = ring_dump((RingMemory *) ring, data);
+    pthread_mutex_unlock(&ring->mutex);
+
+    return result;
+}
+
+#endif
\ No newline at end of file
diff --git a/object/Mesh.h b/object/Mesh.h
index e7a62a5..490a2b9 100644
--- a/object/Mesh.h
+++ b/object/Mesh.h
@@ -29,7 +29,7 @@
 //      maybe make a mesh hold other meshes?
 // @todo handle vertices arrays where for example no texture coordinates are defined/used
 struct Mesh {
-    byte* data; // memory owner that subdevides into the pointers below
+    byte* data; // memory owner that subdivides into the pointers below
 
     // @todo Implement the version into the file, currently not implemented
     int32 version;
@@ -70,13 +70,17 @@ struct Mesh {
 };
 
 // @todo also handle textures etc.
-// WARNING: mesh needs to have memory already reserved and asigned to data
+// WARNING: mesh needs to have memory already reserved and assigned to data
 void mesh_from_file_txt(
     Mesh* mesh,
-    byte* data,
+    const char* path,
     RingMemory* ring
 ) {
-    char* pos = (char *) data;
+    FileBody file;
+    file_read(path, &file, ring);
+    ASSERT_SIMPLE(file.size);
+
+    char* pos = (char *) file.content;
 
     // move past the version string
     pos += 8;
@@ -458,19 +462,15 @@ enum MeshLoadingRestriction {
 // @todo sometimes we don't care about some data, we should have an option which defines which data should be loaded
 //      this can improve performance for algorithms on this. e.g.:
 //      on the server side we only care about the vertex positions for collision (no normals, no color, ...)
-int32 mesh_from_file(
-    RingMemory* ring,
-    const char* path,
+int32 mesh_from_data(
+    const byte* data,
     Mesh* mesh,
     const char* group = NULL,
     int32 load_format = MESH_LOADING_RESTRICTION_EVERYTHING,
     int32 steps = 8
 )
 {
-    FileBody file;
-    file_read(path, &file, ring);
-
-    byte* pos = file.content;
+    const byte* pos = data;
 
     // Read version
     mesh->version = *((int32 *) pos);
@@ -537,24 +537,24 @@ int32 mesh_from_file(
     return offset;
 }
 
-void mesh_to_file(
-    RingMemory* ring,
-    const char* path,
+// @bug this is wrong, since it is the max size
+// We would have to check the vertex format to calculate the actual size
+int32 mesh_data_size(const Mesh* mesh)
+{
+    return sizeof(mesh->version)
+        + sizeof(mesh->vertex_type)
+        + sizeof(mesh->vertex_count)
+        + 12 * sizeof(f32) * mesh->vertex_count; // 12 is the maximum value
+}
+
+int32 mesh_to_data(
     const Mesh* mesh,
+    byte* data,
     int32 vertex_save_format = VERTEX_TYPE_ALL,
     int32 steps = 8
 )
 {
-    FileBody file;
-
-    // Temporary file size for buffer
-    // @todo check the actual size, we are currently more or less guessing
-    file.size = sizeof(mesh)
-        + sizeof(Vertex3D) * mesh->vertex_count
-        + 4096;
-
-    file.content = ring_get_memory(ring, file.size, 64);
-    byte* pos = file.content;
+    byte* pos = data;
 
     // version
     memcpy(pos, &mesh->version, sizeof(mesh->version));
@@ -571,7 +571,7 @@ void mesh_to_file(
     memcpy(pos, &mesh->vertex_count, sizeof(mesh->vertex_count));
     pos += sizeof(mesh->vertex_count);
 
-    // verticies
+    // vertices
     int32 vertex_size = 0;
     if (mesh->vertex_type & VERTEX_TYPE_POSITION) {
         vertex_size += 3;
@@ -614,16 +614,16 @@ void mesh_to_file(
         pos += vertex_size * sizeof(f32) * mesh->vertex_count;
     }
 
-    file.size = pos - file.content;
+    int32 size = (int32) (pos - data);
 
     SWAP_ENDIAN_LITTLE_SIMD(
-        (int32 *) file.content,
-        (int32 *) file.content,
-        file.size / 4, // everything in here is 4 bytes -> super easy to swap
+        (int32 *) data,
+        (int32 *) data,
+        size / 4, // everything in here is 4 bytes -> super easy to swap
         steps
     );
 
-    file_write(path, &file);
+    return size;
 }
 
 #endif
\ No newline at end of file
diff --git a/platform/linux/FileUtils.cpp b/platform/linux/FileUtils.cpp
index d8f8594..7a5fcfe 100644
--- a/platform/linux/FileUtils.cpp
+++ b/platform/linux/FileUtils.cpp
@@ -13,6 +13,8 @@
 #include <stdlib.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
 #include <unistd.h>
 #include <linux/limits.h>
 #include <stdarg.h>
@@ -28,7 +30,54 @@
     #define MAX_PATH PATH_MAX
 #endif
 
-typedef int32 FileHandler;
+typedef int32 FileHandle;
+typedef int MMFHandle;
+
+inline
+MMFHandle file_mmf_handle(FileHandle fp) {
+    return fp;
+}
+
+inline
+void* mmf_region_init(MMFHandle fh, size_t offset, size_t length = 0) {
+    if (length == 0) {
+        struct stat st;
+        if (fstat(fh, &st) != 0) {
+            return null;
+        }
+
+        length = st.st_size - offset;
+    }
+
+    size_t page_size = sysconf(_SC_PAGESIZE);
+
+    // Offset (must be page-aligned)
+    size_t aligned_offset = offset & ~(page_size - 1);
+    size_t offset_diff = offset - aligned_offset;
+    size_t map_length = length + offset_diff;
+
+    void *mapped_region = mmap(nullptr, map_length, PROT_READ, MAP_PRIVATE, fh, aligned_offset);
+
+    if (mapped_region == MAP_FAILED) {
+        return null;
+    }
+
+    return (char *) mapped_region + offset_diff;
+}
+
+inline
+void mmf_region_release(void* region, size_t length = 0) {
+    size_t page_size = sysconf(_SC_PAGESIZE);
+
+    void *aligned_region = (void *) ((uintptr_t)region & ~(page_size - 1));
+
+    munmap(aligned_region, length);
+}
+
+inline
+void file_mmf_close(MMFHandle fh) {
+    close(fh);
+}
 
 inline
 void relative_to_absolute(const char* rel, char* path)
@@ -77,8 +126,8 @@ uint64 file_last_modified(const char* filename)
 }
 
 inline
-FileHandler file_append_handle(const char* path) {
-    FileHandler fp;
+FileHandle file_append_handle(const char* path) {
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -151,6 +200,9 @@ bool file_copy(const char* src, const char* dst) {
     close(src_fd);
     close(dst_fd);
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes_read);
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, bytes_written);
+
     return success;
 }
 
@@ -205,6 +257,8 @@ void file_read(const char* path, FileBody* file, RingMemory* ring) {
     file->content[bytes_read] = '\0';
     file->size = bytes_read;
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes_read);
+
     close(fp);
 }
 
@@ -235,11 +289,13 @@ bool file_write(const char* path, const FileBody* file) {
         return false;
     }
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
     return true;
 }
 
 inline
-void close_handle(FileHandler fp)
+void file_close_handle(FileHandle fp)
 {
     close(fp);
 }
diff --git a/platform/linux/Server.h b/platform/linux/network/Server.h
similarity index 91%
rename from platform/linux/Server.h
rename to platform/linux/network/Server.h
index ebc175b..f25908a 100644
--- a/platform/linux/Server.h
+++ b/platform/linux/network/Server.h
@@ -6,8 +6,8 @@
  * @version   1.0.0
  * @link      https://jingga.app
  */
-#ifndef TOS_PLATFORM_LINUX_SERVER_H
-#define TOS_PLATFORM_LINUX_SERVER_H
+#ifndef TOS_PLATFORM_LINUX_NETWORK_SERVER_H
+#define TOS_PLATFORM_LINUX_NETWORK_SERVER_H
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -21,9 +21,9 @@
 #include <fcntl.h>
 #include <errno.h>
 
-#include "../../stdlib/Types.h"
-#include "../../network/SocketConnection.h"
-#include "../../utils/EndianUtils.h"
+#include "../../../stdlib/Types.h"
+#include "../../../network/SocketConnection.h"
+#include "../../../utils/EndianUtils.h"
 
 // WARNING: requires `sudo setcap cap_net_raw=eip /path/to/your_program`
 void socket_server_raw_create(const char* hostname, SocketConnection* con) {
diff --git a/platform/linux/Socket.h b/platform/linux/network/Socket.h
similarity index 62%
rename from platform/linux/Socket.h
rename to platform/linux/network/Socket.h
index afa619f..f4fa6a3 100644
--- a/platform/linux/Socket.h
+++ b/platform/linux/network/Socket.h
@@ -6,8 +6,8 @@
  * @version   1.0.0
  * @link      https://jingga.app
  */
-#ifndef TOS_PLATFORM_LINUX_SOCKET_H
-#define TOS_PLATFORM_LINUX_SOCKET_H
+#ifndef TOS_PLATFORM_LINUX_NETWORK_SOCKET_H
+#define TOS_PLATFORM_LINUX_NETWORK_SOCKET_H
 
 #define socket_close close
 
diff --git a/platform/linux/threading/Atomic.h b/platform/linux/threading/Atomic.h
index ff9cd24..a5ffda0 100644
--- a/platform/linux/threading/Atomic.h
+++ b/platform/linux/threading/Atomic.h
@@ -12,6 +12,16 @@
 #include <stdatomic.h>
 #include "../../../stdlib/Types.h"
 
+inline
+void atomic_set(void** target, void* value) {
+    __atomic_store_n(target, value, __ATOMIC_SEQ_CST);
+}
+
+inline
+void* atomic_get(void** target) {
+    return __atomic_load_n(target, __ATOMIC_SEQ_CST);
+}
+
 inline
 void atomic_set(volatile int32* value, int32 new_value)
 {
@@ -60,22 +70,42 @@ void atomic_get(volatile byte* value, byte data[16])
 
 inline
 void atomic_increment(volatile int32* value) {
-    __atomic_fetch_add(value, 1, __ATOMIC_SEQ_CST);
+    __atomic_add_fetch(value, 1, __ATOMIC_SEQ_CST);
 }
 
 inline
 void atomic_decrement(volatile int32* value) {
-    __atomic_fetch_sub(value, 1, __ATOMIC_SEQ_CST);
+    __atomic_sub_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_increment(volatile int64* value) {
+    __atomic_add_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_decrement(volatile int64* value) {
+    __atomic_sub_fetch(value, 1, __ATOMIC_SEQ_CST);
 }
 
 inline
 void atomic_add(volatile int32* value, int32 increment) {
-    __atomic_fetch_add(value, increment, __ATOMIC_SEQ_CST);
+    __atomic_add_fetch(value, increment, __ATOMIC_SEQ_CST);
 }
 
 inline
 void atomic_sub(volatile int32* value, int32 decrement) {
-    __atomic_fetch_sub(value, decrement, __ATOMIC_SEQ_CST);
+    __atomic_sub_fetch(value, decrement, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_add(volatile int64* value, int64 increment) {
+    __atomic_add_fetch(value, increment, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_sub(volatile int64* value, int64 decrement) {
+    __atomic_sub_fetch(value, decrement, __ATOMIC_SEQ_CST);
 }
 
 inline
@@ -85,13 +115,158 @@ int32 atomic_compare_exchange_weak(volatile int32* value, int32* expected, int32
 }
 
 inline
-int32 atomic_fetch_add(volatile int32* value, int32 operand) {
-    return __atomic_fetch_add(value, operand, __ATOMIC_SEQ_CST);
+int32 atomic_add_fetch(volatile int32* value, int32 operand) {
+    return __atomic_add_fetch(value, operand, __ATOMIC_SEQ_CST);
 }
 
 inline
-int32 atomic_fetch_sub(volatile int32* value, int32 operand) {
-    return __atomic_fetch_sub(value, operand, __ATOMIC_SEQ_CST);
+int32 atomic_sub_fetch(volatile int32* value, int32 operand) {
+    return __atomic_sub_fetch(value, operand, __ATOMIC_SEQ_CST);
 }
 
+inline
+int64 atomic_add_fetch(volatile int64* value, int64 operand) {
+    return __atomic_add_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+int64 atomic_sub_fetch(volatile int64* value, int64 operand) {
+    return __atomic_sub_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_set(volatile uint32* value, uint32 new_value)
+{
+    __atomic_store_n(value, new_value, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_set(volatile uint64* value, uint64 new_value)
+{
+    __atomic_store_n(value, new_value, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint32 atomic_set_fetch(volatile uint32* value, uint32 new_value) {
+    return __atomic_exchange_n(value, new_value, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint64 atomic_set_fetch(volatile uint64* value, uint64 new_value) {
+    return __atomic_exchange_n(value, new_value, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_get(volatile byte* value, byte data[16])
+{
+    __atomic_store((volatile __uint128 *) value, (__uint128 *) data, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint32 atomic_get(volatile uint32* value)
+{
+    return __atomic_load_n((uint32 *) value, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint64 atomic_get(volatile uint64* value)
+{
+    return __atomic_load_n((uint64 *) value, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_increment(volatile uint32* value) {
+    __atomic_add_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_decrement(volatile uint32* value) {
+    __atomic_sub_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_increment(volatile uint64* value) {
+    __atomic_add_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_decrement(volatile uint64* value) {
+    __atomic_sub_fetch(value, 1, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_add(volatile uint32* value, uint32 increment) {
+    __atomic_add_fetch(value, increment, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_sub(volatile uint32* value, uint32 decrement) {
+    __atomic_sub_fetch(value, decrement, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint32 atomic_compare_exchange_weak(volatile uint32* value, uint32* expected, uint32 desired) {
+    __atomic_compare_exchange_n(value, expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+    return *expected;
+}
+
+inline
+uint32 atomic_add_fetch(volatile uint32* value, uint32 operand) {
+    return __atomic_add_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint32 atomic_sub_fetch(volatile uint32* value, uint32 operand) {
+    return __atomic_sub_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint64 atomic_add_fetch(volatile uint64* value, uint64 operand) {
+    return __atomic_add_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+uint64 atomic_sub_fetch(volatile uint64* value, uint64 operand) {
+    return __atomic_sub_fetch(value, operand, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_and(volatile uint32* value, uint32 mask) {
+    __atomic_fetch_and(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_and(volatile int32* value, int32 mask) {
+    __atomic_fetch_and(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_and(volatile uint64* value, uint64 mask) {
+    __atomic_fetch_and(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_and(volatile int64* value, int64 mask) {
+    __atomic_fetch_and(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_or(volatile uint32* value, uint32 mask) {
+    __atomic_fetch_or(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_or(volatile int32* value, int32 mask) {
+    __atomic_fetch_or(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_or(volatile uint64* value, uint64 mask) {
+    __atomic_fetch_or(value, mask, __ATOMIC_SEQ_CST);
+}
+
+inline
+void atomic_or(volatile int64* value, int64 mask) {
+    __atomic_fetch_or(value, mask, __ATOMIC_SEQ_CST);
+}
 #endif
\ No newline at end of file
diff --git a/platform/linux/threading/Thread.h b/platform/linux/threading/Thread.h
index 0dd9d74..6b36041 100644
--- a/platform/linux/threading/Thread.h
+++ b/platform/linux/threading/Thread.h
@@ -98,7 +98,7 @@ int32 pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex) {
 }
 
 int32 pthread_cond_signal(pthread_cond_t* cond) {
-    atomic_fetch_add(cond, 1);
+    atomic_add_fetch(cond, 1);
     syscall(SYS_futex, cond, FUTEX_WAKE, 1, NULL, NULL, 0);
 
     return 0;
@@ -114,7 +114,7 @@ int32 pthread_rwlock_init(pthread_rwlock_t* rwlock, const pthread_rwlockattr_t*)
 int32 pthread_rwlock_rdlock(pthread_rwlock_t* rwlock) {
     while (atomic_get(&rwlock->writer)) {}
 
-    atomic_fetch_add(&rwlock->readers, 1);
+    atomic_add_fetch(&rwlock->readers, 1);
 
     return 0;
 }
@@ -129,7 +129,7 @@ int32 pthread_rwlock_unlock(pthread_rwlock_t* rwlock) {
     if (atomic_get(&rwlock->writer)) {
         atomic_set(&rwlock->writer, 0);
     } else {
-        atomic_fetch_sub(&rwlock->readers, 1);
+        atomic_sub_fetch(&rwlock->readers, 1);
     }
 
     return 0;
diff --git a/platform/win32/FileUtils.cpp b/platform/win32/FileUtils.cpp
index 25feead..26c3bc0 100644
--- a/platform/win32/FileUtils.cpp
+++ b/platform/win32/FileUtils.cpp
@@ -22,7 +22,9 @@
 #include "../../utils/TestUtils.h"
 #include "../../memory/RingMemory.h"
 
-typedef HANDLE FileHandler;
+typedef HANDLE FileHandle;
+typedef HANDLE MMFHandle;
+typedef OVERLAPPED file_overlapped;
 
 struct FileBodyAsync {
     // doesn't include null termination (same as strlen)
@@ -31,7 +33,30 @@ struct FileBodyAsync {
     OVERLAPPED ov;
 };
 
-// @todo Consider to implement directly mapped files (CreateFileMapping) for certain files (e.g. map data or texture data, ...)
+inline
+MMFHandle file_mmf_handle(FileHandle fp)
+{
+    return CreateFileMappingA(fp, NULL, PAGE_READONLY, 0, 0, NULL);
+}
+
+inline
+void* mmf_region_init(MMFHandle fh, size_t offset, size_t length = 0)
+{
+    DWORD high = (DWORD) ((offset >> 32) & 0xFFFFFFFF);
+    DWORD low = (DWORD) (offset & 0xFFFFFFFF);
+
+    return MapViewOfFile(fh, FILE_MAP_READ, high, low, length);
+}
+
+inline
+void mmf_region_release(void* fh) {
+    UnmapViewOfFile(fh);
+}
+
+inline
+void file_mmf_close(MMFHandle fh) {
+    CloseHandle(fh);
+}
 
 inline
 void relative_to_absolute(const char* rel, char* path)
@@ -63,7 +88,7 @@ inline uint64
 file_size(const char* path)
 {
     // @performance Profile against fseek strategy
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -121,7 +146,7 @@ bool file_exists(const char* path)
 inline void
 file_read(const char* path, FileBody* file, RingMemory* ring = NULL)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -159,11 +184,10 @@ file_read(const char* path, FileBody* file, RingMemory* ring = NULL)
     }
 
     if (ring != NULL) {
-        file->content = ring_get_memory(ring, size.QuadPart);
+        file->content = ring_get_memory(ring, size.QuadPart + 1);
     }
 
     DWORD bytes;
-    ASSERT_SIMPLE(size.QuadPart < MAX_UINT32);
     if (!ReadFile(fp, file->content, (uint32) size.QuadPart, &bytes, NULL)) {
         CloseHandle(fp);
         file->content = NULL;
@@ -175,12 +199,14 @@ file_read(const char* path, FileBody* file, RingMemory* ring = NULL)
 
     file->content[bytes] = '\0';
     file->size = size.QuadPart;
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes);
 }
 
 inline
 void file_read(const char* path, FileBody* file, uint64 offset, uint64 length = MAX_UINT64, RingMemory* ring = NULL)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -232,7 +258,7 @@ void file_read(const char* path, FileBody* file, uint64 offset, uint64 length =
     uint64 read_length = OMS_MIN(length, file_size - offset);
 
     if (ring != NULL) {
-        file->content = ring_get_memory(ring, read_length);
+        file->content = ring_get_memory(ring, read_length + 1);
     }
 
     // Move the file pointer to the offset position
@@ -246,7 +272,6 @@ void file_read(const char* path, FileBody* file, uint64 offset, uint64 length =
     }
 
     DWORD bytes;
-    ASSERT_SIMPLE(read_length < MAX_UINT32);
     if (!ReadFile(fp, file->content, (uint32) read_length, &bytes, NULL)) {
         CloseHandle(fp);
         file->content = NULL;
@@ -258,10 +283,12 @@ void file_read(const char* path, FileBody* file, uint64 offset, uint64 length =
 
     file->content[bytes] = '\0';
     file->size = bytes;
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes);
 }
 
 inline
-void file_read(FileHandler fp, FileBody* file, uint64 offset = 0, uint64 length = MAX_UINT64, RingMemory* ring = NULL)
+void file_read(FileHandle fp, FileBody* file, uint64 offset = 0, uint64 length = MAX_UINT64, RingMemory* ring = NULL)
 {
     LARGE_INTEGER size;
     if (!GetFileSizeEx(fp, &size)) {
@@ -285,7 +312,7 @@ void file_read(FileHandler fp, FileBody* file, uint64 offset = 0, uint64 length
     uint64 read_length = OMS_MIN(length, file_size - offset);
 
     if (ring != NULL) {
-        file->content = ring_get_memory(ring, read_length);
+        file->content = ring_get_memory(ring, read_length + 1);
     }
 
     // Move the file pointer to the offset position
@@ -299,7 +326,6 @@ void file_read(FileHandler fp, FileBody* file, uint64 offset = 0, uint64 length
     }
 
     DWORD bytes;
-    ASSERT_SIMPLE(read_length < MAX_UINT32);
     if (!ReadFile(fp, file->content, (uint32) read_length, &bytes, NULL)) {
         CloseHandle(fp);
         file->content = NULL;
@@ -307,16 +333,16 @@ void file_read(FileHandler fp, FileBody* file, uint64 offset = 0, uint64 length
         return;
     }
 
-    CloseHandle(fp);
-
     file->content[bytes] = '\0';
     file->size = bytes;
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, bytes);
 }
 
 inline uint64
 file_read_struct(const char* path, void* file, uint32 size)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -361,13 +387,15 @@ file_read_struct(const char* path, void* file, uint32 size)
 
     CloseHandle(fp);
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, read);
+
     return read;
 }
 
 inline bool
 file_write(const char* path, const FileBody* file)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -397,7 +425,6 @@ file_write(const char* path, const FileBody* file)
 
     DWORD written;
     DWORD length = (DWORD) file->size;
-    ASSERT_SIMPLE(file->size < MAX_UINT32);
     if (!WriteFile(fp, file->content, length, &written, NULL)) {
         CloseHandle(fp);
         return false;
@@ -405,13 +432,15 @@ file_write(const char* path, const FileBody* file)
 
     CloseHandle(fp);
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, length);
+
     return true;
 }
 
 inline bool
 file_write_struct(const char* path, const void* file, uint32 size)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -444,6 +473,8 @@ file_write_struct(const char* path, const void* file, uint32 size)
 
     CloseHandle(fp);
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
     return true;
 }
 
@@ -473,7 +504,7 @@ file_copy(const char* src, const char* dst)
 }
 
 inline
-void close_handle(FileHandler fp)
+void file_close_handle(FileHandle fp)
 {
     CloseHandle(fp);
 }
@@ -481,7 +512,7 @@ void close_handle(FileHandler fp)
 inline
 HANDLE file_append_handle(const char* path)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -514,10 +545,10 @@ HANDLE file_append_handle(const char* path)
 
 inline
 bool file_read_async(
-    FileHandler fp,
+    FileHandle fp,
     FileBodyAsync* file,
     uint64_t offset = 0,
-    uint64_t length = MAXUINT64,
+    uint64_t length = MAX_UINT64,
     RingMemory* ring = NULL
 ) {
     LARGE_INTEGER size;
@@ -559,7 +590,6 @@ bool file_read_async(
     file->ov.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
 
     DWORD bytes_read = 0;
-    ASSERT_SIMPLE(read_length < MAXDWORD);
     if (!ReadFile(fp, file->content, (DWORD) read_length, &bytes_read, &file->ov)) {
         DWORD error = GetLastError();
         if (error != ERROR_IO_PENDING) {
@@ -573,13 +603,23 @@ bool file_read_async(
     }
 
     file->size = read_length;
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_READ, read_length);
+
     return true;
 }
 
 inline
-FileHandler file_read_handle(const char* path)
+void file_async_wait(FileHandle fp, file_overlapped* overlapped, bool wait)
 {
-    FileHandler fp;
+    DWORD bytesTransferred;
+    GetOverlappedResult(fp, overlapped, &bytesTransferred, wait);
+}
+
+inline
+FileHandle file_read_handle(const char* path)
+{
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -611,9 +651,9 @@ FileHandler file_read_handle(const char* path)
 }
 
 inline
-FileHandler file_read_async_handle(const char* path)
+FileHandle file_read_async_handle(const char* path)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -646,7 +686,7 @@ FileHandler file_read_async_handle(const char* path)
 
 bool file_append(const char* path, const char* file)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -675,39 +715,40 @@ bool file_append(const char* path, const char* file)
     }
 
     DWORD written;
-    DWORD length = (DWORD) strlen(file); // @question WHY is WriteFile not supporting larger data?
-    ASSERT_SIMPLE(length < MAX_UINT32);
+    DWORD length = (DWORD) strlen(file);
     if (!WriteFile(fp, file, length, &written, NULL)) {
         CloseHandle(fp);
         return false;
     }
 
     CloseHandle(fp);
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
     return true;
 }
 
 inline bool
-file_append(FileHandler fp, const char* file)
+file_append(FileHandle fp, const char* file)
 {
     if (fp == INVALID_HANDLE_VALUE) {
         return false;
     }
 
     DWORD written;
-    DWORD length = (DWORD) strlen(file); // @question WHY is WriteFile not supporting larger data?
-    ASSERT_SIMPLE(length < MAX_UINT32);
-
+    DWORD length = (DWORD) strlen(file);
     if (!WriteFile(fp, file, length, &written, NULL)) {
         CloseHandle(fp);
         return false;
     }
 
-    CloseHandle(fp);
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
     return true;
 }
 
 inline bool
-file_append(FileHandler fp, const char* file, size_t length)
+file_append(FileHandle fp, const char* file, size_t length)
 {
     if (fp == INVALID_HANDLE_VALUE) {
         return false;
@@ -719,13 +760,15 @@ file_append(FileHandler fp, const char* file, size_t length)
         return false;
     }
 
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, written);
+
     return true;
 }
 
 inline bool
 file_append(const char* path, const FileBody* file)
 {
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
@@ -755,13 +798,15 @@ file_append(const char* path, const FileBody* file)
 
     DWORD bytes;
     DWORD length = (DWORD) file->size;
-    ASSERT_SIMPLE(file->size < MAX_UINT32);
     if (!WriteFile(fp, file->content, length, &bytes, NULL)) {
         CloseHandle(fp);
         return false;
     }
 
     CloseHandle(fp);
+
+    LOG_INCREMENT_BY(DEBUG_COUNTER_DRIVE_WRITE, bytes);
+
     return true;
 }
 
@@ -770,7 +815,7 @@ uint64 file_last_modified(const char* path)
 {
     WIN32_FIND_DATA find_data;
 
-    FileHandler fp;
+    FileHandle fp;
     if (*path == '.') {
         char full_path[MAX_PATH];
         relative_to_absolute(path, full_path);
diff --git a/platform/win32/Library.h b/platform/win32/Library.h
index 86e3e41..de5162e 100644
--- a/platform/win32/Library.h
+++ b/platform/win32/Library.h
@@ -72,6 +72,7 @@ bool library_load(Library* lib)
         if (function) {
             lib->functions[c] = function;
         } else {
+            ASSERT_SIMPLE(false);
             lib->is_valid = false;
         }
     }
diff --git a/platform/win32/SystemInfo.cpp b/platform/win32/SystemInfo.cpp
index 5f46011..11e8e14 100644
--- a/platform/win32/SystemInfo.cpp
+++ b/platform/win32/SystemInfo.cpp
@@ -25,6 +25,7 @@
 #include <wbemidl.h>
 #include <comdef.h>
 #include <winnls.h>
+#include <hidsdi.h>
 
 // @performance Do we really need all these libs, can't we simplify that?!
 #include <intrin.h>
diff --git a/platform/win32/Window.h b/platform/win32/Window.h
index 49605ec..ac72443 100644
--- a/platform/win32/Window.h
+++ b/platform/win32/Window.h
@@ -21,10 +21,14 @@ struct WindowState {
     uint64 style;
 };
 
-#define WINDOW_STATE_CHANGE_SIZE 1
-#define WINDOW_STATE_CHANGE_POS 2
-#define WINDOW_STATE_CHANGE_FOCUS 4
-#define WINDOW_STATE_CHANGE_FULLSCREEN 8
+enum WindowStateChanges : byte {
+    WINDOW_STATE_CHANGE_NONE = 0,
+    WINDOW_STATE_CHANGE_SIZE = 1,
+    WINDOW_STATE_CHANGE_POS = 2,
+    WINDOW_STATE_CHANGE_FOCUS = 4,
+    WINDOW_STATE_CHANGE_FULLSCREEN = 8,
+    WINDOW_STATE_CHANGE_ALL = 16,
+};
 
 struct Window {
     uint16 width;
diff --git a/platform/win32/audio/DirectSound.h b/platform/win32/audio/DirectSound.h
index aec1ae4..8635902 100644
--- a/platform/win32/audio/DirectSound.h
+++ b/platform/win32/audio/DirectSound.h
@@ -9,8 +9,9 @@
 #ifndef TOS_SOUND_DIRECT_SOUND_H
 #define TOS_SOUND_DIRECT_SOUND_H
 
-#include <dsound.h>
 #include <windows.h>
+#include <mmeapi.h>
+#include <dsound.h>
 
 #include "../../../stdlib/Types.h"
 #include "../../../audio/AudioSetting.h"
@@ -172,10 +173,10 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
         return;
     }
 
-    void *region1;
+    void* region1;
     DWORD region1_size;
 
-    void *region2;
+    void* region2;
     DWORD region2_size;
 
     DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
@@ -203,6 +204,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
 
     api_setting->secondary_buffer->Unlock(region1, region1_size, region2, region2_size);
 
+    // @question Do we want to keep this here or move it to the audio mixer?
     setting->sample_index += setting->sample_buffer_size / setting->sample_size;
     setting->sample_buffer_size = 0;
 }
diff --git a/platform/win32/audio/XAudio2.h b/platform/win32/audio/XAudio2.h
index 8f56f60..7fa16ec 100644
--- a/platform/win32/audio/XAudio2.h
+++ b/platform/win32/audio/XAudio2.h
@@ -9,9 +9,9 @@
 #ifndef TOS_SOUND_XAUDIO2_H
 #define TOS_SOUND_XAUDIO2_H
 
-#include <xaudio2.h>
 #include <windows.h>
 #include <objbase.h>
+#include <xaudio2.h>
 
 #include "../../../stdlib/Types.h"
 #include "../../../audio/AudioSetting.h"
diff --git a/platform/win32/input/HidInput.h b/platform/win32/input/HidInput.h
index 5729d9a..4a1f2e1 100644
--- a/platform/win32/input/HidInput.h
+++ b/platform/win32/input/HidInput.h
@@ -137,7 +137,7 @@ void hid_init_controllers(Input* __restrict states, int32 state_count, RingMemor
     SetupDiDestroyDeviceInfoList(device_info_set);
 }
 
-uint32 hid_divice_poll(Input* state, uint64 time) {
+uint32 hid_device_poll(Input* state, uint64 time) {
     UCHAR buffer[128];
     DWORD bytes_read;
 
diff --git a/platform/win32/Client.h b/platform/win32/network/Client.h
similarity index 84%
rename from platform/win32/Client.h
rename to platform/win32/network/Client.h
index 37e52c3..3b25575 100644
--- a/platform/win32/Client.h
+++ b/platform/win32/network/Client.h
@@ -6,8 +6,8 @@
  * @version   1.0.0
  * @link      https://jingga.app
  */
-#ifndef TOS_PLATFORM_WIN32_SERVER_H
-#define TOS_PLATFORM_WIN32_SERVER_H
+#ifndef TOS_PLATFORM_WIN32_NETWORK_SERVER_H
+#define TOS_PLATFORM_WIN32_NETWORK_SERVER_H
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -16,9 +16,9 @@
 #include <winsock2.h>
 #include <ws2tcpip.h>
 
-#include "../../stdlib/Types.h"
-#include "../../network/SocketConnection.h"
-#include "../../utils/EndianUtils.h"
+#include "../../../stdlib/Types.h"
+#include "../../../network/SocketConnection.h"
+#include "../../../utils/EndianUtils.h"
 
 #pragma comment(lib, "Ws2_32.lib")
 
diff --git a/platform/win32/Server.h b/platform/win32/network/Server.h
similarity index 86%
rename from platform/win32/Server.h
rename to platform/win32/network/Server.h
index 7ca5c76..3c7a36c 100644
--- a/platform/win32/Server.h
+++ b/platform/win32/network/Server.h
@@ -6,8 +6,8 @@
  * @version   1.0.0
  * @link      https://jingga.app
  */
-#ifndef TOS_PLATFORM_WIN32_SERVER_H
-#define TOS_PLATFORM_WIN32_SERVER_H
+#ifndef TOS_PLATFORM_WIN32_NETWORK_SERVER_H
+#define TOS_PLATFORM_WIN32_NETWORK_SERVER_H
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -16,8 +16,8 @@
 #include <winsock2.h>
 #include <ws2tcpip.h>
 
-#include "../../network/SocketConnection.h"
-#include "../../utils/EndianUtils.h"
+#include "../../../network/SocketConnection.h"
+#include "../../../utils/EndianUtils.h"
 
 #pragma comment(lib, "Ws2_32.lib")
 
diff --git a/platform/win32/Socket.h b/platform/win32/network/Socket.h
similarity index 62%
rename from platform/win32/Socket.h
rename to platform/win32/network/Socket.h
index 472adde..709bb7e 100644
--- a/platform/win32/Socket.h
+++ b/platform/win32/network/Socket.h
@@ -6,8 +6,8 @@
  * @version   1.0.0
  * @link      https://jingga.app
  */
-#ifndef TOS_PLATFORM_WIN32_SOCKET_H
-#define TOS_PLATFORM_WIN32_SOCKET_H
+#ifndef TOS_PLATFORM_WIN32_NETWORK_SOCKET_H
+#define TOS_PLATFORM_WIN32_NETWORK_SOCKET_H
 
 #define socket_close closesocket
 
diff --git a/platform/win32/threading/Atomic.h b/platform/win32/threading/Atomic.h
index 32ab430..93ed91f 100644
--- a/platform/win32/threading/Atomic.h
+++ b/platform/win32/threading/Atomic.h
@@ -12,6 +12,18 @@
 #include <windows.h>
 #include "../../../stdlib/Types.h"
 
+inline
+void atomic_set(void** target, void* new_pointer)
+{
+    InterlockedExchangePointer(target, new_pointer);
+}
+
+inline
+void* atomic_get(void** target)
+{
+    return InterlockedCompareExchangePointer(target, NULL, NULL);
+}
+
 inline
 void atomic_set(volatile int32* value, int32 new_value)
 {
@@ -85,6 +97,16 @@ void atomic_decrement(volatile int32* value) {
     InterlockedDecrement((long *) value);
 }
 
+inline
+void atomic_increment(volatile int64* value) {
+    InterlockedIncrement((long *) value);
+}
+
+inline
+void atomic_decrement(volatile int64* value) {
+    InterlockedDecrement((long *) value);
+}
+
 inline
 void atomic_add(volatile int32* value, int32 increment) {
     InterlockedAdd((long *) value, increment);
@@ -95,19 +117,180 @@ void atomic_sub(volatile int32* value, int32 decrement) {
     InterlockedAdd((long *) value, -decrement);
 }
 
+inline
+void atomic_add(volatile int64* value, int64 increment) {
+    InterlockedAdd((long *) value, (long) increment);
+}
+
+inline
+void atomic_sub(volatile int64* value, int64 decrement) {
+    InterlockedAdd((long *) value, -1 * ((long) decrement));
+}
+
 inline
 int32 atomic_compare_exchange_weak(volatile int32* value, int32* expected, int32 desired) {
     return (int32) InterlockedCompareExchange((long *) value, desired, *expected);
 }
 
 inline
-int32 atomic_fetch_add(volatile int32* value, int32 operand) {
+int32 atomic_add_fetch(volatile int32* value, int32 operand) {
     return (int32) InterlockedExchangeAdd((long *) value, operand);
 }
 
 inline
-int32 atomic_fetch_sub(volatile int32* value, int32 operand) {
+int32 atomic_sub_fetch(volatile int32* value, int32 operand) {
     return (int32) InterlockedExchangeSubtract((unsigned long *) value, operand);
 }
 
+inline
+int64 atomic_add_fetch(volatile int64* value, int64 operand) {
+    return (int64) InterlockedExchangeAdd((long *) value, (long) operand);
+}
+
+inline
+int64 atomic_sub_fetch(volatile int64* value, int64 operand) {
+    return (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+}
+
+inline
+void atomic_set(volatile uint32* value, uint32 new_value)
+{
+    InterlockedExchange((long *) value, new_value);
+}
+
+inline
+void atomic_set(volatile uint64* value, uint64 new_value)
+{
+    InterlockedExchange((long *) value, (long) new_value);
+}
+
+inline
+uint32 atomic_set_fetch(volatile uint32* value, uint32 new_value)
+{
+    return (uint32) InterlockedExchange((long *) value, new_value);
+}
+
+inline
+uint64 atomic_set_fetch(volatile uint64* value, uint64 new_value)
+{
+    return (uint64) InterlockedExchange((long *) value, (long) new_value);
+}
+
+inline
+uint32 atomic_get(volatile uint32* value)
+{
+    return (uint32) InterlockedCompareExchange((long *) value, 0, 0);
+}
+
+inline
+uint64 atomic_get(volatile uint64* value)
+{
+    return (uint64) InterlockedCompareExchange((long *) value, 0, 0);
+}
+
+inline
+void atomic_increment(volatile uint32* value) {
+    InterlockedIncrement((long *) value);
+}
+
+inline
+void atomic_decrement(volatile uint32* value) {
+    InterlockedDecrement((long *) value);
+}
+
+inline
+void atomic_increment(volatile uint64* value) {
+    InterlockedIncrement((long *) value);
+}
+
+inline
+void atomic_decrement(volatile uint64* value) {
+    InterlockedDecrement((long *) value);
+}
+
+inline
+void atomic_add(volatile uint32* value, uint32 increment) {
+    InterlockedAdd((long *) value, increment);
+}
+
+inline
+void atomic_sub(volatile uint32* value, uint32 decrement) {
+    InterlockedAdd((long *) value, -1 * ((int32) decrement));
+}
+
+inline
+void atomic_add(volatile uint64* value, uint64 increment) {
+    InterlockedAdd((long *) value, (long) increment);
+}
+
+inline
+void atomic_sub(volatile uint64* value, uint64 decrement) {
+    InterlockedAdd((long *) value, -1 * ((long) decrement));
+}
+
+inline
+uint32 atomic_compare_exchange_weak(volatile uint32* value, uint32* expected, uint32 desired) {
+    return (uint32) InterlockedCompareExchange((long *) value, desired, *expected);
+}
+
+inline
+uint32 atomic_add_fetch(volatile uint32* value, uint32 operand) {
+    return (uint32) InterlockedExchangeAdd((long *) value, operand);
+}
+
+inline
+uint32 atomic_sub_fetch(volatile uint32* value, uint32 operand) {
+    return (uint32) InterlockedExchangeSubtract((unsigned long *) value, operand);
+}
+
+inline
+uint64 atomic_add_fetch(volatile uint64* value, uint64 operand) {
+    return (uint64) InterlockedExchangeAdd((long *) value, (long) operand);
+}
+
+inline
+uint64 atomic_sub_fetch(volatile uint64* value, uint64 operand) {
+    return (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+}
+
+inline
+void atomic_and(volatile uint32* value, uint32 mask) {
+    InterlockedAnd((volatile LONG *) value, mask);
+}
+
+inline
+void atomic_and(volatile int32* value, int32 mask) {
+    InterlockedAnd((volatile LONG *) value, (LONG)mask);
+}
+
+inline
+void atomic_and(volatile uint64* value, uint64 mask) {
+    InterlockedAnd64((volatile LONG64 *) value, mask);
+}
+
+inline
+void atomic_and(volatile int64* value, int64 mask) {
+    InterlockedAnd64((volatile LONG64 *) value, mask);
+}
+
+inline
+void atomic_or(volatile uint32* value, uint32 mask) {
+    InterlockedOr((volatile LONG *) value, mask);
+}
+
+inline
+void atomic_or(volatile int32* value, int32 mask) {
+    InterlockedOr((volatile LONG *) value, (LONG)mask);
+}
+
+inline
+void atomic_or(volatile uint64* value, uint64 mask) {
+    InterlockedOr64((volatile LONG64 *) value, mask);
+}
+
+inline
+void atomic_or(volatile int64* value, int64 mask) {
+    InterlockedOr64((volatile LONG64 *) value, mask);
+}
+
 #endif
\ No newline at end of file
diff --git a/platform/win32/threading/Semaphore.h b/platform/win32/threading/Semaphore.h
index 9fb718f..4846a38 100644
--- a/platform/win32/threading/Semaphore.h
+++ b/platform/win32/threading/Semaphore.h
@@ -29,6 +29,14 @@ void sem_wait(sem_t* semaphore) {
     WaitForSingleObject(*semaphore, INFINITE);
 }
 
+int32 sem_timedwait(sem_t* semaphore, uint64 ms) {
+    return (int32) WaitForSingleObject(*semaphore, (DWORD) ms);
+}
+
+int32 sem_trywait(sem_t* semaphore) {
+    return (int32) WaitForSingleObject(*semaphore, 0);
+}
+
 // increment
 void sem_post(sem_t* semaphore) {
     ReleaseSemaphore(*semaphore, 1, NULL);
diff --git a/scene/SceneState.h b/scene/SceneState.h
new file mode 100644
index 0000000..b048979
--- /dev/null
+++ b/scene/SceneState.h
@@ -0,0 +1,23 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_SCENE_STATE_H
+#define TOS_SCENE_STATE_H
+
+#include "../stdlib/Types.h"
+
+enum SceneState : byte {
+    SCENE_STATE_DEFAULT = 0,
+    SCENE_STATE_WINDOW_CHANGED = 1,
+    SCENE_STATE_SHOULD_SWITCH = 2,
+    SCENE_STATE_STARTED_SETUP = 4,
+    SCENE_STATE_WAITING_SETUP = 8,
+    SCENE_STATE_READY = 16,
+};
+
+#endif
\ No newline at end of file
diff --git a/stdlib/HashMap.h b/stdlib/HashMap.h
index 18e2dac..9abcc33 100644
--- a/stdlib/HashMap.h
+++ b/stdlib/HashMap.h
@@ -16,53 +16,53 @@
 #include "../memory/ChunkMemory.h"
 #include "../utils/StringUtils.h"
 
-#define MAX_KEY_LENGTH 32
+#define HASH_MAP_MAX_KEY_LENGTH 32
 
 struct HashEntryInt32 {
     int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
     HashEntryInt32* next;
     int32 value;
 };
 
 struct HashEntryInt64 {
     int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
     HashEntryInt64* next;
     int64 value;
 };
 
 struct HashEntryUIntPtr {
     int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
     HashEntryUIntPtr* next;
     uintptr_t value;
 };
 
 struct HashEntryVoidP {
     int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
     HashEntryVoidP* next;
     void* value;
 };
 
 struct HashEntryFloat {
     int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
     HashEntryFloat* next;
     f32 value;
 };
 
 struct HashEntryStr {
     int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
     HashEntryStr* next;
-    char value[MAX_KEY_LENGTH];
+    char value[HASH_MAP_MAX_KEY_LENGTH];
 };
 
 struct HashEntry {
     int64 element_id;
-    char key[MAX_KEY_LENGTH];
+    char key[HASH_MAP_MAX_KEY_LENGTH];
     HashEntry* next;
     byte* value;
 };
@@ -128,8 +128,8 @@ void hashmap_insert(HashMap* hm, const char* key, int32 value) {
     HashEntryInt32* entry = (HashEntryInt32 *) chunk_get_element(&hm->buf, element, true);
     entry->element_id = element;
 
-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
 
     entry->value = value;
     entry->next = NULL;
@@ -153,8 +153,8 @@ void hashmap_insert(HashMap* hm, const char* key, int64 value) {
     HashEntryInt64* entry = (HashEntryInt64 *) chunk_get_element(&hm->buf, element, true);
     entry->element_id = element;
 
-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
 
     entry->value = value;
     entry->next = NULL;
@@ -178,8 +178,8 @@ void hashmap_insert(HashMap* hm, const char* key, uintptr_t value) {
     HashEntryUIntPtr* entry = (HashEntryUIntPtr *) chunk_get_element(&hm->buf, element, true);
     entry->element_id = element;
 
-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
 
     entry->value = value;
     entry->next = NULL;
@@ -203,8 +203,8 @@ void hashmap_insert(HashMap* hm, const char* key, void* value) {
     HashEntryVoidP* entry = (HashEntryVoidP *) chunk_get_element(&hm->buf, element, true);
     entry->element_id = element;
 
-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
 
     entry->value = value;
     entry->next = NULL;
@@ -228,8 +228,8 @@ void hashmap_insert(HashMap* hm, const char* key, f32 value) {
     HashEntryFloat* entry = (HashEntryFloat *) chunk_get_element(&hm->buf, element, true);
     entry->element_id = element;
 
-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
 
     entry->value = value;
     entry->next = NULL;
@@ -253,11 +253,11 @@ void hashmap_insert(HashMap* hm, const char* key, const char* value) {
     HashEntryStr* entry = (HashEntryStr *) chunk_get_element(&hm->buf, element, true);
     entry->element_id = element;
 
-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
 
-    strncpy(entry->value, value, MAX_KEY_LENGTH);
-    entry->value[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->value, value, HASH_MAP_MAX_KEY_LENGTH);
+    entry->value[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
 
     entry->next = NULL;
 
@@ -282,8 +282,8 @@ void hashmap_insert(HashMap* hm, const char* key, byte* value) {
 
     entry->value = (byte *) entry + sizeof(HashEntry);
 
-    strncpy(entry->key, key, MAX_KEY_LENGTH);
-    entry->key[MAX_KEY_LENGTH - 1] = '\0';
+    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
+    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';
 
     memcpy(entry->value, value, hm->buf.chunk_size - sizeof(HashEntry));
 
@@ -306,7 +306,7 @@ HashEntry* hashmap_get_entry(const HashMap* hm, const char* key) {
     HashEntry* entry = (HashEntry *) hm->table[index];
 
     while (entry != NULL) {
-        if (strncmp(entry->key, key, MAX_KEY_LENGTH) == 0) {
+        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
             return entry;
         }
 
@@ -318,12 +318,12 @@ HashEntry* hashmap_get_entry(const HashMap* hm, const char* key) {
 
 // This function only saves one step (omission of the hash function)
 // The reason for this is in some cases we can use compile time hashing
-HashEntry* hashmap_get_entry(const HashMap* hm, const char* key, uint64 index) {
-    index %= hm->buf.count;
-    HashEntry* entry = (HashEntry *) hm->table[index];
+HashEntry* hashmap_get_entry(const HashMap* hm, const char* key, uint64 hash) {
+    hash %= hm->buf.count;
+    HashEntry* entry = (HashEntry *) hm->table[hash];
 
     while (entry != NULL) {
-        if (strncmp(entry->key, key, MAX_KEY_LENGTH) == 0) {
+        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
             return entry;
         }
 
@@ -339,7 +339,7 @@ void hashmap_delete_entry(HashMap* hm, const char* key) {
     HashEntry* prev = NULL;
 
     while (entry != NULL) {
-        if (strncmp(entry->key, key, MAX_KEY_LENGTH) == 0) {
+        if (strncmp(entry->key, key, HASH_MAP_MAX_KEY_LENGTH) == 0) {
             if (prev == NULL) {
                 hm->table[index] = entry->next;
             } else {
@@ -370,7 +370,7 @@ int64 hashmap_dump(const HashMap* hm, byte* data)
     }
     data += sizeof(uint64) * hm->buf.count;
 
-    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64);
+    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * HASH_MAP_MAX_KEY_LENGTH - sizeof(uint64);
 
     // Dumb hash map content = buffer memory
     int32 free_index = 0;
@@ -449,7 +449,7 @@ int64 hashmap_load(HashMap* hm, const byte* data)
     // @question don't we have to possibly endian swap check the free array as well?
     memcpy(hm->buf.free, data, sizeof(uint64) * CEIL_DIV(hm->buf.count, 64));
 
-    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * MAX_KEY_LENGTH - sizeof(uint64);
+    int64 value_size = hm->buf.chunk_size - sizeof(uint64) - sizeof(char) * HASH_MAP_MAX_KEY_LENGTH - sizeof(uint64);
 
     // Switch endian AND turn offsets to pointers
     int32 free_index = 0;
diff --git a/stdlib/ThreadedHashMap.h b/stdlib/ThreadedHashMap.h
index 9eaf54a..60208c3 100644
--- a/stdlib/ThreadedHashMap.h
+++ b/stdlib/ThreadedHashMap.h
@@ -31,7 +31,7 @@ struct ThreadedHashMap {
 
 // WARNING: element_size = element size + remaining HashEntry data size
 inline
-void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, RingMemory* ring)
+void thrd_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, RingMemory* ring)
 {
     hashmap_create((HashMap *) hm, count, element_size, ring);
     pthread_mutex_init(&hm->mutex, NULL);
@@ -39,7 +39,7 @@ void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_siz
 
 // WARNING: element_size = element size + remaining HashEntry data size
 inline
-void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, BufferMemory* buf)
+void thrd_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, BufferMemory* buf)
 {
     hashmap_create((HashMap *) hm, count, element_size, buf);
     pthread_mutex_init(&hm->mutex, NULL);
@@ -47,69 +47,69 @@ void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_siz
 
 // WARNING: element_size = element size + remaining HashEntry data size
 inline
-void threaded_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, byte* buf)
+void thrd_hashmap_create(ThreadedHashMap* hm, int32 count, int32 element_size, byte* buf)
 {
     hashmap_create((HashMap *) hm, count, element_size, buf);
     pthread_mutex_init(&hm->mutex, NULL);
 }
 
 inline
-void threaded_hashmap_free(ThreadedHashMap* hm)
+void thrd_hashmap_free(ThreadedHashMap* hm)
 {
     pthread_mutex_destroy(&hm->mutex);
 }
 
 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, int32 value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, int32 value) {
     pthread_mutex_lock(&hm->mutex);
     hashmap_insert((HashMap *) hm, key, value);
     pthread_mutex_unlock(&hm->mutex);
 }
 
 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, int64 value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, int64 value) {
     pthread_mutex_lock(&hm->mutex);
     hashmap_insert((HashMap *) hm, key, value);
     pthread_mutex_unlock(&hm->mutex);
 }
 
 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, uintptr_t value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, uintptr_t value) {
     pthread_mutex_lock(&hm->mutex);
     hashmap_insert((HashMap *) hm, key, value);
     pthread_mutex_unlock(&hm->mutex);
 }
 
 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, void* value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, void* value) {
     pthread_mutex_lock(&hm->mutex);
     hashmap_insert((HashMap *) hm, key, value);
     pthread_mutex_unlock(&hm->mutex);
 }
 
 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, f32 value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, f32 value) {
     pthread_mutex_lock(&hm->mutex);
     hashmap_insert((HashMap *) hm, key, value);
     pthread_mutex_unlock(&hm->mutex);
 }
 
 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, const char* value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, const char* value) {
     pthread_mutex_lock(&hm->mutex);
     hashmap_insert((HashMap *) hm, key, value);
     pthread_mutex_unlock(&hm->mutex);
 }
 
 inline
-void threaded_hashmap_insert(ThreadedHashMap* hm, const char* key, byte* value) {
+void thrd_hashmap_insert(ThreadedHashMap* hm, const char* key, byte* value) {
     pthread_mutex_lock(&hm->mutex);
     hashmap_insert((HashMap *) hm, key, value);
     pthread_mutex_unlock(&hm->mutex);
 }
 
 inline
-void threaded_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* key) {
+void thrd_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* key) {
     pthread_mutex_lock(&hm->mutex);
     HashEntry* temp = hashmap_get_entry((HashMap *) hm, key);
     memcpy(entry, temp, hm->buf.chunk_size);
@@ -117,7 +117,7 @@ void threaded_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const cha
 }
 
 inline
-void threaded_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* key, uint64 index) {
+void thrd_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const char* key, uint64 index) {
     pthread_mutex_lock(&hm->mutex);
     HashEntry* temp = hashmap_get_entry((HashMap *) hm, key, index);
     memcpy(entry, temp, hm->buf.chunk_size);
@@ -125,7 +125,7 @@ void threaded_hashmap_get_entry(ThreadedHashMap* hm, HashEntry* entry, const cha
 }
 
 inline
-void threaded_hashmap_delete_entry(ThreadedHashMap* hm, const char* key) {
+void thrd_hashmap_delete_entry(ThreadedHashMap* hm, const char* key) {
     pthread_mutex_lock(&hm->mutex);
     hashmap_delete_entry((HashMap *) hm, key);
     pthread_mutex_unlock(&hm->mutex);
diff --git a/stdlib/Types.h b/stdlib/Types.h
index 65d16ab..430ce55 100644
--- a/stdlib/Types.h
+++ b/stdlib/Types.h
@@ -12,12 +12,14 @@
 #include <stdint.h>
 
 #ifdef _MSC_VER
+    #include <windows.h>
+
     #define PACKED_STRUCT  __pragma(pack(push, 1))
     #define UNPACKED_STRUCT __pragma(pack(pop))
     typedef SSIZE_T ssize_t;
 #else
     #define PACKED_STRUCT  __attribute__((__packed__))
-    #define UNPACKED_STRUCT
+    #define UNPACKED_STRUCT ((void) 0)
 #endif
 
 #define ARRAY_COUNT(a) (sizeof(a) / sizeof((a)[0]))
@@ -61,12 +63,49 @@ typedef intptr_t smm;
 #define MIN_INT32 0x80000000
 #define MIN_INT64 0x8000000000000000
 
+#define SEC_MILLI 1000
 #define MILLI_MICRO 1000
+#define SEC_MICRO 1000000
+
+#define MHZ 1000000
+#define GHZ 1000000000
 
 #define internal static // only allows local "file" access
 #define local_persist static
 #define global_persist static
 
+struct v3_byte {
+    union {
+        struct {
+            byte x, y, z;
+        };
+
+        struct {
+            byte r, g, b;
+        };
+
+        byte v[3];
+    };
+};
+
+struct v4_byte {
+    union {
+        struct {
+            byte x, y, z, w;
+        };
+
+        struct {
+            byte r, g, b, a;
+        };
+
+        union {
+            byte v[4];
+            uint32 val;
+        };
+    };
+};
+
+
 struct v2_int32 {
     union {
         struct {
@@ -265,19 +304,19 @@ struct m_f64 {
     size_t m, n;
 };
 
-#define HALF_FLOAT_SIGN_MASK   0x8000
-#define HALF_FLOAT_EXP_MASK    0x7C00
-#define HALF_FLOAT_FRAC_MASK   0x03FF
+#define HALF_FLOAT_SIGN_MASK 0x8000
+#define HALF_FLOAT_EXP_MASK 0x7C00
+#define HALF_FLOAT_FRAC_MASK 0x03FF
 
-#define HALF_FLOAT_EXP_SHIFT   10
-#define HALF_FLOAT_EXP_BIAS    15
+#define HALF_FLOAT_EXP_SHIFT 10
+#define HALF_FLOAT_EXP_BIAS 15
 
-#define FLOAT32_SIGN_MASK      0x80000000
-#define FLOAT32_EXP_MASK       0x7F800000
-#define FLOAT32_FRAC_MASK      0x007FFFFF
+#define FLOAT32_SIGN_MASK 0x80000000
+#define FLOAT32_EXP_MASK 0x7F800000
+#define FLOAT32_FRAC_MASK 0x007FFFFF
 
-#define FLOAT32_EXP_SHIFT      23
-#define FLOAT32_EXP_BIAS       127
+#define FLOAT32_EXP_SHIFT 23
+#define FLOAT32_EXP_BIAS 127
 
 uint16 float_to_f16(float f) {
     uint32_t f_bits = *((uint32_t*)&f);
diff --git a/stdlib/simd/SIMD_I32.h b/stdlib/simd/SIMD_I32.h
index 95c9eea..b43ce9c 100644
--- a/stdlib/simd/SIMD_I32.h
+++ b/stdlib/simd/SIMD_I32.h
@@ -1332,7 +1332,6 @@ void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
             result += steps;
        }
     } else if (steps == 8) {
-        // @todo this his how all the functions should be implemented that take in baseic types and output basic types
         __m256i a_8;
         __m256 af_8;
         __m256 b_8 = _mm256_set1_ps(b);
diff --git a/stdlib/simd/SIMD_SVML.h b/stdlib/simd/SIMD_SVML.h
index e863957..0308ada 100644
--- a/stdlib/simd/SIMD_SVML.h
+++ b/stdlib/simd/SIMD_SVML.h
@@ -18,46 +18,46 @@
     inline __m128i _mm_div_epi32(__m128i a, __m128i b) {
         alignas(16) int32_t a_array[4], b_array[4], result[4];
 
-        _mm_storeu_si128((__m128i*)a_array, a);
-        _mm_storeu_si128((__m128i*)b_array, b);
+        _mm_storeu_si128((__m128i*) a_array, a);
+        _mm_storeu_si128((__m128i*) b_array, b);
 
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
             result[i] = a_array[i] / b_array[i];
         }
 
-        return _mm_load_si128((__m128i*)result);
+        return _mm_load_si128((__m128i*) result);
     }
 
     inline __m256i _mm256_div_epi32(__m256i a, __m256i b) {
         alignas(32) int32_t a_array[8], b_array[8], result[8];
 
-        _mm256_storeu_si256((__m256i*)a_array, a);
-        _mm256_storeu_si256((__m256i*)b_array, b);
+        _mm256_storeu_si256((__m256i*) a_array, a);
+        _mm256_storeu_si256((__m256i*) b_array, b);
 
-        for (int i = 0; i < 8; ++i) {
+        for (int32 i = 0; i < 8; ++i) {
             result[i] = a_array[i] / b_array[i];
         }
 
-        return _mm256_load_si256((__m256i*)result);
+        return _mm256_load_si256((__m256i*) result);
     }
 
     inline __m512i _mm512_div_epi32(__m512i a, __m512i b) {
         alignas(64) int32_t a_array[16], b_array[16], result[16];
 
-        _mm512_storeu_si512((__m512i*)a_array, a);
-        _mm512_storeu_si512((__m512i*)b_array, b);
+        _mm512_storeu_si512((__m512i*) a_array, a);
+        _mm512_storeu_si512((__m512i*) b_array, b);
 
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
             result[i] = a_array[i] / b_array[i];
         }
 
-        return _mm512_load_si512((__m512i*)result);
+        return _mm512_load_si512((__m512i*) result);
     }
 
     inline __m128 _mm_sin_ps(__m128 a) {
         alignas(16) f32 a_array[4], result[4];
         _mm_storeu_ps(a_array, a);
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
             result[i] = sinf(a_array[i]);
         }
         return _mm_load_ps(result);
@@ -66,7 +66,7 @@
     inline __m128 _mm_cos_ps(__m128 a) {
         alignas(16) f32 a_array[4], result[4];
         _mm_storeu_ps(a_array, a);
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
             result[i] = cosf(a_array[i]);
         }
         return _mm_load_ps(result);
@@ -75,7 +75,7 @@
     inline __m128 _mm_asin_ps(__m128 a) {
         alignas(16) f32 a_array[4], result[4];
         _mm_storeu_ps(a_array, a);
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
             result[i] = asinf(a_array[i]);
         }
         return _mm_load_ps(result);
@@ -84,7 +84,7 @@
     inline __m128 _mm_acos_ps(__m128 a) {
         alignas(16) f32 a_array[4], result[4];
         _mm_storeu_ps(a_array, a);
-        for (int i = 0; i < 4; ++i) {
+        for (int32 i = 0; i < 4; ++i) {
             result[i] = acosf(a_array[i]);
         }
         return _mm_load_ps(result);
@@ -93,7 +93,7 @@
     inline __m256 _mm256_sin_ps(__m256 a) {
         alignas(32) f32 a_array[8], result[8];
         _mm256_storeu_ps(a_array, a);
-        for (int i = 0; i < 8; ++i) {
+        for (int32 i = 0; i < 8; ++i) {
             result[i] = sinf(a_array[i]);
         }
         return _mm256_load_ps(result);
@@ -102,7 +102,7 @@
     inline __m256 _mm256_cos_ps(__m256 a) {
         alignas(32) f32 a_array[8], result[8];
         _mm256_storeu_ps(a_array, a);
-        for (int i = 0; i < 8; ++i) {
+        for (int32 i = 0; i < 8; ++i) {
             result[i] = cosf(a_array[i]);
         }
         return _mm256_load_ps(result);
@@ -111,7 +111,7 @@
     inline __m256 _mm256_asin_ps(__m256 a) {
         alignas(32) f32 a_array[8], result[8];
         _mm256_storeu_ps(a_array, a);
-        for (int i = 0; i < 8; ++i) {
+        for (int32 i = 0; i < 8; ++i) {
             result[i] = asinf(a_array[i]);
         }
         return _mm256_load_ps(result);
@@ -120,7 +120,7 @@
     inline __m256 _mm256_acos_ps(__m256 a) {
         alignas(32) f32 a_array[8], result[8];
         _mm256_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
             result[i] = acosf(a_array[i]);
         }
         return _mm256_load_ps(result);
@@ -129,7 +129,7 @@
     inline __m512 _mm512_sin_ps(__m512 a) {
         alignas(64) f32 a_array[8], result[8];
         _mm512_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
             result[i] = sinf(a_array[i]);
         }
         return _mm512_load_ps(result);
@@ -138,7 +138,7 @@
     inline __m512 _mm512_cos_ps(__m512 a) {
         alignas(64) f32 a_array[8], result[8];
         _mm512_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
             result[i] = cosf(a_array[i]);
         }
         return _mm512_load_ps(result);
@@ -147,7 +147,7 @@
     inline __m512 _mm512_asin_ps(__m512 a) {
         alignas(64) f32 a_array[8], result[8];
         _mm512_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
             result[i] = asinf(a_array[i]);
         }
         return _mm512_load_ps(result);
@@ -156,7 +156,7 @@
     inline __m512 _mm512_acos_ps(__m512 a) {
         alignas(64) f32 a_array[16], result[16];
         _mm512_storeu_ps(a_array, a);
-        for (int i = 0; i < 16; ++i) {
+        for (int32 i = 0; i < 16; ++i) {
             result[i] = acosf(a_array[i]);
         }
         return _mm512_load_ps(result);
diff --git a/thread/ThreadJob.h b/thread/ThreadJob.h
index 7b8c7f2..e8b17ca 100644
--- a/thread/ThreadJob.h
+++ b/thread/ThreadJob.h
@@ -13,6 +13,7 @@
 #include <stdlib.h>
 
 #include "../stdlib/Types.h"
+#include "../memory/ThreadedRingMemory.h"
 
 #if _WIN32
     #include "../platform/win32/threading/ThreadDefines.h"
@@ -20,14 +21,16 @@
     #include "../platform/linux/threading/ThreadDefines.h"
 #endif
 
-struct PoolWorker {
-    ThreadJobFunc func;
-    void *arg;
-    volatile int32 state;
-    PoolWorker *next;
-};
+typedef void (*ThreadPoolJobFunc)(void*);
 
-typedef PoolWorker ThreadJob;
+struct PoolWorker {
+    int32 id;
+    volatile int32 state;
+    void* arg;
+    void* result;
+    RingMemory ring;
+    ThreadPoolJobFunc func;
+};
 
 struct Worker {
     volatile int32 state;
diff --git a/thread/ThreadPool.h b/thread/ThreadPool.h
index 5468284..f47a4fa 100644
--- a/thread/ThreadPool.h
+++ b/thread/ThreadPool.h
@@ -13,158 +13,120 @@
 #include <stdlib.h>
 
 #include "../stdlib/Types.h"
+#include "../memory/Queue.h"
+#include "../memory/BufferMemory.h"
 
 #ifdef _WIN32
     #include "../platform/win32/threading/Thread.h"
+    #include "../platform/win32/threading/Atomic.h"
 #elif __linux__
     #include "../platform/linux/threading/Thread.h"
+    #include "../platform/linux/threading/Atomic.h"
 #endif
 
 #include "ThreadJob.h"
 
 struct ThreadPool {
-    ThreadJob *work_first;
-    ThreadJob *work_last;
+    // This is not a threaded queue since we want to handle the mutex in here, not in the queue for finer control
+    Queue work_queue;
 
     pthread_mutex_t work_mutex;
     pthread_cond_t work_cond;
     pthread_cond_t working_cond;
 
-    size_t working_cnt;
-    size_t thread_cnt;
+    int32 working_cnt;
+    int32 thread_cnt;
 
     int32 size;
-    bool stop;
+    int32 state;
+
+    uint32 id_counter;
 };
 
-ThreadJob *thread_pool_work_poll(ThreadPool *pool)
-{
-    if (pool == NULL) {
-        return NULL;
-    }
-
-    ThreadJob *work = pool->work_first;
-    if (work == NULL) {
-        return NULL;
-    }
-
-    if (work->next == NULL) {
-        pool->work_first = NULL;
-        pool->work_last  = NULL;
-    } else {
-        pool->work_first = work->next;
-    }
-
-    return work;
-}
-
 static THREAD_RETURN thread_pool_worker(void* arg)
 {
-    ThreadPool *pool = (ThreadPool *) arg;
-    ThreadJob *work;
+    ThreadPool* pool = (ThreadPool *) arg;
+    PoolWorker* work;
 
     while (true) {
         pthread_mutex_lock(&pool->work_mutex);
-
-        while (pool->work_first == NULL && !pool->stop) {
+        while (queue_is_empty(&pool->work_queue) && !pool->state) {
             pthread_cond_wait(&pool->work_cond, &pool->work_mutex);
         }
 
-        if (pool->stop) {
+        if (pool->state == 1) {
+            pthread_mutex_unlock(&pool->work_mutex);
+
             break;
         }
 
-        work = thread_pool_work_poll(pool);
-        ++(pool->working_cnt);
+        work = (PoolWorker *) queue_dequeue_keep(&pool->work_queue, sizeof(PoolWorker), 64);
         pthread_mutex_unlock(&pool->work_mutex);
 
-        if (work != NULL) {
-            work->func(work);
+        if (!work) {
+            continue;
         }
 
-        pthread_mutex_lock(&pool->work_mutex);
-        --(pool->working_cnt);
+        atomic_increment(&pool->working_cnt);
+        atomic_set(&work->state, 2);
+        work->func(work);
+        atomic_set(&work->state, 1);
 
-        if (!pool->stop && pool->working_cnt == 0 && pool->work_first == NULL) {
+        // Job gets marked after completion -> can be overwritten now
+        if (atomic_get(&work->id) == -1) {
+            atomic_set(&work->id, 0);
+        }
+
+        atomic_decrement(&pool->working_cnt);
+
+        if (atomic_get(&pool->state) == 0 && atomic_get(&pool->working_cnt) == 0) {
             pthread_cond_signal(&pool->working_cond);
         }
-
-        pthread_mutex_unlock(&pool->work_mutex);
     }
 
-    --(pool->thread_cnt);
     pthread_cond_signal(&pool->working_cond);
-    pthread_mutex_unlock(&pool->work_mutex);
+    atomic_decrement(&pool->thread_cnt);
 
     return NULL;
 }
 
-ThreadPool *thread_pool_create(size_t num, ThreadPool* pool)
+void thread_pool_create(ThreadPool* pool, BufferMemory* buf, int32 thread_count)
 {
-    pthread_t thread;
-    size_t i;
+    queue_init(&pool->work_queue, buf, 64, sizeof(PoolWorker), 64);
 
-    if (num == 0) {
-        num = 2;
-    }
-
-    pool->thread_cnt = num;
+    pool->thread_cnt = thread_count;
 
     // @todo switch from pool mutex and pool cond to threadjob mutex/cond
-    //      thread_pool_wait etc. should just itereate over all mutexes
+    //      thread_pool_wait etc. should just iterate over all mutexes
     pthread_mutex_init(&pool->work_mutex, NULL);
     pthread_cond_init(&pool->work_cond, NULL);
     pthread_cond_init(&pool->working_cond, NULL);
 
-    pool->work_first = NULL;
-    pool->work_last  = NULL;
-
-    for (i = 0; i < num; ++i) {
+    pthread_t thread;
+    for (pool->size = 0; pool->size < thread_count; ++pool->size) {
         pthread_create(&thread, NULL, thread_pool_worker, pool);
-        ++(pool->size);
-
         pthread_detach(thread);
     }
-
-    return pool;
 }
 
-void thread_pool_wait(ThreadPool *pool)
+void thread_pool_wait(ThreadPool* pool)
 {
-    if (pool == NULL) {
-        return;
-    }
-
     pthread_mutex_lock(&pool->work_mutex);
-
-    while (true) {
-        if ((!pool->stop && pool->working_cnt != 0) || (pool->stop && pool->thread_cnt != 0)) {
-            pthread_cond_wait(&pool->working_cond, &pool->work_mutex);
-        } else {
-            break;
-        }
+    while ((!pool->state && pool->working_cnt != 0) || (pool->state && pool->thread_cnt != 0)) {
+        pthread_cond_wait(&pool->working_cond, &pool->work_mutex);
     }
-
     pthread_mutex_unlock(&pool->work_mutex);
 }
 
-void thread_pool_destroy(ThreadPool *pool)
+void thread_pool_destroy(ThreadPool* pool)
 {
-    if (pool == NULL) {
-        return;
-    }
+    // This sets the queue to empty
+    atomic_set((void **) &pool->work_queue.tail, (void **) &pool->work_queue.head);
 
-    pthread_mutex_lock(&pool->work_mutex);
-    ThreadJob *work = pool->work_first;
+    // This sets the state to "shutdown"
+    atomic_set(&pool->state, 1);
 
-    while (work != NULL) {
-        work = work->next;
-    }
-
-    pool->stop = true;
     pthread_cond_broadcast(&pool->work_cond);
-    pthread_mutex_unlock(&pool->work_mutex);
-
     thread_pool_wait(pool);
 
     pthread_mutex_destroy(&pool->work_mutex);
@@ -172,25 +134,58 @@ void thread_pool_destroy(ThreadPool *pool)
     pthread_cond_destroy(&pool->working_cond);
 }
 
-ThreadJob* thread_pool_add_work(ThreadPool *pool, ThreadJob* job)
+PoolWorker* thread_pool_add_work(ThreadPool* pool, const PoolWorker* job)
 {
-    if (pool == NULL || job == NULL) {
+    pthread_mutex_lock(&pool->work_mutex);
+    PoolWorker* temp_job = (PoolWorker *) ring_get_memory_nomove(&pool->work_queue, sizeof(PoolWorker), 64);
+    if (atomic_get(&temp_job->id) > 0) {
+        pthread_mutex_unlock(&pool->work_mutex);
+        ASSERT_SIMPLE(temp_job->id == 0);
+
         return NULL;
     }
 
-    pthread_mutex_lock(&pool->work_mutex);
-    if (pool->work_first == NULL) {
-        pool->work_first = job;
-        pool->work_last  = pool->work_first;
-    } else {
-        pool->work_last->next = job;
-        pool->work_last       = job;
+    memcpy(temp_job, job, sizeof(PoolWorker));
+    ring_move_pointer(&pool->work_queue, &pool->work_queue.head, sizeof(PoolWorker), 64);
+
+    if (temp_job->id == 0) {
+        temp_job->id = atomic_add_fetch(&pool->id_counter, 1);
     }
 
     pthread_cond_broadcast(&pool->work_cond);
     pthread_mutex_unlock(&pool->work_mutex);
 
-    return job;
+    return temp_job;
 }
 
+// This is basically the same as thread_pool_add_work but allows us to directly write into the memory in the caller
+// This makes it faster, since we can avoid a memcpy
+PoolWorker* thread_pool_add_work_start(ThreadPool* pool)
+{
+    pthread_mutex_lock(&pool->work_mutex);
+
+    PoolWorker* temp_job = (PoolWorker *) queue_enqueue_start(&pool->work_queue, sizeof(PoolWorker), 64);
+    if (atomic_get(&temp_job->id) > 0) {
+        pthread_mutex_unlock(&pool->work_mutex);
+        ASSERT_SIMPLE(temp_job->id == 0);
+
+        return NULL;
+    }
+
+    if (temp_job->id == 0) {
+        // +1 because otherwise the very first job would be id = 0 which is not a valid id
+        temp_job->id = atomic_add_fetch(&pool->id_counter, 1) + 1;
+    }
+
+    return temp_job;
+}
+
+void thread_pool_add_work_end(ThreadPool* pool)
+{
+    queue_enqueue_end(&pool->work_queue, sizeof(PoolWorker), 64);
+    pthread_cond_broadcast(&pool->work_cond);
+    pthread_mutex_unlock(&pool->work_mutex);
+}
+
+
 #endif
\ No newline at end of file
diff --git a/ui/UITheme.h b/ui/UITheme.h
index 9e48dc1..54a566c 100644
--- a/ui/UITheme.h
+++ b/ui/UITheme.h
@@ -108,9 +108,14 @@ int compare_by_attribute_id(const void* a, const void* b) {
 // WARNING: theme needs to have memory already reserved and assigned to data
 void theme_from_file_txt(
     UIThemeStyle* theme,
-    byte* data
+    const char* path,
+    RingMemory* ring
 ) {
-    char* pos = (char *) data;
+    FileBody file;
+    file_read(path, &file, ring);
+    ASSERT_SIMPLE(file.size);
+
+    char* pos = (char *) file.content;
 
     // move past the version string
     pos += 8;
@@ -150,11 +155,11 @@ void theme_from_file_txt(
 
     UIAttributeGroup* temp_group = NULL;
 
-    pos = (char *) data;
+    pos = (char *) file.content;
     pos += 8; // move past version
 
     while (*pos != '\0') {
-        str_skip_empty(&pos);
+        str_skip_whitespace(&pos);
 
         if (*pos == '\n') {
             ++pos;
@@ -200,7 +205,7 @@ void theme_from_file_txt(
 
         str_copy_move_until(&pos, attribute_name, " :\n", sizeof(" :\n") - 1);
 
-        // Skip any white spaces or other delimeters
+        // Skip any white spaces or other delimeter
         str_skip_list(&pos, " \t:", sizeof(" \t:") - 1);
 
         ASSERT_SIMPLE((*pos != '\0' && *pos != '\n'));
@@ -394,9 +399,9 @@ void theme_from_file_txt(
 
 // The size of theme->data should be the file size.
 // Yes, this means we have a little too much data but not by a lot
-void theme_from_file(
-    UIThemeStyle* theme,
-    const byte* data
+int32 theme_from_data(
+    const byte* data,
+    UIThemeStyle* theme
 ) {
     const byte* pos = data;
 
@@ -445,13 +450,15 @@ void theme_from_file(
             entry = entry->next;
         }
     }
+
+    return (int32) (pos - data);
 }
 
 // Calculates the maximum theme size
 // Not every group has all the attributes (most likely only a small subset)
 // However, an accurate calculation is probably too slow and not needed most of the time
 inline
-int64 theme_size(const UIThemeStyle* theme)
+int64 theme_data_size(const UIThemeStyle* theme)
 {
     return hashmap_size(&theme->hash_map)
         + theme->hash_map.buf.count * UI_ATTRIBUTE_TYPE_SIZE * sizeof(UIAttribute);
@@ -472,20 +479,11 @@ int64 theme_size(const UIThemeStyle* theme)
 //      attributes ...
 //      attributes ...
 
-void theme_to_file(
-    RingMemory* ring,
-    const char* path,
-    const UIThemeStyle* theme
+int32 theme_to_data(
+    const UIThemeStyle* theme,
+    byte* data
 ) {
-    FileBody file;
-
-    // Temporary file size for buffer
-    // @todo This is a bad placeholder, The problem is we don't know how much we actually need without stepping through the elements
-    //      I also don't want to add a size variable to the theme as it is useless in all other cases
-    file.size = theme_size(theme);
-
-    file.content = ring_get_memory(ring, file.size, 64, true);
-    byte* pos = file.content;
+    byte* pos = data;
 
     // version
     *((int32 *) pos) = SWAP_ENDIAN_LITTLE(theme->version);
@@ -497,7 +495,7 @@ void theme_to_file(
 
     // theme data
     // Layout: first save the size of the group, then save the individual attributes
-    for (int32 i = 0; i < theme->hash_map.buf.count; ++i) {
+    for (uint32 i = 0; i < theme->hash_map.buf.count; ++i) {
         if (!theme->hash_map.table[i]) {
             continue;
         }
@@ -530,8 +528,7 @@ void theme_to_file(
         }
     }
 
-    file.size = pos - file.content;
-    file_write(path, &file);
+    return (int32) (pos - data);
 }
 
 #endif
\ No newline at end of file
diff --git a/utils/MathUtils.h b/utils/MathUtils.h
index 926998b..9e2e0ca 100644
--- a/utils/MathUtils.h
+++ b/utils/MathUtils.h
@@ -27,6 +27,9 @@
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 #define OMS_CEIL(x) ((x) == (int)(x) ? (int)(x) : ((x) > 0 ? (int)(x) + 1 : (int)(x)))
 
+// Modulo function when b is a power of 2
+#define MODULO_2(a, b) ((a) & (b - 1))
+
 #define SQRT_2 1.4142135623730950488016887242097f
 
 #endif
diff --git a/utils/RandomUtils.h b/utils/RandomUtils.h
new file mode 100644
index 0000000..808ce16
--- /dev/null
+++ b/utils/RandomUtils.h
@@ -0,0 +1,81 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_UTILS_RANDOM_H
+#define TOS_UTILS_RANDOM_H
+
+#include <stdlib.h>
+#include "../stdlib/Types.h"
+
+global_persist uint32 fast_seed;
+#define FAST_RAND_MAX 32767
+
+inline
+uint32 fast_rand1(void) {
+    fast_seed = (214013 * fast_seed + 2531011);
+
+    return (fast_seed >> 16) & 0x7FFF;
+}
+
+uint32 fast_rand2(uint32* state) {
+    uint32 x = *state;
+
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 5;
+
+    *state = x;
+
+    return x;
+}
+
+inline
+f32 fast_rand_percentage(void) {
+    return (f32) fast_rand1() / (f32) FAST_RAND_MAX;
+}
+
+/**
+ * Picks n random elements from end and stores them in begin.
+ */
+inline
+void random_unique(int32* array, int32 size) {
+    for (int32 i = size - 1; i > 0; --i) {
+        int32 j = rand() % (i + 1);
+
+        int32 temp = array[i];
+        array[i] = array[j];
+        array[j] = temp;
+    }
+}
+
+/**
+ * Gets random index based value probability
+ */
+int32 random_weighted_index(const int32* arr, int32 array_count)
+{
+    uint32 prob_sum = 0;
+    for (int32 i = 0; i < array_count; ++i) {
+        prob_sum += arr[i];
+    }
+
+    uint32 random_prob = rand() % (prob_sum + 1);
+    uint32 current_rarity = 0;
+    int32 item_rarity = array_count - 1;
+    for (int32 i = 0; i < array_count - 1; ++i) {
+        current_rarity += arr[i];
+
+        if (current_rarity < random_prob) {
+            item_rarity = i;
+            break;
+        }
+    }
+
+    return item_rarity;
+}
+
+#endif
\ No newline at end of file
diff --git a/utils/StringUtils.h b/utils/StringUtils.h
index 29d5564..b8d8c34 100644
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@@ -10,6 +10,7 @@
 #define TOS_UTILS_STRING_UTILS_H
 
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 
@@ -430,7 +431,8 @@ void str_replace(const char* str, const char* __restrict search, const char* __r
         memcpy(result_ptr, replace, replace_len);
         result_ptr += replace_len;
 
-        str = current + search_len;
+        current += search_len;
+        str = current;
     }
 
     strcpy(result_ptr, str);
@@ -709,4 +711,16 @@ void hexstr_to_rgba(v4_f32* rgba, const char* hex)
     rgba->a = (f32) (value & 0xFF) / 255.0f;
 }
 
+inline constexpr
+void str_pad(const char* input, char* output, char pad, size_t len) {
+    size_t i = 0;
+    for (; i < len && input[i] != '\0'; ++i) {
+        output[i] = input[i];
+    }
+
+    for (; i < len; ++i) {
+        output[i] = pad;
+    }
+}
+
 #endif
\ No newline at end of file
diff --git a/utils/Utils.h b/utils/Utils.h
index 0457c17..a032bd4 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -10,7 +10,6 @@
 #define TOS_UTILS_H
 
 #include <stdlib.h>
-
 #include "../stdlib/Types.h"
 
 struct FileBody {
@@ -18,76 +17,11 @@ struct FileBody {
     byte* content;
 };
 
-global_persist uint32 fast_seed;
-#define FAST_RAND_MAX 32767
-
-inline
-uint32 fast_rand1(void) {
-    fast_seed = (214013 * fast_seed + 2531011);
-
-    return (fast_seed >> 16) & 0x7FFF;
-}
-
-uint32 fast_rand2(uint32* state) {
-    uint32 x = *state;
-
-    x ^= x << 13;
-    x ^= x >> 17;
-    x ^= x << 5;
-
-    *state = x;
-
-    return x;
-}
-
-inline
-f32 fast_rand_percentage(void) {
-    return (f32) fast_rand1() / (f32) FAST_RAND_MAX;
-}
-
-/**
- * Picks n random elements from end and stores them in begin.
- */
-inline
-void random_unique(int32* array, int32 size) {
-    for (int32 i = size - 1; i > 0; --i) {
-        int32 j = rand() % (i + 1);
-
-        int32 temp = array[i];
-        array[i] = array[j];
-        array[j] = temp;
-    }
-}
-
-/**
- * Gets random index based value probability
- */
-int random_weighted_index(const int32* arr, int32 array_count)
-{
-    uint32 prob_sum = 0;
-    for (int32 i = 0; i < array_count; ++i) {
-        prob_sum += arr[i];
-    }
-
-    uint32 random_prob = rand() % (prob_sum + 1);
-    uint32 current_rarity = 0;
-    int32 item_rarity = array_count - 1;
-    for (int32 i = 0; i < array_count - 1; ++i) {
-        current_rarity += arr[i];
-
-        if (current_rarity < random_prob) {
-            item_rarity = i;
-            break;
-        }
-    }
-
-    return item_rarity;
-}
-
+// @question Do we want to make the size comparison a step variable?
 bool is_equal_aligned(const byte* region1, const byte* region2, uint64 size)
 {
     while (size > 4) {
-        if (*(const int32_t*) region1 != *(const int32_t*) region2) {
+        if (*(const int32 *) region1 != *(const int32 *) region2) {
             return false;
         }
 
@@ -108,4 +42,27 @@ bool is_equal_aligned(const byte* region1, const byte* region2, uint64 size)
     return true;
 }
 
+// @question Do we want to make the size comparison a step variable?
+bool is_empty(const byte* region, uint64 size)
+{
+    while (size > 4) {
+        if (*(const int32 *) region != 0) {
+            return false;
+        }
+
+        region += 4;
+        size -= 4;
+    }
+
+    for (; size > 0; --size) {
+        if (region != 0) {
+            return false;
+        }
+
+        ++region;
+    }
+
+    return true;
+}
+
 #endif
\ No newline at end of file