Started to do a massive thread improvement journey. AMS still outstanding also handling multiple AssetArchives is still a open todo, same goes for multiple gpuapis

2026-03-07 10:08:41 +00:00 · 2024-12-18 07:43:11 +01:00 · 2024-12-18 07:43:11 +01:00 · fa9fcb6394
commit fa9fcb6394
parent 8c034c4a26
25 changed files with 3206 additions and 386 deletions
--- a/asset/Asset.h
+++ b/asset/Asset.h
@ -22,6 +22,8 @@ struct Asset {
    // Could be 0 if there is no official id
    uint64 official_id;

+    // @performance This is bad, this uses the same name as the hashmap
+    // We effectively store the asset name twice which shouldn't be the case
    char name[MAX_ASSET_NAME_LENGTH];

    AssetType type;
@ -32,13 +34,17 @@ struct Asset {

    // Describes how much ram/vram the asset uses
    // E.g. vram_size = 0 but ram_size > 0 means that it never uses any gpu memory
-    uint64 ram_size;
-    uint64 vram_size;
+    uint32 ram_size;
+    uint32 vram_size;
+    uint64 last_access;

    // Usually 1 but in some cases an ams may hold entities of variable chunk length
    // For textures for example a 128x128 is of size 1 but 256x256 is of size 4
    uint32 size;

+    // Variable used for thread safety
+    bool is_loaded;
+
    // Describes if the memory is currently available in ram/vram
    // E.g. an asset might be uploaded to the gpu and no longer held in ram (or the other way around)
    bool is_ram;
@ -49,10 +55,6 @@ struct Asset {
    bool can_garbage_collect_ram;
    bool can_garbage_collect_vram;

-    // Describes if the asset should be removed/garbage collected during CPU/GPU down time
-    bool should_garbage_collect_ram;
-    bool should_garbage_collect_vram;
-
    Asset* next;
    Asset* prev;

--- a/asset/AssetManagementSystem.h
+++ b/asset/AssetManagementSystem.h
@ -22,6 +22,7 @@
 // @question Asset component systems could be created per region -> easy to simulate a specific region
 // @bug This means players might not be able to transition from one area to another?!

+// @performance There is a huge performance flaw. We CANNOT have an asset only in vram because it always also allocates the ram (asset_data_memory)
 struct AssetManagementSystem {
    // @question is this even necessary or could we integrate this directly into the system here?
    HashMap hash_map;
@ -29,6 +30,7 @@ struct AssetManagementSystem {
    uint64 ram_size;
    uint64 vram_size;
    uint64 asset_count;
+    int32 overhead;
    bool has_changed;

    // The indices of asset_memory and asset_data_memory are always linked
@ -48,16 +50,15 @@ struct AssetManagementSystem {
    // @question do we want to create an extra threaded version? Or a combined one, like we have right now.
    // @question Do we want to add a mutex to assets. This way we don't have to lock the entire ams.
    pthread_mutex_t mutex;
-
-    // @bug We probably also need a overhead value.
-    // In some cases we need more data than our normal data (see texture, it contains image + texture)
 };

-void ams_create(AssetManagementSystem* ams, BufferMemory* buf, int32 chunk_size, int32 count)
+void ams_create(AssetManagementSystem* ams, BufferMemory* buf, int32 chunk_size, int32 count, int32 overhead = 0)
 {
    // setup hash_map
    hashmap_create(&ams->hash_map, count, sizeof(HashEntryInt64), buf);

+    ams->overhead = overhead;
+
    // setup asset_memory
    chunk_init(&ams->asset_memory, buf, count, sizeof(Asset), 64);

@ -71,13 +72,15 @@ void ams_create(AssetManagementSystem* ams, BufferMemory* buf, int32 chunk_size,
 }

 // WARNING: buf size see ams_get_buffer_size
-void ams_create(AssetManagementSystem* ams, byte* buf, int32 chunk_size, int32 count)
+void ams_create(AssetManagementSystem* ams, byte* buf, int32 chunk_size, int32 count, int32 overhead = 0)
 {
    ASSERT_SIMPLE(chunk_size);

    // setup hash_map
    hashmap_create(&ams->hash_map, count, sizeof(HashEntryInt64), buf);

+    ams->overhead = overhead;
+
    // setup asset_memory
    ams->asset_memory.count = count;
    ams->asset_memory.chunk_size = sizeof(Asset);
@ -108,7 +111,7 @@ void ams_free(AssetManagementSystem* ams)
 inline
 int32 ams_calculate_chunks(const AssetManagementSystem* ams, int32 byte_size)
 {
-    return (int32) CEIL_DIV(byte_size, ams->asset_data_memory.chunk_size);
+    return (int32) CEIL_DIV(byte_size + ams->overhead, ams->asset_data_memory.chunk_size);
 }

 inline
@ -122,8 +125,6 @@ int64 ams_get_buffer_size(int32 count, int32 chunk_size)
 inline
 void ams_update_stats(AssetManagementSystem* ams)
 {
-    // @bug We should check the hash map or the memory status, we could still have old values in here
-
    ams->vram_size = 0;
    ams->ram_size = 0;
    ams->asset_count = 0;
@ -197,10 +198,14 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key)
 {
    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key);

-    // @bug entry->value seems to be an address outside of any known buffer, how?
    DEBUG_MEMORY_READ(
        (uint64) (entry ? (Asset *) entry->value : 0),
-        entry ? ((Asset *) entry->value)->ram_size + sizeof(Asset) : 0
+        entry ? sizeof(Asset) : 0
+    );
+
+    DEBUG_MEMORY_READ(
+        (uint64) (entry ? (Asset *) entry->value : 0),
+        entry ? ((Asset *) entry->value)->self + ((Asset *) entry->value)->ram_size : 0
    );

    return entry ? (Asset *) entry->value : NULL;
@ -211,10 +216,14 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash)
 {
    HashEntry* entry = hashmap_get_entry(&ams->hash_map, key, hash);

-    // @bug entry->value seems to be an address outside of any known buffer, how?
    DEBUG_MEMORY_READ(
        (uint64) (entry ? (Asset *) entry->value : 0),
-        entry ? ((Asset *) entry->value)->ram_size + sizeof(Asset) : 0
+        entry ? sizeof(Asset) : 0
+    );
+
+    DEBUG_MEMORY_READ(
+        (uint64) (entry ? (Asset *) entry->value : 0),
+        entry ? ((Asset *) entry->value)->self + ((Asset *) entry->value)->ram_size : 0
    );

    return entry ? (Asset *) entry->value : NULL;
@ -248,6 +257,7 @@ Asset* thrd_ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 ha
 // @todo implement defragment command to optimize memory layout since the memory layout will become fragmented over time

 // @performance This function is VERY important, check if we can optimize it
+// We could probably optimize the threaded version by adding a atomic_set_release(asset->is_loaded, true);
 Asset* ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 elements = 1)
 {
    int64 free_asset = chunk_reserve(&ams->asset_memory, elements, true);
@ -315,6 +325,48 @@ Asset* ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 el
    return asset;
 }

+void ams_garbage_collect(AssetManagementSystem* ams, uint64 time, uint64 dt)
+{
+    Asset* asset = ams->first;
+
+    while (asset) {
+        // @performance We cannot just remove ram and keep vram. This is a huge flaw
+        if (asset->can_garbage_collect_ram && asset->can_garbage_collect_vram && time - asset->last_access <= dt) {
+            ams_free_asset(ams, asset);
+        }
+
+        asset = asset->next;
+    }
+}
+
+void ams_garbage_collect(AssetManagementSystem* ams)
+{
+    Asset* asset = ams->first;
+
+    while (asset) {
+        // @performance We cannot just remove ram and keep vram. This is a huge flaw
+        if (asset->can_garbage_collect_ram && asset->can_garbage_collect_vram) {
+            ams_free_asset(ams, asset);
+        }
+
+        asset = asset->next;
+    }
+}
+
+void thrd_ams_garbage_collect(AssetManagementSystem* ams, uint64 time, uint64 dt)
+{
+    pthread_mutex_lock(&ams->mutex);
+    ams_garbage_collect(ams, time, dt);
+    pthread_mutex_unlock(&ams->mutex);
+}
+
+void thrd_ams_garbage_collect(AssetManagementSystem* ams)
+{
+    pthread_mutex_lock(&ams->mutex);
+    ams_garbage_collect(ams);
+    pthread_mutex_unlock(&ams->mutex);
+}
+
 Asset* thrd_ams_reserve_asset(AssetManagementSystem* ams, const char* name, uint32 elements = 1) {
    pthread_mutex_lock(&ams->mutex);
    Asset* asset = ams_reserve_asset(ams, name, elements);
--- a/audio/AudioMixer.h
+++ b/audio/AudioMixer.h
@ -89,17 +89,18 @@ struct AudioMixer {

    // @todo add mutex for locking and create threaded functions
    // do we need a condition or semaphore?
+    // Wait, why do we even need threading? Isn't the threading handled by the file loading
 };

 bool audio_mixer_is_active(AudioMixer* mixer) {
    if (mixer->state_new == AUDIO_MIXER_STATE_ACTIVE
-        && atomic_get((int32 *) &mixer->state_new) == AUDIO_MIXER_STATE_ACTIVE
+        && atomic_get_relaxed((int32 *) &mixer->state_new) == AUDIO_MIXER_STATE_ACTIVE
    ) {
        return true;
    }

    AudioMixerState mixer_state;
-    if ((mixer_state = (AudioMixerState) atomic_get((int32 *) &mixer->state_new)) != mixer->state_old) {
+    if ((mixer_state = (AudioMixerState) atomic_get_relaxed((int32 *) &mixer->state_new)) != mixer->state_old) {
        if (mixer_state != AUDIO_MIXER_STATE_UNINITIALIZED) {
            audio_load(
                mixer->window,
--- a/hash/GeneralHash.h
+++ b/hash/GeneralHash.h
@ -23,6 +23,7 @@ uint64 hash_djb2(const char* key) {
    return hash;
 }

+inline
 uint64 hash_sdbm(const byte* key)
 {
    uint64 hash = 0;
@ -35,6 +36,7 @@ uint64 hash_sdbm(const byte* key)
    return hash;
 }

+inline
 uint64 hash_lose_lose(const byte* key)
 {
 	uint64 hash = 0;
@ -47,7 +49,9 @@ uint64 hash_lose_lose(const byte* key)
 	return hash;
 }

-uint64 hash_polynomial_rolling(const char* str) {
+inline
+uint64 hash_polynomial_rolling(const char* str)
+{
    const int32 p = 31;
    const int32 m = 1000000009;
    uint64 hash = 0;
@ -62,7 +66,9 @@ uint64 hash_polynomial_rolling(const char* str) {
    return hash;
 }

-uint64 hash_fnv1a(const char* str) {
+inline
+uint64 hash_fnv1a(const char* str)
+{
    const uint64 FNV_OFFSET_BASIS = 14695981039346656037UL;
    const uint64 FNV_PRIME = 1099511628211UL;
    uint64 hash = FNV_OFFSET_BASIS;
@ -76,34 +82,144 @@ uint64 hash_fnv1a(const char* str) {
    return hash;
 }

+inline
 uint32 hash_oat(const char* str)
 {
-    uint32 h = 0;
+    uint32 hash = 0;

    while(*str) {
-        h += *str++;
-        h += (h << 10);
-        h ^= (h >> 6);
+        hash += *str++;
+        hash += (hash << 10);
+        hash ^= (hash >> 6);
    }

-    h += (h << 3);
-    h ^= (h >> 11);
-    h += (h << 15);
+    hash += (hash << 3);
+    hash ^= (hash >> 11);
+    hash += (hash << 15);

-    return h;
+    return hash;
 }

+inline
 uint32 hash_ejb(const char* str)
 {
 	const uint32 PRIME1 = 37;
 	const uint32 PRIME2 = 1048583;
-	uint32 h = 0;
+	uint32 hash = 0;

 	while (*str) {
-		h = h * PRIME1 ^ (*str++ - ' ');
+		hash = hash * PRIME1 ^ (*str++ - ' ');
 	}

-	return h % PRIME2;
+	return hash % PRIME2;
+}
+
+////////////////////////////////////
+// Seeded hash functions
+////////////////////////////////////
+
+inline constexpr
+uint64 hash_djb2_seeded(const char* key, int32 seed)
+{
+    uint64 hash = 5381;
+    int32 c;
+
+    while ((c = *key++)) {
+        hash = ((hash << 5) + hash) + c;
+    }
+
+    return hash ^ (seed + (seed << 6) + (seed >> 2));
+}
+
+inline
+uint64 hash_sdbm_seeded(const char* key, int32 seed)
+{
+    uint64 hash = 0;
+    int32 c;
+
+    while (c = *key++) {
+        hash = c + (hash << 6) + (hash << 16) - hash;
+    }
+
+    return hash ^ (seed + (seed << 6) + (seed >> 2));
+}
+
+inline
+uint64 hash_lose_lose_seeded(const char* key, int32 seed)
+{
+	uint64 hash = 0;
+	int32 c;
+
+	while (c = *key++) {
+	    hash += c;
+    }
+
+	return hash ^ (seed + (seed << 6) + (seed >> 2));
+}
+
+inline
+uint64 hash_polynomial_rolling_seeded(const char* str, int32 seed)
+{
+    const int32 p = 31;
+    const int32 m = 1000000009;
+    uint64 hash = 0;
+    uint64 p_pow = 1;
+
+    while (*str) {
+        hash = (hash + (*str - 'a' + 1) * p_pow) % m;
+        p_pow = (p_pow * p) % m;
+        str++;
+    }
+
+    return hash ^ (seed + (seed << 6) + (seed >> 2));
+}
+
+inline
+uint64 hash_fnv1a_seeded(const char* str, int32 seed)
+{
+    const uint64 FNV_OFFSET_BASIS = 14695981039346656037UL;
+    const uint64 FNV_PRIME = 1099511628211UL;
+    uint64 hash = FNV_OFFSET_BASIS;
+
+    while (*str) {
+        hash ^= (byte) *str;
+        hash *= FNV_PRIME;
+        str++;
+    }
+
+    return hash ^ (seed + (seed << 6) + (seed >> 2));
+}
+
+inline
+uint64 hash_oat_seeded(const char* str, int32 seed)
+{
+    uint64 hash = 0;
+
+    while(*str) {
+        hash += *str++;
+        hash += (hash << 10);
+        hash ^= (hash >> 6);
+    }
+
+    hash += (hash << 3);
+    hash ^= (hash >> 11);
+    hash += (hash << 15);
+
+    return hash ^ (seed + (seed << 6) + (seed >> 2));;
+}
+
+inline
+uint64 hash_ejb_seeded(const char* str, int32 seed)
+{
+	const uint64 PRIME1 = 37;
+	const uint64 PRIME2 = 1048583;
+	uint64 hash = 0;
+
+	while (*str) {
+		hash = hash * PRIME1 ^ (*str++ - ' ');
+	}
+
+	return (hash % PRIME2) ^ (seed + (seed << 6) + (seed >> 2));;
 }

 #endif
--- a/log/Debug.cpp
+++ b/log/Debug.cpp
@ -161,19 +161,19 @@ void update_timing_stat_reset(uint32 stat)
 inline
 void reset_counter(int32 id)
 {
-    atomic_set(&debug_container->counter[id], 0);
+    atomic_set_acquire(&debug_container->counter[id], 0);
 }

 inline
 void log_increment(int32 id, int64 by = 1)
 {
-    atomic_add(&debug_container->counter[id], by);
+    atomic_add_acquire(&debug_container->counter[id], by);
 }

 inline
 void log_counter(int32 id, int64 value)
 {
-    atomic_set(&debug_container->counter[id], value);
+    atomic_set_acquire(&debug_container->counter[id], value);
 }

 // @todo don't use a pointer to this should be in a global together with other logging data (see Log.h)
@ -234,9 +234,9 @@ void debug_memory_log(uint64 start, uint64 size, int32 type, const char* functio
        return;
    }

-    uint64 idx = atomic_fetch_add(&mem->action_idx, 1);
+    uint64 idx = atomic_fetch_add_relaxed(&mem->action_idx, 1);
    if (idx >= ARRAY_COUNT(mem->last_action)) {
-        atomic_set(&mem->action_idx, 1);
+        atomic_set_acquire(&mem->action_idx, 1);
        idx %= ARRAY_COUNT(mem->last_action);
    }

@ -266,9 +266,9 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
        return;
    }

-    uint64 idx = atomic_fetch_add(&mem->reserve_action_idx, 1);
+    uint64 idx = atomic_fetch_add_relaxed(&mem->reserve_action_idx, 1);
    if (idx >= ARRAY_COUNT(mem->reserve_action)) {
-        atomic_set(&mem->reserve_action_idx, 1);
+        atomic_set_acquire(&mem->reserve_action_idx, 1);
        idx %= ARRAY_COUNT(mem->last_action);
    }

--- a/memory/BufferMemory.h
+++ b/memory/BufferMemory.h
@ -170,9 +170,6 @@ int64 buffer_load(BufferMemory* buf, const byte* data)
    buf->alignment = SWAP_ENDIAN_LITTLE(*((int32 *) data));
    data += sizeof(buf->alignment);

-    buf->element_alignment = SWAP_ENDIAN_LITTLE(*((int32 *) data));
-    data += sizeof(buf->element_alignment);
-
    // End
    buf->end = buf->memory + SWAP_ENDIAN_LITTLE(*((uint64 *) data));
    data += sizeof(uint64);
--- a/memory/ChunkMemory.h
+++ b/memory/ChunkMemory.h
@ -43,23 +43,6 @@ struct ChunkMemory {
    uint64* free;
 };

-struct ThreadedChunkMemory {
-    byte* memory;
-
-    uint64 count;
-    uint64 size;
-    uint64 chunk_size;
-    int64 last_pos;
-    int32 alignment;
-
-    // length = count
-    // free describes which locations are used and which are free
-    uint64* free;
-
-    pthread_mutex_t mutex;
-    pthread_cond_t cond;
-};
-
 inline
 void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignment = 64)
 {
@ -69,11 +52,11 @@ void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignm
    chunk_size = ROUND_TO_NEAREST(chunk_size, alignment);

    buf->memory = alignment < 2
-        ? (byte *) platform_alloc(count * chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64))
-        : (byte *) platform_alloc_aligned(count * chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64), alignment);
+        ? (byte *) platform_alloc(count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64))
+        : (byte *) platform_alloc_aligned(count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64), alignment);

    buf->count = count;
-    buf->size = count * chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64);
+    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
    buf->last_pos = -1;
    buf->alignment = alignment;
@ -86,17 +69,6 @@ void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignm
    DEBUG_MEMORY_INIT((uint64) buf->memory, buf->size);
 }

-inline
-void chunk_free(ChunkMemory* buf)
-{
-    DEBUG_MEMORY_DELETE((uint64) buf->memory, buf->size);
-    if (buf->alignment < 2) {
-        platform_free((void **) &buf->memory);
-    } else {
-        platform_aligned_free((void **) &buf->memory);
-    }
-}
-
 inline
 void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint64 chunk_size, int32 alignment = 64)
 {
@ -105,10 +77,10 @@ void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint64 chunk

    chunk_size = ROUND_TO_NEAREST(chunk_size, alignment);

-    buf->memory = buffer_get_memory(data, count * chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64));
+    buf->memory = buffer_get_memory(data, count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64));

    buf->count = count;
-    buf->size = count * chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64);
+    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
    buf->last_pos = -1;
    buf->alignment = alignment;
@ -134,7 +106,7 @@ void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, i
    buf->memory = data;

    buf->count = count;
-    buf->size = count * chunk_size + sizeof(buf->free) * CEIL_DIV(count, 64);
+    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
    buf->last_pos = -1;
    buf->alignment = alignment;
@ -148,10 +120,22 @@ void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, i
    DEBUG_MEMORY_RESERVE((uint64) buf->memory, buf->size, 187);
 }

+inline
+void chunk_free(ChunkMemory* buf)
+{
+    DEBUG_MEMORY_DELETE((uint64) buf->memory, buf->size);
+    if (buf->alignment < 2) {
+        platform_free((void **) &buf->memory);
+    } else {
+        platform_aligned_free((void **) &buf->memory);
+    }
+}
+
 inline
 byte* chunk_get_element(ChunkMemory* buf, uint64 element, bool zeroed = false)
 {
    byte* offset = buf->memory + element * buf->chunk_size;
+    ASSERT_SIMPLE(offset);

    if (zeroed) {
        memset((void *) offset, 0, buf->chunk_size);
@ -159,8 +143,6 @@ byte* chunk_get_element(ChunkMemory* buf, uint64 element, bool zeroed = false)

    DEBUG_MEMORY_READ((uint64) offset, buf->chunk_size);

-    ASSERT_SIMPLE(offset);
-
    return offset;
 }

--- a/memory/Heap.h
+++ b/memory/Heap.h
@ -132,6 +132,8 @@ void heap_pop(Heap* heap, void* out) {
        return;
    }

+    DEBUG_MEMORY_READ((uint64) heap->elements, heap->element_size);
+
    memcpy(out, heap->elements, heap->element_size);
    void* last_element = heap->elements + ((heap->size - 1) * heap->element_size);
    memcpy(heap->elements, last_element, heap->element_size);
@ -141,6 +143,8 @@ void heap_pop(Heap* heap, void* out) {

 inline
 void* heap_peek(Heap* heap) {
+    DEBUG_MEMORY_READ((uint64) heap->elements, heap->element_size);
+
    return heap->elements;
 }

--- a/memory/Queue.h
+++ b/memory/Queue.h
@ -12,35 +12,57 @@
 #include "../stdlib/Types.h"
 #include "RingMemory.h"

-typedef RingMemory Queue;
+// WARNING: Structure needs to be the same as RingMemory
+struct Queue {
+    byte* memory;
+    byte* end;

-// @question Consider to add the element size into the Queue struct -> we don't need to pass it after initialization as parameter
+    byte* head;
+
+    // This variable is usually only used by single producer/consumer code mostly found in threads.
+    // One thread inserts elements -> updates head
+    // The other thread reads elements -> updates tail
+    // This code itself doesn't change this variable
+    byte* tail;
+
+    uint64 size;
+    uint32 alignment;
+
+    // The ring memory ends here
+    uint32 element_size;
+};

 inline
-void queue_alloc(Queue* queue, uint64 element_count, uint32 element_size, int32 alignment = 64)
+void queue_alloc(Queue* queue, uint64 element_count, uint32 element_size, uint32 alignment = 64)
 {
-    // @bug The alignment needs to be included in EVERY element
-    ring_alloc(queue, element_count * element_size, alignment);
+    element_size = ROUND_TO_NEAREST(element_size, alignment);
+
+    ring_alloc((RingMemory *) queue, element_count * element_size, alignment);
+    queue->element_size = element_size;
 }

 inline
-void queue_init(Queue* queue, BufferMemory* buf, uint64 element_count, uint32 element_size, int32 alignment = 64)
+void queue_init(Queue* queue, BufferMemory* buf, uint64 element_count, uint32 element_size, uint32 alignment = 64)
 {
-    // @bug The alignment needs to be included in EVERY element
-    ring_init(queue, buf, element_count * element_size, alignment);
+    element_size = ROUND_TO_NEAREST(element_size, alignment);
+
+    ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);
+    queue->element_size = element_size;
 }

 inline
-void queue_init(Queue* queue, byte* buf, uint64 element_count, uint32 element_size, int32 alignment = 64)
+void queue_init(Queue* queue, byte* buf, uint64 element_count, uint32 element_size, uint32 alignment = 64)
 {
-    // @bug The alignment needs to be included in EVERY element
-    ring_init(queue, buf, element_count * element_size, alignment);
+    element_size = ROUND_TO_NEAREST(element_size, alignment);
+
+    ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);
+    queue->element_size = element_size;
 }

 inline
 void queue_free(Queue* queue)
 {
-    ring_free(queue);
+    ring_free((RingMemory *) queue);
 }

 inline
@ -54,60 +76,145 @@ bool queue_set_empty(Queue* queue) {
 }

 inline
-bool queue_is_full(Queue* queue, uint64 size, byte aligned = 0) {
-    return !ring_commit_safe((RingMemory *) queue, size, aligned);
+bool queue_is_full(Queue* queue) {
+    return !ring_commit_safe((RingMemory *) queue, queue->element_size, queue->alignment);
 }

-// Conditional Lock
 inline
-byte* queue_enqueue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
+void queue_enqueue_unique(ThreadedQueue* queue, const byte* data)
 {
-    byte* mem = ring_get_memory_nomove(queue, size, aligned);
-    memcpy(mem, data, size);
-    ring_move_pointer(queue, &queue->head, size, aligned);
+    ASSERT_SIMPLE((uint64_t) data % 4 == 0);
+
+    byte* tail = queue->tail;
+    while (tail != queue->tail) {
+        ASSERT_SIMPLE((uint64_t) tail % 4 == 0);
+
+        // @performance we could probably make this faster since we don't need to compare the entire range
+        if (is_equal_aligned(tail, data, queue->element_size) == 0) {
+            return;
+        }
+
+        ring_move_pointer((RingMemory *) queue, &tail, queue->element_size, queue->alignment);
+    }
+
+    if (!ring_commit_safe((RingMemory *) queue, queue->element_size, queue->alignment)) {
+        return;
+    }
+
+    byte* mem = ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);
+}
+
+inline
+byte* queue_enqueue(Queue* queue, byte* data)
+{
+    byte* mem = ring_get_memory_nomove((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);
+    ring_move_pointer((RingMemory *) queue, &queue->head, queue->element_size, queue->alignment);

    return mem;
 }

 inline
-byte* queue_enqueue_start(Queue* queue, uint64 size, byte aligned = 0)
+byte* queue_enqueue_safe(Queue* queue, byte* data)
 {
-    return ring_get_memory_nomove(queue, size, aligned);
+    if(queue_is_full(queue)) {
+        return NULL;
+    }
+
+    byte* mem = ring_get_memory_nomove((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);
+    ring_move_pointer((RingMemory *) queue, &queue->head, queue->element_size, queue->alignment);
+
+    return mem;
+}
+
+// WARNING: Only useful for single producer single consumer
+inline
+byte* queue_enqueue_wait_atomic(Queue* queue, byte* data)
+{
+    while (!ring_commit_safe_atomic((RingMemory *) queue, queue->alignment)) {}
+
+    byte* mem = ring_get_memory_nomove((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);
+    ring_move_pointer((RingMemory *) queue, &queue->head, queue->element_size, queue->alignment);
+
+    return mem;
+}
+
+// WARNING: Only useful for single producer single consumer
+inline
+byte* queue_enqueue_safe_atomic(Queue* queue, byte* data)
+{
+    if (!ring_commit_safe_atomic((RingMemory *) queue, queue->alignment)) {
+        return NULL;
+    }
+
+    byte* mem = ring_get_memory_nomove((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);
+    ring_move_pointer((RingMemory *) queue, &queue->head, queue->element_size, queue->alignment);
+
+    return mem;
 }

 inline
-void queue_enqueue_end(Queue* queue, uint64 size, byte aligned = 0)
+byte* queue_enqueue_start(Queue* queue)
 {
-    ring_move_pointer(queue, &queue->head, size, aligned);
+    return ring_get_memory_nomove((RingMemory *) queue, queue->element_size, queue->alignment);
 }

 inline
-bool queue_dequeue(Queue* queue, byte* data, uint64 size, byte aligned = 0)
+void queue_enqueue_end(Queue* queue)
+{
+    ring_move_pointer((RingMemory *) queue, &queue->head, queue->element_size, queue->alignment);
+}
+
+inline
+bool queue_dequeue(Queue* queue, byte* data)
 {
    if (queue->head == queue->tail) {
        return false;
    }

-    if (size == 4) {
+    if (queue->element_size == 4) {
        *((int32 *) data) = *((int32 *) queue->tail);
    } else {
-        memcpy(data, queue->tail, size);
+        memcpy(data, queue->tail, queue->element_size);
    }

-    ring_move_pointer(queue, &queue->tail, size, aligned);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);
+
+    return true;
+}
+
+// WARNING: Only useful for single producer single consumer
+inline
+bool queue_dequeue_atomic(Queue* queue, byte* data)
+{
+    if (atomic_get_relaxed((uint64 *) &queue->head) == (uint64) queue->tail) {
+        return false;
+    }
+
+    if (queue->element_size == 4) {
+        *((int32 *) data) = *((int32 *) queue->tail);
+    } else {
+        memcpy(data, queue->tail, queue->element_size);
+    }
+
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);

    return true;
 }

 inline
-byte* queue_dequeue_keep(Queue* queue, uint64 size, byte aligned = 0)
+byte* queue_dequeue_keep(Queue* queue)
 {
    if (queue->head == queue->tail) {
        return NULL;
    }

    byte* data = queue->tail;
-    ring_move_pointer(queue, &queue->tail, size, aligned);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);

    return data;
 }
@ -119,9 +226,9 @@ byte* queue_dequeue_start(Queue* queue)
 }

 inline
-void queue_dequeue_end(Queue* queue, uint64 size, byte aligned = 0)
+void queue_dequeue_end(Queue* queue)
 {
-    ring_move_pointer(queue, &queue->tail, size, aligned);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);
 }

 #endif
--- a/memory/RingMemory.h
+++ b/memory/RingMemory.h
@ -24,12 +24,16 @@
    #include "../platform/win32/Allocator.h"
    #include "../platform/win32/threading/ThreadDefines.h"
    #include "../platform/win32/threading/Semaphore.h"
+    #include "../platform/win32/threading/Atomic.h"
 #elif __linux__
    #include "../platform/linux/Allocator.h"
    #include "../platform/linux/threading/ThreadDefines.h"
    #include "../platform/linux/threading/Semaphore.h"
+    #include "../platform/linux/threading/Atomic.h"
 #endif

+// WARNING: Changing this structure has effects on other data structures (e.g. Queue)
+// When chaning make sure you understand what you are doing
 struct RingMemory {
    byte* memory;
    byte* end;
@ -43,14 +47,11 @@ struct RingMemory {
    byte* tail;

    uint64 size;
-    int32 alignment;
-    int32 element_alignment;
+    uint32 alignment;
 };

-// @bug alignment should also include the end point, not just the start
-
 inline
-void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64)
+void ring_alloc(RingMemory* ring, uint64 size, uint32 alignment = 64)
 {
    ASSERT_SIMPLE(size);

@ -63,7 +64,6 @@ void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64)
    ring->tail = ring->memory;
    ring->size = size;
    ring->alignment = alignment;
-    ring->element_alignment = 0;

    memset(ring->memory, 0, ring->size);

@ -71,7 +71,7 @@ void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64)
 }

 inline
-void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment = 64)
+void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, uint32 alignment = 64)
 {
    ASSERT_SIMPLE(size);

@ -82,26 +82,23 @@ void ring_init(RingMemory* ring, BufferMemory* buf, uint64 size, int32 alignment
    ring->tail = ring->memory;
    ring->size = size;
    ring->alignment = alignment;
-    ring->element_alignment = 0;

    DEBUG_MEMORY_INIT((uint64) ring->memory, ring->size);
    DEBUG_MEMORY_RESERVE((uint64) ring->memory, ring->size, 187);
 }

 inline
-void ring_init(RingMemory* ring, byte* buf, uint64 size, int32 alignment = 64)
+void ring_init(RingMemory* ring, byte* buf, uint64 size, uint32 alignment = 64)
 {
    ASSERT_SIMPLE(size);

-    // @bug what if an alignment is defined?
-    ring->memory = buf;
+    ring->memory = (byte *) ROUND_TO_NEAREST((uintptr_t) buf, alignment);

    ring->end = ring->memory + size;
    ring->head = ring->memory;
    ring->tail = ring->memory;
    ring->size = size;
    ring->alignment = alignment;
-    ring->element_alignment = 0;

    memset(ring->memory, 0, ring->size);

@ -122,10 +119,6 @@ void ring_free(RingMemory* ring)
 inline
 byte* ring_calculate_position(const RingMemory* ring, uint64 size, byte aligned = 0)
 {
-    if (aligned == 0) {
-        aligned = (byte) OMS_MAX(ring->element_alignment, 1);
-    }
-
    byte* head = ring->head;

    if (aligned > 1) {
@ -158,9 +151,9 @@ void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, byte aligned =
 {
    ASSERT_SIMPLE(size <= ring->size);

-    if (aligned == 0) {
-        aligned = (byte) OMS_MAX(ring->element_alignment, 1);
-    }
+    // Actually, we cannot be sure that this is a read, it could also be a write.
+    // However, we better do it once here than manually in every place that uses this function
+    DEBUG_MEMORY_READ((uint64) *pos, size);

    if (aligned > 1) {
        uintptr_t address = (uintptr_t) *pos;
@ -184,10 +177,6 @@ byte* ring_get_memory(RingMemory* ring, uint64 size, byte aligned = 0, bool zero
 {
    ASSERT_SIMPLE(size <= ring->size);

-    if (aligned == 0) {
-        aligned = (byte) OMS_MAX(ring->element_alignment, 1);
-    }
-
    if (aligned > 1) {
        uintptr_t address = (uintptr_t) ring->head;
        ring->head += (aligned - (address& (aligned - 1))) % aligned;
@ -222,10 +211,6 @@ byte* ring_get_memory_nomove(RingMemory* ring, uint64 size, byte aligned = 0, bo
 {
    ASSERT_SIMPLE(size <= ring->size);

-    if (aligned == 0) {
-        aligned = (byte) OMS_MAX(ring->element_alignment, 1);
-    }
-
    byte* pos = ring->head;

    if (aligned > 1) {
@ -285,6 +270,27 @@ bool ring_commit_safe(const RingMemory* ring, uint64 size, byte aligned = 0)
    }
 }

+inline
+bool ring_commit_safe_atomic(const RingMemory* ring, uint64 size, byte aligned = 0)
+{
+    // aligned * 2 since that should be the maximum overhead for an element
+    // @bug could this result in a case where the ring is considered empty/full (false positive/negative)?
+    // The "correct" version would probably to use ring_move_pointer in some form
+    uint64 max_mem_required = size + aligned * 2;
+
+    uint64 tail = atomic_get_relaxed((uint64 *) &ring->tail);
+    uint64 head = atomic_get_relaxed((uint64 *) &ring->head);
+
+    if (tail < head) {
+        return ((uint64) (ring->end - head)) > max_mem_required
+            || ((uint64) (tail - (uint64) ring->memory)) > max_mem_required;
+    } else if (tail > head) {
+        return ((uint64) (tail - head)) > max_mem_required;
+    } else {
+        return true;
+    }
+}
+
 inline
 void ring_force_head_update(const RingMemory* ring)
 {
@ -314,9 +320,6 @@ int64 ring_dump(const RingMemory* ring, byte* data)
    *((int32 *) data) = SWAP_ENDIAN_LITTLE(ring->alignment);
    data += sizeof(ring->alignment);

-    *((int32 *) data) = SWAP_ENDIAN_LITTLE(ring->element_alignment);
-    data += sizeof(ring->element_alignment);
-
    // tail/End
    *((uint64 *) data) = SWAP_ENDIAN_LITTLE((uint64) (ring->tail - ring->memory));
    data += sizeof(ring->tail);
--- a/memory/ThreadedChunkMemory.h
+++ b/memory/ThreadedChunkMemory.h
@ -0,0 +1,42 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_MEMORY_THREADED_CHUNK_MEMORY_H
+#define TOS_MEMORY_THREADED_CHUNK_MEMORY_H
+
+#include <string.h>
+#include "../stdlib/Types.h"
+
+#if _WIN32
+    #include "../platform/win32/threading/Thread.h"
+#elif __linux__
+    #include "../platform/linux/threading/Thread.h"
+#endif
+
+struct ThreadedChunkMemory {
+    byte* memory;
+
+    uint64 count;
+    uint64 size;
+    uint64 chunk_size;
+    int64 last_pos;
+    int32 alignment;
+
+    // length = count
+    // free describes which locations are used and which are free
+    uint64* free;
+
+    // Chunk implementation ends here
+    // The completeness indicates if the data is completely written to
+    uint64* completeness;
+
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+};
+
+#endif
--- a/memory/ThreadedQueue.h
+++ b/memory/ThreadedQueue.h
@ -36,8 +36,10 @@ struct ThreadedQueue {
    byte* tail;

    uint64 size;
-    int32 alignment;
-    int32 element_alignment;
+    uint32 alignment;
+
+    // The ring memory ends here
+    uint32 element_size;

    // We support both conditional locking and semaphore locking
    // These values are not initialized and not used unless you use the queue
@ -48,26 +50,14 @@ struct ThreadedQueue {
    sem_t full;
 };

-// @question Consider to add the element size into the Queue struct -> we don't need to pass it after initialization as parameter
-
 inline
-void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element_size, int32 alignment = 64)
+void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element_size, uint32 alignment = 64)
 {
-    // @bug The alignment needs to be included in EVERY element
+    element_size = ROUND_TO_NEAREST(element_size, alignment);
+
    ring_alloc((RingMemory *) queue, element_count * element_size, alignment);

-    pthread_mutex_init(&queue->mutex, NULL);
-    pthread_cond_init(&queue->cond, NULL);
-
-    sem_init(&queue->empty, element_count);
-    sem_init(&queue->full, 0);
-}
-
-inline
-void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
-{
-    // @bug The alignment needs to be included in EVERY element
-    ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);
+    queue->element_size = element_size;

    pthread_mutex_init(&queue->mutex, NULL);
    pthread_cond_init(&queue->cond, NULL);
@ -77,11 +67,30 @@ void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_cou
 }

 inline
-void thrd_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint64 element_size, int32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint64 element_size, uint32 alignment = 64)
 {
-    // @bug The alignment needs to be included in EVERY element
+    element_size = ROUND_TO_NEAREST(element_size, alignment);
+
    ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);

+    queue->element_size = element_size;
+
+    pthread_mutex_init(&queue->mutex, NULL);
+    pthread_cond_init(&queue->cond, NULL);
+
+    sem_init(&queue->empty, element_count);
+    sem_init(&queue->full, 0);
+}
+
+inline
+void thrd_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint64 element_size, uint32 alignment = 64)
+{
+    element_size = ROUND_TO_NEAREST(element_size, alignment);
+
+    ring_init((RingMemory *) queue, buf, element_count * element_size, alignment);
+
+    queue->element_size = element_size;
+
    pthread_mutex_init(&queue->mutex, NULL);
    pthread_cond_init(&queue->cond, NULL);

@ -101,7 +110,7 @@ void thrd_queue_free(ThreadedQueue* queue)

 // @todo Create enqueue_unique and enqueue_unique_sem
 inline
-void thrd_queue_enqueue_unique_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_unique_wait(ThreadedQueue* queue, const byte* data)
 {
    ASSERT_SIMPLE((uint64_t) data % 4 == 0);
    pthread_mutex_lock(&queue->mutex);
@ -111,28 +120,28 @@ void thrd_queue_enqueue_unique_wait(ThreadedQueue* queue, const byte* data, uint
        ASSERT_SIMPLE((uint64_t) tail % 4 == 0);

        // @performance we could probably make this faster since we don't need to compare the entire range
-        if (is_equal_aligned(tail, data, size) == 0) {
+        if (is_equal_aligned(tail, data, queue->element_size) == 0) {
            pthread_mutex_unlock(&queue->mutex);

            return;
        }

-        ring_move_pointer((RingMemory *) queue, &tail, size, aligned);
+        ring_move_pointer((RingMemory *) queue, &tail, queue->element_size, queue->alignment);
    }

-    while (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
+    while (!ring_commit_safe((RingMemory *) queue, queue->element_size, queue->alignment)) {
        pthread_cond_wait(&queue->cond, &queue->mutex);
    }

-    byte* mem = ring_get_memory((RingMemory *) queue, size, aligned);
-    memcpy(mem, data, size);
+    byte* mem = ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);

    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
 }

 inline
-void thrd_queue_enqueue_unique(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_unique(ThreadedQueue* queue, const byte* data)
 {
    ASSERT_SIMPLE((uint64_t) data % 4 == 0);
    pthread_mutex_lock(&queue->mutex);
@ -142,23 +151,23 @@ void thrd_queue_enqueue_unique(ThreadedQueue* queue, const byte* data, uint64 si
        ASSERT_SIMPLE((uint64_t) tail % 4 == 0);

        // @performance we could probably make this faster since we don't need to compare the entire range
-        if (is_equal_aligned(tail, data, size) == 0) {
+        if (is_equal_aligned(tail, data, queue->element_size) == 0) {
            pthread_mutex_unlock(&queue->mutex);

            return;
        }

-        ring_move_pointer((RingMemory *) queue, &tail, size, aligned);
+        ring_move_pointer((RingMemory *) queue, &tail, queue->element_size, queue->alignment);
    }

-    if (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
+    if (!ring_commit_safe((RingMemory *) queue, queue->element_size, queue->alignment)) {
        pthread_mutex_unlock(&queue->mutex);

        return;
    }

-    byte* mem = ring_get_memory((RingMemory *) queue, size, aligned);
-    memcpy(mem, data, size);
+    byte* mem = ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);

    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
@ -166,49 +175,49 @@ void thrd_queue_enqueue_unique(ThreadedQueue* queue, const byte* data, uint64 si

 // Conditional Lock
 inline
-void thrd_queue_enqueue(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue(ThreadedQueue* queue, const byte* data)
 {
    pthread_mutex_lock(&queue->mutex);

-    if (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
+    if (!ring_commit_safe((RingMemory *) queue, queue->element_size, queue->alignment)) {
        pthread_mutex_unlock(&queue->mutex);

        return;
    }

-    byte* mem = ring_get_memory((RingMemory *) queue, size, aligned);
-    memcpy(mem, data, size);
+    byte* mem = ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);

    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
 }

 inline
-void thrd_queue_enqueue_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_wait(ThreadedQueue* queue, const byte* data)
 {
    pthread_mutex_lock(&queue->mutex);

-    while (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
+    while (!ring_commit_safe((RingMemory *) queue, queue->element_size, queue->alignment)) {
        pthread_cond_wait(&queue->cond, &queue->mutex);
    }

-    byte* mem = ring_get_memory((RingMemory *) queue, size, aligned);
-    memcpy(mem, data, size);
+    byte* mem = ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);

    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
 }

 inline
-byte* thrd_queue_enqueue_start_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+byte* thrd_queue_enqueue_start_wait(ThreadedQueue* queue)
 {
    pthread_mutex_lock(&queue->mutex);

-    while (!ring_commit_safe((RingMemory *) queue, size, aligned)) {
+    while (!ring_commit_safe((RingMemory *) queue, queue->element_size, queue->alignment)) {
        pthread_cond_wait(&queue->cond, &queue->mutex);
    }

-    return ring_get_memory((RingMemory *) queue, size, aligned);
+    return ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
 }

 inline
@ -219,7 +228,7 @@ void thrd_queue_enqueue_end_wait(ThreadedQueue* queue)
 }

 inline
-bool thrd_queue_dequeue(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+bool thrd_queue_dequeue(ThreadedQueue* queue, byte* data)
 {
    if (queue->head == queue->tail) {
        return false;
@ -233,12 +242,12 @@ bool thrd_queue_dequeue(ThreadedQueue* queue, byte* data, uint64 size, byte alig
        return false;
    }

-    if (size == 4) {
+    if (queue->element_size == 4) {
        *((int32 *) data) = *((int32 *) queue->tail);
    } else {
-        memcpy(data, queue->tail, size);
+        memcpy(data, queue->tail, queue->element_size);
    }
-    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);

    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
@ -256,9 +265,9 @@ bool thrd_queue_empty(ThreadedQueue* queue) {
 }

 inline
-bool thrd_queue_full(ThreadedQueue* queue, uint64 size, byte aligned = 0) {
+bool thrd_queue_full(ThreadedQueue* queue) {
    pthread_mutex_lock(&queue->mutex);
-    bool is_full = !ring_commit_safe((RingMemory *) queue, size, aligned);
+    bool is_full = !ring_commit_safe((RingMemory *) queue, queue->element_size, queue->alignment);
    pthread_mutex_unlock(&queue->mutex);

    return is_full;
@ -266,7 +275,7 @@ bool thrd_queue_full(ThreadedQueue* queue, uint64 size, byte aligned = 0) {

 // Waits until a dequeue is available
 inline
-void thrd_queue_dequeue_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_wait(ThreadedQueue* queue, byte* data)
 {
    pthread_mutex_lock(&queue->mutex);

@ -274,8 +283,8 @@ void thrd_queue_dequeue_wait(ThreadedQueue* queue, byte* data, uint64 size, byte
        pthread_cond_wait(&queue->cond, &queue->mutex);
    }

-    memcpy(data, queue->tail, size);
-    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
+    memcpy(data, queue->tail, queue->element_size);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);

    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
@ -294,9 +303,9 @@ byte* thrd_queue_dequeue_start_wait(ThreadedQueue* queue)
 }

 inline
-void thrd_queue_dequeue_end_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_end_wait(ThreadedQueue* queue)
 {
-    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);

    pthread_cond_signal(&queue->cond);
    pthread_mutex_unlock(&queue->mutex);
@ -304,20 +313,20 @@ void thrd_queue_dequeue_end_wait(ThreadedQueue* queue, uint64 size, byte aligned

 // Semaphore Lock
 inline
-void thrd_queue_enqueue_sem_wait(ThreadedQueue* queue, const byte* data, uint64 size, byte aligned = 0)
+void thrd_queue_enqueue_sem_wait(ThreadedQueue* queue, const byte* data)
 {
    sem_wait(&queue->empty);
    pthread_mutex_lock(&queue->mutex);

-    byte* mem = ring_get_memory((RingMemory *) queue, size, aligned);
-    memcpy(mem, data, size);
+    byte* mem = ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);

    pthread_mutex_unlock(&queue->mutex);
    sem_post(&queue->full);
 }

 inline
-bool thrd_queue_enqueue_sem_timedwait(ThreadedQueue* queue, const byte* data, uint64 size, uint64 wait, byte aligned = 0)
+bool thrd_queue_enqueue_sem_timedwait(ThreadedQueue* queue, const byte* data, uint64 wait)
 {
    if (sem_timedwait(&queue->empty, wait)) {
        return false;
@ -325,8 +334,8 @@ bool thrd_queue_enqueue_sem_timedwait(ThreadedQueue* queue, const byte* data, ui

    pthread_mutex_lock(&queue->mutex);

-    byte* mem = ring_get_memory((RingMemory *) queue, size, aligned);
-    memcpy(mem, data, size);
+    byte* mem = ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
+    memcpy(mem, data, queue->element_size);

    pthread_mutex_unlock(&queue->mutex);
    sem_post(&queue->full);
@ -335,12 +344,12 @@ bool thrd_queue_enqueue_sem_timedwait(ThreadedQueue* queue, const byte* data, ui
 }

 inline
-byte* thrd_queue_enqueue_start_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+byte* thrd_queue_enqueue_start_sem_wait(ThreadedQueue* queue)
 {
    sem_wait(&queue->empty);
    pthread_mutex_lock(&queue->mutex);

-    return ring_get_memory((RingMemory *) queue, size, aligned);
+    return ring_get_memory((RingMemory *) queue, queue->element_size, queue->alignment);
 }

 inline
@ -351,20 +360,20 @@ void thrd_queue_enqueue_end_sem_wait(ThreadedQueue* queue)
 }

 inline
-byte* thrd_queue_dequeue_sem_wait(ThreadedQueue* queue, byte* data, uint64 size, byte aligned = 0)
+byte* thrd_queue_dequeue_sem_wait(ThreadedQueue* queue, byte* data)
 {
    sem_wait(&queue->full);
    pthread_mutex_lock(&queue->mutex);

-    memcpy(data, queue->tail, size);
-    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
+    memcpy(data, queue->tail, queue->element_size);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);

    pthread_mutex_unlock(&queue->mutex);
    sem_post(&queue->empty);
 }

 inline
-bool thrd_queue_dequeue_sem_timedwait(ThreadedQueue* queue, byte* data, uint64 size, uint64 wait, byte aligned = 0)
+bool thrd_queue_dequeue_sem_timedwait(ThreadedQueue* queue, byte* data, uint64 wait)
 {
    if (sem_timedwait(&queue->full, wait)) {
        return false;
@ -372,8 +381,8 @@ bool thrd_queue_dequeue_sem_timedwait(ThreadedQueue* queue, byte* data, uint64 s

    pthread_mutex_lock(&queue->mutex);

-    memcpy(data, queue->tail, size);
-    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
+    memcpy(data, queue->tail, queue->element_size);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);

    pthread_mutex_unlock(&queue->mutex);
    sem_post(&queue->empty);
@ -391,9 +400,9 @@ byte* thrd_queue_dequeue_start_sem_wait(ThreadedQueue* queue)
 }

 inline
-void thrd_queue_dequeue_end_sem_wait(ThreadedQueue* queue, uint64 size, byte aligned = 0)
+void thrd_queue_dequeue_end_sem_wait(ThreadedQueue* queue)
 {
-    ring_move_pointer((RingMemory *) queue, &queue->tail, size, aligned);
+    ring_move_pointer((RingMemory *) queue, &queue->tail, queue->element_size, queue->alignment);

    pthread_mutex_unlock(&queue->mutex);
    sem_post(&queue->empty);
--- a/memory/ThreadedRingMemory.h
+++ b/memory/ThreadedRingMemory.h
@ -33,8 +33,8 @@ struct ThreadedRingMemory {

    uint64 size;
    int32 alignment;
-    int32 element_alignment;

+    // The ring memory ends here
    pthread_mutex_t mutex;
 };

--- a/platform/linux/Allocator.h
+++ b/platform/linux/Allocator.h
@ -45,7 +45,8 @@ void* platform_alloc_aligned(size_t size, int32 alignment)
        alignment = page_size;
    }

-    size += alignment - 1 + sizeof(void *) + sizeof(size_t);
+    size = ROUND_TO_NEAREST(size, alignment);
+    size += alignment + sizeof(void *) + sizeof(size_t);

    void* ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    ASSERT_SIMPLE(ptr != MAP_FAILED);
--- a/platform/linux/threading/Atomic.h
+++ b/platform/linux/threading/Atomic.h
--- a/platform/linux/threading/Thread.h
+++ b/platform/linux/threading/Thread.h
@ -61,7 +61,7 @@ int32 pthread_join(pthread_t thread, void** retval) {
 }

 int32 pthread_mutex_init(pthread_mutex_t* mutex, pthread_mutexattr_t*) {
-    atomic_set(mutex, 0);
+    atomic_set_acquire(mutex, 0);

    return 0;
 }
@ -77,44 +77,43 @@ int32 pthread_mutex_lock(pthread_mutex_t* mutex) {
 }

 int32 pthread_mutex_unlock(pthread_mutex_t* mutex) {
-    atomic_set(mutex, 0);
+    atomic_set_release(mutex, 0);
    syscall(SYS_futex, mutex, FUTEX_WAKE, 1, NULL, NULL, 0);

    return 0;
 }

 int32 pthread_cond_init(pthread_cond_t* cond, pthread_condattr_t*) {
-    atomic_set(cond, 0);
+    atomic_set_release(cond, 0);

    return 0;
 }

 int32 pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex) {
    pthread_mutex_unlock(mutex);
-    syscall(SYS_futex, cond, FUTEX_WAIT, atomic_get(cond), NULL, NULL, 0);
+    syscall(SYS_futex, cond, FUTEX_WAIT, atomic_get_acquire(cond), NULL, NULL, 0);
    pthread_mutex_lock(mutex);

    return 0;
 }

 int32 pthread_cond_signal(pthread_cond_t* cond) {
-    atomic_fetch_add(cond, 1);
+    atomic_fetch_add_acquire(cond, 1);
    syscall(SYS_futex, cond, FUTEX_WAKE, 1, NULL, NULL, 0);

    return 0;
 }

 int32 pthread_rwlock_init(pthread_rwlock_t* rwlock, const pthread_rwlockattr_t*) {
-    atomic_set((int64 *) &rwlock->readers, 0);
-    //atomic_set(&rwlock->writer, 0);
+    atomic_set_release((int64 *) &rwlock->readers, 0);

    return 0;
 }

 int32 pthread_rwlock_rdlock(pthread_rwlock_t* rwlock) {
-    while (atomic_get(&rwlock->writer)) {}
+    while (atomic_get_acquire_release(&rwlock->writer)) {}

-    atomic_fetch_add(&rwlock->readers, 1);
+    atomic_fetch_add_acquire(&rwlock->readers, 1);

    return 0;
 }
@ -126,10 +125,10 @@ int32 pthread_rwlock_wrlock(pthread_rwlock_t* rwlock) {
 }

 int32 pthread_rwlock_unlock(pthread_rwlock_t* rwlock) {
-    if (atomic_get(&rwlock->writer)) {
-        atomic_set(&rwlock->writer, 0);
+    if (atomic_get_acquire(&rwlock->writer)) {
+        atomic_set_release(&rwlock->writer, 0);
    } else {
-        atomic_fetch_sub(&rwlock->readers, 1);
+        atomic_fetch_sub_acquire(&rwlock->readers, 1);
    }

    return 0;
--- a/platform/win32/Allocator.h
+++ b/platform/win32/Allocator.h
@ -26,6 +26,8 @@ void* platform_alloc(size_t size)
 inline
 void* platform_alloc_aligned(size_t size, int32 alignment)
 {
+    size = ROUND_TO_NEAREST(size, alignment);
+
    void* ptr = VirtualAlloc(NULL, size + alignment + sizeof(void*), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
    ASSERT_SIMPLE(ptr);

--- a/platform/win32/threading/Atomic.h
+++ b/platform/win32/threading/Atomic.h
--- a/stdlib/HashMap.h
+++ b/stdlib/HashMap.h
@ -72,10 +72,6 @@ struct HashMap {
    ChunkMemory buf;
 };

-// @performance Implement more like gperf, our implementation is slow. However, keep it around since it is very general purpose
-// Alternatively, also create a version that creates perfect hashes (input requires a hash function and a seed for that hash function)
-// Both would be saved in the hash impl.
-
 // WARNING: element_size = element size + remaining HashEntry data size
 void hashmap_create(HashMap* hm, int32 count, int32 element_size, RingMemory* ring)
 {
@ -132,6 +128,7 @@ void hashmap_insert(HashMap* hm, const char* key, int32 value) {
    HashEntryInt32* entry = (HashEntryInt32 *) chunk_get_element(&hm->buf, element, true);
    entry->element_id = element;

+    // @performance Do we really need strncpy? Either use memcpy or strcpy?! Same goes for all the other cases below
    strncpy(entry->key, key, HASH_MAP_MAX_KEY_LENGTH);
    entry->key[HASH_MAP_MAX_KEY_LENGTH - 1] = '\0';

@ -364,7 +361,7 @@ inline
 int64 hashmap_dump(const HashMap* hm, byte* data)
 {
    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(hm->buf.count);
-    data += sizeof(uint64);
+    data += sizeof(hm->buf.count);

    // Dump the table content where the elements are relative indices/pointers
    for (int32 i = 0; i < hm->buf.count; ++i) {
--- a/stdlib/Intrinsics.h
+++ b/stdlib/Intrinsics.h
@ -52,17 +52,38 @@ inline f32 oms_floor(f32 a) { return _mm_cvtss_f32(_mm_floor_ss(_mm_setzero_ps()

 inline f32 oms_ceil(f32 a) { return _mm_cvtss_f32(_mm_ceil_ss(_mm_setzero_ps(), _mm_set_ss(a))); }

-inline uint32 hash(uint64 a, uint64 b = 0)
+inline uint32 oms_hash(uint64 a, uint64 b = 0)
 {
    uint8 seed[16] = {
        0xaa, 0x9b, 0xbd, 0xb8, 0xa1, 0x98, 0xac, 0x3f, 0x1f, 0x94, 0x07, 0xb3, 0x8c, 0x27, 0x93, 0x69,
    };

    __m128i hash = _mm_set_epi64x(a, b);
-    hash         = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed));
-    hash         = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed));
+    hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed));
+    hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed));

    return _mm_extract_epi32(hash, 0);
 }

+inline void oms_fence_memory()
+{
+    _mm_mfence();
+}
+
+inline void oms_fence_write()
+{
+    _mm_sfence();
+}
+
+inline void oms_fence_load()
+{
+    _mm_lfence();
+}
+
+inline
+void oms_invalidate_cache(void* address)
+{
+    _mm_clflush(address);
+}
+
 #endif
--- a/stdlib/IntrinsicsArm.h
+++ b/stdlib/IntrinsicsArm.h
@ -10,6 +10,7 @@
 #define TOS_STDLIB_INTRINSICS_ARM_H

 #include <arm_sve.h>
+#include <arm_acle.h>

 inline float oms_sqrt(float a) {
    svfloat32_t input = svdup_f32(a);
@ -67,4 +68,25 @@ inline float oms_ceil(float a) {
    return svget1_f32(result);
 }

+inline void oms_fence_memory()
+{
+    __dmb(0xF);
+}
+
+inline void oms_fence_write()
+{
+    __dmb(0xB);
+}
+
+inline void oms_fence_load()
+{
+    __dmb(0x7);
+}
+
+inline
+void oms_invalidate_cache(void* address)
+{
+    asm volatile("dc ivac, %0" : : "r"(address) : "memory");
+}
+
 #endif
--- a/stdlib/PerfectHashMap.h
+++ b/stdlib/PerfectHashMap.h
@ -0,0 +1,363 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_STDLIB_PERFECT_HASH_MAP_H
+#define TOS_STDLIB_PERFECT_HASH_MAP_H
+
+#include "Types.h"
+#include "HashMap.h"
+#include "../hash/GeneralHash.h"
+#include "../memory/RingMemory.h"
+
+#define PERFECT_HASH_MAP_MAX_KEY_LENGTH 32
+typedef uint64 (*perfect_hash_function)(const char* key, int32 seed);
+
+const perfect_hash_function PERFECT_HASH_FUNCTIONS[] = {
+    hash_djb2_seeded,
+    hash_sdbm_seeded,
+    hash_lose_lose_seeded,
+    hash_polynomial_rolling_seeded,
+    hash_fnv1a_seeded,
+    hash_oat_seeded,
+    hash_ejb_seeded
+};
+
+struct PerfectHashEntryInt32 {
+    int64 element_id;
+    char key[PERFECT_HASH_MAP_MAX_KEY_LENGTH];
+    int32 value;
+};
+
+struct PerfectHashEntryInt64 {
+    int64 element_id;
+    char key[PERFECT_HASH_MAP_MAX_KEY_LENGTH];
+    int64 value;
+};
+
+struct PerfectHashEntryUIntPtr {
+    int64 element_id;
+    char key[PERFECT_HASH_MAP_MAX_KEY_LENGTH];
+    uintptr_t value;
+};
+
+struct PerfectHashEntryVoidP {
+    int64 element_id;
+    char key[PERFECT_HASH_MAP_MAX_KEY_LENGTH];
+    void* value;
+};
+
+struct PerfectHashEntryFloat {
+    int64 element_id;
+    char key[PERFECT_HASH_MAP_MAX_KEY_LENGTH];
+    f32 value;
+};
+
+struct PerfectHashEntryStr {
+    int64 element_id;
+    char key[PERFECT_HASH_MAP_MAX_KEY_LENGTH];
+    char value[PERFECT_HASH_MAP_MAX_KEY_LENGTH];
+};
+
+struct PerfectHashEntry {
+    int64 element_id;
+    char key[PERFECT_HASH_MAP_MAX_KEY_LENGTH];
+    byte* value;
+};
+
+struct PerfectHashMap {
+    int32 hash_seed;
+    perfect_hash_function hash_function;
+    int32 entry_size;
+
+    int32 map_size;
+    byte* hash_entries;
+};
+
+bool set_perfect_hashmap(PerfectHashMap* hm, const char** keys, int32 key_count, perfect_hash_function hash_func, int32 seed_tries, RingMemory* ring)
+{
+    int32* indices = (int32 *) ring_get_memory(ring, hm->map_size * sizeof(int32), 4);
+    bool is_unique = false;
+
+    int32 seed;
+    int32 c = 0;
+
+    while (!is_unique && c < seed_tries) {
+        is_unique = true;
+        seed = rand();
+        memset(indices, 0, hm->map_size * sizeof(int32));
+
+        for (int32 j = 0; j < key_count; ++j) {
+            int32 index = hash_func(keys[j], seed) % hm->map_size;
+            if (indices[index]) {
+                is_unique = false;
+                break;
+            } else {
+                indices[index] = 1;
+            }
+        }
+
+        ++c;
+    }
+
+    if (is_unique) {
+        hm->hash_seed = seed;
+        hm->hash_function = hash_func;
+    }
+
+    return is_unique;
+}
+
+bool perfect_hashmap_find_perfect_hash(PerfectHashMap* hm, const char** keys, int32 key_count, int32 seed_trys, RingMemory* ring)
+{
+    int32* indices = (int32 *) ring_get_memory(ring, hm->map_size * sizeof(int32), 4);
+    bool is_unique = false;
+
+    for (int32 i = 0; i < ARRAY_COUNT(PERFECT_HASH_FUNCTIONS); ++i) {
+        int32 seed;
+        int32 c = 0;
+
+        while (!is_unique && c < seed_trys) {
+            is_unique = true;
+            seed = rand();
+            memset(indices, 0, hm->map_size * sizeof(int32));
+
+            for (int32 j = 0; j < key_count; ++j) {
+                int32 index = (PERFECT_HASH_FUNCTIONS[i])(keys[j], seed) % hm->map_size;
+                if (indices[index]) {
+                    is_unique = false;
+                    break;
+                } else {
+                    indices[index] = 1;
+                }
+            }
+
+            ++c;
+        }
+
+        if (is_unique) {
+            hm->hash_seed = seed;
+            hm->hash_function = PERFECT_HASH_FUNCTIONS[i];
+        }
+    }
+
+    return is_unique;
+}
+
+// WARNING: element_size = element size + remaining HashEntry data size
+void perfect_hashmap_create(PerfectHashMap* hm, int32 count, int32 element_size, RingMemory* ring)
+{
+    hm->map_size = count;
+    hm->entry_size = element_size;
+    hm->hash_entries = ring_get_memory(
+        ring,
+        count * element_size,
+        0, true
+    );
+}
+
+// WARNING: element_size = element size + remaining HashEntry data size
+void perfect_hashmap_create(PerfectHashMap* hm, int32 count, int32 element_size, BufferMemory* buf)
+{
+    hm->map_size = count;
+    hm->entry_size = element_size;
+    hm->hash_entries = buffer_get_memory(
+        buf,
+        count * element_size,
+        0, true
+    );
+}
+
+// WARNING: element_size = element size + remaining HashEntry data size
+void perfect_hashmap_create(PerfectHashMap* hm, int32 count, int32 element_size, byte* buf)
+{
+    hm->map_size = count;
+    hm->entry_size = element_size;
+    hm->hash_entries = buf;
+}
+
+// Calculates how large a hashmap will be
+inline
+int64 perfect_hashmap_size(int count, int32 element_size)
+{
+    return count * element_size;
+}
+
+inline
+int64 perfect_hashmap_size(const PerfectHashMap* hm)
+{
+    return hm->entry_size * hm->map_size;
+}
+
+inline
+void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, int32 value) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntryInt32* entry = (PerfectHashEntryInt32 *) (hm->hash_entries + hm->entry_size * index);
+    entry->element_id = index;
+    strcpy(entry->key, key);
+    entry->value = value;
+}
+
+inline
+void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, int64 value) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntryInt64* entry = (PerfectHashEntryInt64 *) (hm->hash_entries + hm->entry_size * index);
+    entry->element_id = index;
+    strcpy(entry->key, key);
+    entry->value = value;
+}
+
+inline
+void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, uintptr_t value) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntryUIntPtr* entry = (PerfectHashEntryUIntPtr *) (hm->hash_entries + hm->entry_size * index);
+    entry->element_id = index;
+    strcpy(entry->key, key);
+    entry->value = value;
+}
+
+inline
+void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, void* value) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntryVoidP* entry = (PerfectHashEntryVoidP *) (hm->hash_entries + hm->entry_size * index);
+    entry->element_id = index;
+    strcpy(entry->key, key);
+    entry->value = value;
+}
+
+inline
+void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, f32 value) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntryFloat* entry = (PerfectHashEntryFloat *) (hm->hash_entries + hm->entry_size * index);
+    entry->element_id = index;
+    strcpy(entry->key, key);
+    entry->value = value;
+}
+
+inline
+void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, const char* value) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntryStr* entry = (PerfectHashEntryStr *) (hm->hash_entries + hm->entry_size * index);
+    entry->element_id = index;
+    strcpy(entry->key, key);
+    memcpy(entry->value, value, PERFECT_HASH_MAP_MAX_KEY_LENGTH);
+}
+
+inline
+void perfect_hashmap_insert(PerfectHashMap* hm, const char* key, byte* value) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntryStr* entry = (PerfectHashEntryStr *) (hm->hash_entries + hm->entry_size * index);
+    entry->element_id = index;
+    strcpy(entry->key, key);
+    memcpy(entry->value, value, hm->entry_size - sizeof(PerfectHashEntry));
+}
+
+inline
+PerfectHashEntry* perfect_hashmap_get_entry(const PerfectHashMap* hm, const char* key) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntry* entry = (PerfectHashEntry *) (hm->hash_entries + hm->entry_size * index);
+
+    return *entry->key == '\0' ? NULL : entry;
+}
+
+inline
+void perfect_hashmap_delete_entry(PerfectHashMap* hm, const char* key) {
+    int32 index = hm->hash_function(key, hm->hash_seed) % hm->map_size;
+    PerfectHashEntry* entry = (PerfectHashEntry *) (hm->hash_entries + hm->entry_size * index);
+
+    // This depends on where we check if an element exists (if we change perfect_hashmap_get_entry this also needs changing)
+    *entry->key = '\0';
+}
+
+inline
+int64 perfect_hashmap_dump(const PerfectHashMap* hm, byte* data)
+{
+    byte* start = data;
+
+    *((int32 *) data) = SWAP_ENDIAN_LITTLE(hm->map_size);
+    data += sizeof(hm->map_size);
+
+    *((int32 *) data) = SWAP_ENDIAN_LITTLE(hm->hash_seed);
+    data += sizeof(hm->hash_seed);
+
+    for (int32 i = 0; i < ARRAY_COUNT(PERFECT_HASH_FUNCTIONS); ++i) {
+        if (hm->hash_function == PERFECT_HASH_FUNCTIONS[i]) {
+            *((int32 *) data) = SWAP_ENDIAN_LITTLE((uint64) i);
+            data += sizeof(i);
+
+            break;
+        }
+    }
+
+    *((int32 *) data) = SWAP_ENDIAN_LITTLE(hm->entry_size);
+    data += sizeof(hm->entry_size);
+
+    memcpy(data, hm->hash_entries, hm->map_size * hm->entry_size);
+    data += hm->map_size * hm->entry_size;
+
+    return (int64) (data - start);
+}
+
+// WARNING: Requires perfect_hashmap_create first
+inline
+int64 perfect_hashmap_load(PerfectHashMap* hm, const byte* data)
+{
+    const byte* start = data;
+
+    hm->map_size = SWAP_ENDIAN_LITTLE(*((int32 *) data));
+    data += sizeof(hm->map_size);
+
+    hm->hash_seed = SWAP_ENDIAN_LITTLE(*((int32 *) data));
+    data += sizeof(hm->hash_seed);
+
+    hm->hash_function = PERFECT_HASH_FUNCTIONS[*((int32 *) data)];
+    data += sizeof(int32);
+
+    hm->entry_size = SWAP_ENDIAN_LITTLE(*((int32 *) data));
+    data += sizeof(hm->entry_size);
+
+    memcpy(hm->hash_entries, data, hm->map_size * hm->entry_size);
+    data += hm->map_size * hm->entry_size;
+
+    return (int64) (data - start);
+}
+
+// WARNiNG: Requires the phm to be initialized already incl. element count and element size etc.
+inline
+bool perfect_hashmap_from_hashmap(PerfectHashMap* phm, const HashMap* hm, int32 seed_trys, RingMemory* ring)
+{
+    char** keys = (char **) ring_get_memory(ring, sizeof(char *) * hm->buf.count, 8);
+
+    // Find all keys
+    int32 key_index = 0;
+    for (int32 i = 0; i < hm->buf.count; ++i) {
+        HashEntry* entry = (HashEntry *) hm->table[i];
+        while (entry != NULL) {
+            keys[key_index++] = entry->key;
+            entry = (HashEntry *) entry->next;
+        }
+    }
+
+    // Check if we can turn it into a perfect hash map
+    bool is_perfect = perfect_hashmap_find_perfect_hash(phm, (char **) keys, key_index, seed_trys, ring);
+    if (!is_perfect) {
+        return false;
+    }
+
+    // Fill perfect hash map
+    for (int32 i = 0; i < hm->buf.count; ++i) {
+        HashEntry* entry = (HashEntry *) hm->table[i];
+        while (entry != NULL) {
+            perfect_hashmap_insert(phm, entry->key, entry->value);
+            entry = (HashEntry *) entry->next;
+        }
+    }
+
+    return true;
+}
+
+#endif
--- a/thread/Thread.h
+++ b/thread/Thread.h
@ -32,7 +32,7 @@ void thread_create(Worker* worker, ThreadJobFunc routine, void* arg)

 void thread_stop(Worker* worker)
 {
-    atomic_set(&worker->state, 0);
+    atomic_set_acquire(&worker->state, 0);
    pthread_join(worker->thread, NULL);
 }

--- a/thread/ThreadPool.h
+++ b/thread/ThreadPool.h
@ -60,32 +60,32 @@ static THREAD_RETURN thread_pool_worker(void* arg)
            break;
        }

-        work = (PoolWorker *) queue_dequeue_keep(&pool->work_queue, sizeof(PoolWorker), 64);
+        work = (PoolWorker *) queue_dequeue_keep(&pool->work_queue);
        pthread_mutex_unlock(&pool->work_mutex);

        if (!work) {
            continue;
        }

-        atomic_increment(&pool->working_cnt);
-        atomic_set(&work->state, 2);
+        atomic_increment_relaxed(&pool->working_cnt);
+        atomic_set_release(&work->state, 2);
        work->func(work);
-        atomic_set(&work->state, 1);
+        atomic_set_release(&work->state, 1);

        // Job gets marked after completion -> can be overwritten now
-        if (atomic_get(&work->id) == -1) {
-            atomic_set(&work->id, 0);
+        if (atomic_get_relaxed(&work->id) == -1) {
+            atomic_set_release(&work->id, 0);
        }

-        atomic_decrement(&pool->working_cnt);
+        atomic_decrement_relaxed(&pool->working_cnt);

-        if (atomic_get(&pool->state) == 0 && atomic_get(&pool->working_cnt) == 0) {
+        if (atomic_get_relaxed(&pool->state) == 0 && atomic_get_relaxed(&pool->working_cnt) == 0) {
            pthread_cond_signal(&pool->working_cond);
        }
    }

    pthread_cond_signal(&pool->working_cond);
-    atomic_decrement(&pool->thread_cnt);
+    atomic_decrement_relaxed(&pool->thread_cnt);

    return NULL;
 }
@ -121,10 +121,10 @@ void thread_pool_wait(ThreadPool* pool)
 void thread_pool_destroy(ThreadPool* pool)
 {
    // This sets the queue to empty
-    atomic_set((void **) &pool->work_queue.tail, (void **) &pool->work_queue.head);
+    atomic_set_acquire((void **) &pool->work_queue.tail, (void **) &pool->work_queue.head);

    // This sets the state to "shutdown"
-    atomic_set(&pool->state, 1);
+    atomic_set_release(&pool->state, 1);

    pthread_cond_broadcast(&pool->work_cond);
    thread_pool_wait(pool);
@ -137,8 +137,8 @@ void thread_pool_destroy(ThreadPool* pool)
 PoolWorker* thread_pool_add_work(ThreadPool* pool, const PoolWorker* job)
 {
    pthread_mutex_lock(&pool->work_mutex);
-    PoolWorker* temp_job = (PoolWorker *) ring_get_memory_nomove(&pool->work_queue, sizeof(PoolWorker), 64);
-    if (atomic_get(&temp_job->id) > 0) {
+    PoolWorker* temp_job = (PoolWorker *) ring_get_memory_nomove((RingMemory *) &pool->work_queue, sizeof(PoolWorker), 64);
+    if (atomic_get_relaxed(&temp_job->id) > 0) {
        pthread_mutex_unlock(&pool->work_mutex);
        ASSERT_SIMPLE(temp_job->id == 0);

@ -146,10 +146,10 @@ PoolWorker* thread_pool_add_work(ThreadPool* pool, const PoolWorker* job)
    }

    memcpy(temp_job, job, sizeof(PoolWorker));
-    ring_move_pointer(&pool->work_queue, &pool->work_queue.head, sizeof(PoolWorker), 64);
+    ring_move_pointer((RingMemory *) &pool->work_queue, &pool->work_queue.head, sizeof(PoolWorker), 64);

    if (temp_job->id == 0) {
-        temp_job->id = atomic_fetch_add(&pool->id_counter, 1);
+        temp_job->id = atomic_fetch_add_acquire(&pool->id_counter, 1);
    }

    pthread_cond_broadcast(&pool->work_cond);
@ -164,8 +164,8 @@ PoolWorker* thread_pool_add_work_start(ThreadPool* pool)
 {
    pthread_mutex_lock(&pool->work_mutex);

-    PoolWorker* temp_job = (PoolWorker *) queue_enqueue_start(&pool->work_queue, sizeof(PoolWorker), 64);
-    if (atomic_get(&temp_job->id) > 0) {
+    PoolWorker* temp_job = (PoolWorker *) queue_enqueue_start(&pool->work_queue);
+    if (atomic_get_relaxed(&temp_job->id) > 0) {
        pthread_mutex_unlock(&pool->work_mutex);
        ASSERT_SIMPLE(temp_job->id == 0);

@ -174,7 +174,7 @@ PoolWorker* thread_pool_add_work_start(ThreadPool* pool)

    if (temp_job->id == 0) {
        // +1 because otherwise the very first job would be id = 0 which is not a valid id
-        temp_job->id = atomic_fetch_add(&pool->id_counter, 1) + 1;
+        temp_job->id = atomic_fetch_add_acquire(&pool->id_counter, 1) + 1;
    }

    return temp_job;
@ -182,7 +182,7 @@ PoolWorker* thread_pool_add_work_start(ThreadPool* pool)

 void thread_pool_add_work_end(ThreadPool* pool)
 {
-    queue_enqueue_end(&pool->work_queue, sizeof(PoolWorker), 64);
+    queue_enqueue_end(&pool->work_queue);
    pthread_cond_broadcast(&pool->work_cond);
    pthread_mutex_unlock(&pool->work_mutex);
 }
--- a/ui/UITheme.h
+++ b/ui/UITheme.h
@ -31,6 +31,7 @@ struct UIThemeStyle {

    // A theme may have N named styles
    // The hashmap contains the offset where the respective style can be found
+    // @performance Switch to perfect hash map
    HashMap hash_map;
 };