diff --git a/asset/AssetArchive.h b/asset/AssetArchive.h
index 4bcd24e..9c5fe4a 100644
--- a/asset/AssetArchive.h
+++ b/asset/AssetArchive.h
@@ -73,6 +73,8 @@ struct AssetArchive {
     // If not remove
     MMFHandle mmf;
 
+    // This is used to tell the asset archive in which AssetManagementSystem (AMS) which asset type is located.
+    // Remember, many AMS only contain one asset type (e.g. image, audio, ...)
     int32 asset_type_map[ASSET_TYPE_SIZE];
 };
 
@@ -182,24 +184,19 @@ Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetMana
 {
     // @todo add calculation from element->type to ams index
 
-    AssetArchiveElement* element = &archive->header.asset_element[id];
+    // We have to mask 0x00FFFFFF since the highest bits define the archive id, not the element id
+    AssetArchiveElement* element = &archive->header.asset_element[id & 0x00FFFFFF];
     AssetManagementSystem* ams = &ams_array[archive->asset_type_map[element->type]];
 
     // @todo This is a little bit stupid, reconsider
-    char id_str[5];
-    id_str[4] = '\0';
-    *((int32 *) id_str) = id;
-
-    uint64 hash = hash_djb2(id_str);
+    char id_str[32];
+    _itoa(id, id_str, 16);
 
     Asset* asset;
 
     // @performance I think we could optimize the ams_reserver_asset in a way so we don't have to lock it the entire time
     pthread_mutex_lock(&ams->mutex);
-    // @bug If we have multiple archive files the ids also repeat, which is not possible for the hash map
-    // Possible solution: also store a string name for every asset. This would add HASH_MAP_MAX_KEY_LENGTH bytes of data to every asset though (see hash map key size = 32)
-
-    asset = ams_get_asset(ams, id_str, hash);
+    asset = ams_get_asset(ams, id_str);
     if (asset) {
         // Asset already loaded
         pthread_mutex_unlock(&ams->mutex);
@@ -238,17 +235,15 @@ Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetMana
                 Texture* texture = (Texture *) asset->self;
                 texture->image.pixels = (byte *) (texture + 1);
 
+                // @todo implement qoi encoding
                 image_from_data(file.content, &texture->image);
 
                 asset->vram_size = texture->image.pixel_count * image_pixel_size_from_type(texture->image.pixel_type);
                 asset->ram_size = asset->vram_size + sizeof(Texture);
 
                 #if OPENGL
-                    // @bug I think order_rows has the wrong value
-                    if (texture->image.order_rows == IMAGE_ROW_ORDER_TOP_TO_BOTTOM) {
-                        image_flip_vertical(ring, &texture->image);
-                        texture->image.order_rows = IMAGE_ROW_ORDER_BOTTOM_TO_TOP;
-                    }
+                    image_flip_vertical(ring, &texture->image);
+                    texture->image.order_rows = IMAGE_ROW_ORDER_BOTTOM_TO_TOP;
                 #endif
             } break;
             case ASSET_TYPE_AUDIO: {
@@ -289,6 +284,7 @@ Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetMana
     pthread_mutex_unlock(&ams->mutex);
 
     // @performance maybe do in worker threads? This just feels very slow
+    // @question dependencies might be stored in different archives?
     for (uint32 i = 0; i < element->dependency_count; ++i) {
         asset_archive_asset_load(archive, id, ams, ring);
     }
diff --git a/asset/AssetManagementSystem.h b/asset/AssetManagementSystem.h
index 025b1e7..e99b746 100644
--- a/asset/AssetManagementSystem.h
+++ b/asset/AssetManagementSystem.h
@@ -84,7 +84,7 @@ void ams_create(AssetManagementSystem* ams, byte* buf, int32 chunk_size, int32 c
     // setup asset_memory
     ams->asset_memory.count = count;
     ams->asset_memory.chunk_size = sizeof(Asset);
-    ams->asset_memory.last_pos = -1;
+    ams->asset_memory.last_pos = 0;
     ams->asset_memory.alignment = 64;
     ams->asset_memory.memory = buf;
     ams->asset_memory.free = (uint64 *) (ams->asset_memory.memory + ams->asset_memory.chunk_size * count);
@@ -92,7 +92,7 @@ void ams_create(AssetManagementSystem* ams, byte* buf, int32 chunk_size, int32 c
     // setup asset_data_memory
     ams->asset_data_memory.count = count;
     ams->asset_data_memory.chunk_size = chunk_size;
-    ams->asset_data_memory.last_pos = -1;
+    ams->asset_data_memory.last_pos = 0;
     ams->asset_data_memory.alignment = 64;
     ams->asset_data_memory.memory = (byte *) (ams->asset_memory.free + CEIL_DIV(count, 64));
     ams->asset_data_memory.free = (uint64 *) (ams->asset_data_memory.memory + ams->asset_data_memory.chunk_size * count);
@@ -204,8 +204,8 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key)
     );
 
     DEBUG_MEMORY_READ(
-        (uint64) (entry ? (Asset *) entry->value : 0),
-        entry ? ((Asset *) entry->value)->self + ((Asset *) entry->value)->ram_size : 0
+        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
+        entry ? ((Asset *) entry->value)->ram_size : 0
     );
 
     return entry ? (Asset *) entry->value : NULL;
@@ -222,8 +222,8 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash)
     );
 
     DEBUG_MEMORY_READ(
-        (uint64) (entry ? (Asset *) entry->value : 0),
-        entry ? ((Asset *) entry->value)->self + ((Asset *) entry->value)->ram_size : 0
+        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
+        entry ? ((Asset *) entry->value)->ram_size : 0
     );
 
     return entry ? (Asset *) entry->value : NULL;
diff --git a/audio/AudioMixer.h b/audio/AudioMixer.h
index 0e8b68f..89f79d5 100644
--- a/audio/AudioMixer.h
+++ b/audio/AudioMixer.h
@@ -31,22 +31,23 @@
 
 enum AudioEffect {
     AUDIO_EFFECT_NONE,
-    AUDIO_EFFECT_ECHO = 1,
-    AUDIO_EFFECT_REVERB = 2,
-    AUDIO_EFFECT_UNDERWATER = 4,
-    AUDIO_EFFECT_CAVE = 8,
-    AUDIO_EFFECT_LOWPASS = 16,
-    AUDIO_EFFECT_HIGHPASS = 32,
-    AUDIO_EFFECT_FLANGER = 64,
-    AUDIO_EFFECT_TREMOLO = 128,
-    AUDIO_EFFECT_DISTORTION = 256,
-    AUDIO_EFFECT_CHORUS = 512,
-    AUDIO_EFFECT_PITCH_SHIFT = 1024,
-    AUDIO_EFFECT_GRANULAR_DELAY = 2048,
-    AUDIO_EFFECT_FM = 4096,
-    AUDIO_EFFECT_STEREO_PANNING = 8192,
-    AUDIO_EFFECT_EASE_IN = 16384,
-    AUDIO_EFFECT_EASE_OUT = 32768,
+    AUDIO_EFFECT_ECHO = 1 << 0,
+    AUDIO_EFFECT_REVERB = 1 << 1,
+    AUDIO_EFFECT_UNDERWATER = 1 << 2,
+    AUDIO_EFFECT_CAVE = 1 << 3,
+    AUDIO_EFFECT_LOWPASS = 1 << 4,
+    AUDIO_EFFECT_HIGHPASS = 1 << 5,
+    AUDIO_EFFECT_FLANGER = 1 << 6,
+    AUDIO_EFFECT_TREMOLO = 1 << 7,
+    AUDIO_EFFECT_DISTORTION = 1 << 8,
+    AUDIO_EFFECT_CHORUS = 1 << 9,
+    AUDIO_EFFECT_PITCH_SHIFT = 1 << 10,
+    AUDIO_EFFECT_GRANULAR_DELAY = 1 << 11,
+    AUDIO_EFFECT_FM = 1 << 12,
+    AUDIO_EFFECT_STEREO_PANNING = 1 << 13,
+    AUDIO_EFFECT_EASE_IN = 1 << 14,
+    AUDIO_EFFECT_EASE_OUT = 1 << 15,
+    AUDIO_EFFECT_SPEED = 1 << 16,
 };
 
 struct AudioInstance {
@@ -56,7 +57,13 @@ struct AudioInstance {
     uint32 audio_size;
     byte* audio_data;
 
+    uint64 effect;
     uint32 sample_index;
+    byte channels;
+    bool repeat;
+
+    // @todo How to implement audio that is only supposed to be played after a certain other sound file is finished
+    // e.g. queueing soundtracks/ambient noise
 };
 
 enum AudioMixerState {
@@ -128,10 +135,13 @@ void audio_mixer_add(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSet
         return;
     }
 
+    // @question Do I really want to use audio instance? wouldn't Audio* be sufficient?
+    // Well AudioInstance is a little bit smaller but is this really worth it, probably yes?!
     AudioInstance* instance = (AudioInstance *) chunk_get_element(&mixer->audio_instances, index);
     instance->id = id;
     instance->audio_size = audio->size;
     instance->audio_data = audio->data;
+    instance->channels = audio->channels;
 
     if (origin) {
         memcpy(&instance->origin, origin, sizeof(AudioLocationSetting));
@@ -166,6 +176,41 @@ void audio_mixer_remove(AudioMixer* mixer, int64 id)
     }
 }
 
+int32 apply_speed(int16* buffer, uint32 buffer_size, f32 speed) {
+    if (speed == 1.0f) {
+        return 0;
+    }
+
+    // Has to be multiple of 2 to ensure stereo is implemented correctly
+    uint32 new_size = ROUND_TO_NEAREST((uint32) (buffer_size / speed), 2);
+
+    // Speed up
+    if (speed > 1.0f) {
+        for (int32 i = 0; i < new_size; ++i) {
+            // @bug What if 2 consecutive values fall onto the same int index for stereo. This would break it.
+            // The problem is, even by doing this as stereo calculation we would still have the same issue just not on the current value but the next loop
+            uint32 src_index = (uint32) (i * speed);
+            buffer[i] = buffer[src_index];
+        }
+
+        // A speed up reduces the sample_index -> we reduce the data in the buffer
+        return new_size - buffer_size;
+    }
+
+    // Slow down
+    for (int32 i = buffer_size - 1; i > 0; --i) {
+        uint32 src_index = (uint32) (i * speed);
+        buffer[i] = buffer[src_index];
+    }
+
+    return 0;
+}
+
+// @performance Whenever we handle left and right the same we could half the buffer_size
+// This allows us to re-use existing helper variables without re-calculating them for the next loop (e.g. delay below)
+// Or, if the multiplier is an int we can even perform the multiplication on int32 through casting instead of 2 operations on int16
+// We might have to adjust some of the values to ensure correct multiplication if possible (e.g. feedback, intensity, ...)
+// @todo We probably want to handle left and right channel differently to add some depth
 void apply_echo(int16* buffer, uint32 buffer_size, f32 delay, f32 feedback, int32 sample_rate) {
     int32 delay_samples = (int32) (delay * sample_rate);
     for (uint32 i = delay_samples; i < buffer_size; ++i) {
@@ -173,6 +218,7 @@ void apply_echo(int16* buffer, uint32 buffer_size, f32 delay, f32 feedback, int3
     }
 }
 
+// @todo We probably want to handle left and right channel differently to add some depth
 void apply_reverb(int16* buffer, uint32 buffer_size, f32 intensity) {
     intensity *= 0.5f;
     for (uint32 i = 1; i < buffer_size; ++i) {
@@ -294,11 +340,93 @@ void apply_lowpass(int16* buffer, uint32 buffer_size, f32 cutoff, int32 sample_r
     }
 }
 
-void audio_mixer_mix(AudioMixer* mixer) {
-    uint32 limit = OMS_MIN(
-        mixer->settings.sample_buffer_size / mixer->settings.sample_size,
-        mixer->settings.buffer_size / mixer->settings.sample_size
-    );
+int32 mixer_effects_mono(AudioMixer* mixer, uint64 effect, int32 samples)
+{
+    int32 sound_sample_index = 0;
+
+    if (effect & AUDIO_EFFECT_ECHO) {
+        apply_echo(mixer->buffer_temp, samples * 2, 0.2f, 0.4f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_REVERB) {
+        apply_reverb(mixer->buffer_temp, samples * 2, 0.3f);
+    }
+
+    if (effect & AUDIO_EFFECT_UNDERWATER) {
+        apply_underwater(mixer->buffer_temp, samples * 2);
+    }
+
+    if (effect & AUDIO_EFFECT_CAVE) {
+        apply_cave(mixer->buffer_temp, samples * 2, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_LOWPASS) {
+        apply_lowpass(mixer->buffer_temp, samples * 2, 500.0f, mixer->settings.sample_rate); // Cutoff frequency 500
+    }
+
+    if (effect & AUDIO_EFFECT_HIGHPASS) {
+        apply_highpass(mixer->buffer_temp, samples * 2, 2000.0f, mixer->settings.sample_rate); // Cutoff frequency 2 kHz
+    }
+
+    if (effect & AUDIO_EFFECT_FLANGER) {
+        apply_flanger(mixer->buffer_temp, samples * 2, 0.25f, 0.005f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_TREMOLO) {
+        apply_tremolo(mixer->buffer_temp, samples * 2, 5.0f, 0.8f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_DISTORTION) {
+        apply_distortion(mixer->buffer_temp, samples * 2, 10.0f);
+    }
+
+    if (effect & AUDIO_EFFECT_CHORUS) {
+        apply_chorus(mixer->buffer_temp, samples * 2, 0.25f, 0.005f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_PITCH_SHIFT) {
+        apply_pitch_shift(mixer->buffer_temp, samples * 2, 1.2f); // Slight pitch increase
+    }
+
+    if (effect & AUDIO_EFFECT_GRANULAR_DELAY) {
+        apply_granular_delay(mixer->buffer_temp, samples * 2, 0.1f, 0.2f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_FM) {
+        apply_frequency_modulation(mixer->buffer_temp, samples * 2, 2.0f, 0.5f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_STEREO_PANNING) {
+        apply_stereo_panning(mixer->buffer_temp, samples * 2, 0.5f);
+    }
+
+    /*
+    if (effect & AUDIO_EFFECT_EASE_IN) {
+        apply_ease_in(mixer->buffer_temp, samples * 2, 0.5f);
+    }
+
+    if (effect & AUDIO_EFFECT_EASE_IN) {
+        apply_ease_out(mixer->buffer_temp, samples * 2, 0.5f);
+    }
+    */
+
+    if (effect & AUDIO_EFFECT_SPEED) {
+        sound_sample_index += apply_speed(mixer->buffer_temp, samples * 2, 1.0f);
+    }
+
+    return sound_sample_index;
+}
+
+int32 mixer_effects_stereo()
+{
+    return 0;
+}
+
+void audio_mixer_mix(AudioMixer* mixer, uint32 size) {
+    memset(mixer->settings.buffer, 0, size);
+
+    mixer->settings.sample_buffer_size = 0;
+    uint32 limit_max = size / mixer->settings.sample_size;
 
     bool has_location = !is_empty((byte *) &mixer->camera.audio_location, sizeof(mixer->camera.audio_location));
 
@@ -310,6 +438,8 @@ void audio_mixer_mix(AudioMixer* mixer) {
             continue;
         }
 
+        uint32 limit = limit_max;
+
         // Compute the vector from the player to the sound's origin
         v3_f32 to_sound = {};
         f32 total_attenuation = 1.0f;
@@ -331,105 +461,93 @@ void audio_mixer_mix(AudioMixer* mixer) {
         }
 
         uint32 sound_sample_count = sound->audio_size / mixer->settings.sample_size;
-        uint32 sound_sample_index = sound->sample_index;
+        int32 sound_sample_index = sound->sample_index;
         int16* audio_data = (int16 *) sound->audio_data;
 
         // Temporary buffer for effects processing
         // @performance If there are situations where only one file exists in the mixer that should be played we could directly write to
         // the output buffer improving the performance. Some of those mixers are: music, cinematic, ui
         // Careful, NOT voice since we will probably manually layer them according to their position?
-        for (int32 j = 0; j < limit; ++j) {
-            if (sound_sample_index >= sound_sample_count) {
-                // @todo if repeat we need to handle part of it here, else quit
+        if (sound->channels == 1) {
+            // We make it stereo
+            for (int32 j = 0; j < limit; ++j) {
+                if (sound_sample_index >= sound_sample_count) {
+                    if (!sound->repeat) {
+                        limit = j;
+                        break;
+                    }
 
-                sound_sample_index = 0;
+                    sound_sample_index = 0;
+                }
 
-                // @question why are we doing this?
-                mixer->settings.sample_index = 0;
+                // We could make the temp buffer stereo here but we later on have to touch the array anyways.
+                // This way we can easily perform mixer effects on a mono output.
+                mixer->buffer_temp[j] = (int16) (audio_data[sound_sample_index] * volume_scale * total_attenuation);
+
+                ++sound_sample_index;
+
+                // @performance Some adjustments could be made right here the question is if this is faster.
+                // Probably depends on how likely the adjustment is to happen. Orientation effects are probably very likely.
             }
 
-            mixer->buffer_temp[j * 2] = (int16) (audio_data[sound_sample_index * 2] * volume_scale * total_attenuation);
-            mixer->buffer_temp[j * 2 + 1] = (int16) (audio_data[sound_sample_index * 2 + 1] * volume_scale * total_attenuation);
+            // Apply effects based on sound's effect type
+            if (sound->effect) {
+                int32 sample_adjustment = mixer_effects_mono(mixer, sound->effect, sound_sample_index);
+                sound_sample_index += sample_adjustment;
+                limit += sample_adjustment;
+            }
+        } else {
+            for (int32 j = 0; j < limit; ++j) {
+                if (sound_sample_index >= sound_sample_count) {
+                    if (!sound->repeat) {
+                        limit = j;
+                        break;
+                    }
 
-            ++sound_sample_index;
+                    sound_sample_index = 0;
+                }
 
-            // @performance Some adjustments could be made right here the question is if this is faster.
-            // Probably depends on how likely the adjustment is to happen.
+                mixer->buffer_temp[j * 2] = (int16) (audio_data[sound_sample_index * 2] * volume_scale * total_attenuation);
+                mixer->buffer_temp[j * 2 + 1] = (int16) (audio_data[sound_sample_index * 2 + 1] * volume_scale * total_attenuation);
 
-            // @todo if end of file and no repeat -> remove from list
-        }
+                ++sound_sample_index;
 
-        // @question We also have to set setting->sample_index = sound_sample_index.
-        // But that currently happens in the sound api. Do we want to keep it there or move it here
-
-        // Apply effects based on sound's effect type
-        // @performance Depending on how we implement effects we could even pull them out of this loop
-        // What I mean is effects could either be sound file dependent (current location correct) or mixer dependent
-        if (mixer->effect) {
-            if (mixer->effect & AUDIO_EFFECT_ECHO) {
-                apply_echo(mixer->buffer_temp, limit, 0.2f, 0.4f, mixer->settings.sample_rate);
+                // @performance Some adjustments could be made right here the question is if this is faster.
+                // Probably depends on how likely the adjustment is to happen. Orientation effects are probably very likely.
             }
 
-            if (mixer->effect & AUDIO_EFFECT_REVERB) {
-                apply_reverb(mixer->buffer_temp, limit, 0.3f);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_UNDERWATER) {
-                apply_underwater(mixer->buffer_temp, limit);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_CAVE) {
-                apply_cave(mixer->buffer_temp, limit, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_LOWPASS) {
-                apply_lowpass(mixer->buffer_temp, limit, 500.0f, mixer->settings.sample_rate); // Cutoff frequency 500
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_HIGHPASS) {
-                apply_highpass(mixer->buffer_temp, limit, 2000.0f, mixer->settings.sample_rate); // Cutoff frequency 2 kHz
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_FLANGER) {
-                apply_flanger(mixer->buffer_temp, limit, 0.25f, 0.005f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_TREMOLO) {
-                apply_tremolo(mixer->buffer_temp, limit, 5.0f, 0.8f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_DISTORTION) {
-                apply_distortion(mixer->buffer_temp, limit, 10.0f);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_CHORUS) {
-                apply_chorus(mixer->buffer_temp, limit, 0.25f, 0.005f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_PITCH_SHIFT) {
-                apply_pitch_shift(mixer->buffer_temp, limit, 1.2f); // Slight pitch increase
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_GRANULAR_DELAY) {
-                apply_granular_delay(mixer->buffer_temp, limit, 0.1f, 0.2f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_FM) {
-                apply_frequency_modulation(mixer->buffer_temp, limit, 2.0f, 0.5f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_STEREO_PANNING) {
-                apply_stereo_panning(mixer->buffer_temp, limit, 0.5f);
+            // Apply effects based on sound's effect type
+            if (sound->effect) {
+                int32 sample_adjustment = mixer_effects_stereo() / 2;;
+                sound_sample_index += sample_adjustment;
+                limit += sample_adjustment;
             }
         }
 
-        // @bug the actual output "limit" could be smaller if sound files end earlier and no repeat is defined
-        // In that case we would also have to adjust mixer->settings.sample_buffer_size
+        // @bug if we use speed up effect, this value could be negative. Fix.
+        sound->sample_index = sound_sample_index;
 
         // Add the processed sound to the output buffer
-        for (uint32 j = 0; j < limit; j++) {
-            mixer->settings.buffer[j] += mixer->buffer_temp[j];
+        if (sound->channels == 1) {
+            // We turn it stereo here
+            for (uint32 j = 0; j < limit; ++j) {
+                mixer->settings.buffer[j * 2] += mixer->buffer_temp[j];
+                mixer->settings.buffer[j * 2 + 1] += mixer->buffer_temp[j];
+            }
+        } else {
+            for (uint32 j = 0; j < limit * 2; ++j) {
+                mixer->settings.buffer[j] += mixer->buffer_temp[j];
+            }
         }
+
+        mixer->settings.sample_buffer_size = OMS_MAX(
+            mixer->settings.sample_buffer_size,
+            limit * mixer->settings.sample_size
+        );
+    }
+
+    if (mixer->effect) {
+        mixer_effects_stereo();
     }
 }
 
diff --git a/audio/AudioSetting.h b/audio/AudioSetting.h
index d1031f6..f1a3985 100644
--- a/audio/AudioSetting.h
+++ b/audio/AudioSetting.h
@@ -16,10 +16,6 @@
 #define SOUND_API_XAUDIO2 1
 
 struct AudioSetting {
-    // position in the audio data
-    // WARNING: not the byte position, but the index based on the sample size
-    uint32 sample_index;
-
     f32 master_volume;
 
     // bits per sample
diff --git a/font/Font.h b/font/Font.h
index 2fe27c1..4416382 100644
--- a/font/Font.h
+++ b/font/Font.h
@@ -59,11 +59,28 @@ void font_init(Font* font, byte* data, int count)
 }
 
 inline
-Glyph* font_glyph_find(Font* font, uint32 codepoint)
+Glyph* font_glyph_find(const Font* font, uint32 codepoint)
 {
-    for (uint32 i = 0; i < font->glyph_count; ++i) {
-        if (font->glyphs[i].codepoint == codepoint) {
-            return &font->glyphs[i];
+    int32 perfect_glyph_pos = codepoint - font->glyphs[0].codepoint;
+    int32 limit = OMS_MIN(perfect_glyph_pos, font->glyph_count - 1);
+
+    // We try to jump to the correct glyph based on the glyph codepoint
+    if (font->glyphs[limit].codepoint == codepoint) {
+        return &font->glyphs[limit];
+    }
+
+    // If that doesn't work we iterate the glyph list BUT only until the last possible match.
+    // Glyphs must be sorted ascending.
+    int32 low = 0;
+    int32 high = limit;
+    while (low <= high) {
+        int32 mid = low + (high - low) / 2;
+        if (font->glyphs[mid].codepoint == codepoint) {
+            return &font->glyphs[mid];
+        } else if (font->glyphs[mid].codepoint < codepoint) {
+            low = mid + 1;
+        } else {
+            high = mid - 1;
         }
     }
 
@@ -254,9 +271,21 @@ int32 font_to_data(
     return size;
 }
 
+inline
 f32 font_line_height(Font* font, f32 size)
 {
     return font->line_height * size / font->size;
 }
 
+inline
+void font_invert_coordinates(Font* font)
+{
+    // @todo Implement y-offset correction
+    for (uint32 i = 0; i < font->glyph_count; ++i) {
+        float temp = font->glyphs[i].coords.y1;
+        font->glyphs[i].coords.y1 = 1.0f - font->glyphs[i].coords.y2;
+        font->glyphs[i].coords.y2 = 1.0f - temp;
+    }
+}
+
 #endif
\ No newline at end of file
diff --git a/gpuapi/RenderUtils.h b/gpuapi/RenderUtils.h
index 22002b5..fda7006 100644
--- a/gpuapi/RenderUtils.h
+++ b/gpuapi/RenderUtils.h
@@ -299,8 +299,6 @@ f32 text_calculate_dimensions_width(
     f32 x = 0;
     f32 offset_x = 0;
 
-    uint32 first_glyph = font->glyphs[0].codepoint;
-
     // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value
 
     for (int32 i = 0; i < length; ++i) {
@@ -313,25 +311,7 @@ f32 text_calculate_dimensions_width(
             continue;
         }
 
-        Glyph* glyph = NULL;
-        // We try to jump to the correct glyph based on the glyph codepoint
-        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        int32 perfect_glyph_pos = character - first_glyph;
-        if (font->glyph_count > perfect_glyph_pos
-            && font->glyphs[perfect_glyph_pos].codepoint == character
-        ) {
-            glyph = &font->glyphs[perfect_glyph_pos];
-        } else {
-            // @performance consider to do binary search
-            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
-                if (font->glyphs[j].codepoint == character) {
-                    glyph = &font->glyphs[j];
-
-                    break;
-                }
-            }
-        }
-
+        Glyph* glyph = font_glyph_find(font, character);
         if (!glyph) {
             continue;
         }
@@ -353,8 +333,6 @@ void text_calculate_dimensions(
 
     f32 offset_x = 0;
 
-    uint32 first_glyph = font->glyphs[0].codepoint;
-
     // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value
 
     for (int32 i = 0; i < length; ++i) {
@@ -369,25 +347,7 @@ void text_calculate_dimensions(
             continue;
         }
 
-        Glyph* glyph = NULL;
-        // We try to jump to the correct glyph based on the glyph codepoint
-        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        int32 perfect_glyph_pos = character - first_glyph;
-        if (font->glyph_count > perfect_glyph_pos
-            && font->glyphs[perfect_glyph_pos].codepoint == character
-        ) {
-            glyph = &font->glyphs[perfect_glyph_pos];
-        } else {
-            // @performance consider to do binary search
-            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
-                if (font->glyphs[j].codepoint == character) {
-                    glyph = &font->glyphs[j];
-
-                    break;
-                }
-            }
-        }
-
+        Glyph* glyph = font_glyph_find(font, character);
         if (!glyph) {
             continue;
         }
@@ -433,10 +393,6 @@ v2_f32 vertex_text_create(
         }
     }
 
-    uint32 first_glyph = font->glyphs[0].codepoint;
-
-    int32 first_char = is_ascii ? text[0] : utf8_get_char_at(text, 0);
-
     f32 offset_x = x;
     for (int32 i = 0; i < length; ++i) {
         int32 character = is_ascii ? text[i] : utf8_get_char_at(text, i);
@@ -447,25 +403,7 @@ v2_f32 vertex_text_create(
             continue;
         }
 
-        Glyph* glyph = NULL;
-        // We try to jump to the correct glyph based on the glyph codepoint
-        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        int32 perfect_glyph_pos = character - first_glyph;
-        if (font->glyph_count > perfect_glyph_pos
-            && font->glyphs[perfect_glyph_pos].codepoint == character
-        ) {
-            glyph = &font->glyphs[perfect_glyph_pos];
-        } else {
-            // @performance consider to do binary search
-            for (uint32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
-                if (font->glyphs[j].codepoint == character) {
-                    glyph = &font->glyphs[j];
-
-                    break;
-                }
-            }
-        }
-
+        Glyph* glyph = font_glyph_find(font, character);
         if (!glyph) {
             continue;
         }
@@ -577,8 +515,6 @@ f32 ui_text_create(
         }
     }
 
-    uint32 first_glyph = theme->font.glyphs[0].codepoint;
-
     int32 start = *index;
     f32 offset_x = (f32) x->value_int;
     f32 offset_y = (f32) y->value_int;
@@ -594,25 +530,7 @@ f32 ui_text_create(
             continue;
         }
 
-        Glyph* glyph = NULL;
-        // We try to jump to the correct glyph based on the glyph codepoint
-        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        int32 perfect_glyph_pos = character - first_glyph;
-        if (theme->font.glyph_count > perfect_glyph_pos
-            && theme->font.glyphs[perfect_glyph_pos].codepoint == character
-        ) {
-            glyph = &theme->font.glyphs[perfect_glyph_pos];
-        } else {
-            // @performance consider to do binary search
-            for (int32 j = 0; j <= perfect_glyph_pos && j < theme->font.glyph_count; ++j) {
-                if (theme->font.glyphs[j].codepoint == character) {
-                    glyph = &theme->font.glyphs[j];
-
-                    break;
-                }
-            }
-        }
-
+        Glyph* glyph = font_glyph_find(&theme->font, character);
         if (!glyph) {
             continue;
         }
@@ -721,7 +639,7 @@ void ui_button_create(
 
     vertex_text_create(
         vertices, index, zindex,
-        x->value_float, y->value_float, width->value_float, height->value_float, align_h->value_float, align_v->value_float,
+        x->value_float, y->value_float, width->value_float, height->value_float, align_h->value_int, align_v->value_int,
         &theme->font, text->value_str, size->value_float, color_index->value_float
     );
 
diff --git a/gpuapi/opengl/OpenglUtils.h b/gpuapi/opengl/OpenglUtils.h
index 53977b9..8f8f904 100644
--- a/gpuapi/opengl/OpenglUtils.h
+++ b/gpuapi/opengl/OpenglUtils.h
@@ -156,7 +156,7 @@ void texture_use_1D(const Texture* texture, uint32 texture_unit)
     glBindTexture(GL_TEXTURE_1D, (GLuint) texture->id);
 }
 
-GLuint shader_make(GLenum type, const char *source, RingMemory* ring)
+GLuint shader_make(GLenum type, const char* source, RingMemory* ring)
 {
     GLuint shader = glCreateShader(type);
     glShaderSource(shader, 1, (GLchar **) &source, NULL);
diff --git a/image/Image.h b/image/Image.h
index f17ce7a..80f2395 100644
--- a/image/Image.h
+++ b/image/Image.h
@@ -31,6 +31,7 @@ enum PixelType
 //      has_alpha is defined it forces an alpha channel even for bitmaps
 //      order_pixels defines how the pixels should be ordered
 //      order_rows defines how the rows should be ordered
+// @question Do we really ever need int32 for width/height?
 struct Image {
     uint32 width;
     uint32 height;
diff --git a/image/Qoi.h b/image/Qoi.h
new file mode 100644
index 0000000..dc9e65a
--- /dev/null
+++ b/image/Qoi.h
@@ -0,0 +1,230 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_IMAGE_QOI_H
+#define TOS_IMAGE_QOI_H
+
+#include "../stdlib/Types.h"
+#include "../memory/RingMemory.h"
+
+#define QOI_OP_INDEX  0b00000000
+#define QOI_OP_DIFF   0b01000000
+#define QOI_OP_LUMA   0b10000000
+#define QOI_OP_RUN    0b11000000 // @todo There is a HUGE step from here to QOI_OP_RGB this leaves room for more cases or using this data
+#define QOI_OP_RGB    0b11111110
+#define QOI_OP_RGBA   0b11111111
+#define QOI_MASK_2    0b11000000
+
+#define QOI_COLOR_HASH(color) (color.r * 3 + color.g * 5 + color.b * 7 + color.a * 11)
+#define QOI_COLOR_HASH_2(color) ((((uint32)(color)) * 0x9E3779B1U) >> 26)
+#define QOI_HEADER_SIZE 9
+
+// @question Do we really ever need int32 for width/height?
+struct QoiDescription {
+    uint32 width;
+    uint32 height;
+    byte channels;
+    byte colorspace;
+};
+
+uint32 qoi_encode_size(QoiDescription* desc)
+{
+    return desc->width * desc->height * (desc->channels + 1) + QOI_HEADER_SIZE;
+}
+
+int32 qoi_encode(const byte* data, byte* output, const QoiDescription* desc) {
+	if (desc->width == 0 || desc->height == 0 ||
+		desc->channels < 3 || desc->channels > 4 ||
+		desc->colorspace > 1
+	) {
+		return;
+	}
+
+	int32 p = 0;
+    *((uint32 *) output[p]) = SWAP_ENDIAN_LITTLE(desc->width); p += 4;
+    *((uint32 *) output[p]) = SWAP_ENDIAN_LITTLE(desc->height); p += 4;
+
+    // Channel count 1-4 requires 3 bits, colorspace requires 1 bit
+	output[p++] = ((desc->channels - 1) << 1) | (desc->colorspace & 0x01);;
+
+    v4_byte index[64];
+	memset(index, 0, sizeof(index));
+
+    v4_byte px_prev = {0, 0, 0, 255};
+	v4_byte px = px_prev;
+
+	int32 px_len = desc->width * desc->height * desc->channels;
+	int32 px_end = px_len - desc->channels;
+	int32 channels = desc->channels;
+
+    int32 run = 0;
+	for (int32 px_pos = 0; px_pos < px_len; px_pos += channels) {
+        memcpy(&px, &data[px_pos], channels * sizeof(byte));
+
+		if (px.v == px_prev.v) {
+			++run;
+			if (run == 62 || px_pos == px_end) {
+				output[p++] = QOI_OP_RUN | (run - 1);
+				run = 0;
+			}
+		} else {
+			if (run) {
+				output[p++] = QOI_OP_RUN | (run - 1);
+				run = 0;
+			}
+
+			int32 index_pos = QOI_COLOR_HASH(px) % 64;
+			//int32 index_pos = QOI_COLOR_HASH_2(px);
+
+			if (index[index_pos].v == px.v) {
+				output[p++] = QOI_OP_INDEX | index_pos;
+			} else {
+				index[index_pos] = px;
+
+				if (px.a == px_prev.a) {
+					signed char vr = px.r - px_prev.r;
+					signed char vg = px.g - px_prev.g;
+					signed char vb = px.b - px_prev.b;
+
+					signed char vg_r = vr - vg;
+					signed char vg_b = vb - vg;
+
+					if (vr > -3 && vr < 2
+						&& vg > -3 && vg < 2
+						&& vb > -3 && vb < 2
+					) {
+						output[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
+					} else if (vg_r > -9 && vg_r < 8
+						&& vg > -33 && vg < 32
+						&& vg_b > -9 && vg_b < 8
+					) {
+						output[p++] = QOI_OP_LUMA | (vg + 32);
+						output[p++] = (vg_r + 8) << 4 | (vg_b +  8);
+					} else {
+						output[p++] = QOI_OP_RGB;
+						output[p++] = px.r;
+						output[p++] = px.g;
+						output[p++] = px.b;
+					}
+				} else {
+					output[p++] = QOI_OP_RGBA;
+                    *((uint32 *) &output[p]) = SWAP_ENDIAN_LITTLE(px.val);
+                    p += 4;
+				}
+			}
+		}
+
+		px_prev = px;
+	}
+
+	return p;
+}
+
+uint32 qoi_decode_size(QoiDescription* desc, int32 channels)
+{
+    return desc->width * desc->height * channels;
+}
+
+void qoi_decode(const byte* data, byte* output, int32 steps = 8)
+{
+    int32 p = 0;
+	uint32 width = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p])); p += 4;
+	uint32 height = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p])); p += 4;
+
+    // Channel count 1-4 requires 3 bits, colorspace requires 1 bit
+	int32 colorspace = data[p] & 0x01;
+	uint32 channels = ((data[p] > 1) & 0x07) + 1;
+
+	uint32 px_len = width * height * channels;
+
+    v4_byte px = {0, 0, 0, 255};
+
+    v4_byte index[64];
+    memset(index, 0, sizeof(index));
+
+    int32 run = 0;
+
+	for (uint32 px_pos = 0; px_pos < px_len; px_pos += channels) {
+        int32 b1 = data[p++];
+
+        if (b1 == QOI_OP_RGB) {
+            px.r = data[p++];
+            px.g = data[p++];
+            px.b = data[p++];
+        } else if (b1 == QOI_OP_RGBA) {
+            px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p]));
+            p += 4;
+        } else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) {
+            px = index[b1];
+        } else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) {
+            px.r += ((b1 >> 4) & 0x03) - 2;
+            px.g += ((b1 >> 2) & 0x03) - 2;
+            px.b += ( b1 & 0x03) - 2;
+        } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) {
+            int32 b2 = data[p++];
+            int32 vg = (b1 & 0x3f) - 32;
+            px.r += vg - 8 + ((b2 >> 4) & 0x0f);
+            px.g += vg;
+            px.b += vg - 8 + (b2 & 0x0f);
+        } else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
+            run = (b1 & 0x3f);
+
+            if (channels == 4) {
+                uint32 px_little_endian = SWAP_ENDIAN_LITTLE(px.val);
+                int32 pixel_step_size = steps * 4;
+                int32 i = 0;
+
+                if (steps == 16) {
+                    __m512i simd_value = _mm512_set1_epi32(px_little_endian);
+                    for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
+                        _mm512_storeu_si512((__m512i *) &output[px_pos], simd_value);
+                    }
+                } else if (steps >= 8) {
+                    __m256i simd_value = _mm256_set1_epi32(px_little_endian);
+                    for (; i <= run - steps; i += steps, px_pos += pixel_step_size) {
+                        _mm256_storeu_si256((__m256i *) &output[px_pos], simd_value);
+                    }
+                } else if (steps >= 4) {
+                    __m128i simd_value = _mm_set1_epi32(px_little_endian);
+                    for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
+                        _mm_storeu_si128((__m128i *) &output[px_pos], simd_value);
+                    }
+                }
+
+                for (; i < run; ++i) {
+                    output[px_pos] = px_little_endian;
+                    px_pos += channels;
+                }
+            } else if (channels == 3) {
+                for (int32 i = 0; i < run; ++i) {
+                    output[px_pos++] = px.r;
+                    output[px_pos++] = px.g;
+                    output[px_pos++] = px.b;
+                }
+            } else if (channels == 1) {
+                memset(&output[px_pos], px.r, run * sizeof(byte));
+                px_pos += run;
+            }
+
+            // Correction, since the loop increments by channels count as well
+            px_pos -= channels;
+
+            index[QOI_COLOR_HASH(px) % 64] = px;
+            //index[QOI_COLOR_HASH_2(px)] = px;
+
+            continue;
+        }
+
+        index[QOI_COLOR_HASH(px) % 64] = px;
+        //index[QOI_COLOR_HASH_2(px)] = px;
+
+        memcpy(&output[px_pos], &px, channels * sizeof(byte));
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/memory/BufferMemory.h b/memory/BufferMemory.h
index 8ca0557..ac9d3c3 100644
--- a/memory/BufferMemory.h
+++ b/memory/BufferMemory.h
@@ -92,7 +92,7 @@ void buffer_reset(BufferMemory* buf)
 }
 
 inline
-byte* buffer_get_memory(BufferMemory* buf, uint64 size, int32 aligned = 0, bool zeroed = false)
+byte* buffer_get_memory(BufferMemory* buf, uint64 size, int32 aligned = 4, bool zeroed = false)
 {
     ASSERT_SIMPLE(size <= buf->size);
 
diff --git a/memory/ChunkMemory.h b/memory/ChunkMemory.h
index 86ffc8a..3ab6597 100644
--- a/memory/ChunkMemory.h
+++ b/memory/ChunkMemory.h
@@ -34,9 +34,9 @@ struct ChunkMemory {
 
     uint64 count;
     uint64 size;
-    uint64 chunk_size;
-    int64 last_pos;
-    int32 alignment;
+    uint64 last_pos;
+    uint32 chunk_size;
+    uint32 alignment;
 
     // length = count
     // free describes which locations are used and which are free
@@ -44,7 +44,7 @@ struct ChunkMemory {
 };
 
 inline
-void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignment = 64)
+void chunk_alloc(ChunkMemory* buf, uint64 count, uint32 chunk_size, int32 alignment = 64)
 {
     ASSERT_SIMPLE(chunk_size);
     ASSERT_SIMPLE(count);
@@ -58,7 +58,7 @@ void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignm
     buf->count = count;
     buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
     buf->chunk_size = chunk_size;
-    buf->last_pos = -1;
+    buf->last_pos = 0;
     buf->alignment = alignment;
 
     // @question Could it be beneficial to have this before the element data?
@@ -70,7 +70,7 @@ void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignm
 }
 
 inline
-void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint64 chunk_size, int32 alignment = 64)
+void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint32 chunk_size, int32 alignment = 64)
 {
     ASSERT_SIMPLE(chunk_size);
     ASSERT_SIMPLE(count);
@@ -82,7 +82,7 @@ void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint64 chunk
     buf->count = count;
     buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
     buf->chunk_size = chunk_size;
-    buf->last_pos = -1;
+    buf->last_pos = 0;
     buf->alignment = alignment;
 
     // @question Could it be beneficial to have this before the element data?
@@ -95,7 +95,7 @@ void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint64 chunk
 }
 
 inline
-void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, int32 alignment = 64)
+void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint32 chunk_size, int32 alignment = 64)
 {
     ASSERT_SIMPLE(chunk_size);
     ASSERT_SIMPLE(count);
@@ -108,7 +108,7 @@ void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, i
     buf->count = count;
     buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
     buf->chunk_size = chunk_size;
-    buf->last_pos = -1;
+    buf->last_pos = 0;
     buf->alignment = alignment;
 
     // @question Could it be beneficial to have this before the element data?
@@ -320,15 +320,15 @@ int64 chunk_dump(const ChunkMemory* buf, byte* data)
     data += sizeof(buf->size);
 
     // Chunk Size
-    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(buf->chunk_size);
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(buf->chunk_size);
     data += sizeof(buf->chunk_size);
 
     // Last pos
-    *((int64 *) data) = SWAP_ENDIAN_LITTLE(buf->last_pos);
+    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(buf->last_pos);
     data += sizeof(buf->last_pos);
 
     // Alignment
-    *((int32 *) data) = SWAP_ENDIAN_LITTLE(buf->alignment);
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(buf->alignment);
     data += sizeof(buf->alignment);
 
     // All memory is handled in the buffer -> simply copy the buffer
@@ -351,15 +351,15 @@ int64 chunk_load(ChunkMemory* buf, const byte* data)
     data += sizeof(buf->size);
 
     // Chunk Size
-    buf->chunk_size = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
+    buf->chunk_size = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
     data += sizeof(buf->chunk_size);
 
     // Last pos
-    buf->last_pos = SWAP_ENDIAN_LITTLE(*((int64 *) data));
+    buf->last_pos = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
     data += sizeof(buf->last_pos);
 
     // Alignment
-    buf->alignment = SWAP_ENDIAN_LITTLE(*((int32 *) data));
+    buf->alignment = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
     data += sizeof(buf->alignment);
 
     memcpy(buf->memory, data, buf->size);
diff --git a/memory/Queue.h b/memory/Queue.h
index b3e6fcf..e93bcdd 100644
--- a/memory/Queue.h
+++ b/memory/Queue.h
@@ -10,6 +10,7 @@
 #define TOS_MEMORY_QUEUE_H
 
 #include "../stdlib/Types.h"
+#include "../utils/Utils.h"
 #include "RingMemory.h"
 
 // WARNING: Structure needs to be the same as RingMemory
@@ -81,7 +82,7 @@ bool queue_is_full(Queue* queue) {
 }
 
 inline
-void queue_enqueue_unique(ThreadedQueue* queue, const byte* data)
+void queue_enqueue_unique(Queue* queue, const byte* data)
 {
     ASSERT_SIMPLE((uint64_t) data % 4 == 0);
 
@@ -191,7 +192,7 @@ bool queue_dequeue(Queue* queue, byte* data)
 inline
 bool queue_dequeue_atomic(Queue* queue, byte* data)
 {
-    if (atomic_get_relaxed((uint64 *) &queue->head) == (uint64) queue->tail) {
+    if (atomic_get_acquire_release((volatile uint64 *) &queue->head) == (uint64) queue->tail) {
         return false;
     }
 
diff --git a/memory/RingMemory.h b/memory/RingMemory.h
index 8188728..583f502 100644
--- a/memory/RingMemory.h
+++ b/memory/RingMemory.h
@@ -92,7 +92,7 @@ void ring_init(RingMemory* ring, byte* buf, uint64 size, uint32 alignment = 64)
 {
     ASSERT_SIMPLE(size);
 
-    ring->memory = (byte *) ROUND_TO_NEAREST((uintptr_t) buf, alignment);
+    ring->memory = (byte *) ROUND_TO_NEAREST((uintptr_t) buf, (uint64) alignment);
 
     ring->end = ring->memory + size;
     ring->head = ring->memory;
@@ -117,7 +117,7 @@ void ring_free(RingMemory* ring)
 }
 
 inline
-byte* ring_calculate_position(const RingMemory* ring, uint64 size, byte aligned = 0)
+byte* ring_calculate_position(const RingMemory* ring, uint64 size, uint32 aligned = 4)
 {
     byte* head = ring->head;
 
@@ -126,7 +126,7 @@ byte* ring_calculate_position(const RingMemory* ring, uint64 size, byte aligned
         head += (aligned - (address & (aligned - 1))) % aligned;
     }
 
-    size = ROUND_TO_NEAREST(size, aligned);
+    size = ROUND_TO_NEAREST(size, (uint64) aligned);
     if (head + size > ring->end) {
         head = ring->memory;
 
@@ -147,7 +147,7 @@ void ring_reset(RingMemory* ring)
 }
 
 // Moves a pointer based on the size you want to consume (new position = after consuming size)
-void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, byte aligned = 0)
+void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, uint32 aligned = 4)
 {
     ASSERT_SIMPLE(size <= ring->size);
 
@@ -160,7 +160,7 @@ void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, byte aligned =
         *pos += (aligned - (address& (aligned - 1))) % aligned;
     }
 
-    size = ROUND_TO_NEAREST(size, aligned);
+    size = ROUND_TO_NEAREST(size, (uint64) aligned);
     if (*pos + size > ring->end) {
         *pos = ring->memory;
 
@@ -173,7 +173,7 @@ void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, byte aligned =
     *pos += size;
 }
 
-byte* ring_get_memory(RingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+byte* ring_get_memory(RingMemory* ring, uint64 size, uint32 aligned = 4, bool zeroed = false)
 {
     ASSERT_SIMPLE(size <= ring->size);
 
@@ -182,7 +182,7 @@ byte* ring_get_memory(RingMemory* ring, uint64 size, byte aligned = 0, bool zero
         ring->head += (aligned - (address& (aligned - 1))) % aligned;
     }
 
-    size = ROUND_TO_NEAREST(size, aligned);
+    size = ROUND_TO_NEAREST(size, (uint64) aligned);
     if (ring->head + size > ring->end) {
         ring_reset(ring);
 
@@ -207,7 +207,7 @@ byte* ring_get_memory(RingMemory* ring, uint64 size, byte aligned = 0, bool zero
 }
 
 // Same as ring_get_memory but DOESN'T move the head
-byte* ring_get_memory_nomove(RingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+byte* ring_get_memory_nomove(RingMemory* ring, uint64 size, uint32 aligned = 4, bool zeroed = false)
 {
     ASSERT_SIMPLE(size <= ring->size);
 
@@ -218,7 +218,7 @@ byte* ring_get_memory_nomove(RingMemory* ring, uint64 size, byte aligned = 0, bo
         pos += (aligned - (address& (aligned - 1))) % aligned;
     }
 
-    size = ROUND_TO_NEAREST(size, aligned);
+    size = ROUND_TO_NEAREST(size, (uint64) aligned);
     if (pos + size > ring->end) {
         ring_reset(ring);
 
@@ -253,11 +253,10 @@ byte* ring_get_element(const RingMemory* ring, uint64 element_count, uint64 elem
  * Checks if one additional element can be inserted without overwriting the tail index
  */
 inline
-bool ring_commit_safe(const RingMemory* ring, uint64 size, byte aligned = 0)
+bool ring_commit_safe(const RingMemory* ring, uint64 size, uint32 aligned = 4)
 {
     // aligned * 2 since that should be the maximum overhead for an element
-    // @bug could this result in a case where the ring is considered empty/full (false positive/negative)?
-    // The "correct" version would probably to use ring_move_pointer in some form
+    // This is not 100% correct BUT it is way faster than any correct version I can come up with
     uint64 max_mem_required = size + aligned * 2;
 
     if (ring->tail < ring->head) {
@@ -271,15 +270,17 @@ bool ring_commit_safe(const RingMemory* ring, uint64 size, byte aligned = 0)
 }
 
 inline
-bool ring_commit_safe_atomic(const RingMemory* ring, uint64 size, byte aligned = 0)
+bool ring_commit_safe_atomic(const RingMemory* ring, uint64 size, uint32 aligned = 4)
 {
     // aligned * 2 since that should be the maximum overhead for an element
-    // @bug could this result in a case where the ring is considered empty/full (false positive/negative)?
-    // The "correct" version would probably to use ring_move_pointer in some form
+    // This is not 100% correct BUT it is way faster than any correct version I can come up with
     uint64 max_mem_required = size + aligned * 2;
 
+    // @todo consider to switch to uintptr_t
     uint64 tail = atomic_get_relaxed((uint64 *) &ring->tail);
-    uint64 head = atomic_get_relaxed((uint64 *) &ring->head);
+
+    // This doesn't have to be atomic since we assume single producer/consumer and a commit is performed by the consumer
+    uint64 head = (uint64) ring->head;
 
     if (tail < head) {
         return ((uint64) (ring->end - head)) > max_mem_required
@@ -291,18 +292,6 @@ bool ring_commit_safe_atomic(const RingMemory* ring, uint64 size, byte aligned =
     }
 }
 
-inline
-void ring_force_head_update(const RingMemory* ring)
-{
-    _mm_clflush(ring->head);
-}
-
-inline
-void ring_force_tail_update(const RingMemory* ring)
-{
-    _mm_clflush(ring->tail);
-}
-
 inline
 int64 ring_dump(const RingMemory* ring, byte* data)
 {
diff --git a/memory/ThreadedChunkMemory.h b/memory/ThreadedChunkMemory.h
index 9987d4b..d1cb426 100644
--- a/memory/ThreadedChunkMemory.h
+++ b/memory/ThreadedChunkMemory.h
@@ -23,8 +23,8 @@ struct ThreadedChunkMemory {
 
     uint64 count;
     uint64 size;
-    uint64 chunk_size;
     int64 last_pos;
+    uint32 chunk_size;
     int32 alignment;
 
     // length = count
diff --git a/memory/ThreadedQueue.h b/memory/ThreadedQueue.h
index e21a337..6e7dfd3 100644
--- a/memory/ThreadedQueue.h
+++ b/memory/ThreadedQueue.h
@@ -51,7 +51,7 @@ struct ThreadedQueue {
 };
 
 inline
-void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element_size, uint32 alignment = 64)
+void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint32 element_size, uint32 alignment = 64)
 {
     element_size = ROUND_TO_NEAREST(element_size, alignment);
 
@@ -67,7 +67,7 @@ void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element
 }
 
 inline
-void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint64 element_size, uint32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint32 element_size, uint32 alignment = 64)
 {
     element_size = ROUND_TO_NEAREST(element_size, alignment);
 
@@ -83,7 +83,7 @@ void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_cou
 }
 
 inline
-void thrd_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint64 element_size, uint32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint32 element_size, uint32 alignment = 64)
 {
     element_size = ROUND_TO_NEAREST(element_size, alignment);
 
diff --git a/memory/ThreadedRingMemory.h b/memory/ThreadedRingMemory.h
index 3dcbf83..284b98f 100644
--- a/memory/ThreadedRingMemory.h
+++ b/memory/ThreadedRingMemory.h
@@ -69,7 +69,7 @@ void thrd_ring_free(ThreadedRingMemory* ring)
 }
 
 inline
-byte* thrd_ring_calculate_position(ThreadedRingMemory* ring, uint64 size, byte aligned = 0)
+byte* thrd_ring_calculate_position(ThreadedRingMemory* ring, uint64 size, byte aligned = 4)
 {
     pthread_mutex_lock(&ring->mutex);
     byte* result = ring_calculate_position((RingMemory *) ring, size, aligned);
@@ -87,14 +87,14 @@ void thrd_ring_reset(ThreadedRingMemory* ring)
 }
 
 // Moves a pointer based on the size you want to consume (new position = after consuming size)
-void thrd_ring_move_pointer(ThreadedRingMemory* ring, byte** pos, uint64 size, byte aligned = 0)
+void thrd_ring_move_pointer(ThreadedRingMemory* ring, byte** pos, uint64 size, byte aligned = 4)
 {
     pthread_mutex_lock(&ring->mutex);
     ring_move_pointer((RingMemory *) ring, pos, size, aligned);
     pthread_mutex_unlock(&ring->mutex);
 }
 
-byte* thrd_ring_get_memory(ThreadedRingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+byte* thrd_ring_get_memory(ThreadedRingMemory* ring, uint64 size, byte aligned = 4, bool zeroed = false)
 {
     pthread_mutex_lock(&ring->mutex);
     byte* result = ring_get_memory((RingMemory *) ring, size, aligned, zeroed);
@@ -104,7 +104,7 @@ byte* thrd_ring_get_memory(ThreadedRingMemory* ring, uint64 size, byte aligned =
 }
 
 // Same as ring_get_memory but DOESN'T move the head
-byte* thrd_ring_get_memory_nomove(ThreadedRingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+byte* thrd_ring_get_memory_nomove(ThreadedRingMemory* ring, uint64 size, byte aligned = 4, bool zeroed = false)
 {
     pthread_mutex_lock(&ring->mutex);
     byte* result = ring_get_memory_nomove((RingMemory *) ring, size, aligned, zeroed);
@@ -129,7 +129,7 @@ byte* thrd_ring_get_element(ThreadedRingMemory* ring, uint64 element_count, uint
  * Checks if one additional element can be inserted without overwriting the tail index
  */
 inline
-bool thrd_ring_commit_safe(ThreadedRingMemory* ring, uint64 size, byte aligned = 0)
+bool thrd_ring_commit_safe(ThreadedRingMemory* ring, uint64 size, byte aligned = 4)
 {
     pthread_mutex_lock(&ring->mutex);
     bool result = ring_commit_safe((RingMemory *) ring, size, aligned);
diff --git a/object/Mesh.h b/object/Mesh.h
index 284d67b..e7ce776 100644
--- a/object/Mesh.h
+++ b/object/Mesh.h
@@ -23,9 +23,9 @@
 #include "../utils/StringUtils.h"
 
 #if __aarch64__
-    #include "../../../GameEngine/stdlib/sve/SVE_I32.h"
+    #include "../stdlib/sve/SVE_I32.h"
 #else
-    #include "../../../GameEngine/stdlib/simd/SIMD_I32.h"
+    #include "../stdlib/simd/SIMD_I32.h"
 #endif
 
 #define MESH_VERSION 1
@@ -178,7 +178,7 @@ void mesh_from_file_txt(
             continue;
         }
 
-        // NOTE: we always load a file in the format: POSITON + NORMAL + TEXTURE + COLOR
+        // NOTE: we always load a file in the format: POSITION + NORMAL + TEXTURE + COLOR
         //      EVEN if some of the data is missing. This is necessary to keep the memory kinda in line.
         //      The actual binary file later will have the minimized layout.
 
@@ -558,7 +558,7 @@ int32 mesh_data_size(const Mesh* mesh)
 int32 mesh_to_data(
     const Mesh* mesh,
     byte* data,
-    int32 vertex_save_format = VERTEX_TYPE_ALL,
+    uint32 vertex_save_format = VERTEX_TYPE_ALL,
     int32 steps = 8
 )
 {
diff --git a/object/Texture.h b/object/Texture.h
index 257a88d..3437abf 100644
--- a/object/Texture.h
+++ b/object/Texture.h
@@ -36,7 +36,7 @@
 #include "../image/Image.h"
 
 struct Texture {
-    uint64 id;
+    uint32 id;
     byte sample_id;
 
     // @question Should the texture hold the texture unit?
diff --git a/platform/win32/audio/DirectSound.h b/platform/win32/audio/DirectSound.h
index 8635902..22ddb6e 100644
--- a/platform/win32/audio/DirectSound.h
+++ b/platform/win32/audio/DirectSound.h
@@ -148,11 +148,12 @@ uint32 audio_buffer_fillable(const AudioSetting* setting, const DirectSoundSetti
         return 0;
     }
 
-    DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
+    DWORD bytes_to_lock = setting->sample_buffer_size;
     DWORD bytes_to_write = 0;
 
     DWORD target_cursor = (player_cursor + (setting->latency * setting->sample_size)) % setting->buffer_size;
 
+    // @bug Why does this case even exist?
     if (bytes_to_lock == player_cursor) {
         // @bug What if just started?
         bytes_to_write = 0;
@@ -179,7 +180,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
     void* region2;
     DWORD region2_size;
 
-    DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
+    DWORD bytes_to_lock = setting->sample_buffer_size;
 
     api_setting->secondary_buffer->Lock(
         bytes_to_lock, setting->sample_buffer_size,
@@ -204,8 +205,6 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
 
     api_setting->secondary_buffer->Unlock(region1, region1_size, region2, region2_size);
 
-    // @question Do we want to keep this here or move it to the audio mixer?
-    setting->sample_index += setting->sample_buffer_size / setting->sample_size;
     setting->sample_buffer_size = 0;
 }
 
diff --git a/platform/win32/audio/XAudio2.h b/platform/win32/audio/XAudio2.h
index 7fa16ec..4190112 100644
--- a/platform/win32/audio/XAudio2.h
+++ b/platform/win32/audio/XAudio2.h
@@ -106,8 +106,6 @@ void audio_load(HWND hwnd, AudioSetting* setting, XAudio2Setting* api_setting) {
     api_setting->internal_buffer[1].LoopLength = 0;
     api_setting->internal_buffer[1].LoopCount = 0;
     api_setting->internal_buffer[1].pContext = NULL;
-
-    setting->sample_index = 0;
 }
 
 inline
@@ -117,10 +115,7 @@ void audio_play(AudioSetting* setting, XAudio2Setting* api_setting) {
     }
 
     api_setting->source_voice->Start(0, XAUDIO2_COMMIT_NOW);
-
-    if (setting->sample_index > 1) {
-        setting->sample_index = 0;
-    }
+    setting->sample_index = 0;
 }
 
 inline
@@ -199,7 +194,9 @@ void audio_play_buffer(AudioSetting* setting, XAudio2Setting* api_setting) {
     }
 
     ++setting->sample_output;
-    setting->sample_index += setting->sample_buffer_size / setting->sample_size;
+
+    // @performance Why do I even need this?
+    //setting->sample_index += setting->sample_buffer_size / setting->sample_size;
     setting->sample_buffer_size = 0;
 }
 
diff --git a/platform/win32/threading/Atomic.h b/platform/win32/threading/Atomic.h
index 99d304a..21d2df1 100644
--- a/platform/win32/threading/Atomic.h
+++ b/platform/win32/threading/Atomic.h
@@ -12,54 +12,55 @@
 #include <windows.h>
 #include "../../../stdlib/Types.h"
 
-// WARNING: Windows doesn't really support all the relaxed implementations, we therefore often use acquire as alternative.
+// WARNING: Windows doesn't really have relaxed, release, acquire function on x86_64.
+// You can see that by checking out how they are defined
 
 inline
 void atomic_set_relaxed(void** target, void* new_pointer)
 {
-    InterlockedExchangePointerAcquire(target, new_pointer);
+    InterlockedExchangePointerNoFence(target, new_pointer);
 }
 
 inline
 void* atomic_get_relaxed(void** target)
 {
-    return InterlockedCompareExchangePointer(target, NULL, NULL);
+    return InterlockedCompareExchangePointerNoFence(target, NULL, NULL);
 }
 
 inline
 void atomic_set_relaxed(volatile int32* value, int32 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, new_value);
+    InterlockedExchangeNoFence((long *) value, new_value);
 }
 
 inline
 void atomic_set_relaxed(volatile int64* value, int64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
 void atomic_set_relaxed(volatile f32* value, f32 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeNoFence((long *) value, (long) new_value);
 }
 
 inline
 void atomic_set_relaxed(volatile f64* value, f64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
 int32 atomic_fetch_set_relaxed(volatile int32* value, int32 new_value)
 {
-    return (int32) InterlockedExchangeAcquire((long *) value, new_value);
+    return (int32) InterlockedExchangeNoFence((long *) value, new_value);
 }
 
 inline
 int64 atomic_fetch_set_relaxed(volatile int64* value, int64 new_value)
 {
-    return (int64) InterlockedExchangeAcquire((long *) value, (long) new_value);
+    return (int64) InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -86,25 +87,25 @@ void atomic_set_relaxed(volatile byte* value, const byte new_value[16])
 inline
 int32 atomic_get_relaxed(volatile int32* value)
 {
-    return (int32) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (int32) InterlockedCompareExchangeNoFence((long *) value, 0, 0);
 }
 
 inline
 int64 atomic_get_relaxed(volatile int64* value)
 {
-    return (int64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (int64) InterlockedCompareExchangeNoFence64((LONG64 *) value, 0, 0);
 }
 
 inline
 f32 atomic_get_relaxed(volatile f32* value)
 {
-    return (f32) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (f32) InterlockedCompareExchangeNoFence((long *) value, 0, 0);
 }
 
 inline
 f64 atomic_get_relaxed(volatile f64* value)
 {
-    return (f64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (f64) InterlockedCompareExchangeNoFence64((LONG64 *) value, 0, 0);
 }
 
 inline
@@ -116,79 +117,79 @@ void atomic_get_relaxed(volatile byte* value, byte data[16])
 inline
 void atomic_increment_relaxed(volatile int32* value)
 {
-    InterlockedIncrementAcquire((long *) value);
+    InterlockedIncrementNoFence((long *) value);
 }
 
 inline
 void atomic_decrement_relaxed(volatile int32* value)
 {
-    InterlockedDecrementAcquire((long *) value);
+    InterlockedDecrementNoFence((long *) value);
 }
 
 inline
 void atomic_increment_relaxed(volatile int64* value)
 {
-    InterlockedIncrementAcquire((long *) value);
+    InterlockedIncrementNoFence64((LONG64 *) value);
 }
 
 inline
 void atomic_decrement_relaxed(volatile int64* value)
 {
-    InterlockedDecrementAcquire((long *) value);
+    InterlockedDecrementNoFence64((LONG64 *) value);
 }
 
 inline
 void atomic_add_relaxed(volatile int32* value, int32 increment)
 {
-    InterlockedAddAcquire((long *) value, increment);
+    InterlockedAddNoFence((long *) value, increment);
 }
 
 inline
 void atomic_sub_relaxed(volatile int32* value, int32 decrement)
 {
-    InterlockedAddAcquire((long *) value, -decrement);
+    InterlockedAddNoFence((long *) value, -decrement);
 }
 
 inline
 void atomic_add_relaxed(volatile int64* value, int64 increment)
 {
-    InterlockedAddAcquire((long *) value, (long) increment);
+    InterlockedAddNoFence64((LONG64 *) value, (LONG64) increment);
 }
 
 inline
 void atomic_sub_relaxed(volatile int64* value, int64 decrement)
 {
-    InterlockedAddAcquire((long *) value, -1 * ((long) decrement));
+    InterlockedAddNoFence64((LONG64 *) value, -((LONG64) decrement));
 }
 
 inline
 f32 atomic_compare_exchange_weak_relaxed(volatile f32* value, f32* expected, f32 desired)
 {
-    return (f32) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (f32) InterlockedCompareExchangeNoFence((long *) value, (long) desired, (long) *expected);
 }
 
 inline
 f64 atomic_compare_exchange_weak_relaxed(volatile f64* value, f64* expected, f64 desired)
 {
-    return (f64) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (f64) InterlockedCompareExchangeNoFence64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
 int32 atomic_compare_exchange_weak_relaxed(volatile int32* value, int32* expected, int32 desired)
 {
-    return (int32) InterlockedCompareExchangeRelease((long *) value, desired, *expected);
+    return (int32) InterlockedCompareExchangeNoFence((long *) value, desired, *expected);
 }
 
 inline
 int64 atomic_compare_exchange_weak_relaxed(volatile int64* value, int64* expected, int64 desired)
 {
-    return (int64) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (int64) InterlockedCompareExchangeNoFence64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
 int32 atomic_fetch_add_relaxed(volatile int32* value, int32 operand)
 {
-    return (int32) InterlockedExchangeAddRelease((long *) value, operand);
+    return (int32) InterlockedExchangeAddNoFence((long *) value, operand);
 }
 
 inline
@@ -200,115 +201,115 @@ int32 atomic_fetch_sub_relaxed(volatile int32* value, int32 operand)
 inline
 int64 atomic_fetch_add_relaxed(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeAddRelease((long *) value, (long) operand);
+    return (int64) InterlockedExchangeAddNoFence64((LONG64 *) value, (LONG64) operand);
 }
 
 inline
 int64 atomic_fetch_sub_relaxed(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }
 
 inline
 void atomic_set_relaxed(volatile uint32* value, uint32 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, new_value);
+    InterlockedExchangeNoFence((long *) value, new_value);
 }
 
 inline
 void atomic_set_relaxed(volatile uint64* value, uint64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
 uint32 atomic_fetch_set_relaxed(volatile uint32* value, uint32 new_value)
 {
-    return (uint32) InterlockedExchangeAcquire((long *) value, new_value);
+    return (uint32) InterlockedExchangeNoFence((long *) value, new_value);
 }
 
 inline
 uint64 atomic_fetch_set_relaxed(volatile uint64* value, uint64 new_value)
 {
-    return (uint64) InterlockedExchangeAcquire((long *) value, (long) new_value);
+    return (uint64) InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
 uint32 atomic_get_relaxed(volatile uint32* value)
 {
-    return (uint32) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (uint32) InterlockedCompareExchangeNoFence((long *) value, 0, 0);
 }
 
 inline
 uint64 atomic_get_relaxed(volatile uint64* value)
 {
-    return (uint64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (uint64) InterlockedCompareExchangeNoFence64((LONG64 *) value, 0, 0);
 }
 
 inline
 void atomic_increment_relaxed(volatile uint32* value)
 {
-    InterlockedIncrementRelease((long *) value);
+    InterlockedIncrementNoFence((long *) value);
 }
 
 inline
 void atomic_decrement_relaxed(volatile uint32* value)
 {
-    InterlockedDecrementRelease((long *) value);
+    InterlockedDecrementNoFence((long *) value);
 }
 
 inline
 void atomic_increment_relaxed(volatile uint64* value)
 {
-    InterlockedIncrementRelease((long *) value);
+    InterlockedIncrementNoFence64((LONG64 *) value);
 }
 
 inline
 void atomic_decrement_relaxed(volatile uint64* value)
 {
-    InterlockedDecrementRelease((long *) value);
+    InterlockedDecrementNoFence64((LONG64 *) value);
 }
 
 inline
 void atomic_add_relaxed(volatile uint32* value, uint32 increment)
 {
-    InterlockedAddRelease((long *) value, increment);
+    InterlockedAddNoFence((long *) value, increment);
 }
 
 inline
 void atomic_sub_relaxed(volatile uint32* value, uint32 decrement)
 {
-    InterlockedAddRelease((long *) value, -1 * ((int32) decrement));
+    InterlockedAddNoFence((long *) value, -1 * ((int32) decrement));
 }
 
 inline
 void atomic_add_relaxed(volatile uint64* value, uint64 increment)
 {
-    InterlockedAddRelease((long *) value, (long) increment);
+    InterlockedAddNoFence64((LONG64 *) value, (LONG64) increment);
 }
 
 inline
 void atomic_sub_relaxed(volatile uint64* value, uint64 decrement)
 {
-    InterlockedAddRelease((long *) value, -1 * ((long) decrement));
+    InterlockedAddNoFence64((LONG64 *) value, -((LONG64) decrement));
 }
 
 inline
 uint32 atomic_compare_exchange_weak_relaxed(volatile uint32* value, uint32* expected, uint32 desired)
 {
-    return (uint32) InterlockedCompareExchangeAcquire((long *) value, desired, *expected);
+    return (uint32) InterlockedCompareExchangeNoFence((long *) value, desired, *expected);
 }
 
 inline
 uint64 atomic_compare_exchange_weak_relaxed(volatile uint64* value, uint64* expected, uint64 desired)
 {
-    return (uint64) InterlockedCompareExchangeAcquire((unsigned long long *) value, (unsigned long long) desired, (unsigned long long) *expected);
+    return (uint64) InterlockedCompareExchangeNoFence64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
 uint32 atomic_fetch_add_relaxed(volatile uint32* value, uint32 operand)
 {
-    return (uint32) InterlockedExchangeAddRelease((long *) value, operand);
+    return (uint32) InterlockedExchangeAddNoFence((long *) value, operand);
 }
 
 inline
@@ -320,61 +321,61 @@ uint32 atomic_fetch_sub_relaxed(volatile uint32* value, uint32 operand)
 inline
 uint64 atomic_fetch_add_relaxed(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeAddRelease((long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAddNoFence64((LONG64 *) value, (LONG64) operand);
 }
 
 inline
 uint64 atomic_fetch_sub_relaxed(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }
 
 inline
 void atomic_and_relaxed(volatile uint32* value, uint32 mask)
 {
-    InterlockedAndRelease((volatile LONG *) value, mask);
+    InterlockedAndNoFence((volatile LONG *) value, mask);
 }
 
 inline
 void atomic_and_relaxed(volatile int32* value, int32 mask)
 {
-    InterlockedAndRelease((volatile LONG *) value, (LONG)mask);
+    InterlockedAndNoFence((volatile LONG *) value, (LONG)mask);
 }
 
 inline
 void atomic_and_relaxed(volatile uint64* value, uint64 mask)
 {
-    InterlockedAnd64Release((volatile LONG64 *) value, mask);
+    InterlockedAnd64NoFence((volatile LONG64 *) value, mask);
 }
 
 inline
 void atomic_and_relaxed(volatile int64* value, int64 mask)
 {
-    InterlockedAnd64Release((volatile LONG64 *) value, mask);
+    InterlockedAnd64NoFence((volatile LONG64 *) value, mask);
 }
 
 inline
 void atomic_or_relaxed(volatile uint32* value, uint32 mask)
 {
-    InterlockedOrRelease((volatile LONG *) value, mask);
+    InterlockedOrNoFence((volatile LONG *) value, mask);
 }
 
 inline
 void atomic_or_relaxed(volatile int32* value, int32 mask)
 {
-    InterlockedOrRelease((volatile LONG *) value, (LONG)mask);
+    InterlockedOrNoFence((volatile LONG *) value, (LONG)mask);
 }
 
 inline
 void atomic_or_relaxed(volatile uint64* value, uint64 mask)
 {
-    InterlockedOr64Release((volatile LONG64 *) value, mask);
+    InterlockedOr64NoFence((volatile LONG64 *) value, mask);
 }
 
 inline
 void atomic_or_relaxed(volatile int64* value, int64 mask)
 {
-    InterlockedOr64Release((volatile LONG64 *) value, mask);
+    InterlockedOr64NoFence((volatile LONG64 *) value, mask);
 }
 
 inline
@@ -398,7 +399,7 @@ void atomic_set_acquire(volatile int32* value, int32 new_value)
 inline
 void atomic_set_acquire(volatile int64* value, int64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -410,7 +411,7 @@ void atomic_set_acquire(volatile f32* value, f32 new_value)
 inline
 void atomic_set_acquire(volatile f64* value, f64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -422,7 +423,7 @@ int32 atomic_fetch_set_acquire(volatile int32* value, int32 new_value)
 inline
 int64 atomic_fetch_set_acquire(volatile int64* value, int64 new_value)
 {
-    return (int64) InterlockedExchangeAcquire((long *) value, (long) new_value);
+    return (int64) InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -467,7 +468,7 @@ f32 atomic_get_acquire(volatile f32* value)
 inline
 f64 atomic_get_acquire(volatile f64* value)
 {
-    return (f64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (f64) InterlockedCompareExchangeAcquire64((LONG64 *) value, 0, 0);
 }
 
 inline
@@ -533,7 +534,7 @@ f32 atomic_compare_exchange_weak_acquire(volatile f32* value, f32* expected, f32
 inline
 f64 atomic_compare_exchange_weak_acquire(volatile f64* value, f64* expected, f64 desired)
 {
-    return (f64) InterlockedCompareExchangeAcquire((long *) value, (long) desired, (long) *expected);
+    return (f64) InterlockedCompareExchangeAcquire64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -545,7 +546,7 @@ int32 atomic_compare_exchange_weak_acquire(volatile int32* value, int32* expecte
 inline
 int64 atomic_compare_exchange_weak_acquire(volatile int64* value, int64* expected, int64 desired)
 {
-    return (int64) InterlockedCompareExchangeAcquire((long *) value, (long) desired, (long) *expected);
+    return (int64) InterlockedCompareExchangeAcquire64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -563,13 +564,13 @@ int32 atomic_fetch_sub_acquire(volatile int32* value, int32 operand)
 inline
 int64 atomic_fetch_add_acquire(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }
 
 inline
 int64 atomic_fetch_sub_acquire(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }
 
 inline
@@ -581,7 +582,7 @@ void atomic_set_acquire(volatile uint32* value, uint32 new_value)
 inline
 void atomic_set_acquire(volatile uint64* value, uint64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -593,7 +594,7 @@ uint32 atomic_fetch_set_acquire(volatile uint32* value, uint32 new_value)
 inline
 uint64 atomic_fetch_set_acquire(volatile uint64* value, uint64 new_value)
 {
-    return (uint64) InterlockedExchangeAcquire((long *) value, (long) new_value);
+    return (uint64) InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -605,7 +606,7 @@ uint32 atomic_get_acquire(volatile uint32* value)
 inline
 uint64 atomic_get_acquire(volatile uint64* value)
 {
-    return (uint64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (uint64) InterlockedCompareExchangeAcquire64((LONG64 *) value, 0, 0);
 }
 
 inline
@@ -623,13 +624,13 @@ void atomic_decrement_acquire(volatile uint32* value)
 inline
 void atomic_increment_acquire(volatile uint64* value)
 {
-    InterlockedIncrementAcquire((long *) value);
+    InterlockedIncrementAcquire64((LONG64 *) value);
 }
 
 inline
 void atomic_decrement_acquire(volatile uint64* value)
 {
-    InterlockedDecrementAcquire((long *) value);
+    InterlockedDecrementAcquire64((LONG64 *) value);
 }
 
 inline
@@ -647,13 +648,13 @@ void atomic_sub_acquire(volatile uint32* value, uint32 decrement)
 inline
 void atomic_add_acquire(volatile uint64* value, uint64 increment)
 {
-    InterlockedAddAcquire((long *) value, (long) increment);
+    InterlockedAddAcquire64((LONG64 *) value, (LONG64) increment);
 }
 
 inline
 void atomic_sub_acquire(volatile uint64* value, uint64 decrement)
 {
-    InterlockedAddAcquire((long *) value, -1 * ((long) decrement));
+    InterlockedAddAcquire64((LONG64 *) value, -((LONG64) decrement));
 }
 
 inline
@@ -665,7 +666,7 @@ uint32 atomic_compare_exchange_weak_acquire(volatile uint32* value, uint32* expe
 inline
 uint64 atomic_compare_exchange_weak_acquire(volatile uint64* value, uint64* expected, uint64 desired)
 {
-    return (uint64) InterlockedCompareExchangeAcquire((unsigned long long *) value, (unsigned long long) desired, (unsigned long long) *expected);
+    return (uint64) InterlockedCompareExchangeAcquire64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -683,13 +684,13 @@ uint32 atomic_fetch_sub_acquire(volatile uint32* value, uint32 operand)
 inline
 uint64 atomic_fetch_add_acquire(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeAddAcquire((long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAddAcquire64((LONG64 *) value, (LONG64) operand);
 }
 
 inline
 uint64 atomic_fetch_sub_acquire(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }
 
 inline
@@ -761,7 +762,7 @@ void atomic_set_release(volatile int32* value, int32 new_value)
 inline
 void atomic_set_release(volatile int64* value, int64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -773,7 +774,7 @@ void atomic_set_release(volatile f32* value, f32 new_value)
 inline
 void atomic_set_release(volatile f64* value, f64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -785,7 +786,7 @@ int32 atomic_fetch_set_release(volatile int32* value, int32 new_value)
 inline
 int64 atomic_fetch_set_release(volatile int64* value, int64 new_value)
 {
-    return (int64) InterlockedExchange((long *) value, (long) new_value);
+    return (int64) InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -830,7 +831,7 @@ f32 atomic_get_release(volatile f32* value)
 inline
 f64 atomic_get_release(volatile f64* value)
 {
-    return (f64) InterlockedCompareExchangeRelease((long *) value, 0, 0);
+    return (f64) InterlockedCompareExchangeRelease64((LONG64 *) value, 0, 0);
 }
 
 inline
@@ -854,13 +855,13 @@ void atomic_decrement_release(volatile int32* value)
 inline
 void atomic_increment_release(volatile int64* value)
 {
-    InterlockedIncrementRelease((long *) value);
+    InterlockedIncrementRelease64((LONG64 *) value);
 }
 
 inline
 void atomic_decrement_release(volatile int64* value)
 {
-    InterlockedDecrementRelease((long *) value);
+    InterlockedDecrementRelease64((LONG64 *) value);
 }
 
 inline
@@ -878,13 +879,13 @@ void atomic_sub_release(volatile int32* value, int32 decrement)
 inline
 void atomic_add_release(volatile int64* value, int64 increment)
 {
-    InterlockedAddRelease((long *) value, (long) increment);
+    InterlockedAddRelease64((LONG64 *) value, (LONG64) increment);
 }
 
 inline
 void atomic_sub_release(volatile int64* value, int64 decrement)
 {
-    InterlockedAddRelease((long *) value, -1 * ((long) decrement));
+    InterlockedAddRelease64((LONG64 *) value, -((LONG64) decrement));
 }
 
 inline
@@ -896,7 +897,7 @@ f32 atomic_compare_exchange_weak_release(volatile f32* value, f32* expected, f32
 inline
 f64 atomic_compare_exchange_weak_release(volatile f64* value, f64* expected, f64 desired)
 {
-    return (f64) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (f64) InterlockedCompareExchangeRelease64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -908,7 +909,7 @@ int32 atomic_compare_exchange_weak_release(volatile int32* value, int32* expecte
 inline
 int64 atomic_compare_exchange_weak_release(volatile int64* value, int64* expected, int64 desired)
 {
-    return (int64) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (int64) InterlockedCompareExchangeRelease64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -926,13 +927,13 @@ int32 atomic_fetch_sub_release(volatile int32* value, int32 operand)
 inline
 int64 atomic_fetch_add_release(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }
 
 inline
 int64 atomic_fetch_sub_release(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }
 
 inline
@@ -944,7 +945,7 @@ void atomic_set_release(volatile uint32* value, uint32 new_value)
 inline
 void atomic_set_release(volatile uint64* value, uint64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -956,7 +957,7 @@ uint32 atomic_fetch_set_release(volatile uint32* value, uint32 new_value)
 inline
 uint64 atomic_fetch_set_release(volatile uint64* value, uint64 new_value)
 {
-    return (uint64) InterlockedExchange((long *) value, (long) new_value);
+    return (uint64) InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -968,7 +969,7 @@ uint32 atomic_get_release(volatile uint32* value)
 inline
 uint64 atomic_get_release(volatile uint64* value)
 {
-    return (uint64) InterlockedCompareExchangeRelease((long *) value, 0, 0);
+    return (uint64) InterlockedCompareExchangeRelease64((LONG64 *) value, 0, 0);
 }
 
 inline
@@ -986,13 +987,13 @@ void atomic_decrement_release(volatile uint32* value)
 inline
 void atomic_increment_release(volatile uint64* value)
 {
-    InterlockedIncrementRelease((long *) value);
+    InterlockedIncrementRelease64((LONG64 *) value);
 }
 
 inline
 void atomic_decrement_release(volatile uint64* value)
 {
-    InterlockedDecrementRelease((long *) value);
+    InterlockedDecrementRelease64((LONG64 *) value);
 }
 
 inline
@@ -1010,13 +1011,13 @@ void atomic_sub_release(volatile uint32* value, uint32 decrement)
 inline
 void atomic_add_release(volatile uint64* value, uint64 increment)
 {
-    InterlockedAddRelease((long *) value, (long) increment);
+    InterlockedAddRelease64((LONG64 *) value, (LONG64) increment);
 }
 
 inline
 void atomic_sub_release(volatile uint64* value, uint64 decrement)
 {
-    InterlockedAddRelease((long *) value, -1 * ((long) decrement));
+    InterlockedAddRelease64((LONG64 *) value, -((LONG64) decrement));
 }
 
 inline
@@ -1028,7 +1029,7 @@ uint32 atomic_compare_exchange_weak_release(volatile uint32* value, uint32* expe
 inline
 uint64 atomic_compare_exchange_weak_release(volatile uint64* value, uint64* expected, uint64 desired)
 {
-    return (uint64) InterlockedCompareExchangeRelease((unsigned long long *) value, (unsigned long long) desired, (unsigned long long) *expected);
+    return (uint64) InterlockedCompareExchangeRelease64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -1046,13 +1047,13 @@ uint32 atomic_fetch_sub_release(volatile uint32* value, uint32 operand)
 inline
 uint64 atomic_fetch_add_release(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeAddRelease((long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAddRelease64((LONG64 *) value, (LONG64) operand);
 }
 
 inline
 uint64 atomic_fetch_sub_release(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }
 
 inline
@@ -1124,7 +1125,7 @@ void atomic_set_acquire_release(volatile int32* value, int32 new_value)
 inline
 void atomic_set_acquire_release(volatile int64* value, int64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -1136,7 +1137,7 @@ void atomic_set_acquire_release(volatile f32* value, f32 new_value)
 inline
 void atomic_set_acquire_release(volatile f64* value, f64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -1148,7 +1149,7 @@ int32 atomic_fetch_set_acquire_release(volatile int32* value, int32 new_value)
 inline
 int64 atomic_fetch_set_acquire_release(volatile int64* value, int64 new_value)
 {
-    return (int64) InterlockedExchange((long *) value, (long) new_value);
+    return (int64) InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -1181,7 +1182,7 @@ int32 atomic_get_acquire_release(volatile int32* value)
 inline
 int64 atomic_get_acquire_release(volatile int64* value)
 {
-    return (int64) InterlockedCompareExchange((long *) value, 0, 0);
+    return (int64) InterlockedCompareExchange64((LONG64 *) value, 0, 0);
 }
 
 inline
@@ -1193,7 +1194,7 @@ f32 atomic_get_acquire_release(volatile f32* value)
 inline
 f64 atomic_get_acquire_release(volatile f64* value)
 {
-    return (f64) InterlockedCompareExchange((long *) value, 0, 0);
+    return (f64) InterlockedCompareExchange64((LONG64 *) value, 0, 0);
 }
 
 inline
@@ -1217,13 +1218,13 @@ void atomic_decrement_acquire_release(volatile int32* value)
 inline
 void atomic_increment_acquire_release(volatile int64* value)
 {
-    InterlockedIncrement((long *) value);
+    InterlockedIncrement64((LONG64 *) value);
 }
 
 inline
 void atomic_decrement_acquire_release(volatile int64* value)
 {
-    InterlockedDecrement((long *) value);
+    InterlockedDecrement64((LONG64 *) value);
 }
 
 inline
@@ -1241,13 +1242,13 @@ void atomic_sub_acquire_release(volatile int32* value, int32 decrement)
 inline
 void atomic_add_acquire_release(volatile int64* value, int64 increment)
 {
-    InterlockedAdd((long *) value, (long) increment);
+    InterlockedAdd64((LONG64 *) value, (LONG64) increment);
 }
 
 inline
 void atomic_sub_acquire_release(volatile int64* value, int64 decrement)
 {
-    InterlockedAdd((long *) value, -1 * ((long) decrement));
+    InterlockedAdd64((LONG64 *) value, -((LONG64) decrement));
 }
 
 inline
@@ -1259,7 +1260,7 @@ f32 atomic_compare_exchange_weak_acquire_release(volatile f32* value, f32* expec
 inline
 f64 atomic_compare_exchange_weak_acquire_release(volatile f64* value, f64* expected, f64 desired)
 {
-    return (f64) InterlockedCompareExchange((long *) value, (long) desired, (long) *expected);
+    return (f64) InterlockedCompareExchange64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -1271,7 +1272,7 @@ int32 atomic_compare_exchange_weak_acquire_release(volatile int32* value, int32*
 inline
 int64 atomic_compare_exchange_weak_acquire_release(volatile int64* value, int64* expected, int64 desired)
 {
-    return (int64) InterlockedCompareExchange((long *) value, (long) desired, (long) *expected);
+    return (int64) InterlockedCompareExchange64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -1291,7 +1292,7 @@ int32 atomic_fetch_sub_acquire_release(volatile int32* value, int32 operand)
 inline
 int64 atomic_fetch_add_acquire_release(volatile int64* value, int64 operand)
 {
-    int64 ret = (int64) InterlockedExchangeSubtract((unsigned long *) value, operand);
+    int64 ret = (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 
     return ret;
 }
@@ -1299,7 +1300,7 @@ int64 atomic_fetch_add_acquire_release(volatile int64* value, int64 operand)
 inline
 int64 atomic_fetch_sub_acquire_release(volatile int64* value, int64 operand)
 {
-    int64 ret = (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    int64 ret = (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 
     return ret;
 }
@@ -1325,7 +1326,7 @@ uint32 atomic_fetch_set_acquire_release(volatile uint32* value, uint32 new_value
 inline
 uint64 atomic_fetch_set_acquire_release(volatile uint64* value, uint64 new_value)
 {
-    return (uint64) InterlockedExchange((long *) value, (long) new_value);
+    return (uint64) InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }
 
 inline
@@ -1337,7 +1338,7 @@ uint32 atomic_get_acquire_release(volatile uint32* value)
 inline
 uint64 atomic_get_acquire_release(volatile uint64* value)
 {
-    return (uint64) InterlockedCompareExchange((long *) value, 0, 0);
+    return (uint64) InterlockedCompareExchange64((LONG64 *) value, 0, 0);
 }
 
 inline
@@ -1355,13 +1356,13 @@ void atomic_decrement_acquire_release(volatile uint32* value)
 inline
 void atomic_increment_acquire_release(volatile uint64* value)
 {
-    InterlockedIncrement((long *) value);
+    InterlockedIncrement64((LONG64 *) value);
 }
 
 inline
 void atomic_decrement_acquire_release(volatile uint64* value)
 {
-    InterlockedDecrement((long *) value);
+    InterlockedDecrement64((LONG64 *) value);
 }
 
 inline
@@ -1379,13 +1380,13 @@ void atomic_sub_acquire_release(volatile uint32* value, uint32 decrement)
 inline
 void atomic_add_acquire_release(volatile uint64* value, uint64 increment)
 {
-    InterlockedAdd((long *) value, (long) increment);
+    InterlockedAdd64((LONG64 *) value, (LONG64) increment);
 }
 
 inline
 void atomic_sub_acquire_release(volatile uint64* value, uint64 decrement)
 {
-    InterlockedAdd((long *) value, -1 * ((long) decrement));
+    InterlockedAdd64((LONG64 *) value, -((LONG64) decrement));
 }
 
 inline
@@ -1397,7 +1398,7 @@ uint32 atomic_compare_exchange_weak_acquire_release(volatile uint32* value, uint
 inline
 uint64 atomic_compare_exchange_weak_acquire_release(volatile uint64* value, uint64* expected, uint64 desired)
 {
-    return (uint64) InterlockedCompareExchange((unsigned long long *) value, (unsigned long long) desired, (unsigned long long) *expected);
+    return (uint64) InterlockedCompareExchange64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }
 
 inline
@@ -1417,13 +1418,13 @@ uint32 atomic_fetch_sub_acquire_release(volatile uint32* value, uint32 operand)
 inline
 uint64 atomic_fetch_add_acquire_release(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeAdd((long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAdd64((LONG64 *) value, (LONG64) operand);
 }
 
 inline
 uint64 atomic_fetch_sub_acquire_release(volatile uint64* value, uint64 operand)
 {
-    uint64 ret = (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    uint64 ret = (uint64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 
     return ret;
 }
diff --git a/stdlib/HashMap.h b/stdlib/HashMap.h
index e7ca2aa..b64d599 100644
--- a/stdlib/HashMap.h
+++ b/stdlib/HashMap.h
@@ -376,7 +376,7 @@ int64 hashmap_dump(const HashMap* hm, byte* data)
     // Dumb hash map content = buffer memory
     int32 free_index = 0;
     int32 bit_index = 0;
-    for (int32 i = 0; i < hm->buf.count; ++i) {
+    for (uint32 i = 0; i < hm->buf.count; ++i) {
         if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
             HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);
 
@@ -434,7 +434,7 @@ int64 hashmap_load(HashMap* hm, const byte* data)
     data += sizeof(uint64);
 
     // Load the table content
-    for (int i = 0; i < count; ++i) {
+    for (uint32 i = 0; i < count; ++i) {
         uint64 offset =  SWAP_ENDIAN_LITTLE(*((uint64 *) data));
         data += sizeof(offset);
 
@@ -455,7 +455,7 @@ int64 hashmap_load(HashMap* hm, const byte* data)
     // Switch endian AND turn offsets to pointers
     int32 free_index = 0;
     int32 bit_index = 0;
-    for (int32 i = 0; i < hm->buf.count; ++i) {
+    for (uint32 i = 0; i < hm->buf.count; ++i) {
         if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
             HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);
 
diff --git a/stdlib/Types.h b/stdlib/Types.h
index 461fe79..ce9f41f 100644
--- a/stdlib/Types.h
+++ b/stdlib/Types.h
@@ -109,7 +109,6 @@ struct v4_byte {
     };
 };
 
-
 struct v2_int32 {
     union {
         struct {
diff --git a/stdlib/simd/SIMD_I16.h b/stdlib/simd/SIMD_I16.h
index a9dc646..3c3c73e 100644
--- a/stdlib/simd/SIMD_I16.h
+++ b/stdlib/simd/SIMD_I16.h
@@ -438,7 +438,8 @@ inline int16_16 operator<=(int16_16 a, int16_16 b)
 inline int16_32 operator<=(int16_32 a, int16_32 b)
 {
     int16_32 simd;
-    simd.s = _mm512_mask_blend_epi16(_mm512_knot(_mm512_cmpgt_epi16_mask(b.s, a.s)), b.s, a.s);
+    __mmask32 mask = _mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_LE);
+    simd.s = _mm512_mask_blend_epi16(mask, b.s, a.s);
 
     return simd;
 }
@@ -716,25 +717,19 @@ inline int16_32 clamp(int16_32 min_value, int16_32 a, int16_32 max_value)
     return simd_min(simd_max(a, min_value), max_value);
 }
 
-inline int16 which_true(int16_8 a)
+inline int32 which_true(int16_8 a)
 {
-    int16 which_true = _mm_movemask_epi8(a.s);
-
-    return which_true;
+    return _mm_movemask_epi8(a.s);
 }
 
-inline int16 which_true(int16_16 a)
+inline int32 which_true(int16_16 a)
 {
-    int16 which_true = _mm256_movemask_epi8(a.s);
-
-    return which_true;
+    return _mm256_movemask_epi8(a.s);
 }
 
-inline int16 which_true(int16_32 a)
+inline int32 which_true(int16_32 a)
 {
-    int16 which_true = _mm512_movepi16_mask(a.s);
-
-    return which_true;
+    return _mm512_movepi16_mask(a.s);
 }
 
 inline bool any_true(int16_8 a)
diff --git a/utils/MathUtils.h b/utils/MathUtils.h
index 9e2e0ca..2f83cb3 100644
--- a/utils/MathUtils.h
+++ b/utils/MathUtils.h
@@ -26,6 +26,8 @@
 #define ROUND_TO_NEAREST(a, b) (((a) + ((b) - 1)) & ~((b) - 1))
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 #define OMS_CEIL(x) ((x) == (int)(x) ? (int)(x) : ((x) > 0 ? (int)(x) + 1 : (int)(x)))
+#define OMS_ROUND(x) (((x) >= 0) ? ((int)((x) + 0.5f)) : ((int)((x) - 0.5f)))
+#define OMS_ROUND_POSITIVE(x) ((int)((x) + 0.5f))
 
 // Modulo function when b is a power of 2
 #define MODULO_2(a, b) ((a) & (b - 1))
diff --git a/utils/StringUtils.h b/utils/StringUtils.h
index 515a753..1295f1e 100644
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@@ -15,6 +15,7 @@
 #include <ctype.h>
 
 #include "../stdlib/Types.h"
+#include "MathUtils.h"
 
 inline
 int32 utf8_encode(uint32 codepoint, char* out)
@@ -821,9 +822,9 @@ void sprintf_fast(char *buffer, const char* format, ...) {
                 case 'f': {
                     f64 val = va_arg(args, f64);
 
-                    int32 precision = 6; // Default precision
+                    // Default precision
+                    int32 precision = 5;
 
-                    // @question Consider to implement rounding
                     // Check for optional precision specifier
                     const char* prec_ptr = ptr + 1;
                     if (*prec_ptr >= '0' && *prec_ptr <= '9') {
@@ -841,6 +842,15 @@ void sprintf_fast(char *buffer, const char* format, ...) {
                         val = -val;
                     }
 
+                    if (precision < 6) {
+                        static const float powers_of_ten[] = {
+                            1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f, 100000.0f
+                        };
+
+                        f32 scale = powers_of_ten[precision];
+                        val = OMS_ROUND_POSITIVE(val * scale) / scale;
+                    }
+
                     // Handle integer part
                     int32 int_part = (int32) val;
                     f64 frac_part = val - int_part;
@@ -896,7 +906,7 @@ void format_time_hh_mm_ss(char* time_str, int32 hours, int32 minutes, int32 secs
 }
 
 inline
-void format_time_hh_mm_ss(char* time_str, int32 time) {
+void format_time_hh_mm_ss(char* time_str, uint64 time) {
     int32 hours = (time / 3600) % 24;
     int32 minutes = (time / 60) % 60;
     int32 secs = time % 60;
@@ -915,7 +925,7 @@ void format_time_hh_mm(char* time_str, int32 hours, int32 minutes) {
 }
 
 inline
-void format_time_hh_mm(char* time_str, int32 time) {
+void format_time_hh_mm(char* time_str, uint64 time) {
     int32 hours = (time / 3600) % 24;
     int32 minutes = (time / 60) % 60;
 
diff --git a/utils/Utils.h b/utils/Utils.h
index a032bd4..52ae558 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -18,6 +18,7 @@ struct FileBody {
 };
 
 // @question Do we want to make the size comparison a step variable?
+inline
 bool is_equal_aligned(const byte* region1, const byte* region2, uint64 size)
 {
     while (size > 4) {