opengl rendering working again, also improving some code

2026-03-07 10:08:41 +00:00 · 2024-12-21 03:46:52 +01:00 · 2024-12-21 03:46:52 +01:00 · 5d7943016d
commit 5d7943016d
parent fa9fcb6394
27 changed files with 706 additions and 424 deletions
--- a/asset/AssetArchive.h
+++ b/asset/AssetArchive.h
@ -73,6 +73,8 @@ struct AssetArchive {
    // If not remove
    MMFHandle mmf;

+    // This is used to tell the asset archive in which AssetManagementSystem (AMS) which asset type is located.
+    // Remember, many AMS only contain one asset type (e.g. image, audio, ...)
    int32 asset_type_map[ASSET_TYPE_SIZE];
 };

@ -182,24 +184,19 @@ Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetMana
 {
    // @todo add calculation from element->type to ams index

-    AssetArchiveElement* element = &archive->header.asset_element[id];
+    // We have to mask 0x00FFFFFF since the highest bits define the archive id, not the element id
+    AssetArchiveElement* element = &archive->header.asset_element[id & 0x00FFFFFF];
    AssetManagementSystem* ams = &ams_array[archive->asset_type_map[element->type]];

    // @todo This is a little bit stupid, reconsider
-    char id_str[5];
-    id_str[4] = '\0';
-    *((int32 *) id_str) = id;
-
-    uint64 hash = hash_djb2(id_str);
+    char id_str[32];
+    _itoa(id, id_str, 16);

    Asset* asset;

    // @performance I think we could optimize the ams_reserver_asset in a way so we don't have to lock it the entire time
    pthread_mutex_lock(&ams->mutex);
-    // @bug If we have multiple archive files the ids also repeat, which is not possible for the hash map
-    // Possible solution: also store a string name for every asset. This would add HASH_MAP_MAX_KEY_LENGTH bytes of data to every asset though (see hash map key size = 32)
-
-    asset = ams_get_asset(ams, id_str, hash);
+    asset = ams_get_asset(ams, id_str);
    if (asset) {
        // Asset already loaded
        pthread_mutex_unlock(&ams->mutex);
@ -238,17 +235,15 @@ Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetMana
                Texture* texture = (Texture *) asset->self;
                texture->image.pixels = (byte *) (texture + 1);

+                // @todo implement qoi encoding
                image_from_data(file.content, &texture->image);

                asset->vram_size = texture->image.pixel_count * image_pixel_size_from_type(texture->image.pixel_type);
                asset->ram_size = asset->vram_size + sizeof(Texture);

                #if OPENGL
-                    // @bug I think order_rows has the wrong value
-                    if (texture->image.order_rows == IMAGE_ROW_ORDER_TOP_TO_BOTTOM) {
-                        image_flip_vertical(ring, &texture->image);
-                        texture->image.order_rows = IMAGE_ROW_ORDER_BOTTOM_TO_TOP;
-                    }
+                    image_flip_vertical(ring, &texture->image);
+                    texture->image.order_rows = IMAGE_ROW_ORDER_BOTTOM_TO_TOP;
                #endif
            } break;
            case ASSET_TYPE_AUDIO: {
@ -289,6 +284,7 @@ Asset* asset_archive_asset_load(const AssetArchive* archive, int32 id, AssetMana
    pthread_mutex_unlock(&ams->mutex);

    // @performance maybe do in worker threads? This just feels very slow
+    // @question dependencies might be stored in different archives?
    for (uint32 i = 0; i < element->dependency_count; ++i) {
        asset_archive_asset_load(archive, id, ams, ring);
    }
--- a/asset/AssetManagementSystem.h
+++ b/asset/AssetManagementSystem.h
@ -84,7 +84,7 @@ void ams_create(AssetManagementSystem* ams, byte* buf, int32 chunk_size, int32 c
    // setup asset_memory
    ams->asset_memory.count = count;
    ams->asset_memory.chunk_size = sizeof(Asset);
-    ams->asset_memory.last_pos = -1;
+    ams->asset_memory.last_pos = 0;
    ams->asset_memory.alignment = 64;
    ams->asset_memory.memory = buf;
    ams->asset_memory.free = (uint64 *) (ams->asset_memory.memory + ams->asset_memory.chunk_size * count);
@ -92,7 +92,7 @@ void ams_create(AssetManagementSystem* ams, byte* buf, int32 chunk_size, int32 c
    // setup asset_data_memory
    ams->asset_data_memory.count = count;
    ams->asset_data_memory.chunk_size = chunk_size;
-    ams->asset_data_memory.last_pos = -1;
+    ams->asset_data_memory.last_pos = 0;
    ams->asset_data_memory.alignment = 64;
    ams->asset_data_memory.memory = (byte *) (ams->asset_memory.free + CEIL_DIV(count, 64));
    ams->asset_data_memory.free = (uint64 *) (ams->asset_data_memory.memory + ams->asset_data_memory.chunk_size * count);
@ -204,8 +204,8 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key)
    );

    DEBUG_MEMORY_READ(
-        (uint64) (entry ? (Asset *) entry->value : 0),
-        entry ? ((Asset *) entry->value)->self + ((Asset *) entry->value)->ram_size : 0
+        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
+        entry ? ((Asset *) entry->value)->ram_size : 0
    );

    return entry ? (Asset *) entry->value : NULL;
@ -222,8 +222,8 @@ Asset* ams_get_asset(AssetManagementSystem* ams, const char* key, uint64 hash)
    );

    DEBUG_MEMORY_READ(
-        (uint64) (entry ? (Asset *) entry->value : 0),
-        entry ? ((Asset *) entry->value)->self + ((Asset *) entry->value)->ram_size : 0
+        (uint64) (entry ? ((Asset *) entry->value)->self : 0),
+        entry ? ((Asset *) entry->value)->ram_size : 0
    );

    return entry ? (Asset *) entry->value : NULL;
--- a/audio/AudioMixer.h
+++ b/audio/AudioMixer.h
@ -31,22 +31,23 @@

 enum AudioEffect {
    AUDIO_EFFECT_NONE,
-    AUDIO_EFFECT_ECHO = 1,
-    AUDIO_EFFECT_REVERB = 2,
-    AUDIO_EFFECT_UNDERWATER = 4,
-    AUDIO_EFFECT_CAVE = 8,
-    AUDIO_EFFECT_LOWPASS = 16,
-    AUDIO_EFFECT_HIGHPASS = 32,
-    AUDIO_EFFECT_FLANGER = 64,
-    AUDIO_EFFECT_TREMOLO = 128,
-    AUDIO_EFFECT_DISTORTION = 256,
-    AUDIO_EFFECT_CHORUS = 512,
-    AUDIO_EFFECT_PITCH_SHIFT = 1024,
-    AUDIO_EFFECT_GRANULAR_DELAY = 2048,
-    AUDIO_EFFECT_FM = 4096,
-    AUDIO_EFFECT_STEREO_PANNING = 8192,
-    AUDIO_EFFECT_EASE_IN = 16384,
-    AUDIO_EFFECT_EASE_OUT = 32768,
+    AUDIO_EFFECT_ECHO = 1 << 0,
+    AUDIO_EFFECT_REVERB = 1 << 1,
+    AUDIO_EFFECT_UNDERWATER = 1 << 2,
+    AUDIO_EFFECT_CAVE = 1 << 3,
+    AUDIO_EFFECT_LOWPASS = 1 << 4,
+    AUDIO_EFFECT_HIGHPASS = 1 << 5,
+    AUDIO_EFFECT_FLANGER = 1 << 6,
+    AUDIO_EFFECT_TREMOLO = 1 << 7,
+    AUDIO_EFFECT_DISTORTION = 1 << 8,
+    AUDIO_EFFECT_CHORUS = 1 << 9,
+    AUDIO_EFFECT_PITCH_SHIFT = 1 << 10,
+    AUDIO_EFFECT_GRANULAR_DELAY = 1 << 11,
+    AUDIO_EFFECT_FM = 1 << 12,
+    AUDIO_EFFECT_STEREO_PANNING = 1 << 13,
+    AUDIO_EFFECT_EASE_IN = 1 << 14,
+    AUDIO_EFFECT_EASE_OUT = 1 << 15,
+    AUDIO_EFFECT_SPEED = 1 << 16,
 };

 struct AudioInstance {
@ -56,7 +57,13 @@ struct AudioInstance {
    uint32 audio_size;
    byte* audio_data;

+    uint64 effect;
    uint32 sample_index;
+    byte channels;
+    bool repeat;
+
+    // @todo How to implement audio that is only supposed to be played after a certain other sound file is finished
+    // e.g. queueing soundtracks/ambient noise
 };

 enum AudioMixerState {
@ -128,10 +135,13 @@ void audio_mixer_add(AudioMixer* mixer, int64 id, Audio* audio, AudioLocationSet
        return;
    }

+    // @question Do I really want to use audio instance? wouldn't Audio* be sufficient?
+    // Well AudioInstance is a little bit smaller but is this really worth it, probably yes?!
    AudioInstance* instance = (AudioInstance *) chunk_get_element(&mixer->audio_instances, index);
    instance->id = id;
    instance->audio_size = audio->size;
    instance->audio_data = audio->data;
+    instance->channels = audio->channels;

    if (origin) {
        memcpy(&instance->origin, origin, sizeof(AudioLocationSetting));
@ -166,6 +176,41 @@ void audio_mixer_remove(AudioMixer* mixer, int64 id)
    }
 }

+int32 apply_speed(int16* buffer, uint32 buffer_size, f32 speed) {
+    if (speed == 1.0f) {
+        return 0;
+    }
+
+    // Has to be multiple of 2 to ensure stereo is implemented correctly
+    uint32 new_size = ROUND_TO_NEAREST((uint32) (buffer_size / speed), 2);
+
+    // Speed up
+    if (speed > 1.0f) {
+        for (int32 i = 0; i < new_size; ++i) {
+            // @bug What if 2 consecutive values fall onto the same int index for stereo. This would break it.
+            // The problem is, even by doing this as stereo calculation we would still have the same issue just not on the current value but the next loop
+            uint32 src_index = (uint32) (i * speed);
+            buffer[i] = buffer[src_index];
+        }
+
+        // A speed up reduces the sample_index -> we reduce the data in the buffer
+        return new_size - buffer_size;
+    }
+
+    // Slow down
+    for (int32 i = buffer_size - 1; i > 0; --i) {
+        uint32 src_index = (uint32) (i * speed);
+        buffer[i] = buffer[src_index];
+    }
+
+    return 0;
+}
+
+// @performance Whenever we handle left and right the same we could half the buffer_size
+// This allows us to re-use existing helper variables without re-calculating them for the next loop (e.g. delay below)
+// Or, if the multiplier is an int we can even perform the multiplication on int32 through casting instead of 2 operations on int16
+// We might have to adjust some of the values to ensure correct multiplication if possible (e.g. feedback, intensity, ...)
+// @todo We probably want to handle left and right channel differently to add some depth
 void apply_echo(int16* buffer, uint32 buffer_size, f32 delay, f32 feedback, int32 sample_rate) {
    int32 delay_samples = (int32) (delay * sample_rate);
    for (uint32 i = delay_samples; i < buffer_size; ++i) {
@ -173,6 +218,7 @@ void apply_echo(int16* buffer, uint32 buffer_size, f32 delay, f32 feedback, int3
    }
 }

+// @todo We probably want to handle left and right channel differently to add some depth
 void apply_reverb(int16* buffer, uint32 buffer_size, f32 intensity) {
    intensity *= 0.5f;
    for (uint32 i = 1; i < buffer_size; ++i) {
@ -294,11 +340,93 @@ void apply_lowpass(int16* buffer, uint32 buffer_size, f32 cutoff, int32 sample_r
    }
 }

-void audio_mixer_mix(AudioMixer* mixer) {
-    uint32 limit = OMS_MIN(
-        mixer->settings.sample_buffer_size / mixer->settings.sample_size,
-        mixer->settings.buffer_size / mixer->settings.sample_size
-    );
+int32 mixer_effects_mono(AudioMixer* mixer, uint64 effect, int32 samples)
+{
+    int32 sound_sample_index = 0;
+
+    if (effect & AUDIO_EFFECT_ECHO) {
+        apply_echo(mixer->buffer_temp, samples * 2, 0.2f, 0.4f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_REVERB) {
+        apply_reverb(mixer->buffer_temp, samples * 2, 0.3f);
+    }
+
+    if (effect & AUDIO_EFFECT_UNDERWATER) {
+        apply_underwater(mixer->buffer_temp, samples * 2);
+    }
+
+    if (effect & AUDIO_EFFECT_CAVE) {
+        apply_cave(mixer->buffer_temp, samples * 2, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_LOWPASS) {
+        apply_lowpass(mixer->buffer_temp, samples * 2, 500.0f, mixer->settings.sample_rate); // Cutoff frequency 500
+    }
+
+    if (effect & AUDIO_EFFECT_HIGHPASS) {
+        apply_highpass(mixer->buffer_temp, samples * 2, 2000.0f, mixer->settings.sample_rate); // Cutoff frequency 2 kHz
+    }
+
+    if (effect & AUDIO_EFFECT_FLANGER) {
+        apply_flanger(mixer->buffer_temp, samples * 2, 0.25f, 0.005f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_TREMOLO) {
+        apply_tremolo(mixer->buffer_temp, samples * 2, 5.0f, 0.8f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_DISTORTION) {
+        apply_distortion(mixer->buffer_temp, samples * 2, 10.0f);
+    }
+
+    if (effect & AUDIO_EFFECT_CHORUS) {
+        apply_chorus(mixer->buffer_temp, samples * 2, 0.25f, 0.005f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_PITCH_SHIFT) {
+        apply_pitch_shift(mixer->buffer_temp, samples * 2, 1.2f); // Slight pitch increase
+    }
+
+    if (effect & AUDIO_EFFECT_GRANULAR_DELAY) {
+        apply_granular_delay(mixer->buffer_temp, samples * 2, 0.1f, 0.2f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_FM) {
+        apply_frequency_modulation(mixer->buffer_temp, samples * 2, 2.0f, 0.5f, mixer->settings.sample_rate);
+    }
+
+    if (effect & AUDIO_EFFECT_STEREO_PANNING) {
+        apply_stereo_panning(mixer->buffer_temp, samples * 2, 0.5f);
+    }
+
+    /*
+    if (effect & AUDIO_EFFECT_EASE_IN) {
+        apply_ease_in(mixer->buffer_temp, samples * 2, 0.5f);
+    }
+
+    if (effect & AUDIO_EFFECT_EASE_IN) {
+        apply_ease_out(mixer->buffer_temp, samples * 2, 0.5f);
+    }
+    */
+
+    if (effect & AUDIO_EFFECT_SPEED) {
+        sound_sample_index += apply_speed(mixer->buffer_temp, samples * 2, 1.0f);
+    }
+
+    return sound_sample_index;
+}
+
+int32 mixer_effects_stereo()
+{
+    return 0;
+}
+
+void audio_mixer_mix(AudioMixer* mixer, uint32 size) {
+    memset(mixer->settings.buffer, 0, size);
+
+    mixer->settings.sample_buffer_size = 0;
+    uint32 limit_max = size / mixer->settings.sample_size;

    bool has_location = !is_empty((byte *) &mixer->camera.audio_location, sizeof(mixer->camera.audio_location));

@ -310,6 +438,8 @@ void audio_mixer_mix(AudioMixer* mixer) {
            continue;
        }

+        uint32 limit = limit_max;
+
        // Compute the vector from the player to the sound's origin
        v3_f32 to_sound = {};
        f32 total_attenuation = 1.0f;
@ -331,105 +461,93 @@ void audio_mixer_mix(AudioMixer* mixer) {
        }

        uint32 sound_sample_count = sound->audio_size / mixer->settings.sample_size;
-        uint32 sound_sample_index = sound->sample_index;
+        int32 sound_sample_index = sound->sample_index;
        int16* audio_data = (int16 *) sound->audio_data;

        // Temporary buffer for effects processing
        // @performance If there are situations where only one file exists in the mixer that should be played we could directly write to
        // the output buffer improving the performance. Some of those mixers are: music, cinematic, ui
        // Careful, NOT voice since we will probably manually layer them according to their position?
-        for (int32 j = 0; j < limit; ++j) {
-            if (sound_sample_index >= sound_sample_count) {
-                // @todo if repeat we need to handle part of it here, else quit
+        if (sound->channels == 1) {
+            // We make it stereo
+            for (int32 j = 0; j < limit; ++j) {
+                if (sound_sample_index >= sound_sample_count) {
+                    if (!sound->repeat) {
+                        limit = j;
+                        break;
+                    }

-                sound_sample_index = 0;
+                    sound_sample_index = 0;
+                }

-                // @question why are we doing this?
-                mixer->settings.sample_index = 0;
+                // We could make the temp buffer stereo here but we later on have to touch the array anyways.
+                // This way we can easily perform mixer effects on a mono output.
+                mixer->buffer_temp[j] = (int16) (audio_data[sound_sample_index] * volume_scale * total_attenuation);
+
+                ++sound_sample_index;
+
+                // @performance Some adjustments could be made right here the question is if this is faster.
+                // Probably depends on how likely the adjustment is to happen. Orientation effects are probably very likely.
            }

-            mixer->buffer_temp[j * 2] = (int16) (audio_data[sound_sample_index * 2] * volume_scale * total_attenuation);
-            mixer->buffer_temp[j * 2 + 1] = (int16) (audio_data[sound_sample_index * 2 + 1] * volume_scale * total_attenuation);
+            // Apply effects based on sound's effect type
+            if (sound->effect) {
+                int32 sample_adjustment = mixer_effects_mono(mixer, sound->effect, sound_sample_index);
+                sound_sample_index += sample_adjustment;
+                limit += sample_adjustment;
+            }
+        } else {
+            for (int32 j = 0; j < limit; ++j) {
+                if (sound_sample_index >= sound_sample_count) {
+                    if (!sound->repeat) {
+                        limit = j;
+                        break;
+                    }

-            ++sound_sample_index;
+                    sound_sample_index = 0;
+                }

-            // @performance Some adjustments could be made right here the question is if this is faster.
-            // Probably depends on how likely the adjustment is to happen.
+                mixer->buffer_temp[j * 2] = (int16) (audio_data[sound_sample_index * 2] * volume_scale * total_attenuation);
+                mixer->buffer_temp[j * 2 + 1] = (int16) (audio_data[sound_sample_index * 2 + 1] * volume_scale * total_attenuation);

-            // @todo if end of file and no repeat -> remove from list
-        }
+                ++sound_sample_index;

-        // @question We also have to set setting->sample_index = sound_sample_index.
-        // But that currently happens in the sound api. Do we want to keep it there or move it here
-
-        // Apply effects based on sound's effect type
-        // @performance Depending on how we implement effects we could even pull them out of this loop
-        // What I mean is effects could either be sound file dependent (current location correct) or mixer dependent
-        if (mixer->effect) {
-            if (mixer->effect & AUDIO_EFFECT_ECHO) {
-                apply_echo(mixer->buffer_temp, limit, 0.2f, 0.4f, mixer->settings.sample_rate);
+                // @performance Some adjustments could be made right here the question is if this is faster.
+                // Probably depends on how likely the adjustment is to happen. Orientation effects are probably very likely.
            }

-            if (mixer->effect & AUDIO_EFFECT_REVERB) {
-                apply_reverb(mixer->buffer_temp, limit, 0.3f);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_UNDERWATER) {
-                apply_underwater(mixer->buffer_temp, limit);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_CAVE) {
-                apply_cave(mixer->buffer_temp, limit, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_LOWPASS) {
-                apply_lowpass(mixer->buffer_temp, limit, 500.0f, mixer->settings.sample_rate); // Cutoff frequency 500
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_HIGHPASS) {
-                apply_highpass(mixer->buffer_temp, limit, 2000.0f, mixer->settings.sample_rate); // Cutoff frequency 2 kHz
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_FLANGER) {
-                apply_flanger(mixer->buffer_temp, limit, 0.25f, 0.005f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_TREMOLO) {
-                apply_tremolo(mixer->buffer_temp, limit, 5.0f, 0.8f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_DISTORTION) {
-                apply_distortion(mixer->buffer_temp, limit, 10.0f);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_CHORUS) {
-                apply_chorus(mixer->buffer_temp, limit, 0.25f, 0.005f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_PITCH_SHIFT) {
-                apply_pitch_shift(mixer->buffer_temp, limit, 1.2f); // Slight pitch increase
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_GRANULAR_DELAY) {
-                apply_granular_delay(mixer->buffer_temp, limit, 0.1f, 0.2f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_FM) {
-                apply_frequency_modulation(mixer->buffer_temp, limit, 2.0f, 0.5f, mixer->settings.sample_rate);
-            }
-
-            if (mixer->effect & AUDIO_EFFECT_STEREO_PANNING) {
-                apply_stereo_panning(mixer->buffer_temp, limit, 0.5f);
+            // Apply effects based on sound's effect type
+            if (sound->effect) {
+                int32 sample_adjustment = mixer_effects_stereo() / 2;;
+                sound_sample_index += sample_adjustment;
+                limit += sample_adjustment;
            }
        }

-        // @bug the actual output "limit" could be smaller if sound files end earlier and no repeat is defined
-        // In that case we would also have to adjust mixer->settings.sample_buffer_size
+        // @bug if we use speed up effect, this value could be negative. Fix.
+        sound->sample_index = sound_sample_index;

        // Add the processed sound to the output buffer
-        for (uint32 j = 0; j < limit; j++) {
-            mixer->settings.buffer[j] += mixer->buffer_temp[j];
+        if (sound->channels == 1) {
+            // We turn it stereo here
+            for (uint32 j = 0; j < limit; ++j) {
+                mixer->settings.buffer[j * 2] += mixer->buffer_temp[j];
+                mixer->settings.buffer[j * 2 + 1] += mixer->buffer_temp[j];
+            }
+        } else {
+            for (uint32 j = 0; j < limit * 2; ++j) {
+                mixer->settings.buffer[j] += mixer->buffer_temp[j];
+            }
        }
+
+        mixer->settings.sample_buffer_size = OMS_MAX(
+            mixer->settings.sample_buffer_size,
+            limit * mixer->settings.sample_size
+        );
+    }
+
+    if (mixer->effect) {
+        mixer_effects_stereo();
    }
 }

--- a/audio/AudioSetting.h
+++ b/audio/AudioSetting.h
@ -16,10 +16,6 @@
 #define SOUND_API_XAUDIO2 1

 struct AudioSetting {
-    // position in the audio data
-    // WARNING: not the byte position, but the index based on the sample size
-    uint32 sample_index;
-
    f32 master_volume;

    // bits per sample
--- a/font/Font.h
+++ b/font/Font.h
@ -59,11 +59,28 @@ void font_init(Font* font, byte* data, int count)
 }

 inline
-Glyph* font_glyph_find(Font* font, uint32 codepoint)
+Glyph* font_glyph_find(const Font* font, uint32 codepoint)
 {
-    for (uint32 i = 0; i < font->glyph_count; ++i) {
-        if (font->glyphs[i].codepoint == codepoint) {
-            return &font->glyphs[i];
+    int32 perfect_glyph_pos = codepoint - font->glyphs[0].codepoint;
+    int32 limit = OMS_MIN(perfect_glyph_pos, font->glyph_count - 1);
+
+    // We try to jump to the correct glyph based on the glyph codepoint
+    if (font->glyphs[limit].codepoint == codepoint) {
+        return &font->glyphs[limit];
+    }
+
+    // If that doesn't work we iterate the glyph list BUT only until the last possible match.
+    // Glyphs must be sorted ascending.
+    int32 low = 0;
+    int32 high = limit;
+    while (low <= high) {
+        int32 mid = low + (high - low) / 2;
+        if (font->glyphs[mid].codepoint == codepoint) {
+            return &font->glyphs[mid];
+        } else if (font->glyphs[mid].codepoint < codepoint) {
+            low = mid + 1;
+        } else {
+            high = mid - 1;
        }
    }

@ -254,9 +271,21 @@ int32 font_to_data(
    return size;
 }

+inline
 f32 font_line_height(Font* font, f32 size)
 {
    return font->line_height * size / font->size;
 }

+inline
+void font_invert_coordinates(Font* font)
+{
+    // @todo Implement y-offset correction
+    for (uint32 i = 0; i < font->glyph_count; ++i) {
+        float temp = font->glyphs[i].coords.y1;
+        font->glyphs[i].coords.y1 = 1.0f - font->glyphs[i].coords.y2;
+        font->glyphs[i].coords.y2 = 1.0f - temp;
+    }
+}
+
 #endif
--- a/gpuapi/RenderUtils.h
+++ b/gpuapi/RenderUtils.h
@ -299,8 +299,6 @@ f32 text_calculate_dimensions_width(
    f32 x = 0;
    f32 offset_x = 0;

-    uint32 first_glyph = font->glyphs[0].codepoint;
-
    // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value

    for (int32 i = 0; i < length; ++i) {
@ -313,25 +311,7 @@ f32 text_calculate_dimensions_width(
            continue;
        }

-        Glyph* glyph = NULL;
-        // We try to jump to the correct glyph based on the glyph codepoint
-        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        int32 perfect_glyph_pos = character - first_glyph;
-        if (font->glyph_count > perfect_glyph_pos
-            && font->glyphs[perfect_glyph_pos].codepoint == character
-        ) {
-            glyph = &font->glyphs[perfect_glyph_pos];
-        } else {
-            // @performance consider to do binary search
-            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
-                if (font->glyphs[j].codepoint == character) {
-                    glyph = &font->glyphs[j];
-
-                    break;
-                }
-            }
-        }
-
+        Glyph* glyph = font_glyph_find(font, character);
        if (!glyph) {
            continue;
        }
@ -353,8 +333,6 @@ void text_calculate_dimensions(

    f32 offset_x = 0;

-    uint32 first_glyph = font->glyphs[0].codepoint;
-
    // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value

    for (int32 i = 0; i < length; ++i) {
@ -369,25 +347,7 @@ void text_calculate_dimensions(
            continue;
        }

-        Glyph* glyph = NULL;
-        // We try to jump to the correct glyph based on the glyph codepoint
-        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        int32 perfect_glyph_pos = character - first_glyph;
-        if (font->glyph_count > perfect_glyph_pos
-            && font->glyphs[perfect_glyph_pos].codepoint == character
-        ) {
-            glyph = &font->glyphs[perfect_glyph_pos];
-        } else {
-            // @performance consider to do binary search
-            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
-                if (font->glyphs[j].codepoint == character) {
-                    glyph = &font->glyphs[j];
-
-                    break;
-                }
-            }
-        }
-
+        Glyph* glyph = font_glyph_find(font, character);
        if (!glyph) {
            continue;
        }
@ -433,10 +393,6 @@ v2_f32 vertex_text_create(
        }
    }

-    uint32 first_glyph = font->glyphs[0].codepoint;
-
-    int32 first_char = is_ascii ? text[0] : utf8_get_char_at(text, 0);
-
    f32 offset_x = x;
    for (int32 i = 0; i < length; ++i) {
        int32 character = is_ascii ? text[i] : utf8_get_char_at(text, i);
@ -447,25 +403,7 @@ v2_f32 vertex_text_create(
            continue;
        }

-        Glyph* glyph = NULL;
-        // We try to jump to the correct glyph based on the glyph codepoint
-        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        int32 perfect_glyph_pos = character - first_glyph;
-        if (font->glyph_count > perfect_glyph_pos
-            && font->glyphs[perfect_glyph_pos].codepoint == character
-        ) {
-            glyph = &font->glyphs[perfect_glyph_pos];
-        } else {
-            // @performance consider to do binary search
-            for (uint32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
-                if (font->glyphs[j].codepoint == character) {
-                    glyph = &font->glyphs[j];
-
-                    break;
-                }
-            }
-        }
-
+        Glyph* glyph = font_glyph_find(font, character);
        if (!glyph) {
            continue;
        }
@ -577,8 +515,6 @@ f32 ui_text_create(
        }
    }

-    uint32 first_glyph = theme->font.glyphs[0].codepoint;
-
    int32 start = *index;
    f32 offset_x = (f32) x->value_int;
    f32 offset_y = (f32) y->value_int;
@ -594,25 +530,7 @@ f32 ui_text_create(
            continue;
        }

-        Glyph* glyph = NULL;
-        // We try to jump to the correct glyph based on the glyph codepoint
-        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        int32 perfect_glyph_pos = character - first_glyph;
-        if (theme->font.glyph_count > perfect_glyph_pos
-            && theme->font.glyphs[perfect_glyph_pos].codepoint == character
-        ) {
-            glyph = &theme->font.glyphs[perfect_glyph_pos];
-        } else {
-            // @performance consider to do binary search
-            for (int32 j = 0; j <= perfect_glyph_pos && j < theme->font.glyph_count; ++j) {
-                if (theme->font.glyphs[j].codepoint == character) {
-                    glyph = &theme->font.glyphs[j];
-
-                    break;
-                }
-            }
-        }
-
+        Glyph* glyph = font_glyph_find(&theme->font, character);
        if (!glyph) {
            continue;
        }
@ -721,7 +639,7 @@ void ui_button_create(

    vertex_text_create(
        vertices, index, zindex,
-        x->value_float, y->value_float, width->value_float, height->value_float, align_h->value_float, align_v->value_float,
+        x->value_float, y->value_float, width->value_float, height->value_float, align_h->value_int, align_v->value_int,
        &theme->font, text->value_str, size->value_float, color_index->value_float
    );

--- a/gpuapi/opengl/OpenglUtils.h
+++ b/gpuapi/opengl/OpenglUtils.h
@ -156,7 +156,7 @@ void texture_use_1D(const Texture* texture, uint32 texture_unit)
    glBindTexture(GL_TEXTURE_1D, (GLuint) texture->id);
 }

-GLuint shader_make(GLenum type, const char *source, RingMemory* ring)
+GLuint shader_make(GLenum type, const char* source, RingMemory* ring)
 {
    GLuint shader = glCreateShader(type);
    glShaderSource(shader, 1, (GLchar **) &source, NULL);
--- a/image/Image.h
+++ b/image/Image.h
@ -31,6 +31,7 @@ enum PixelType
 //      has_alpha is defined it forces an alpha channel even for bitmaps
 //      order_pixels defines how the pixels should be ordered
 //      order_rows defines how the rows should be ordered
+// @question Do we really ever need int32 for width/height?
 struct Image {
    uint32 width;
    uint32 height;
--- a/image/Qoi.h
+++ b/image/Qoi.h
@ -0,0 +1,230 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_IMAGE_QOI_H
+#define TOS_IMAGE_QOI_H
+
+#include "../stdlib/Types.h"
+#include "../memory/RingMemory.h"
+
+#define QOI_OP_INDEX  0b00000000
+#define QOI_OP_DIFF   0b01000000
+#define QOI_OP_LUMA   0b10000000
+#define QOI_OP_RUN    0b11000000 // @todo There is a HUGE step from here to QOI_OP_RGB this leaves room for more cases or using this data
+#define QOI_OP_RGB    0b11111110
+#define QOI_OP_RGBA   0b11111111
+#define QOI_MASK_2    0b11000000
+
+#define QOI_COLOR_HASH(color) (color.r * 3 + color.g * 5 + color.b * 7 + color.a * 11)
+#define QOI_COLOR_HASH_2(color) ((((uint32)(color)) * 0x9E3779B1U) >> 26)
+#define QOI_HEADER_SIZE 9
+
+// @question Do we really ever need int32 for width/height?
+struct QoiDescription {
+    uint32 width;
+    uint32 height;
+    byte channels;
+    byte colorspace;
+};
+
+uint32 qoi_encode_size(QoiDescription* desc)
+{
+    return desc->width * desc->height * (desc->channels + 1) + QOI_HEADER_SIZE;
+}
+
+int32 qoi_encode(const byte* data, byte* output, const QoiDescription* desc) {
+	if (desc->width == 0 || desc->height == 0 ||
+		desc->channels < 3 || desc->channels > 4 ||
+		desc->colorspace > 1
+	) {
+		return;
+	}
+
+	int32 p = 0;
+    *((uint32 *) output[p]) = SWAP_ENDIAN_LITTLE(desc->width); p += 4;
+    *((uint32 *) output[p]) = SWAP_ENDIAN_LITTLE(desc->height); p += 4;
+
+    // Channel count 1-4 requires 3 bits, colorspace requires 1 bit
+	output[p++] = ((desc->channels - 1) << 1) | (desc->colorspace & 0x01);;
+
+    v4_byte index[64];
+	memset(index, 0, sizeof(index));
+
+    v4_byte px_prev = {0, 0, 0, 255};
+	v4_byte px = px_prev;
+
+	int32 px_len = desc->width * desc->height * desc->channels;
+	int32 px_end = px_len - desc->channels;
+	int32 channels = desc->channels;
+
+    int32 run = 0;
+	for (int32 px_pos = 0; px_pos < px_len; px_pos += channels) {
+        memcpy(&px, &data[px_pos], channels * sizeof(byte));
+
+		if (px.v == px_prev.v) {
+			++run;
+			if (run == 62 || px_pos == px_end) {
+				output[p++] = QOI_OP_RUN | (run - 1);
+				run = 0;
+			}
+		} else {
+			if (run) {
+				output[p++] = QOI_OP_RUN | (run - 1);
+				run = 0;
+			}
+
+			int32 index_pos = QOI_COLOR_HASH(px) % 64;
+			//int32 index_pos = QOI_COLOR_HASH_2(px);
+
+			if (index[index_pos].v == px.v) {
+				output[p++] = QOI_OP_INDEX | index_pos;
+			} else {
+				index[index_pos] = px;
+
+				if (px.a == px_prev.a) {
+					signed char vr = px.r - px_prev.r;
+					signed char vg = px.g - px_prev.g;
+					signed char vb = px.b - px_prev.b;
+
+					signed char vg_r = vr - vg;
+					signed char vg_b = vb - vg;
+
+					if (vr > -3 && vr < 2
+						&& vg > -3 && vg < 2
+						&& vb > -3 && vb < 2
+					) {
+						output[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
+					} else if (vg_r > -9 && vg_r < 8
+						&& vg > -33 && vg < 32
+						&& vg_b > -9 && vg_b < 8
+					) {
+						output[p++] = QOI_OP_LUMA | (vg + 32);
+						output[p++] = (vg_r + 8) << 4 | (vg_b +  8);
+					} else {
+						output[p++] = QOI_OP_RGB;
+						output[p++] = px.r;
+						output[p++] = px.g;
+						output[p++] = px.b;
+					}
+				} else {
+					output[p++] = QOI_OP_RGBA;
+                    *((uint32 *) &output[p]) = SWAP_ENDIAN_LITTLE(px.val);
+                    p += 4;
+				}
+			}
+		}
+
+		px_prev = px;
+	}
+
+	return p;
+}
+
+uint32 qoi_decode_size(QoiDescription* desc, int32 channels)
+{
+    return desc->width * desc->height * channels;
+}
+
+void qoi_decode(const byte* data, byte* output, int32 steps = 8)
+{
+    int32 p = 0;
+	uint32 width = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p])); p += 4;
+	uint32 height = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p])); p += 4;
+
+    // Channel count 1-4 requires 3 bits, colorspace requires 1 bit
+	int32 colorspace = data[p] & 0x01;
+	uint32 channels = ((data[p] > 1) & 0x07) + 1;
+
+	uint32 px_len = width * height * channels;
+
+    v4_byte px = {0, 0, 0, 255};
+
+    v4_byte index[64];
+    memset(index, 0, sizeof(index));
+
+    int32 run = 0;
+
+	for (uint32 px_pos = 0; px_pos < px_len; px_pos += channels) {
+        int32 b1 = data[p++];
+
+        if (b1 == QOI_OP_RGB) {
+            px.r = data[p++];
+            px.g = data[p++];
+            px.b = data[p++];
+        } else if (b1 == QOI_OP_RGBA) {
+            px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p]));
+            p += 4;
+        } else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) {
+            px = index[b1];
+        } else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) {
+            px.r += ((b1 >> 4) & 0x03) - 2;
+            px.g += ((b1 >> 2) & 0x03) - 2;
+            px.b += ( b1 & 0x03) - 2;
+        } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) {
+            int32 b2 = data[p++];
+            int32 vg = (b1 & 0x3f) - 32;
+            px.r += vg - 8 + ((b2 >> 4) & 0x0f);
+            px.g += vg;
+            px.b += vg - 8 + (b2 & 0x0f);
+        } else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
+            run = (b1 & 0x3f);
+
+            if (channels == 4) {
+                uint32 px_little_endian = SWAP_ENDIAN_LITTLE(px.val);
+                int32 pixel_step_size = steps * 4;
+                int32 i = 0;
+
+                if (steps == 16) {
+                    __m512i simd_value = _mm512_set1_epi32(px_little_endian);
+                    for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
+                        _mm512_storeu_si512((__m512i *) &output[px_pos], simd_value);
+                    }
+                } else if (steps >= 8) {
+                    __m256i simd_value = _mm256_set1_epi32(px_little_endian);
+                    for (; i <= run - steps; i += steps, px_pos += pixel_step_size) {
+                        _mm256_storeu_si256((__m256i *) &output[px_pos], simd_value);
+                    }
+                } else if (steps >= 4) {
+                    __m128i simd_value = _mm_set1_epi32(px_little_endian);
+                    for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
+                        _mm_storeu_si128((__m128i *) &output[px_pos], simd_value);
+                    }
+                }
+
+                for (; i < run; ++i) {
+                    output[px_pos] = px_little_endian;
+                    px_pos += channels;
+                }
+            } else if (channels == 3) {
+                for (int32 i = 0; i < run; ++i) {
+                    output[px_pos++] = px.r;
+                    output[px_pos++] = px.g;
+                    output[px_pos++] = px.b;
+                }
+            } else if (channels == 1) {
+                memset(&output[px_pos], px.r, run * sizeof(byte));
+                px_pos += run;
+            }
+
+            // Correction, since the loop increments by channels count as well
+            px_pos -= channels;
+
+            index[QOI_COLOR_HASH(px) % 64] = px;
+            //index[QOI_COLOR_HASH_2(px)] = px;
+
+            continue;
+        }
+
+        index[QOI_COLOR_HASH(px) % 64] = px;
+        //index[QOI_COLOR_HASH_2(px)] = px;
+
+        memcpy(&output[px_pos], &px, channels * sizeof(byte));
+	}
+}
+
+#endif
--- a/memory/BufferMemory.h
+++ b/memory/BufferMemory.h
@ -92,7 +92,7 @@ void buffer_reset(BufferMemory* buf)
 }

 inline
-byte* buffer_get_memory(BufferMemory* buf, uint64 size, int32 aligned = 0, bool zeroed = false)
+byte* buffer_get_memory(BufferMemory* buf, uint64 size, int32 aligned = 4, bool zeroed = false)
 {
    ASSERT_SIMPLE(size <= buf->size);

--- a/memory/ChunkMemory.h
+++ b/memory/ChunkMemory.h
@ -34,9 +34,9 @@ struct ChunkMemory {

    uint64 count;
    uint64 size;
-    uint64 chunk_size;
-    int64 last_pos;
-    int32 alignment;
+    uint64 last_pos;
+    uint32 chunk_size;
+    uint32 alignment;

    // length = count
    // free describes which locations are used and which are free
@ -44,7 +44,7 @@ struct ChunkMemory {
 };

 inline
-void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignment = 64)
+void chunk_alloc(ChunkMemory* buf, uint64 count, uint32 chunk_size, int32 alignment = 64)
 {
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);
@ -58,7 +58,7 @@ void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignm
    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
-    buf->last_pos = -1;
+    buf->last_pos = 0;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
@ -70,7 +70,7 @@ void chunk_alloc(ChunkMemory* buf, uint64 count, uint64 chunk_size, int32 alignm
 }

 inline
-void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint64 chunk_size, int32 alignment = 64)
+void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint32 chunk_size, int32 alignment = 64)
 {
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);
@ -82,7 +82,7 @@ void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint64 chunk
    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
-    buf->last_pos = -1;
+    buf->last_pos = 0;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
@ -95,7 +95,7 @@ void chunk_init(ChunkMemory* buf, BufferMemory* data, uint64 count, uint64 chunk
 }

 inline
-void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, int32 alignment = 64)
+void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint32 chunk_size, int32 alignment = 64)
 {
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);
@ -108,7 +108,7 @@ void chunk_init(ChunkMemory* buf, byte* data, uint64 count, uint64 chunk_size, i
    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
-    buf->last_pos = -1;
+    buf->last_pos = 0;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
@ -320,15 +320,15 @@ int64 chunk_dump(const ChunkMemory* buf, byte* data)
    data += sizeof(buf->size);

    // Chunk Size
-    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(buf->chunk_size);
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(buf->chunk_size);
    data += sizeof(buf->chunk_size);

    // Last pos
-    *((int64 *) data) = SWAP_ENDIAN_LITTLE(buf->last_pos);
+    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(buf->last_pos);
    data += sizeof(buf->last_pos);

    // Alignment
-    *((int32 *) data) = SWAP_ENDIAN_LITTLE(buf->alignment);
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(buf->alignment);
    data += sizeof(buf->alignment);

    // All memory is handled in the buffer -> simply copy the buffer
@ -351,15 +351,15 @@ int64 chunk_load(ChunkMemory* buf, const byte* data)
    data += sizeof(buf->size);

    // Chunk Size
-    buf->chunk_size = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
+    buf->chunk_size = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
    data += sizeof(buf->chunk_size);

    // Last pos
-    buf->last_pos = SWAP_ENDIAN_LITTLE(*((int64 *) data));
+    buf->last_pos = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
    data += sizeof(buf->last_pos);

    // Alignment
-    buf->alignment = SWAP_ENDIAN_LITTLE(*((int32 *) data));
+    buf->alignment = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
    data += sizeof(buf->alignment);

    memcpy(buf->memory, data, buf->size);
--- a/memory/Queue.h
+++ b/memory/Queue.h
@ -10,6 +10,7 @@
 #define TOS_MEMORY_QUEUE_H

 #include "../stdlib/Types.h"
+#include "../utils/Utils.h"
 #include "RingMemory.h"

 // WARNING: Structure needs to be the same as RingMemory
@ -81,7 +82,7 @@ bool queue_is_full(Queue* queue) {
 }

 inline
-void queue_enqueue_unique(ThreadedQueue* queue, const byte* data)
+void queue_enqueue_unique(Queue* queue, const byte* data)
 {
    ASSERT_SIMPLE((uint64_t) data % 4 == 0);

@ -191,7 +192,7 @@ bool queue_dequeue(Queue* queue, byte* data)
 inline
 bool queue_dequeue_atomic(Queue* queue, byte* data)
 {
-    if (atomic_get_relaxed((uint64 *) &queue->head) == (uint64) queue->tail) {
+    if (atomic_get_acquire_release((volatile uint64 *) &queue->head) == (uint64) queue->tail) {
        return false;
    }

--- a/memory/RingMemory.h
+++ b/memory/RingMemory.h
@ -92,7 +92,7 @@ void ring_init(RingMemory* ring, byte* buf, uint64 size, uint32 alignment = 64)
 {
    ASSERT_SIMPLE(size);

-    ring->memory = (byte *) ROUND_TO_NEAREST((uintptr_t) buf, alignment);
+    ring->memory = (byte *) ROUND_TO_NEAREST((uintptr_t) buf, (uint64) alignment);

    ring->end = ring->memory + size;
    ring->head = ring->memory;
@ -117,7 +117,7 @@ void ring_free(RingMemory* ring)
 }

 inline
-byte* ring_calculate_position(const RingMemory* ring, uint64 size, byte aligned = 0)
+byte* ring_calculate_position(const RingMemory* ring, uint64 size, uint32 aligned = 4)
 {
    byte* head = ring->head;

@ -126,7 +126,7 @@ byte* ring_calculate_position(const RingMemory* ring, uint64 size, byte aligned
        head += (aligned - (address & (aligned - 1))) % aligned;
    }

-    size = ROUND_TO_NEAREST(size, aligned);
+    size = ROUND_TO_NEAREST(size, (uint64) aligned);
    if (head + size > ring->end) {
        head = ring->memory;

@ -147,7 +147,7 @@ void ring_reset(RingMemory* ring)
 }

 // Moves a pointer based on the size you want to consume (new position = after consuming size)
-void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, byte aligned = 0)
+void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, uint32 aligned = 4)
 {
    ASSERT_SIMPLE(size <= ring->size);

@ -160,7 +160,7 @@ void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, byte aligned =
        *pos += (aligned - (address& (aligned - 1))) % aligned;
    }

-    size = ROUND_TO_NEAREST(size, aligned);
+    size = ROUND_TO_NEAREST(size, (uint64) aligned);
    if (*pos + size > ring->end) {
        *pos = ring->memory;

@ -173,7 +173,7 @@ void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, byte aligned =
    *pos += size;
 }

-byte* ring_get_memory(RingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+byte* ring_get_memory(RingMemory* ring, uint64 size, uint32 aligned = 4, bool zeroed = false)
 {
    ASSERT_SIMPLE(size <= ring->size);

@ -182,7 +182,7 @@ byte* ring_get_memory(RingMemory* ring, uint64 size, byte aligned = 0, bool zero
        ring->head += (aligned - (address& (aligned - 1))) % aligned;
    }

-    size = ROUND_TO_NEAREST(size, aligned);
+    size = ROUND_TO_NEAREST(size, (uint64) aligned);
    if (ring->head + size > ring->end) {
        ring_reset(ring);

@ -207,7 +207,7 @@ byte* ring_get_memory(RingMemory* ring, uint64 size, byte aligned = 0, bool zero
 }

 // Same as ring_get_memory but DOESN'T move the head
-byte* ring_get_memory_nomove(RingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+byte* ring_get_memory_nomove(RingMemory* ring, uint64 size, uint32 aligned = 4, bool zeroed = false)
 {
    ASSERT_SIMPLE(size <= ring->size);

@ -218,7 +218,7 @@ byte* ring_get_memory_nomove(RingMemory* ring, uint64 size, byte aligned = 0, bo
        pos += (aligned - (address& (aligned - 1))) % aligned;
    }

-    size = ROUND_TO_NEAREST(size, aligned);
+    size = ROUND_TO_NEAREST(size, (uint64) aligned);
    if (pos + size > ring->end) {
        ring_reset(ring);

@ -253,11 +253,10 @@ byte* ring_get_element(const RingMemory* ring, uint64 element_count, uint64 elem
 * Checks if one additional element can be inserted without overwriting the tail index
 */
 inline
-bool ring_commit_safe(const RingMemory* ring, uint64 size, byte aligned = 0)
+bool ring_commit_safe(const RingMemory* ring, uint64 size, uint32 aligned = 4)
 {
    // aligned * 2 since that should be the maximum overhead for an element
-    // @bug could this result in a case where the ring is considered empty/full (false positive/negative)?
-    // The "correct" version would probably to use ring_move_pointer in some form
+    // This is not 100% correct BUT it is way faster than any correct version I can come up with
    uint64 max_mem_required = size + aligned * 2;

    if (ring->tail < ring->head) {
@ -271,15 +270,17 @@ bool ring_commit_safe(const RingMemory* ring, uint64 size, byte aligned = 0)
 }

 inline
-bool ring_commit_safe_atomic(const RingMemory* ring, uint64 size, byte aligned = 0)
+bool ring_commit_safe_atomic(const RingMemory* ring, uint64 size, uint32 aligned = 4)
 {
    // aligned * 2 since that should be the maximum overhead for an element
-    // @bug could this result in a case where the ring is considered empty/full (false positive/negative)?
-    // The "correct" version would probably to use ring_move_pointer in some form
+    // This is not 100% correct BUT it is way faster than any correct version I can come up with
    uint64 max_mem_required = size + aligned * 2;

+    // @todo consider to switch to uintptr_t
    uint64 tail = atomic_get_relaxed((uint64 *) &ring->tail);
-    uint64 head = atomic_get_relaxed((uint64 *) &ring->head);
+
+    // This doesn't have to be atomic since we assume single producer/consumer and a commit is performed by the consumer
+    uint64 head = (uint64) ring->head;

    if (tail < head) {
        return ((uint64) (ring->end - head)) > max_mem_required
@ -291,18 +292,6 @@ bool ring_commit_safe_atomic(const RingMemory* ring, uint64 size, byte aligned =
    }
 }

-inline
-void ring_force_head_update(const RingMemory* ring)
-{
-    _mm_clflush(ring->head);
-}
-
-inline
-void ring_force_tail_update(const RingMemory* ring)
-{
-    _mm_clflush(ring->tail);
-}
-
 inline
 int64 ring_dump(const RingMemory* ring, byte* data)
 {
--- a/memory/ThreadedChunkMemory.h
+++ b/memory/ThreadedChunkMemory.h
@ -23,8 +23,8 @@ struct ThreadedChunkMemory {

    uint64 count;
    uint64 size;
-    uint64 chunk_size;
    int64 last_pos;
+    uint32 chunk_size;
    int32 alignment;

    // length = count
--- a/memory/ThreadedQueue.h
+++ b/memory/ThreadedQueue.h
@ -51,7 +51,7 @@ struct ThreadedQueue {
 };

 inline
-void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element_size, uint32 alignment = 64)
+void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint32 element_size, uint32 alignment = 64)
 {
    element_size = ROUND_TO_NEAREST(element_size, alignment);

@ -67,7 +67,7 @@ void thrd_queue_alloc(ThreadedQueue* queue, uint32 element_count, uint64 element
 }

 inline
-void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint64 element_size, uint32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_count, uint32 element_size, uint32 alignment = 64)
 {
    element_size = ROUND_TO_NEAREST(element_size, alignment);

@ -83,7 +83,7 @@ void thrd_queue_init(ThreadedQueue* queue, BufferMemory* buf, uint32 element_cou
 }

 inline
-void thrd_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint64 element_size, uint32 alignment = 64)
+void thrd_queue_init(ThreadedQueue* queue, byte* buf, uint32 element_count, uint32 element_size, uint32 alignment = 64)
 {
    element_size = ROUND_TO_NEAREST(element_size, alignment);

--- a/memory/ThreadedRingMemory.h
+++ b/memory/ThreadedRingMemory.h
@ -69,7 +69,7 @@ void thrd_ring_free(ThreadedRingMemory* ring)
 }

 inline
-byte* thrd_ring_calculate_position(ThreadedRingMemory* ring, uint64 size, byte aligned = 0)
+byte* thrd_ring_calculate_position(ThreadedRingMemory* ring, uint64 size, byte aligned = 4)
 {
    pthread_mutex_lock(&ring->mutex);
    byte* result = ring_calculate_position((RingMemory *) ring, size, aligned);
@ -87,14 +87,14 @@ void thrd_ring_reset(ThreadedRingMemory* ring)
 }

 // Moves a pointer based on the size you want to consume (new position = after consuming size)
-void thrd_ring_move_pointer(ThreadedRingMemory* ring, byte** pos, uint64 size, byte aligned = 0)
+void thrd_ring_move_pointer(ThreadedRingMemory* ring, byte** pos, uint64 size, byte aligned = 4)
 {
    pthread_mutex_lock(&ring->mutex);
    ring_move_pointer((RingMemory *) ring, pos, size, aligned);
    pthread_mutex_unlock(&ring->mutex);
 }

-byte* thrd_ring_get_memory(ThreadedRingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+byte* thrd_ring_get_memory(ThreadedRingMemory* ring, uint64 size, byte aligned = 4, bool zeroed = false)
 {
    pthread_mutex_lock(&ring->mutex);
    byte* result = ring_get_memory((RingMemory *) ring, size, aligned, zeroed);
@ -104,7 +104,7 @@ byte* thrd_ring_get_memory(ThreadedRingMemory* ring, uint64 size, byte aligned =
 }

 // Same as ring_get_memory but DOESN'T move the head
-byte* thrd_ring_get_memory_nomove(ThreadedRingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false)
+byte* thrd_ring_get_memory_nomove(ThreadedRingMemory* ring, uint64 size, byte aligned = 4, bool zeroed = false)
 {
    pthread_mutex_lock(&ring->mutex);
    byte* result = ring_get_memory_nomove((RingMemory *) ring, size, aligned, zeroed);
@ -129,7 +129,7 @@ byte* thrd_ring_get_element(ThreadedRingMemory* ring, uint64 element_count, uint
 * Checks if one additional element can be inserted without overwriting the tail index
 */
 inline
-bool thrd_ring_commit_safe(ThreadedRingMemory* ring, uint64 size, byte aligned = 0)
+bool thrd_ring_commit_safe(ThreadedRingMemory* ring, uint64 size, byte aligned = 4)
 {
    pthread_mutex_lock(&ring->mutex);
    bool result = ring_commit_safe((RingMemory *) ring, size, aligned);
--- a/object/Mesh.h
+++ b/object/Mesh.h
@ -23,9 +23,9 @@
 #include "../utils/StringUtils.h"

 #if __aarch64__
-    #include "../../../GameEngine/stdlib/sve/SVE_I32.h"
+    #include "../stdlib/sve/SVE_I32.h"
 #else
-    #include "../../../GameEngine/stdlib/simd/SIMD_I32.h"
+    #include "../stdlib/simd/SIMD_I32.h"
 #endif

 #define MESH_VERSION 1
@ -178,7 +178,7 @@ void mesh_from_file_txt(
            continue;
        }

-        // NOTE: we always load a file in the format: POSITON + NORMAL + TEXTURE + COLOR
+        // NOTE: we always load a file in the format: POSITION + NORMAL + TEXTURE + COLOR
        //      EVEN if some of the data is missing. This is necessary to keep the memory kinda in line.
        //      The actual binary file later will have the minimized layout.

@ -558,7 +558,7 @@ int32 mesh_data_size(const Mesh* mesh)
 int32 mesh_to_data(
    const Mesh* mesh,
    byte* data,
-    int32 vertex_save_format = VERTEX_TYPE_ALL,
+    uint32 vertex_save_format = VERTEX_TYPE_ALL,
    int32 steps = 8
 )
 {
--- a/object/Texture.h
+++ b/object/Texture.h
@ -36,7 +36,7 @@
 #include "../image/Image.h"

 struct Texture {
-    uint64 id;
+    uint32 id;
    byte sample_id;

    // @question Should the texture hold the texture unit?
--- a/platform/win32/audio/DirectSound.h
+++ b/platform/win32/audio/DirectSound.h
@ -148,11 +148,12 @@ uint32 audio_buffer_fillable(const AudioSetting* setting, const DirectSoundSetti
        return 0;
    }

-    DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
+    DWORD bytes_to_lock = setting->sample_buffer_size;
    DWORD bytes_to_write = 0;

    DWORD target_cursor = (player_cursor + (setting->latency * setting->sample_size)) % setting->buffer_size;

+    // @bug Why does this case even exist?
    if (bytes_to_lock == player_cursor) {
        // @bug What if just started?
        bytes_to_write = 0;
@ -179,7 +180,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
    void* region2;
    DWORD region2_size;

-    DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
+    DWORD bytes_to_lock = setting->sample_buffer_size;

    api_setting->secondary_buffer->Lock(
        bytes_to_lock, setting->sample_buffer_size,
@ -204,8 +205,6 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)

    api_setting->secondary_buffer->Unlock(region1, region1_size, region2, region2_size);

-    // @question Do we want to keep this here or move it to the audio mixer?
-    setting->sample_index += setting->sample_buffer_size / setting->sample_size;
    setting->sample_buffer_size = 0;
 }

--- a/platform/win32/audio/XAudio2.h
+++ b/platform/win32/audio/XAudio2.h
@ -106,8 +106,6 @@ void audio_load(HWND hwnd, AudioSetting* setting, XAudio2Setting* api_setting) {
    api_setting->internal_buffer[1].LoopLength = 0;
    api_setting->internal_buffer[1].LoopCount = 0;
    api_setting->internal_buffer[1].pContext = NULL;
-
-    setting->sample_index = 0;
 }

 inline
@ -117,10 +115,7 @@ void audio_play(AudioSetting* setting, XAudio2Setting* api_setting) {
    }

    api_setting->source_voice->Start(0, XAUDIO2_COMMIT_NOW);
-
-    if (setting->sample_index > 1) {
-        setting->sample_index = 0;
-    }
+    setting->sample_index = 0;
 }

 inline
@ -199,7 +194,9 @@ void audio_play_buffer(AudioSetting* setting, XAudio2Setting* api_setting) {
    }

    ++setting->sample_output;
-    setting->sample_index += setting->sample_buffer_size / setting->sample_size;
+
+    // @performance Why do I even need this?
+    //setting->sample_index += setting->sample_buffer_size / setting->sample_size;
    setting->sample_buffer_size = 0;
 }

--- a/platform/win32/threading/Atomic.h
+++ b/platform/win32/threading/Atomic.h
@ -12,54 +12,55 @@
 #include <windows.h>
 #include "../../../stdlib/Types.h"

-// WARNING: Windows doesn't really support all the relaxed implementations, we therefore often use acquire as alternative.
+// WARNING: Windows doesn't really have relaxed, release, acquire function on x86_64.
+// You can see that by checking out how they are defined

 inline
 void atomic_set_relaxed(void** target, void* new_pointer)
 {
-    InterlockedExchangePointerAcquire(target, new_pointer);
+    InterlockedExchangePointerNoFence(target, new_pointer);
 }

 inline
 void* atomic_get_relaxed(void** target)
 {
-    return InterlockedCompareExchangePointer(target, NULL, NULL);
+    return InterlockedCompareExchangePointerNoFence(target, NULL, NULL);
 }

 inline
 void atomic_set_relaxed(volatile int32* value, int32 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, new_value);
+    InterlockedExchangeNoFence((long *) value, new_value);
 }

 inline
 void atomic_set_relaxed(volatile int64* value, int64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }

 inline
 void atomic_set_relaxed(volatile f32* value, f32 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeNoFence((long *) value, (long) new_value);
 }

 inline
 void atomic_set_relaxed(volatile f64* value, f64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }

 inline
 int32 atomic_fetch_set_relaxed(volatile int32* value, int32 new_value)
 {
-    return (int32) InterlockedExchangeAcquire((long *) value, new_value);
+    return (int32) InterlockedExchangeNoFence((long *) value, new_value);
 }

 inline
 int64 atomic_fetch_set_relaxed(volatile int64* value, int64 new_value)
 {
-    return (int64) InterlockedExchangeAcquire((long *) value, (long) new_value);
+    return (int64) InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -86,25 +87,25 @@ void atomic_set_relaxed(volatile byte* value, const byte new_value[16])
 inline
 int32 atomic_get_relaxed(volatile int32* value)
 {
-    return (int32) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (int32) InterlockedCompareExchangeNoFence((long *) value, 0, 0);
 }

 inline
 int64 atomic_get_relaxed(volatile int64* value)
 {
-    return (int64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (int64) InterlockedCompareExchangeNoFence64((LONG64 *) value, 0, 0);
 }

 inline
 f32 atomic_get_relaxed(volatile f32* value)
 {
-    return (f32) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (f32) InterlockedCompareExchangeNoFence((long *) value, 0, 0);
 }

 inline
 f64 atomic_get_relaxed(volatile f64* value)
 {
-    return (f64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (f64) InterlockedCompareExchangeNoFence64((LONG64 *) value, 0, 0);
 }

 inline
@ -116,79 +117,79 @@ void atomic_get_relaxed(volatile byte* value, byte data[16])
 inline
 void atomic_increment_relaxed(volatile int32* value)
 {
-    InterlockedIncrementAcquire((long *) value);
+    InterlockedIncrementNoFence((long *) value);
 }

 inline
 void atomic_decrement_relaxed(volatile int32* value)
 {
-    InterlockedDecrementAcquire((long *) value);
+    InterlockedDecrementNoFence((long *) value);
 }

 inline
 void atomic_increment_relaxed(volatile int64* value)
 {
-    InterlockedIncrementAcquire((long *) value);
+    InterlockedIncrementNoFence64((LONG64 *) value);
 }

 inline
 void atomic_decrement_relaxed(volatile int64* value)
 {
-    InterlockedDecrementAcquire((long *) value);
+    InterlockedDecrementNoFence64((LONG64 *) value);
 }

 inline
 void atomic_add_relaxed(volatile int32* value, int32 increment)
 {
-    InterlockedAddAcquire((long *) value, increment);
+    InterlockedAddNoFence((long *) value, increment);
 }

 inline
 void atomic_sub_relaxed(volatile int32* value, int32 decrement)
 {
-    InterlockedAddAcquire((long *) value, -decrement);
+    InterlockedAddNoFence((long *) value, -decrement);
 }

 inline
 void atomic_add_relaxed(volatile int64* value, int64 increment)
 {
-    InterlockedAddAcquire((long *) value, (long) increment);
+    InterlockedAddNoFence64((LONG64 *) value, (LONG64) increment);
 }

 inline
 void atomic_sub_relaxed(volatile int64* value, int64 decrement)
 {
-    InterlockedAddAcquire((long *) value, -1 * ((long) decrement));
+    InterlockedAddNoFence64((LONG64 *) value, -((LONG64) decrement));
 }

 inline
 f32 atomic_compare_exchange_weak_relaxed(volatile f32* value, f32* expected, f32 desired)
 {
-    return (f32) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (f32) InterlockedCompareExchangeNoFence((long *) value, (long) desired, (long) *expected);
 }

 inline
 f64 atomic_compare_exchange_weak_relaxed(volatile f64* value, f64* expected, f64 desired)
 {
-    return (f64) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (f64) InterlockedCompareExchangeNoFence64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
 int32 atomic_compare_exchange_weak_relaxed(volatile int32* value, int32* expected, int32 desired)
 {
-    return (int32) InterlockedCompareExchangeRelease((long *) value, desired, *expected);
+    return (int32) InterlockedCompareExchangeNoFence((long *) value, desired, *expected);
 }

 inline
 int64 atomic_compare_exchange_weak_relaxed(volatile int64* value, int64* expected, int64 desired)
 {
-    return (int64) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (int64) InterlockedCompareExchangeNoFence64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
 int32 atomic_fetch_add_relaxed(volatile int32* value, int32 operand)
 {
-    return (int32) InterlockedExchangeAddRelease((long *) value, operand);
+    return (int32) InterlockedExchangeAddNoFence((long *) value, operand);
 }

 inline
@ -200,115 +201,115 @@ int32 atomic_fetch_sub_relaxed(volatile int32* value, int32 operand)
 inline
 int64 atomic_fetch_add_relaxed(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeAddRelease((long *) value, (long) operand);
+    return (int64) InterlockedExchangeAddNoFence64((LONG64 *) value, (LONG64) operand);
 }

 inline
 int64 atomic_fetch_sub_relaxed(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }

 inline
 void atomic_set_relaxed(volatile uint32* value, uint32 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, new_value);
+    InterlockedExchangeNoFence((long *) value, new_value);
 }

 inline
 void atomic_set_relaxed(volatile uint64* value, uint64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }

 inline
 uint32 atomic_fetch_set_relaxed(volatile uint32* value, uint32 new_value)
 {
-    return (uint32) InterlockedExchangeAcquire((long *) value, new_value);
+    return (uint32) InterlockedExchangeNoFence((long *) value, new_value);
 }

 inline
 uint64 atomic_fetch_set_relaxed(volatile uint64* value, uint64 new_value)
 {
-    return (uint64) InterlockedExchangeAcquire((long *) value, (long) new_value);
+    return (uint64) InterlockedExchangeNoFence64((LONG64 *) value, (LONG64) new_value);
 }

 inline
 uint32 atomic_get_relaxed(volatile uint32* value)
 {
-    return (uint32) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (uint32) InterlockedCompareExchangeNoFence((long *) value, 0, 0);
 }

 inline
 uint64 atomic_get_relaxed(volatile uint64* value)
 {
-    return (uint64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (uint64) InterlockedCompareExchangeNoFence64((LONG64 *) value, 0, 0);
 }

 inline
 void atomic_increment_relaxed(volatile uint32* value)
 {
-    InterlockedIncrementRelease((long *) value);
+    InterlockedIncrementNoFence((long *) value);
 }

 inline
 void atomic_decrement_relaxed(volatile uint32* value)
 {
-    InterlockedDecrementRelease((long *) value);
+    InterlockedDecrementNoFence((long *) value);
 }

 inline
 void atomic_increment_relaxed(volatile uint64* value)
 {
-    InterlockedIncrementRelease((long *) value);
+    InterlockedIncrementNoFence64((LONG64 *) value);
 }

 inline
 void atomic_decrement_relaxed(volatile uint64* value)
 {
-    InterlockedDecrementRelease((long *) value);
+    InterlockedDecrementNoFence64((LONG64 *) value);
 }

 inline
 void atomic_add_relaxed(volatile uint32* value, uint32 increment)
 {
-    InterlockedAddRelease((long *) value, increment);
+    InterlockedAddNoFence((long *) value, increment);
 }

 inline
 void atomic_sub_relaxed(volatile uint32* value, uint32 decrement)
 {
-    InterlockedAddRelease((long *) value, -1 * ((int32) decrement));
+    InterlockedAddNoFence((long *) value, -1 * ((int32) decrement));
 }

 inline
 void atomic_add_relaxed(volatile uint64* value, uint64 increment)
 {
-    InterlockedAddRelease((long *) value, (long) increment);
+    InterlockedAddNoFence64((LONG64 *) value, (LONG64) increment);
 }

 inline
 void atomic_sub_relaxed(volatile uint64* value, uint64 decrement)
 {
-    InterlockedAddRelease((long *) value, -1 * ((long) decrement));
+    InterlockedAddNoFence64((LONG64 *) value, -((LONG64) decrement));
 }

 inline
 uint32 atomic_compare_exchange_weak_relaxed(volatile uint32* value, uint32* expected, uint32 desired)
 {
-    return (uint32) InterlockedCompareExchangeAcquire((long *) value, desired, *expected);
+    return (uint32) InterlockedCompareExchangeNoFence((long *) value, desired, *expected);
 }

 inline
 uint64 atomic_compare_exchange_weak_relaxed(volatile uint64* value, uint64* expected, uint64 desired)
 {
-    return (uint64) InterlockedCompareExchangeAcquire((unsigned long long *) value, (unsigned long long) desired, (unsigned long long) *expected);
+    return (uint64) InterlockedCompareExchangeNoFence64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
 uint32 atomic_fetch_add_relaxed(volatile uint32* value, uint32 operand)
 {
-    return (uint32) InterlockedExchangeAddRelease((long *) value, operand);
+    return (uint32) InterlockedExchangeAddNoFence((long *) value, operand);
 }

 inline
@ -320,61 +321,61 @@ uint32 atomic_fetch_sub_relaxed(volatile uint32* value, uint32 operand)
 inline
 uint64 atomic_fetch_add_relaxed(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeAddRelease((long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAddNoFence64((LONG64 *) value, (LONG64) operand);
 }

 inline
 uint64 atomic_fetch_sub_relaxed(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }

 inline
 void atomic_and_relaxed(volatile uint32* value, uint32 mask)
 {
-    InterlockedAndRelease((volatile LONG *) value, mask);
+    InterlockedAndNoFence((volatile LONG *) value, mask);
 }

 inline
 void atomic_and_relaxed(volatile int32* value, int32 mask)
 {
-    InterlockedAndRelease((volatile LONG *) value, (LONG)mask);
+    InterlockedAndNoFence((volatile LONG *) value, (LONG)mask);
 }

 inline
 void atomic_and_relaxed(volatile uint64* value, uint64 mask)
 {
-    InterlockedAnd64Release((volatile LONG64 *) value, mask);
+    InterlockedAnd64NoFence((volatile LONG64 *) value, mask);
 }

 inline
 void atomic_and_relaxed(volatile int64* value, int64 mask)
 {
-    InterlockedAnd64Release((volatile LONG64 *) value, mask);
+    InterlockedAnd64NoFence((volatile LONG64 *) value, mask);
 }

 inline
 void atomic_or_relaxed(volatile uint32* value, uint32 mask)
 {
-    InterlockedOrRelease((volatile LONG *) value, mask);
+    InterlockedOrNoFence((volatile LONG *) value, mask);
 }

 inline
 void atomic_or_relaxed(volatile int32* value, int32 mask)
 {
-    InterlockedOrRelease((volatile LONG *) value, (LONG)mask);
+    InterlockedOrNoFence((volatile LONG *) value, (LONG)mask);
 }

 inline
 void atomic_or_relaxed(volatile uint64* value, uint64 mask)
 {
-    InterlockedOr64Release((volatile LONG64 *) value, mask);
+    InterlockedOr64NoFence((volatile LONG64 *) value, mask);
 }

 inline
 void atomic_or_relaxed(volatile int64* value, int64 mask)
 {
-    InterlockedOr64Release((volatile LONG64 *) value, mask);
+    InterlockedOr64NoFence((volatile LONG64 *) value, mask);
 }

 inline
@ -398,7 +399,7 @@ void atomic_set_acquire(volatile int32* value, int32 new_value)
 inline
 void atomic_set_acquire(volatile int64* value, int64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -410,7 +411,7 @@ void atomic_set_acquire(volatile f32* value, f32 new_value)
 inline
 void atomic_set_acquire(volatile f64* value, f64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -422,7 +423,7 @@ int32 atomic_fetch_set_acquire(volatile int32* value, int32 new_value)
 inline
 int64 atomic_fetch_set_acquire(volatile int64* value, int64 new_value)
 {
-    return (int64) InterlockedExchangeAcquire((long *) value, (long) new_value);
+    return (int64) InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -467,7 +468,7 @@ f32 atomic_get_acquire(volatile f32* value)
 inline
 f64 atomic_get_acquire(volatile f64* value)
 {
-    return (f64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (f64) InterlockedCompareExchangeAcquire64((LONG64 *) value, 0, 0);
 }

 inline
@ -533,7 +534,7 @@ f32 atomic_compare_exchange_weak_acquire(volatile f32* value, f32* expected, f32
 inline
 f64 atomic_compare_exchange_weak_acquire(volatile f64* value, f64* expected, f64 desired)
 {
-    return (f64) InterlockedCompareExchangeAcquire((long *) value, (long) desired, (long) *expected);
+    return (f64) InterlockedCompareExchangeAcquire64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -545,7 +546,7 @@ int32 atomic_compare_exchange_weak_acquire(volatile int32* value, int32* expecte
 inline
 int64 atomic_compare_exchange_weak_acquire(volatile int64* value, int64* expected, int64 desired)
 {
-    return (int64) InterlockedCompareExchangeAcquire((long *) value, (long) desired, (long) *expected);
+    return (int64) InterlockedCompareExchangeAcquire64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -563,13 +564,13 @@ int32 atomic_fetch_sub_acquire(volatile int32* value, int32 operand)
 inline
 int64 atomic_fetch_add_acquire(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }

 inline
 int64 atomic_fetch_sub_acquire(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }

 inline
@ -581,7 +582,7 @@ void atomic_set_acquire(volatile uint32* value, uint32 new_value)
 inline
 void atomic_set_acquire(volatile uint64* value, uint64 new_value)
 {
-    InterlockedExchangeAcquire((long *) value, (long) new_value);
+    InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -593,7 +594,7 @@ uint32 atomic_fetch_set_acquire(volatile uint32* value, uint32 new_value)
 inline
 uint64 atomic_fetch_set_acquire(volatile uint64* value, uint64 new_value)
 {
-    return (uint64) InterlockedExchangeAcquire((long *) value, (long) new_value);
+    return (uint64) InterlockedExchangeAcquire64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -605,7 +606,7 @@ uint32 atomic_get_acquire(volatile uint32* value)
 inline
 uint64 atomic_get_acquire(volatile uint64* value)
 {
-    return (uint64) InterlockedCompareExchangeAcquire((long *) value, 0, 0);
+    return (uint64) InterlockedCompareExchangeAcquire64((LONG64 *) value, 0, 0);
 }

 inline
@ -623,13 +624,13 @@ void atomic_decrement_acquire(volatile uint32* value)
 inline
 void atomic_increment_acquire(volatile uint64* value)
 {
-    InterlockedIncrementAcquire((long *) value);
+    InterlockedIncrementAcquire64((LONG64 *) value);
 }

 inline
 void atomic_decrement_acquire(volatile uint64* value)
 {
-    InterlockedDecrementAcquire((long *) value);
+    InterlockedDecrementAcquire64((LONG64 *) value);
 }

 inline
@ -647,13 +648,13 @@ void atomic_sub_acquire(volatile uint32* value, uint32 decrement)
 inline
 void atomic_add_acquire(volatile uint64* value, uint64 increment)
 {
-    InterlockedAddAcquire((long *) value, (long) increment);
+    InterlockedAddAcquire64((LONG64 *) value, (LONG64) increment);
 }

 inline
 void atomic_sub_acquire(volatile uint64* value, uint64 decrement)
 {
-    InterlockedAddAcquire((long *) value, -1 * ((long) decrement));
+    InterlockedAddAcquire64((LONG64 *) value, -((LONG64) decrement));
 }

 inline
@ -665,7 +666,7 @@ uint32 atomic_compare_exchange_weak_acquire(volatile uint32* value, uint32* expe
 inline
 uint64 atomic_compare_exchange_weak_acquire(volatile uint64* value, uint64* expected, uint64 desired)
 {
-    return (uint64) InterlockedCompareExchangeAcquire((unsigned long long *) value, (unsigned long long) desired, (unsigned long long) *expected);
+    return (uint64) InterlockedCompareExchangeAcquire64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -683,13 +684,13 @@ uint32 atomic_fetch_sub_acquire(volatile uint32* value, uint32 operand)
 inline
 uint64 atomic_fetch_add_acquire(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeAddAcquire((long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAddAcquire64((LONG64 *) value, (LONG64) operand);
 }

 inline
 uint64 atomic_fetch_sub_acquire(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }

 inline
@ -761,7 +762,7 @@ void atomic_set_release(volatile int32* value, int32 new_value)
 inline
 void atomic_set_release(volatile int64* value, int64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -773,7 +774,7 @@ void atomic_set_release(volatile f32* value, f32 new_value)
 inline
 void atomic_set_release(volatile f64* value, f64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -785,7 +786,7 @@ int32 atomic_fetch_set_release(volatile int32* value, int32 new_value)
 inline
 int64 atomic_fetch_set_release(volatile int64* value, int64 new_value)
 {
-    return (int64) InterlockedExchange((long *) value, (long) new_value);
+    return (int64) InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -830,7 +831,7 @@ f32 atomic_get_release(volatile f32* value)
 inline
 f64 atomic_get_release(volatile f64* value)
 {
-    return (f64) InterlockedCompareExchangeRelease((long *) value, 0, 0);
+    return (f64) InterlockedCompareExchangeRelease64((LONG64 *) value, 0, 0);
 }

 inline
@ -854,13 +855,13 @@ void atomic_decrement_release(volatile int32* value)
 inline
 void atomic_increment_release(volatile int64* value)
 {
-    InterlockedIncrementRelease((long *) value);
+    InterlockedIncrementRelease64((LONG64 *) value);
 }

 inline
 void atomic_decrement_release(volatile int64* value)
 {
-    InterlockedDecrementRelease((long *) value);
+    InterlockedDecrementRelease64((LONG64 *) value);
 }

 inline
@ -878,13 +879,13 @@ void atomic_sub_release(volatile int32* value, int32 decrement)
 inline
 void atomic_add_release(volatile int64* value, int64 increment)
 {
-    InterlockedAddRelease((long *) value, (long) increment);
+    InterlockedAddRelease64((LONG64 *) value, (LONG64) increment);
 }

 inline
 void atomic_sub_release(volatile int64* value, int64 decrement)
 {
-    InterlockedAddRelease((long *) value, -1 * ((long) decrement));
+    InterlockedAddRelease64((LONG64 *) value, -((LONG64) decrement));
 }

 inline
@ -896,7 +897,7 @@ f32 atomic_compare_exchange_weak_release(volatile f32* value, f32* expected, f32
 inline
 f64 atomic_compare_exchange_weak_release(volatile f64* value, f64* expected, f64 desired)
 {
-    return (f64) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (f64) InterlockedCompareExchangeRelease64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -908,7 +909,7 @@ int32 atomic_compare_exchange_weak_release(volatile int32* value, int32* expecte
 inline
 int64 atomic_compare_exchange_weak_release(volatile int64* value, int64* expected, int64 desired)
 {
-    return (int64) InterlockedCompareExchangeRelease((long *) value, (long) desired, (long) *expected);
+    return (int64) InterlockedCompareExchangeRelease64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -926,13 +927,13 @@ int32 atomic_fetch_sub_release(volatile int32* value, int32 operand)
 inline
 int64 atomic_fetch_add_release(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }

 inline
 int64 atomic_fetch_sub_release(volatile int64* value, int64 operand)
 {
-    return (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }

 inline
@ -944,7 +945,7 @@ void atomic_set_release(volatile uint32* value, uint32 new_value)
 inline
 void atomic_set_release(volatile uint64* value, uint64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -956,7 +957,7 @@ uint32 atomic_fetch_set_release(volatile uint32* value, uint32 new_value)
 inline
 uint64 atomic_fetch_set_release(volatile uint64* value, uint64 new_value)
 {
-    return (uint64) InterlockedExchange((long *) value, (long) new_value);
+    return (uint64) InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -968,7 +969,7 @@ uint32 atomic_get_release(volatile uint32* value)
 inline
 uint64 atomic_get_release(volatile uint64* value)
 {
-    return (uint64) InterlockedCompareExchangeRelease((long *) value, 0, 0);
+    return (uint64) InterlockedCompareExchangeRelease64((LONG64 *) value, 0, 0);
 }

 inline
@ -986,13 +987,13 @@ void atomic_decrement_release(volatile uint32* value)
 inline
 void atomic_increment_release(volatile uint64* value)
 {
-    InterlockedIncrementRelease((long *) value);
+    InterlockedIncrementRelease64((LONG64 *) value);
 }

 inline
 void atomic_decrement_release(volatile uint64* value)
 {
-    InterlockedDecrementRelease((long *) value);
+    InterlockedDecrementRelease64((LONG64 *) value);
 }

 inline
@ -1010,13 +1011,13 @@ void atomic_sub_release(volatile uint32* value, uint32 decrement)
 inline
 void atomic_add_release(volatile uint64* value, uint64 increment)
 {
-    InterlockedAddRelease((long *) value, (long) increment);
+    InterlockedAddRelease64((LONG64 *) value, (LONG64) increment);
 }

 inline
 void atomic_sub_release(volatile uint64* value, uint64 decrement)
 {
-    InterlockedAddRelease((long *) value, -1 * ((long) decrement));
+    InterlockedAddRelease64((LONG64 *) value, -((LONG64) decrement));
 }

 inline
@ -1028,7 +1029,7 @@ uint32 atomic_compare_exchange_weak_release(volatile uint32* value, uint32* expe
 inline
 uint64 atomic_compare_exchange_weak_release(volatile uint64* value, uint64* expected, uint64 desired)
 {
-    return (uint64) InterlockedCompareExchangeRelease((unsigned long long *) value, (unsigned long long) desired, (unsigned long long) *expected);
+    return (uint64) InterlockedCompareExchangeRelease64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -1046,13 +1047,13 @@ uint32 atomic_fetch_sub_release(volatile uint32* value, uint32 operand)
 inline
 uint64 atomic_fetch_add_release(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeAddRelease((long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAddRelease64((LONG64 *) value, (LONG64) operand);
 }

 inline
 uint64 atomic_fetch_sub_release(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));
 }

 inline
@ -1124,7 +1125,7 @@ void atomic_set_acquire_release(volatile int32* value, int32 new_value)
 inline
 void atomic_set_acquire_release(volatile int64* value, int64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -1136,7 +1137,7 @@ void atomic_set_acquire_release(volatile f32* value, f32 new_value)
 inline
 void atomic_set_acquire_release(volatile f64* value, f64 new_value)
 {
-    InterlockedExchange((long *) value, (long) new_value);
+    InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -1148,7 +1149,7 @@ int32 atomic_fetch_set_acquire_release(volatile int32* value, int32 new_value)
 inline
 int64 atomic_fetch_set_acquire_release(volatile int64* value, int64 new_value)
 {
-    return (int64) InterlockedExchange((long *) value, (long) new_value);
+    return (int64) InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -1181,7 +1182,7 @@ int32 atomic_get_acquire_release(volatile int32* value)
 inline
 int64 atomic_get_acquire_release(volatile int64* value)
 {
-    return (int64) InterlockedCompareExchange((long *) value, 0, 0);
+    return (int64) InterlockedCompareExchange64((LONG64 *) value, 0, 0);
 }

 inline
@ -1193,7 +1194,7 @@ f32 atomic_get_acquire_release(volatile f32* value)
 inline
 f64 atomic_get_acquire_release(volatile f64* value)
 {
-    return (f64) InterlockedCompareExchange((long *) value, 0, 0);
+    return (f64) InterlockedCompareExchange64((LONG64 *) value, 0, 0);
 }

 inline
@ -1217,13 +1218,13 @@ void atomic_decrement_acquire_release(volatile int32* value)
 inline
 void atomic_increment_acquire_release(volatile int64* value)
 {
-    InterlockedIncrement((long *) value);
+    InterlockedIncrement64((LONG64 *) value);
 }

 inline
 void atomic_decrement_acquire_release(volatile int64* value)
 {
-    InterlockedDecrement((long *) value);
+    InterlockedDecrement64((LONG64 *) value);
 }

 inline
@ -1241,13 +1242,13 @@ void atomic_sub_acquire_release(volatile int32* value, int32 decrement)
 inline
 void atomic_add_acquire_release(volatile int64* value, int64 increment)
 {
-    InterlockedAdd((long *) value, (long) increment);
+    InterlockedAdd64((LONG64 *) value, (LONG64) increment);
 }

 inline
 void atomic_sub_acquire_release(volatile int64* value, int64 decrement)
 {
-    InterlockedAdd((long *) value, -1 * ((long) decrement));
+    InterlockedAdd64((LONG64 *) value, -((LONG64) decrement));
 }

 inline
@ -1259,7 +1260,7 @@ f32 atomic_compare_exchange_weak_acquire_release(volatile f32* value, f32* expec
 inline
 f64 atomic_compare_exchange_weak_acquire_release(volatile f64* value, f64* expected, f64 desired)
 {
-    return (f64) InterlockedCompareExchange((long *) value, (long) desired, (long) *expected);
+    return (f64) InterlockedCompareExchange64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -1271,7 +1272,7 @@ int32 atomic_compare_exchange_weak_acquire_release(volatile int32* value, int32*
 inline
 int64 atomic_compare_exchange_weak_acquire_release(volatile int64* value, int64* expected, int64 desired)
 {
-    return (int64) InterlockedCompareExchange((long *) value, (long) desired, (long) *expected);
+    return (int64) InterlockedCompareExchange64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -1291,7 +1292,7 @@ int32 atomic_fetch_sub_acquire_release(volatile int32* value, int32 operand)
 inline
 int64 atomic_fetch_add_acquire_release(volatile int64* value, int64 operand)
 {
-    int64 ret = (int64) InterlockedExchangeSubtract((unsigned long *) value, operand);
+    int64 ret = (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));

    return ret;
 }
@ -1299,7 +1300,7 @@ int64 atomic_fetch_add_acquire_release(volatile int64* value, int64 operand)
 inline
 int64 atomic_fetch_sub_acquire_release(volatile int64* value, int64 operand)
 {
-    int64 ret = (int64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    int64 ret = (int64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));

    return ret;
 }
@ -1325,7 +1326,7 @@ uint32 atomic_fetch_set_acquire_release(volatile uint32* value, uint32 new_value
 inline
 uint64 atomic_fetch_set_acquire_release(volatile uint64* value, uint64 new_value)
 {
-    return (uint64) InterlockedExchange((long *) value, (long) new_value);
+    return (uint64) InterlockedExchange64((LONG64 *) value, (LONG64) new_value);
 }

 inline
@ -1337,7 +1338,7 @@ uint32 atomic_get_acquire_release(volatile uint32* value)
 inline
 uint64 atomic_get_acquire_release(volatile uint64* value)
 {
-    return (uint64) InterlockedCompareExchange((long *) value, 0, 0);
+    return (uint64) InterlockedCompareExchange64((LONG64 *) value, 0, 0);
 }

 inline
@ -1355,13 +1356,13 @@ void atomic_decrement_acquire_release(volatile uint32* value)
 inline
 void atomic_increment_acquire_release(volatile uint64* value)
 {
-    InterlockedIncrement((long *) value);
+    InterlockedIncrement64((LONG64 *) value);
 }

 inline
 void atomic_decrement_acquire_release(volatile uint64* value)
 {
-    InterlockedDecrement((long *) value);
+    InterlockedDecrement64((LONG64 *) value);
 }

 inline
@ -1379,13 +1380,13 @@ void atomic_sub_acquire_release(volatile uint32* value, uint32 decrement)
 inline
 void atomic_add_acquire_release(volatile uint64* value, uint64 increment)
 {
-    InterlockedAdd((long *) value, (long) increment);
+    InterlockedAdd64((LONG64 *) value, (LONG64) increment);
 }

 inline
 void atomic_sub_acquire_release(volatile uint64* value, uint64 decrement)
 {
-    InterlockedAdd((long *) value, -1 * ((long) decrement));
+    InterlockedAdd64((LONG64 *) value, -((LONG64) decrement));
 }

 inline
@ -1397,7 +1398,7 @@ uint32 atomic_compare_exchange_weak_acquire_release(volatile uint32* value, uint
 inline
 uint64 atomic_compare_exchange_weak_acquire_release(volatile uint64* value, uint64* expected, uint64 desired)
 {
-    return (uint64) InterlockedCompareExchange((unsigned long long *) value, (unsigned long long) desired, (unsigned long long) *expected);
+    return (uint64) InterlockedCompareExchange64((LONG64 *) value, (LONG64) desired, (LONG64) *expected);
 }

 inline
@ -1417,13 +1418,13 @@ uint32 atomic_fetch_sub_acquire_release(volatile uint32* value, uint32 operand)
 inline
 uint64 atomic_fetch_add_acquire_release(volatile uint64* value, uint64 operand)
 {
-    return (uint64) InterlockedExchangeAdd((long *) value, (long) operand);
+    return (uint64) InterlockedExchangeAdd64((LONG64 *) value, (LONG64) operand);
 }

 inline
 uint64 atomic_fetch_sub_acquire_release(volatile uint64* value, uint64 operand)
 {
-    uint64 ret = (uint64) InterlockedExchangeSubtract((unsigned long *) value, (long) operand);
+    uint64 ret = (uint64) InterlockedExchangeAdd64((LONG64 *) value, -((LONG64) operand));

    return ret;
 }
--- a/stdlib/HashMap.h
+++ b/stdlib/HashMap.h
@ -376,7 +376,7 @@ int64 hashmap_dump(const HashMap* hm, byte* data)
    // Dumb hash map content = buffer memory
    int32 free_index = 0;
    int32 bit_index = 0;
-    for (int32 i = 0; i < hm->buf.count; ++i) {
+    for (uint32 i = 0; i < hm->buf.count; ++i) {
        if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
            HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);

@ -434,7 +434,7 @@ int64 hashmap_load(HashMap* hm, const byte* data)
    data += sizeof(uint64);

    // Load the table content
-    for (int i = 0; i < count; ++i) {
+    for (uint32 i = 0; i < count; ++i) {
        uint64 offset =  SWAP_ENDIAN_LITTLE(*((uint64 *) data));
        data += sizeof(offset);

@ -455,7 +455,7 @@ int64 hashmap_load(HashMap* hm, const byte* data)
    // Switch endian AND turn offsets to pointers
    int32 free_index = 0;
    int32 bit_index = 0;
-    for (int32 i = 0; i < hm->buf.count; ++i) {
+    for (uint32 i = 0; i < hm->buf.count; ++i) {
        if ((hm->buf.free[free_index] & (1ULL << bit_index)) > 0) {
            HashEntry* entry = (HashEntry *) chunk_get_element((ChunkMemory *) &hm->buf, i);

--- a/stdlib/Types.h
+++ b/stdlib/Types.h
@ -109,7 +109,6 @@ struct v4_byte {
    };
 };

-
 struct v2_int32 {
    union {
        struct {
--- a/stdlib/simd/SIMD_I16.h
+++ b/stdlib/simd/SIMD_I16.h
@ -438,7 +438,8 @@ inline int16_16 operator<=(int16_16 a, int16_16 b)
 inline int16_32 operator<=(int16_32 a, int16_32 b)
 {
    int16_32 simd;
-    simd.s = _mm512_mask_blend_epi16(_mm512_knot(_mm512_cmpgt_epi16_mask(b.s, a.s)), b.s, a.s);
+    __mmask32 mask = _mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_LE);
+    simd.s = _mm512_mask_blend_epi16(mask, b.s, a.s);

    return simd;
 }
@ -716,25 +717,19 @@ inline int16_32 clamp(int16_32 min_value, int16_32 a, int16_32 max_value)
    return simd_min(simd_max(a, min_value), max_value);
 }

-inline int16 which_true(int16_8 a)
+inline int32 which_true(int16_8 a)
 {
-    int16 which_true = _mm_movemask_epi8(a.s);
-
-    return which_true;
+    return _mm_movemask_epi8(a.s);
 }

-inline int16 which_true(int16_16 a)
+inline int32 which_true(int16_16 a)
 {
-    int16 which_true = _mm256_movemask_epi8(a.s);
-
-    return which_true;
+    return _mm256_movemask_epi8(a.s);
 }

-inline int16 which_true(int16_32 a)
+inline int32 which_true(int16_32 a)
 {
-    int16 which_true = _mm512_movepi16_mask(a.s);
-
-    return which_true;
+    return _mm512_movepi16_mask(a.s);
 }

 inline bool any_true(int16_8 a)
--- a/utils/MathUtils.h
+++ b/utils/MathUtils.h
@ -26,6 +26,8 @@
 #define ROUND_TO_NEAREST(a, b) (((a) + ((b) - 1)) & ~((b) - 1))
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 #define OMS_CEIL(x) ((x) == (int)(x) ? (int)(x) : ((x) > 0 ? (int)(x) + 1 : (int)(x)))
+#define OMS_ROUND(x) (((x) >= 0) ? ((int)((x) + 0.5f)) : ((int)((x) - 0.5f)))
+#define OMS_ROUND_POSITIVE(x) ((int)((x) + 0.5f))

 // Modulo function when b is a power of 2
 #define MODULO_2(a, b) ((a) & (b - 1))
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@ -15,6 +15,7 @@
 #include <ctype.h>

 #include "../stdlib/Types.h"
+#include "MathUtils.h"

 inline
 int32 utf8_encode(uint32 codepoint, char* out)
@ -821,9 +822,9 @@ void sprintf_fast(char *buffer, const char* format, ...) {
                case 'f': {
                    f64 val = va_arg(args, f64);

-                    int32 precision = 6; // Default precision
+                    // Default precision
+                    int32 precision = 5;

-                    // @question Consider to implement rounding
                    // Check for optional precision specifier
                    const char* prec_ptr = ptr + 1;
                    if (*prec_ptr >= '0' && *prec_ptr <= '9') {
@ -841,6 +842,15 @@ void sprintf_fast(char *buffer, const char* format, ...) {
                        val = -val;
                    }

+                    if (precision < 6) {
+                        static const float powers_of_ten[] = {
+                            1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f, 100000.0f
+                        };
+
+                        f32 scale = powers_of_ten[precision];
+                        val = OMS_ROUND_POSITIVE(val * scale) / scale;
+                    }
+
                    // Handle integer part
                    int32 int_part = (int32) val;
                    f64 frac_part = val - int_part;
@ -896,7 +906,7 @@ void format_time_hh_mm_ss(char* time_str, int32 hours, int32 minutes, int32 secs
 }

 inline
-void format_time_hh_mm_ss(char* time_str, int32 time) {
+void format_time_hh_mm_ss(char* time_str, uint64 time) {
    int32 hours = (time / 3600) % 24;
    int32 minutes = (time / 60) % 60;
    int32 secs = time % 60;
@ -915,7 +925,7 @@ void format_time_hh_mm(char* time_str, int32 hours, int32 minutes) {
 }

 inline
-void format_time_hh_mm(char* time_str, int32 time) {
+void format_time_hh_mm(char* time_str, uint64 time) {
    int32 hours = (time / 3600) % 24;
    int32 minutes = (time / 60) % 60;

--- a/utils/Utils.h
+++ b/utils/Utils.h
@ -18,6 +18,7 @@ struct FileBody {
 };

 // @question Do we want to make the size comparison a step variable?
+inline
 bool is_equal_aligned(const byte* region1, const byte* region2, uint64 size)
 {
    while (size > 4) {