From b916506f899412410c3d2cb4a7616f075db58780 Mon Sep 17 00:00:00 2001
From: Dennis Eichhorn <spl1nes.com@googlemail.com>
Date: Tue, 24 Dec 2024 02:57:40 +0100
Subject: [PATCH] sound kinda working again but a choppy, probably an interval
 issue or smaple index issue

---
 audio/Audio.cpp                    |  32 +-
 audio/AudioMixer.h                 |   7 +-
 audio/AudioSetting.h               |   3 +
 audio/Qoa.h                        | 436 ++++++++++++++++++++++++++
 audio/QoaSimd.h                    | 470 +++++++++++++++++++++++++++++
 image/Image.cpp                    |  41 +--
 image/Qoi.h                        | 411 +++++++++++++++----------
 platform/win32/FileUtils.cpp       |   1 +
 platform/win32/audio/DirectSound.h |   6 +-
 platform/win32/audio/XAudio2.h     |   4 -
 10 files changed, 1219 insertions(+), 192 deletions(-)
 create mode 100644 audio/Qoa.h
 create mode 100644 audio/QoaSimd.h

diff --git a/audio/Audio.cpp b/audio/Audio.cpp
index 48e7b38..13604e4 100644
--- a/audio/Audio.cpp
+++ b/audio/Audio.cpp
@@ -44,8 +44,11 @@ int32 audio_data_size(const Audio* audio)
     );
 }
 
-int32 audio_from_data(const byte* data, Audio* audio)
+inline
+uint32 audio_header_from_data(const byte* data, Audio* audio)
 {
+    const byte* start = data;
+
     audio->sample_rate = SWAP_ENDIAN_LITTLE(*((uint16 *) data));
     data += sizeof(audio->sample_rate);
 
@@ -60,14 +63,14 @@ int32 audio_from_data(const byte* data, Audio* audio)
     audio->size = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
     data += sizeof(audio->size);
 
-    memcpy(audio->data, data, audio->size);
-    data += audio->size;
-
-    return audio_data_size(audio);
+    return (int32) (data - start);
 }
 
-int32 audio_to_data(const Audio* audio, byte* data)
+inline
+uint32 audio_header_to_data(const Audio* audio, byte* data)
 {
+    byte* start = data;
+
     *((uint16 *) data) = SWAP_ENDIAN_LITTLE(audio->sample_rate);
     data += sizeof(audio->sample_rate);
 
@@ -78,6 +81,23 @@ int32 audio_to_data(const Audio* audio, byte* data)
     *((uint32 *) data) = SWAP_ENDIAN_LITTLE(audio->size);
     data += sizeof(audio->size);
 
+    return (int32) (data - start);
+}
+
+uint32 audio_from_data(const byte* data, Audio* audio)
+{
+    data += audio_header_from_data(data, audio);
+
+    memcpy(audio->data, data, audio->size);
+    data += audio->size;
+
+    return audio_data_size(audio);
+}
+
+uint32 audio_to_data(const Audio* audio, byte* data)
+{
+    data += audio_header_to_data(audio, data);
+
     memcpy(data, audio->data, audio->size);
     data += audio->size;
 
diff --git a/audio/AudioMixer.h b/audio/AudioMixer.h
index 89f79d5..f4496d8 100644
--- a/audio/AudioMixer.h
+++ b/audio/AudioMixer.h
@@ -69,7 +69,6 @@ struct AudioInstance {
 enum AudioMixerState {
     AUDIO_MIXER_STATE_UNINITIALIZED,
     AUDIO_MIXER_STATE_INACTIVE,
-    AUDIO_MIXER_STATE_SHOULD_PLAY,
     AUDIO_MIXER_STATE_ACTIVE,
 };
 
@@ -100,7 +99,7 @@ struct AudioMixer {
 };
 
 bool audio_mixer_is_active(AudioMixer* mixer) {
-    if (mixer->state_new == AUDIO_MIXER_STATE_ACTIVE
+    if (mixer->state_old == AUDIO_MIXER_STATE_ACTIVE
         && atomic_get_relaxed((int32 *) &mixer->state_new) == AUDIO_MIXER_STATE_ACTIVE
     ) {
         return true;
@@ -108,14 +107,14 @@ bool audio_mixer_is_active(AudioMixer* mixer) {
 
     AudioMixerState mixer_state;
     if ((mixer_state = (AudioMixerState) atomic_get_relaxed((int32 *) &mixer->state_new)) != mixer->state_old) {
-        if (mixer_state != AUDIO_MIXER_STATE_UNINITIALIZED) {
+        if (mixer->state_old == AUDIO_MIXER_STATE_UNINITIALIZED) {
             audio_load(
                 mixer->window,
                 &mixer->settings,
                 &mixer->api_setting
             );
 
-            mixer_state = AUDIO_MIXER_STATE_INACTIVE;
+            mixer->state_old = AUDIO_MIXER_STATE_INACTIVE;
         }
 
         if (mixer_state == AUDIO_MIXER_STATE_ACTIVE) {
diff --git a/audio/AudioSetting.h b/audio/AudioSetting.h
index f1a3985..b761c8b 100644
--- a/audio/AudioSetting.h
+++ b/audio/AudioSetting.h
@@ -22,6 +22,9 @@ struct AudioSetting {
     // usually 48000 or 44100
     uint16 sample_rate;
 
+    // This sample index is used to calculate the position in a ring buffer
+    uint16 sample_index;
+
     // bytes per bloc
     // channel count * bit
     // usually 2 * 16 = 4
diff --git a/audio/Qoa.h b/audio/Qoa.h
new file mode 100644
index 0000000..496e913
--- /dev/null
+++ b/audio/Qoa.h
@@ -0,0 +1,436 @@
+/**
+ * Jingga
+ *
+ * @copyright 2023, Dominic Szablewski - https://phoboslab.org
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_AUDIO_QOA_H
+#define TOS_AUDIO_QOA_H
+
+#include "../stdlib/Types.h"
+#include "../utils/EndianUtils.h"
+#include "../audio/Audio.cpp"
+
+#define QOA_SLICE_LEN 20
+#define QOA_SLICES_PER_FRAME 256
+#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN)
+#define QOA_LMS_LEN 4
+#define QOA_MAX_CHANNELS 8
+
+#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels))
+
+struct alignas(16) QoaLms {
+	int32 history[QOA_LMS_LEN];
+	int32 weights[QOA_LMS_LEN];
+};
+
+/*
+The quant_tab provides an index into the dequant_tab for residuals in the
+range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at
+the higher end. Note that the residual zero is identical to the lowest positive
+value. This is mostly fine, since the qoa_div() function always rounds away
+from zero.
+*/
+static const int32 qoa_quant_tab[17] = {
+	7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
+	0,                      /*  0     */
+	0, 2, 2, 4, 4, 6, 6, 6  /*  1.. 8 */
+};
+
+/*
+We have 16 different scalefactors. Like the quantized residuals these become
+less accurate at the higher end. In theory, the highest scalefactor that we
+would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
+rely on the LMS filter to predict samples accurately enough that a maximum
+residual of one quarter of the 16 bit range is sufficient. I.e. with the
+scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
+
+The scalefactor values are computed as:
+scalefactor_tab[s] <- round(pow(s + 1, 2.75))
+*/
+static const int32 qoa_scalefactor_tab[16] = {
+	1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
+};
+
+
+/*
+The reciprocal_tab maps each of the 16 scalefactors to their rounded
+reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in
+the encoder with just one multiplication instead of an expensive division. We
+do this in .16 fixed point with integers, instead of floats.
+
+The reciprocal_tab is computed as:
+reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s]
+*/
+static const int32 qoa_reciprocal_tab[16] = {
+	65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
+};
+
+/*
+The dequant_tab maps each of the scalefactors and quantized residuals to
+their unscaled & dequantized version.
+
+Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
+instead of 1. The dequant_tab assumes the following dequantized values for each
+of the quant_tab indices and is computed as:
+float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
+dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
+
+The rounding employed here is "to nearest, ties away from zero",  i.e. positive
+and negative values are treated symmetrically.
+*/
+static const int32 qoa_dequant_tab[16][8] = {
+	{   1,    -1,    3,    -3,    5,    -5,     7,     -7},
+	{   5,    -5,   18,   -18,   32,   -32,    49,    -49},
+	{  16,   -16,   53,   -53,   95,   -95,   147,   -147},
+	{  34,   -34,  113,  -113,  203,  -203,   315,   -315},
+	{  63,   -63,  210,  -210,  378,  -378,   588,   -588},
+	{ 104,  -104,  345,  -345,  621,  -621,   966,   -966},
+	{ 158,  -158,  528,  -528,  950,  -950,  1477,  -1477},
+	{ 228,  -228,  760,  -760, 1368, -1368,  2128,  -2128},
+	{ 316,  -316, 1053, -1053, 1895, -1895,  2947,  -2947},
+	{ 422,  -422, 1405, -1405, 2529, -2529,  3934,  -3934},
+	{ 548,  -548, 1828, -1828, 3290, -3290,  5117,  -5117},
+	{ 696,  -696, 2320, -2320, 4176, -4176,  6496,  -6496},
+	{ 868,  -868, 2893, -2893, 5207, -5207,  8099,  -8099},
+	{1064, -1064, 3548, -3548, 6386, -6386,  9933,  -9933},
+	{1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005},
+	{1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336},
+};
+
+
+/*
+The Least Mean Squares Filter is the heart of QOA. It predicts the next
+sample based on the previous 4 reconstructed samples. It does so by continuously
+adjusting 4 weights based on the residual of the previous prediction.
+
+The next sample is predicted as the sum of (weight[i] * history[i]).
+
+The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or
+subtracts the residual to each weight, based on the corresponding sample from
+the history. This, surprisingly, is sufficient to get worthwhile predictions.
+
+This is all done with fixed point integers. Hence the right-shifts when updating
+the weights and calculating the prediction.
+*/
+// @performance Depending on context most likely SIMDable
+static inline
+int32 qoa_lms_predict(QoaLms* lms)
+{
+	int32 prediction = 0;
+	for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
+		prediction += lms->weights[i] * lms->history[i];
+	}
+
+	return prediction >> 13;
+}
+
+// @performance Depending on context most likely SIMDable
+static inline
+void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) {
+	int32 delta = residual >> 4;
+
+    lms->weights[0] += lms->history[0] < 0 ? -delta : delta;
+	for (int32 i = 0; i < QOA_LMS_LEN - 1; ++i) {
+        lms->history[i] = lms->history[i + 1];
+		lms->weights[i + 1] += lms->history[i + 1] < 0 ? -delta : delta;
+	}
+    lms->weights[QOA_LMS_LEN - 1] += lms->history[QOA_LMS_LEN - 1] < 0 ? -delta : delta;
+	lms->history[QOA_LMS_LEN - 1] = sample;
+}
+
+/*
+qoa_div() implements a rounding division, but avoids rounding to zero for
+small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still
+returns as 0, which is handled in the qoa_quant_tab[].
+qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an
+argument, so it can do the division with a cheaper integer multiplication.
+*/
+static inline
+int32 qoa_div(int32 v, int32 scalefactor) {
+	int32 reciprocal = qoa_reciprocal_tab[scalefactor];
+	int32 n = (v * reciprocal + (1 << 15)) >> 16;
+
+    /* round away from 0 */
+	n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0));
+
+	return n;
+}
+
+static inline
+int32 qoa_clamp(int32 v, int32 min, int32 max) {
+	if (v < min) { return min; }
+	if (v > max) { return max; }
+	return v;
+}
+
+/*
+This specialized clamp function for the signed 16 bit range improves decode
+performance quite a bit. The extra if() statement works nicely with the CPUs
+branch prediction as this branch is rarely taken.
+*/
+static inline
+int32 qoa_clamp_s16(int32 v) {
+	if ((uint32) (v + 32768) > 65535) {
+		if (v < -32768) { return -32768; }
+		if (v >  32767) { return  32767; }
+	}
+
+	return v;
+}
+
+uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes)
+{
+    byte* start = bytes;
+
+	int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0};
+
+	// Write the frame header
+    *((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples);
+    bytes += sizeof(frame_samples);
+
+    // @performance SIMDable
+	for (uint32 c = 0; c < channels; ++c) {
+		// Write the current LMS state
+		uint64 history = 0;
+		uint64 weights = 0;
+		for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
+			history = (history << 16) | (lms[c].history[i] & 0xffff);
+			weights = (weights << 16) | (lms[c].weights[i] & 0xffff);
+		}
+
+        *((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(history);
+        bytes += sizeof(history);
+
+        *((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(weights);
+        bytes += sizeof(weights);
+	}
+
+	/*
+    We encode all samples with the channels interleaved on a slice level.
+	E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ...
+    */
+	for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
+        // @performance SIMDable
+		for (uint32 c = 0; c < channels; ++c) {
+			int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index);
+			int32 slice_start = sample_index * channels + c;
+			int32 slice_end = (sample_index + slice_len) * channels + c;
+
+			/*
+            Brute for search for the best scalefactor. Just go through all
+			16 scalefactors, encode all samples for the current slice and
+			meassure the total squared error.
+            */
+			uint64 best_rank = -1;
+			uint64 best_slice = 0;
+			QoaLms best_lms;
+			int32 best_scalefactor = 0;
+
+			for (int32 sfi = 0; sfi < 16; ++sfi) {
+				/*
+                There is a strong correlation between the scalefactors of
+				neighboring slices. As an optimization, start testing
+				the best scalefactor of the previous slice first.
+                */
+				int32 scalefactor = (sfi + prev_scalefactor[c]) % 16;
+
+				/*
+                We have to reset the LMS state to the last known good one
+				before trying each scalefactor, as each pass updates the LMS
+				state when encoding.
+                */
+				QoaLms lms_temp = lms[c];
+				uint64 slice = scalefactor;
+				uint64 current_rank = 0;
+
+				for (int32 si = slice_start; si < slice_end; si += channels) {
+					int32 sample = sample_data[si];
+					int32 predicted = qoa_lms_predict(&lms_temp);
+
+					int32 residual = sample - predicted;
+					int32 scaled = qoa_div(residual, scalefactor);
+					int32 clamped = qoa_clamp(scaled, -8, 8);
+					int32 quantized = qoa_quant_tab[clamped + 8];
+					int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
+					int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
+
+					/*
+                    If the weights have grown too large, we introduce a penalty
+					here. This prevents pops/clicks in certain problem cases
+                    */
+					int32 weights_penalty = ((
+						lms_temp.weights[0] * lms_temp.weights[0]
+						+ lms_temp.weights[1] * lms_temp.weights[1]
+						+ lms_temp.weights[2] * lms_temp.weights[2]
+						+ lms_temp.weights[3] * lms_temp.weights[3]
+					) >> 18) - 0x8ff;
+
+					if (weights_penalty < 0) {
+						weights_penalty = 0;
+					}
+
+					int64 error = (sample - reconstructed);
+					uint64 error_sq = error * error;
+
+					current_rank += error_sq + weights_penalty * weights_penalty;
+					if (current_rank > best_rank) {
+						break;
+					}
+
+					qoa_lms_update(&lms_temp, reconstructed, dequantized);
+					slice = (slice << 3) | quantized;
+				}
+
+				if (current_rank < best_rank) {
+					best_rank = current_rank;
+					best_slice = slice;
+					best_lms = lms_temp;
+					best_scalefactor = scalefactor;
+				}
+			}
+
+			prev_scalefactor[c] = best_scalefactor;
+
+			lms[c] = best_lms;
+
+			/*
+            If this slice was shorter than QOA_SLICE_LEN, we have to left-
+			shift all encoded data, to ensure the rightmost bits are the empty
+			ones. This should only happen in the last frame of a file as all
+			slices are completely filled otherwise.
+            */
+			best_slice <<= (QOA_SLICE_LEN - slice_len) * 3;
+
+            *((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice);
+            bytes += sizeof(best_slice);
+		}
+	}
+
+	return (uint32) (bytes - start);
+}
+
+uint32 qoa_encode(const Audio* audio, byte* data) {
+    byte* start = data;
+
+	/* Calculate the encoded size and allocate */
+    uint32 sample_count = audio->size / (audio->channels * audio->bloc_size);
+	uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN;
+	uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
+
+    QoaLms lms[QOA_MAX_CHANNELS];
+    for (int32 i = 0; i < audio->channels; ++i) {
+        /*
+        Set the initial LMS weights to {0, 0, -1, 2}. This helps with the
+        prediction of the first few ms of a file.
+        */
+        lms[i].weights[0] = 0;
+        lms[i].weights[1] = 0;
+        lms[i].weights[2] = -(1 << 13);
+        lms[i].weights[3] = (1 << 14);
+
+        // Explicitly set the history samples to 0, as we might have some garbage in there.
+        memset(lms[i].history, 0, QOA_LMS_LEN * sizeof(int32));
+    }
+
+	// Go through all frames
+	int32 frame_samples = QOA_FRAME_LEN;
+    int32 p = 0;
+
+	for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) {
+		frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index);
+		data += qoa_encode_frame(
+            (int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size),
+            audio->channels, frame_samples, lms, data
+        );
+	}
+
+	return (uint32) (data - start);
+}
+
+uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data)
+{
+    const byte* start = bytes;
+
+	// Read and verify the frame header
+    uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes));
+    bytes += sizeof(frame_samples);
+
+    uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
+    uint32 frame_size = QOA_FRAME_SIZE(channels, slices);
+	uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels;
+	uint32 num_slices = data_size / 8;
+	uint32 max_total_samples = num_slices * QOA_SLICE_LEN;
+
+	// Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel
+	for (uint32 c = 0; c < channels; ++c) {
+		uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
+        bytes += sizeof(history);
+
+		uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
+        bytes += sizeof(weights);
+
+		for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
+			lms[c].history[i] = ((int16) (history >> 48));
+			history <<= 16;
+
+			lms[c].weights[i] = ((int16) (weights >> 48));
+			weights <<= 16;
+		}
+	}
+
+	// Decode all slices for all channels in this frame
+	for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
+		for (uint32 c = 0; c < channels; c++) {
+			uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
+            bytes += sizeof(slice);
+
+			int32 scalefactor = (slice >> 60) & 0xf;
+			slice <<= 4;
+
+			int32 slice_start = sample_index * channels + c;
+			int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c;
+
+			for (int32 si = slice_start; si < slice_end; si += channels) {
+				int32 predicted = qoa_lms_predict(&lms[c]);
+				int32 quantized = (slice >> 61) & 0x7;
+				int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
+				int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
+
+				sample_data[si] = reconstructed;
+				slice <<= 3;
+
+				qoa_lms_update(&lms[c], reconstructed, dequantized);
+			}
+		}
+	}
+
+	return (uint32) (bytes - start);
+}
+
+
+uint32 qoa_decode(const byte* data, Audio* audio)
+{
+    uint32 header_length = audio_header_from_data(data, audio);
+    uint32 p = header_length;
+	uint32 frame_size;
+    byte* sample_ptr = audio->data;
+
+    QoaLms lms[QOA_MAX_CHANNELS];
+
+    uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels;
+
+	do {
+		frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr);
+        sample_ptr += frame_size;
+		p += frame_size;
+	} while (frame_size && p < audio->size && audio->size - p >= limit);
+    // @question do we really need the audio->size - p >= limit check?
+
+	return header_length + audio->size;
+}
+
+#endif
\ No newline at end of file
diff --git a/audio/QoaSimd.h b/audio/QoaSimd.h
new file mode 100644
index 0000000..11d408c
--- /dev/null
+++ b/audio/QoaSimd.h
@@ -0,0 +1,470 @@
+/**
+ * Jingga
+ *
+ * @copyright 2023, Dominic Szablewski - https://phoboslab.org
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef TOS_AUDIO_QOA_H
+#define TOS_AUDIO_QOA_H
+
+#include "../stdlib/Types.h"
+#include "../utils/EndianUtils.h"
+#include "../audio/Audio.cpp"
+#include "../stdlib/simd/SIMD_I32.h"
+
+#define QOA_SLICE_LEN 20
+#define QOA_SLICES_PER_FRAME 256
+#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN)
+#define QOA_LMS_LEN 4
+#define QOA_MAX_CHANNELS 8
+
+#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels))
+
+struct QoaLms {
+	int32_4 history; // automatically QOA_LMS_LEN size
+	int32_4 weights; // automatically QOA_LMS_LEN size
+};
+
+/*
+The quant_tab provides an index into the dequant_tab for residuals in the
+range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at
+the higher end. Note that the residual zero is identical to the lowest positive
+value. This is mostly fine, since the qoa_div() function always rounds away
+from zero.
+*/
+static const int32 qoa_quant_tab[17] = {
+	7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
+	0,                      /*  0     */
+	0, 2, 2, 4, 4, 6, 6, 6  /*  1.. 8 */
+};
+
+/*
+We have 16 different scalefactors. Like the quantized residuals these become
+less accurate at the higher end. In theory, the highest scalefactor that we
+would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
+rely on the LMS filter to predict samples accurately enough that a maximum
+residual of one quarter of the 16 bit range is sufficient. I.e. with the
+scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
+
+The scalefactor values are computed as:
+scalefactor_tab[s] <- round(pow(s + 1, 2.75))
+*/
+static const int32 qoa_scalefactor_tab[16] = {
+	1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
+};
+
+
+/*
+The reciprocal_tab maps each of the 16 scalefactors to their rounded
+reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in
+the encoder with just one multiplication instead of an expensive division. We
+do this in .16 fixed point with integers, instead of floats.
+
+The reciprocal_tab is computed as:
+reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s]
+*/
+static const int32 qoa_reciprocal_tab[16] = {
+	65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
+};
+
+/*
+The dequant_tab maps each of the scalefactors and quantized residuals to
+their unscaled & dequantized version.
+
+Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
+instead of 1. The dequant_tab assumes the following dequantized values for each
+of the quant_tab indices and is computed as:
+float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
+dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
+
+The rounding employed here is "to nearest, ties away from zero",  i.e. positive
+and negative values are treated symmetrically.
+*/
+static const int32 qoa_dequant_tab[16][8] = {
+	{   1,    -1,    3,    -3,    5,    -5,     7,     -7},
+	{   5,    -5,   18,   -18,   32,   -32,    49,    -49},
+	{  16,   -16,   53,   -53,   95,   -95,   147,   -147},
+	{  34,   -34,  113,  -113,  203,  -203,   315,   -315},
+	{  63,   -63,  210,  -210,  378,  -378,   588,   -588},
+	{ 104,  -104,  345,  -345,  621,  -621,   966,   -966},
+	{ 158,  -158,  528,  -528,  950,  -950,  1477,  -1477},
+	{ 228,  -228,  760,  -760, 1368, -1368,  2128,  -2128},
+	{ 316,  -316, 1053, -1053, 1895, -1895,  2947,  -2947},
+	{ 422,  -422, 1405, -1405, 2529, -2529,  3934,  -3934},
+	{ 548,  -548, 1828, -1828, 3290, -3290,  5117,  -5117},
+	{ 696,  -696, 2320, -2320, 4176, -4176,  6496,  -6496},
+	{ 868,  -868, 2893, -2893, 5207, -5207,  8099,  -8099},
+	{1064, -1064, 3548, -3548, 6386, -6386,  9933,  -9933},
+	{1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005},
+	{1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336},
+};
+
+
+/*
+The Least Mean Squares Filter is the heart of QOA. It predicts the next
+sample based on the previous 4 reconstructed samples. It does so by continuously
+adjusting 4 weights based on the residual of the previous prediction.
+
+The next sample is predicted as the sum of (weight[i] * history[i]).
+
+The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or
+subtracts the residual to each weight, based on the corresponding sample from
+the history. This, surprisingly, is sufficient to get worthwhile predictions.
+
+This is all done with fixed point integers. Hence the right-shifts when updating
+the weights and calculating the prediction.
+*/
+static inline
+int32 qoa_lms_predict(QoaLms* lms)
+{
+    __m128i products = _mm_mullo_epi32(lms->weights.s, lms->history.s);
+    __m128i sum1 = _mm_hadd_epi32(products, products);
+    __m128i sum2 = _mm_hadd_epi32(sum1, sum1);
+
+    int32 prediction = _mm_cvtsi128_si32(sum2);
+
+    return prediction >> 13;
+}
+
+static inline
+void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) {
+	int32 delta = residual >> 4;
+
+    __m128i delta_vec = _mm_set1_epi32(delta);
+    __m128i zero_vec = _mm_setzero_si128();
+
+    // Calculate adjustments for weights based on the sign of history
+    __m128i sign_mask = _mm_cmpgt_epi32(zero_vec, lms->history.s); // history < 0
+    __m128i delta_adjust = _mm_blendv_epi8(delta_vec, _mm_sub_epi32(zero_vec, delta_vec), sign_mask);
+
+    // Update weights
+    lms->weights.s = _mm_add_epi32(lms->weights.s, delta_adjust);
+
+    // Shift history left
+    lms->history.s = _mm_alignr_epi8(lms->history.s, lms->history.s, 4); // Shift left by 1 int32 (4 bytes)
+
+    // Insert the new sample into the last position of history
+    lms->history.s = _mm_insert_epi32(lms->history.s, sample, QOA_LMS_LEN - 1);
+
+    // Update the last weight based on the sign of the new sample
+    int32 sample_sign_adjust = (sample < 0) ? -delta : delta;
+    lms->weights.s = _mm_insert_epi32(
+        lms->weights.s,
+        _mm_extract_epi32(lms->weights.s, QOA_LMS_LEN - 1) + sample_sign_adjust,
+        QOA_LMS_LEN - 1
+    );
+}
+
+/*
+qoa_div() implements a rounding division, but avoids rounding to zero for
+small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still
+returns as 0, which is handled in the qoa_quant_tab[].
+qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an
+argument, so it can do the division with a cheaper integer multiplication.
+*/
+static inline
+int32 qoa_div(int32 v, int32 scalefactor) {
+	int32 reciprocal = qoa_reciprocal_tab[scalefactor];
+	int32 n = (v * reciprocal + (1 << 15)) >> 16;
+
+    /* round away from 0 */
+	n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0));
+
+	return n;
+}
+
+static inline
+int32 qoa_clamp(int32 v, int32 min, int32 max) {
+	if (v < min) { return min; }
+	if (v > max) { return max; }
+	return v;
+}
+
+/*
+This specialized clamp function for the signed 16 bit range improves decode
+performance quite a bit. The extra if() statement works nicely with the CPUs
+branch prediction as this branch is rarely taken.
+*/
+static inline
+int32 qoa_clamp_s16(int32 v) {
+	if ((uint32) (v + 32768) > 65535) {
+		if (v < -32768) { return -32768; }
+		if (v >  32767) { return  32767; }
+	}
+
+	return v;
+}
+
+uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes)
+{
+    byte* start = bytes;
+
+	int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0};
+
+	// Write the frame header
+    *((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples);
+    bytes += sizeof(frame_samples);
+
+	for (uint32 c = 0; c < channels; ++c) {
+		// Load the history and weights as 16-bit masked values
+        __m128i history = _mm_and_si128(lms[c].history.s, _mm_set1_epi32(0xFFFF));
+        __m128i weights = _mm_and_si128(lms[c].weights.s, _mm_set1_epi32(0xFFFF));
+
+        // Permute and pack 16-bit values into 64-bit results
+        __m128i packed_history = _mm_packus_epi32(history, history); // Pack 16-bit values
+        __m128i packed_weights = _mm_packus_epi32(weights, weights);
+
+        // Shuffle packed values into the correct order for 64-bit construction
+        __m128i history_64 = _mm_packus_epi16(packed_history, packed_history);
+        __m128i weights_64 = _mm_packus_epi16(packed_weights, packed_weights);
+
+        // Extract 64-bit results and swap endian if necessary
+        uint64 final_history = _mm_extract_epi64(history_64, 0);
+        uint64 final_weights = _mm_extract_epi64(weights_64, 0);
+
+        // Store results with endian swap
+        *((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_history);
+        bytes += sizeof(final_history);
+
+        *((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_weights);
+        bytes += sizeof(final_weights);
+	}
+
+	/*
+    We encode all samples with the channels interleaved on a slice level.
+	E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ...
+    */
+	for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
+        // @performance SIMDable
+		for (uint32 c = 0; c < channels; ++c) {
+			int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index);
+			int32 slice_start = sample_index * channels + c;
+			int32 slice_end = (sample_index + slice_len) * channels + c;
+
+			/*
+            Brute for search for the best scalefactor. Just go through all
+			16 scalefactors, encode all samples for the current slice and
+			meassure the total squared error.
+            */
+			uint64 best_rank = -1;
+			uint64 best_slice = 0;
+			QoaLms best_lms;
+			int32 best_scalefactor = 0;
+
+			for (int32 sfi = 0; sfi < 16; ++sfi) {
+				/*
+                There is a strong correlation between the scalefactors of
+				neighboring slices. As an optimization, start testing
+				the best scalefactor of the previous slice first.
+                */
+				int32 scalefactor = (sfi + prev_scalefactor[c]) % 16;
+
+				/*
+                We have to reset the LMS state to the last known good one
+				before trying each scalefactor, as each pass updates the LMS
+				state when encoding.
+                */
+				QoaLms lms_temp = lms[c];
+				uint64 slice = scalefactor;
+				uint64 current_rank = 0;
+
+				for (int32 si = slice_start; si < slice_end; si += channels) {
+					int32 sample = sample_data[si];
+					int32 predicted = qoa_lms_predict(&lms_temp);
+
+					int32 residual = sample - predicted;
+					int32 scaled = qoa_div(residual, scalefactor);
+					int32 clamped = qoa_clamp(scaled, -8, 8);
+					int32 quantized = qoa_quant_tab[clamped + 8];
+					int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
+					int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
+
+					/*
+                    If the weights have grown too large, we introduce a penalty
+					here. This prevents pops/clicks in certain problem cases
+                    */
+                    // Compute weights squared: w^2
+                    __m128i weights_squared = _mm_mullo_epi32(lms_temp.weights.s, lms_temp.weights.s);
+
+                    // Perform horizontal addition to sum all squared weights
+                    __m128i sum1 = _mm_hadd_epi32(weights_squared, weights_squared);
+                    __m128i sum2 = _mm_hadd_epi32(sum1, sum1);
+
+                    // Extract the final sum (scalar)
+                    int32 sum_of_squares = _mm_cvtsi128_si32(sum2);
+
+                    // Apply the shift and subtraction
+                    int32 weights_penalty = (sum_of_squares >> 18) - 0x8FF;
+					if (weights_penalty < 0) {
+						weights_penalty = 0;
+					}
+
+					int64 error = (sample - reconstructed);
+					uint64 error_sq = error * error;
+
+					current_rank += error_sq + weights_penalty * weights_penalty;
+					if (current_rank > best_rank) {
+						break;
+					}
+
+					qoa_lms_update(&lms_temp, reconstructed, dequantized);
+					slice = (slice << 3) | quantized;
+				}
+
+				if (current_rank < best_rank) {
+					best_rank = current_rank;
+					best_slice = slice;
+					best_lms = lms_temp;
+					best_scalefactor = scalefactor;
+				}
+			}
+
+			prev_scalefactor[c] = best_scalefactor;
+
+			lms[c] = best_lms;
+
+			/*
+            If this slice was shorter than QOA_SLICE_LEN, we have to left-
+			shift all encoded data, to ensure the rightmost bits are the empty
+			ones. This should only happen in the last frame of a file as all
+			slices are completely filled otherwise.
+            */
+			best_slice <<= (QOA_SLICE_LEN - slice_len) * 3;
+
+            *((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice);
+            bytes += sizeof(best_slice);
+		}
+	}
+
+	return (uint32) (bytes - start);
+}
+
+uint32 qoa_encode(const Audio* audio, byte* data)
+{
+    byte* start = data;
+
+	/* Calculate the encoded size and allocate */
+    uint32 sample_count = audio->size / (audio->channels * audio->bloc_size);
+	uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN;
+	uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
+
+    QoaLms lms[QOA_MAX_CHANNELS];
+    __m128i weights_init = _mm_set_epi32(1 << 14, -(1 << 13), 0, 0);
+    __m128i history_init = _mm_setzero_si128();
+
+    for (int32 i = 0; i < audio->channels; ++i) {
+        /*
+        Set the initial LMS weights to {0, 0, -1, 2}. This helps with the
+        prediction of the first few ms of a file.
+        */
+        lms[i].weights.s = weights_init;
+        lms[i].history.s = history_init;
+    }
+
+	// Go through all frames
+	int32 frame_samples = QOA_FRAME_LEN;
+    int32 p = 0;
+
+	for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) {
+		frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index);
+		data += qoa_encode_frame(
+            (int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size),
+            audio->channels, frame_samples, lms, data
+        );
+	}
+
+	return (uint32) (data - start);
+}
+
+uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data)
+{
+    const byte* start = bytes;
+
+	// Read and verify the frame header
+    uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes));
+    bytes += sizeof(frame_samples);
+
+    uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
+    uint32 frame_size = QOA_FRAME_SIZE(channels, slices);
+	uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels;
+	uint32 num_slices = data_size / 8;
+	uint32 max_total_samples = num_slices * QOA_SLICE_LEN;
+
+	// Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel
+	for (uint32 c = 0; c < channels; ++c) {
+		uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
+        bytes += sizeof(history);
+
+		uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
+        bytes += sizeof(weights);
+
+        alignas(16) int32 history_array[4];
+        alignas(16) int32 weights_array[4];
+
+		for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
+			history_array[i] = ((int16) (history >> 48));
+			history <<= 16;
+
+			weights_array[i] = ((int16) (weights >> 48));
+			weights <<= 16;
+		}
+
+        lms[c].history.s = _mm_set_epi32(history_array[3], history_array[2], history_array[1], history_array[0]);
+        lms[c].weights.s = _mm_set_epi32(weights_array[3], weights_array[2], weights_array[1], weights_array[0]);
+	}
+
+	// Decode all slices for all channels in this frame
+	for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
+		for (uint32 c = 0; c < channels; c++) {
+			uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
+            bytes += sizeof(slice);
+
+			int32 scalefactor = (slice >> 60) & 0xf;
+			slice <<= 4;
+
+			int32 slice_start = sample_index * channels + c;
+			int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c;
+
+			for (int32 si = slice_start; si < slice_end; si += channels) {
+				int32 predicted = qoa_lms_predict(&lms[c]);
+				int32 quantized = (slice >> 61) & 0x7;
+				int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
+				int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
+
+				sample_data[si] = reconstructed;
+				slice <<= 3;
+
+				qoa_lms_update(&lms[c], reconstructed, dequantized);
+			}
+		}
+	}
+
+	return (uint32) (bytes - start);
+}
+
+
+uint32 qoa_decode(const byte* data, Audio* audio)
+{
+    uint32 header_length = audio_header_from_data(data, audio);
+    uint32 p = header_length;
+	uint32 frame_size;
+    byte* sample_ptr = audio->data;
+
+    QoaLms lms[QOA_MAX_CHANNELS];
+
+    uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels;
+
+	do {
+		frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr);
+        sample_ptr += frame_size;
+		p += frame_size;
+	} while (frame_size && audio->size - p >= limit);
+    // @question do we really need the audio->size - p >= limit check or would p < audio->size be sufficient?
+
+	return header_length + audio->size;
+}
+
+#endif
\ No newline at end of file
diff --git a/image/Image.cpp b/image/Image.cpp
index 4f14499..b8505ae 100644
--- a/image/Image.cpp
+++ b/image/Image.cpp
@@ -81,25 +81,26 @@ int32 image_data_size(const Image* image)
         + sizeof(image->image_settings);
 }
 
-int32 image_header_from_data(const byte* data, Image* image)
+inline
+uint32 image_header_from_data(const byte* data, Image* image)
 {
-    const byte* pos = data;
+    const byte* start = data;
 
-    image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
-    pos += sizeof(image->width);
+    image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
+    data += sizeof(image->width);
 
-    image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
-    pos += sizeof(image->height);
+    image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
+    data += sizeof(image->height);
 
     image->pixel_count = image->width * image->height;
 
-    image->image_settings = *pos;
-    pos += sizeof(image->image_settings);
+    image->image_settings = *data;
+    data += sizeof(image->image_settings);
 
-    return (int32) (pos - data);
+    return (int32) (data - start);
 }
 
-int32 image_from_data(const byte* data, Image* image)
+uint32 image_from_data(const byte* data, Image* image)
 {
     const byte* pos = data;
     pos += image_header_from_data(data, image);
@@ -112,23 +113,23 @@ int32 image_from_data(const byte* data, Image* image)
 }
 
 inline
-int32 image_header_to_data(const Image* image, byte* data)
+uint32 image_header_to_data(const Image* image, byte* data)
 {
-    byte* pos = data;
+    byte* start = data;
 
-    *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->width);
-    pos += sizeof(image->width);
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(image->width);
+    data += sizeof(image->width);
 
-    *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->height);
-    pos += sizeof(image->height);
+    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(image->height);
+    data += sizeof(image->height);
 
-    *pos = image->image_settings;
-    pos += sizeof(image->image_settings);
+    *data = image->image_settings;
+    data += sizeof(image->image_settings);
 
-    return (int32) (pos - data);
+    return (int32) (data - start);
 }
 
-int32 image_to_data(const Image* image, byte* data)
+uint32 image_to_data(const Image* image, byte* data)
 {
     byte* pos = data;
     pos += image_header_to_data(image, data);
diff --git a/image/Qoi.h b/image/Qoi.h
index 9876aae..044e18a 100644
--- a/image/Qoi.h
+++ b/image/Qoi.h
@@ -1,6 +1,7 @@
 /**
  * Jingga
  *
+ * @copyright 2021, Dominic Szablewski - https://phoboslab.org
  * @copyright Jingga
  * @license   OMS License 2.0
  * @version   1.0.0
@@ -13,20 +14,36 @@
 #include <string.h>
 #include "Image.cpp"
 
-#define QOI_OP_INDEX  0b00000000
-#define QOI_OP_DIFF   0b01000000
-#define QOI_OP_LUMA   0b10000000
-#define QOI_OP_RUN    0b11000000 // @todo There is a HUGE step from here to QOI_OP_RGB this leaves room for more cases or using this data
-#define QOI_OP_RGB    0b11111110
-#define QOI_OP_RGBA   0b11111111
-#define QOI_MASK_2    0b11000000
+#define QOI_OP_LUMA555 0b00000000
+#define QOI_OP_LUMA222 0b10000000
+#define QOI_OP_LUMA777 0b01000000
 
-#define QOI_COLOR_HASH(color) (color.r * 3 + color.g * 5 + color.b * 7 + color.a * 11)
-#define QOI_COLOR_HASH_2(color) ((((uint32)(color)) * 0x9E3779B1U) >> 26)
+#define QOI_OP_RUN 0b11000000
+
+// These definitions are important and impact how large our run can be:
+// Run has 6 free bits -> 2^6 = 64
+// However, the first bit is used to indicate RGB or RGBA -> 64 - 2^1 = 62
+#define QOI_OP_RGB  0b11111110
+#define QOI_OP_RGBA 0b11111111
+
+#define QOI_MASK_1 0b10000000
+#define QOI_MASK_2 0b11000000
+#define QOI_MASK_3 0b11100000
+
+// @performance I feel like there is some more optimization possible by handling fully transparent pixels in a special way
+// @todo We need to implement monochrome handling, which is very important for game assets that often use monochrome assets for all kinds of things (e.g. translucency)
+
+const byte optable[128] = {
+    0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+};
 
 int32 qoi_encode(const Image* image, byte* data)
 {
-    int32 p = image_header_to_data(image, data);
+    byte* start = data;
+    data += image_header_to_data(image, data);
 
     v4_byte index[64];
 	memset(index, 0, sizeof(index));
@@ -34,173 +51,257 @@ int32 qoi_encode(const Image* image, byte* data)
     v4_byte px_prev = {0, 0, 0, 255};
 	v4_byte px = px_prev;
 
-	int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
+	const int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
 
-    // Only works with 1 byte channel size -> we don't have to multiply channel count with channel size
 	int32 px_len = image->width * image->height * channels;
 	int32 px_end = px_len - channels;
 
     int32 run = 0;
-	for (int32 px_pos = 0; px_pos < px_len; px_pos += channels) {
-        // @performance could We just use int32 type cast? The problem would be the last pixel which would be out of bounds by 1 byte
-        memcpy(&px, &image->pixels[px_pos], channels * sizeof(byte));
+	if (channels == 4) {
+		for (int32 px_pos = 0; px_pos < px_len; px_pos += 4) {
+			px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) (image->pixels + px_pos)));
 
-		if (px.val == px_prev.val) {
-			++run;
-			if (run == 62 || px_pos == px_end) {
-				data[p++] = (byte) (QOI_OP_RUN | (run - 1));
-				run = 0;
-			}
-		} else {
-			if (run) {
-				data[p++] = (byte) (QOI_OP_RUN | (run - 1));
-				run = 0;
-			}
+			while(px.val == px_prev.val) {
+				++run;
+				if(px_pos == px_end) {
+					*data++ = (byte) (QOI_OP_RUN | (run - 1));
+					px_pos = px_len;
 
-			int32 index_pos = QOI_COLOR_HASH(px) % 64;
-			//int32 index_pos = QOI_COLOR_HASH_2(px);
-
-			if (index[index_pos].val == px.val) {
-				data[p++] = (byte) (QOI_OP_INDEX | index_pos);
-			} else {
-				index[index_pos] = px;
-
-				if (px.a == px_prev.a) {
-					signed char vr = px.r - px_prev.r;
-					signed char vg = px.g - px_prev.g;
-					signed char vb = px.b - px_prev.b;
-
-					signed char vg_r = vr - vg;
-					signed char vg_b = vb - vg;
-
-					if (vr > -3 && vr < 2
-						&& vg > -3 && vg < 2
-						&& vb > -3 && vb < 2
-					) {
-						data[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
-					} else if (vg_r > -9 && vg_r < 8
-						&& vg > -33 && vg < 32
-						&& vg_b > -9 && vg_b < 8
-					) {
-						data[p++] = QOI_OP_LUMA | (vg + 32);
-						data[p++] = (vg_r + 8) << 4 | (vg_b +  8);
-					} else {
-						data[p++] = QOI_OP_RGB;
-						data[p++] = px.r;
-						data[p++] = px.g;
-						data[p++] = px.b;
-					}
-				} else {
-					data[p++] = QOI_OP_RGBA;
-                    *((uint32 *) &data[p]) = SWAP_ENDIAN_LITTLE(px.val);
-                    p += 4;
+                    break;
+				} else if (run == 62) {
+					*data++ = (byte) (QOI_OP_RUN | (run - 1));
+					run = 0;
 				}
+
+				px_pos += 4;
+                px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) (image->pixels + px_pos)));
+			}
+
+			if (run) {
+				*data++ = (byte) (QOI_OP_RUN | (run - 1));
+				run = 0;
+			}
+
+			if(px.a != px_prev.a){
+				*data++ = QOI_OP_RGBA;
+				*data++ = px.a;
+			}
+
+			signed char vr = px.r - px_prev.r;
+            signed char vg = px.g - px_prev.g;
+            signed char vb = px.b - px_prev.b;
+
+            signed char vg_r = vr - vg;
+            signed char vg_b = vb - vg;
+
+            byte ar = vg_r < 0 ? -vg_r - 1 : vg_r;
+            byte ag = vg < 0 ? -vg - 1 : vg;
+            byte ab = vg_b < 0 ? -vg_b - 1 : vg_b;
+            byte argb = ar | ag | ab;
+
+            switch(optable[argb]) {
+                case 0:
+                    *data++ = QOI_OP_LUMA222 | ((vg_r + 2) << 4) | ((vg_b + 2) << 2) | (vg + 2);
+                    break;
+                case 1:
+                    *data++ = QOI_OP_LUMA555 | ((vg_b + 16) << 2) | ((vg_r + 16) >> 3);
+                    *data++ = (((vg_r + 16) & 7) << 5) | (vg + 16);
+                    break;
+                case 2:
+                    *data++ = QOI_OP_LUMA777 | ((vg_b + 64) >> 2);
+                    *data++ = (((vg_b + 64) & 3) << 6) | ((vg_r + 64) >> 1);
+                    *data++ = (((vg_r + 64) & 1) << 7) | (vg + 64);
+                    break;
+                case 3:
+                    *data++ = QOI_OP_RGB;
+                    *data++ = px.r;
+                    *data++ = px.g;
+                    *data++ = px.b;
+                    break;
+            }
+
+			px_prev = px;
+		}
+	} else {
+		for (int32 px_pos = 0; px_pos < px_len; px_pos += 3) {
+			px.r = image->pixels[px_pos];
+			px.g = image->pixels[px_pos + 1];
+			px.b = image->pixels[px_pos + 2];
+
+			while(px.val == px_prev.val) {
+				++run;
+				if(px_pos == px_end) {
+					*data++ = (byte) (QOI_OP_RUN | (run - 1));
+					px_pos = px_len;
+
+                    break;
+				} else if (run == 62) {
+					*data++ = (byte) (QOI_OP_RUN | (run - 1));
+					run = 0;
+				}
+
+				px_pos += 3;
+				px.r = image->pixels[px_pos];
+				px.g = image->pixels[px_pos + 1];
+				px.b = image->pixels[px_pos + 2];
+			}
+
+			if (run) {
+				*data++ = (byte) (QOI_OP_RUN | (run - 1));
+				run = 0;
+			}
+
+            signed char vr = px.r - px_prev.r;
+            signed char vg = px.g - px_prev.g;
+            signed char vb = px.b - px_prev.b;
+
+            signed char vg_r = vr - vg;
+            signed char vg_b = vb - vg;
+
+            byte ar = vg_r < 0 ? -vg_r - 1 : vg_r;
+            byte ag = vg < 0 ? -vg - 1 : vg;
+            byte ab = vg_b < 0 ? -vg_b - 1 : vg_b;
+            byte argb = ar | ag | ab;
+
+            switch(optable[argb]) {
+                case 0:
+                    *data++ = QOI_OP_LUMA222 | ((vg_r + 2) << 4) | ((vg_b + 2) << 2) | (vg + 2);
+                    break;
+                case 1:
+                    *data++ = QOI_OP_LUMA555 | ((vg_b + 16) << 2) | ((vg_r + 16) >> 3);
+                    *data++ = (((vg_r + 16) & 7) << 5) | (vg + 16);
+                    break;
+                case 2:
+                    *data++ = QOI_OP_LUMA777 | ((vg_b + 64) >> 2);
+                    *data++ = (((vg_b + 64) & 3) << 6) | ((vg_r + 64) >> 1);
+                    *data++ = (((vg_r + 64) & 1) << 7) | (vg + 64);
+                    break;
+                case 3:
+                    *data++ = QOI_OP_RGB;
+                    *data++ = px.r;
+                    *data++ = px.g;
+                    *data++ = px.b;
+                    break;
+            }
+
+			px_prev = px;
+		}
+	}
+
+	return (int32) (data - start);
+}
+
+int32 qoi_decode_4(const byte* data, Image* image)
+{
+    uint32 px_len = image->width * image->height * 4;
+    v4_byte px = {0, 0, 0, 255};
+    v4_byte index[64] = {0};
+    int32 run = 0;
+
+    for (int32 px_pos = 0; px_pos < px_len; px_pos += 4) {
+		if (run > 0) {
+			--run;
+		} else {
+            OP_RGBA_GOTO:
+			byte b1 = *data++;
+
+			if (b1 == QOI_OP_RGB) {
+				px.r = *data++;
+				px.g = *data++;
+				px.b = *data++;
+            } else if (b1 == QOI_OP_RGBA) {
+				px.a = *data++;
+				goto OP_RGBA_GOTO;
+			} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA222) {
+				byte vg = (b1 & 3) - 2;
+				px.r += vg - 2 + ((b1 >> 4) & 3);
+				px.g += vg;
+				px.b += vg - 2 + ((b1 >> 2) & 3);
+			} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA555) {
+				byte b2 = *data++;
+				byte vg = (b2 & 31) - 16;
+				px.r += vg - 16 + (((b1 & 3) << 3) | (b2 >> 5));
+				px.g += vg;
+				px.b += vg - 16 + ((b1 >> 2) & 31);
+			} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA777) {
+				byte b2 = *data++;
+				byte b3 = *data++;
+				byte vg = (b3 & 0x7f) - 64;
+				px.r += vg - 64 + ((b2 & 0x3f) << 1) + (b3 >> 7);
+				px.g += vg;
+				px.b += vg - 64 + ((b1 & 0x1f) << 2) + (b2 >> 6);
+			} else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
+				run = (b1 & 0x3f);
 			}
 		}
 
-		px_prev = px;
+		*((uint32 *) &image->pixels[px_pos]) = SWAP_ENDIAN_LITTLE(px.val);
 	}
 
-	return p;
+    return px_len;
 }
 
-int32 qoi_decode(const byte* data, Image* image, int32 steps = 8)
+int32 qoi_decode_3(const byte* data, Image* image)
 {
-    int32 header_length = image_header_from_data(data, image);
-    int32 p = header_length;
-
-    int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
-	uint32 px_len = image->width * image->height * channels;
-
-    v4_byte px = {0, 0, 0, 255};
-
-    v4_byte index[64];
-    memset(index, 0, sizeof(index));
-
+	uint32 px_len = image->width * image->height * 3;
+    v3_byte px = {0, 0, 0};
     int32 run = 0;
 
-	for (uint32 px_pos = 0; px_pos < px_len; px_pos += channels) {
-        int32 b1 = data[p++];
+	for (int32 px_pos = 0; px_pos < px_len; px_pos += 3) {
+		if (run > 0) {
+			--run;
+		} else {
+			byte b1 = *data++;
 
-        if (b1 == QOI_OP_RGB) {
-            px.r = data[p++];
-            px.g = data[p++];
-            px.b = data[p++];
-        } else if (b1 == QOI_OP_RGBA) {
-            px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p]));
-            p += 4;
-        } else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) {
-            px = index[b1];
-        } else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) {
-            px.r += ((b1 >> 4) & 0x03) - 2;
-            px.g += ((b1 >> 2) & 0x03) - 2;
-            px.b += ( b1 & 0x03) - 2;
-        } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) {
-            int32 b2 = data[p++];
-            byte vg = (b1 & 0x3f) - 32;
-            px.r += vg - 8 + ((b2 >> 4) & 0x0f);
-            px.g += vg;
-            px.b += vg - 8 + (b2 & 0x0f);
-        } else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
-            run = (b1 & 0x3f);
+			if (b1 == QOI_OP_RGB) {
+				px.r = *data++;
+				px.g = *data++;
+				px.b = *data++;
+			} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA222) {
+				byte vg = (b1 & 3) - 2;
+				px.r += vg - 2 + ((b1 >> 4) & 3);
+				px.g += vg;
+				px.b += vg - 2 + ((b1 >> 2) & 3);
+			} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA555) {
+				byte b2 = *data++;
+				byte vg = (b2 & 31) - 16;
+				px.r += vg - 16 + (((b1 & 3) << 3) | (b2 >> 5));
+				px.g += vg;
+				px.b += vg - 16 + ((b1 >> 2) & 31);
+			} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA777) {
+				byte b2 = *data++;
+				byte b3 = *data++;
+				byte vg = (b3 & 0x7f) - 64;
+				px.r += vg - 64 + ((b2 & 0x3f) << 1) + (b3 >> 7);
+				px.g += vg;
+				px.b += vg - 64 + ((b1 & 0x1f) << 2) + (b2 >> 6);
+			} else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
+				run = (b1 & 0x3f);
+			}
+		}
 
-            if (channels == 4) {
-                uint32 px_little_endian = SWAP_ENDIAN_LITTLE(px.val);
-                int32 pixel_step_size = steps * 4;
-                int32 i = 0;
-
-                // @performance Implement for ARM
-                #if ARM
-                #else
-                    if (steps == 16) {
-                        __m512i simd_value = _mm512_set1_epi32(px_little_endian);
-                        for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
-                            _mm512_storeu_si512((__m512i *) &image->pixels[px_pos], simd_value);
-                        }
-                    } else if (steps >= 8) {
-                        __m256i simd_value = _mm256_set1_epi32(px_little_endian);
-                        for (; i <= run - steps; i += steps, px_pos += pixel_step_size) {
-                            _mm256_storeu_si256((__m256i *) &image->pixels[px_pos], simd_value);
-                        }
-                    } else if (steps >= 4) {
-                        __m128i simd_value = _mm_set1_epi32(px_little_endian);
-                        for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
-                            _mm_storeu_si128((__m128i *) &image->pixels[px_pos], simd_value);
-                        }
-                    }
-                #endif
-
-                for (; i < run; ++i) {
-                    *((uint32 *) &image->pixels[px_pos]) = px_little_endian;
-                    px_pos += channels;
-                }
-            } else if (channels == 3) {
-                for (int32 i = 0; i < run; ++i) {
-                    image->pixels[px_pos++] = px.r;
-                    image->pixels[px_pos++] = px.g;
-                    image->pixels[px_pos++] = px.b;
-                }
-            } else if (channels == 1) {
-                memset(&image->pixels[px_pos], px.r, run * sizeof(byte));
-                px_pos += run;
-            }
-
-            // Correction, since the loop increments by channels count as well
-            px_pos -= channels;
-
-            index[QOI_COLOR_HASH(px) % 64] = px;
-            //index[QOI_COLOR_HASH_2(px)] = px;
-
-            continue;
-        }
-
-        index[QOI_COLOR_HASH(px) % 64] = px;
-        //index[QOI_COLOR_HASH_2(px)] = px;
-
-        memcpy(&image->pixels[px_pos], &px, channels * sizeof(byte));
+		image->pixels[px_pos] = px.r;
+		image->pixels[px_pos + 1] = px.g;
+		image->pixels[px_pos + 2] = px.b;
 	}
 
-    return header_length + px_len;
+    return px_len;
+}
+
+int32 qoi_decode(const byte* data, Image* image)
+{
+    int32 header_length = image_header_from_data(data, image);
+
+    const int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
+
+    int32 len = 0;
+    if (channels == 4) {
+        len = qoi_decode_4(data + header_length, image);
+    } else if (channels == 3) {
+        len = qoi_decode_3(data + header_length, image);
+    }
+
+    return header_length + len;
 }
 
 #endif
\ No newline at end of file
diff --git a/platform/win32/FileUtils.cpp b/platform/win32/FileUtils.cpp
index 26c3bc0..9d71aa8 100644
--- a/platform/win32/FileUtils.cpp
+++ b/platform/win32/FileUtils.cpp
@@ -21,6 +21,7 @@
 #include "../../utils/Utils.h"
 #include "../../utils/TestUtils.h"
 #include "../../memory/RingMemory.h"
+#include "../../log/Log.h"
 
 typedef HANDLE FileHandle;
 typedef HANDLE MMFHandle;
diff --git a/platform/win32/audio/DirectSound.h b/platform/win32/audio/DirectSound.h
index 22ddb6e..06f1d3a 100644
--- a/platform/win32/audio/DirectSound.h
+++ b/platform/win32/audio/DirectSound.h
@@ -148,7 +148,7 @@ uint32 audio_buffer_fillable(const AudioSetting* setting, const DirectSoundSetti
         return 0;
     }
 
-    DWORD bytes_to_lock = setting->sample_buffer_size;
+    DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
     DWORD bytes_to_write = 0;
 
     DWORD target_cursor = (player_cursor + (setting->latency * setting->sample_size)) % setting->buffer_size;
@@ -180,7 +180,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
     void* region2;
     DWORD region2_size;
 
-    DWORD bytes_to_lock = setting->sample_buffer_size;
+    DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
 
     api_setting->secondary_buffer->Lock(
         bytes_to_lock, setting->sample_buffer_size,
@@ -204,7 +204,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
     }
 
     api_setting->secondary_buffer->Unlock(region1, region1_size, region2, region2_size);
-
+    setting->sample_index += (uint16) (setting->sample_buffer_size / setting->sample_size);
     setting->sample_buffer_size = 0;
 }
 
diff --git a/platform/win32/audio/XAudio2.h b/platform/win32/audio/XAudio2.h
index 4190112..7d9b466 100644
--- a/platform/win32/audio/XAudio2.h
+++ b/platform/win32/audio/XAudio2.h
@@ -115,7 +115,6 @@ void audio_play(AudioSetting* setting, XAudio2Setting* api_setting) {
     }
 
     api_setting->source_voice->Start(0, XAUDIO2_COMMIT_NOW);
-    setting->sample_index = 0;
 }
 
 inline
@@ -194,9 +193,6 @@ void audio_play_buffer(AudioSetting* setting, XAudio2Setting* api_setting) {
     }
 
     ++setting->sample_output;
-
-    // @performance Why do I even need this?
-    //setting->sample_index += setting->sample_buffer_size / setting->sample_size;
     setting->sample_buffer_size = 0;
 }