From b916506f899412410c3d2cb4a7616f075db58780 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Tue, 24 Dec 2024 02:57:40 +0100 Subject: [PATCH] sound kinda working again but a choppy, probably an interval issue or smaple index issue --- audio/Audio.cpp | 32 +- audio/AudioMixer.h | 7 +- audio/AudioSetting.h | 3 + audio/Qoa.h | 436 ++++++++++++++++++++++++++ audio/QoaSimd.h | 470 +++++++++++++++++++++++++++++ image/Image.cpp | 41 +-- image/Qoi.h | 411 +++++++++++++++---------- platform/win32/FileUtils.cpp | 1 + platform/win32/audio/DirectSound.h | 6 +- platform/win32/audio/XAudio2.h | 4 - 10 files changed, 1219 insertions(+), 192 deletions(-) create mode 100644 audio/Qoa.h create mode 100644 audio/QoaSimd.h diff --git a/audio/Audio.cpp b/audio/Audio.cpp index 48e7b38..13604e4 100644 --- a/audio/Audio.cpp +++ b/audio/Audio.cpp @@ -44,8 +44,11 @@ int32 audio_data_size(const Audio* audio) ); } -int32 audio_from_data(const byte* data, Audio* audio) +inline +uint32 audio_header_from_data(const byte* data, Audio* audio) { + const byte* start = data; + audio->sample_rate = SWAP_ENDIAN_LITTLE(*((uint16 *) data)); data += sizeof(audio->sample_rate); @@ -60,14 +63,14 @@ int32 audio_from_data(const byte* data, Audio* audio) audio->size = SWAP_ENDIAN_LITTLE(*((uint32 *) data)); data += sizeof(audio->size); - memcpy(audio->data, data, audio->size); - data += audio->size; - - return audio_data_size(audio); + return (int32) (data - start); } -int32 audio_to_data(const Audio* audio, byte* data) +inline +uint32 audio_header_to_data(const Audio* audio, byte* data) { + byte* start = data; + *((uint16 *) data) = SWAP_ENDIAN_LITTLE(audio->sample_rate); data += sizeof(audio->sample_rate); @@ -78,6 +81,23 @@ int32 audio_to_data(const Audio* audio, byte* data) *((uint32 *) data) = SWAP_ENDIAN_LITTLE(audio->size); data += sizeof(audio->size); + return (int32) (data - start); +} + +uint32 audio_from_data(const byte* data, Audio* audio) +{ + data += audio_header_from_data(data, audio); + + memcpy(audio->data, data, audio->size); + data += audio->size; + + return audio_data_size(audio); +} + +uint32 audio_to_data(const Audio* audio, byte* data) +{ + data += audio_header_to_data(audio, data); + memcpy(data, audio->data, audio->size); data += audio->size; diff --git a/audio/AudioMixer.h b/audio/AudioMixer.h index 89f79d5..f4496d8 100644 --- a/audio/AudioMixer.h +++ b/audio/AudioMixer.h @@ -69,7 +69,6 @@ struct AudioInstance { enum AudioMixerState { AUDIO_MIXER_STATE_UNINITIALIZED, AUDIO_MIXER_STATE_INACTIVE, - AUDIO_MIXER_STATE_SHOULD_PLAY, AUDIO_MIXER_STATE_ACTIVE, }; @@ -100,7 +99,7 @@ struct AudioMixer { }; bool audio_mixer_is_active(AudioMixer* mixer) { - if (mixer->state_new == AUDIO_MIXER_STATE_ACTIVE + if (mixer->state_old == AUDIO_MIXER_STATE_ACTIVE && atomic_get_relaxed((int32 *) &mixer->state_new) == AUDIO_MIXER_STATE_ACTIVE ) { return true; @@ -108,14 +107,14 @@ bool audio_mixer_is_active(AudioMixer* mixer) { AudioMixerState mixer_state; if ((mixer_state = (AudioMixerState) atomic_get_relaxed((int32 *) &mixer->state_new)) != mixer->state_old) { - if (mixer_state != AUDIO_MIXER_STATE_UNINITIALIZED) { + if (mixer->state_old == AUDIO_MIXER_STATE_UNINITIALIZED) { audio_load( mixer->window, &mixer->settings, &mixer->api_setting ); - mixer_state = AUDIO_MIXER_STATE_INACTIVE; + mixer->state_old = AUDIO_MIXER_STATE_INACTIVE; } if (mixer_state == AUDIO_MIXER_STATE_ACTIVE) { diff --git a/audio/AudioSetting.h b/audio/AudioSetting.h index f1a3985..b761c8b 100644 --- a/audio/AudioSetting.h +++ b/audio/AudioSetting.h @@ -22,6 +22,9 @@ struct AudioSetting { // usually 48000 or 44100 uint16 sample_rate; + // This sample index is used to calculate the position in a ring buffer + uint16 sample_index; + // bytes per bloc // channel count * bit // usually 2 * 16 = 4 diff --git a/audio/Qoa.h b/audio/Qoa.h new file mode 100644 index 0000000..496e913 --- /dev/null +++ b/audio/Qoa.h @@ -0,0 +1,436 @@ +/** + * Jingga + * + * @copyright 2023, Dominic Szablewski - https://phoboslab.org + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef TOS_AUDIO_QOA_H +#define TOS_AUDIO_QOA_H + +#include "../stdlib/Types.h" +#include "../utils/EndianUtils.h" +#include "../audio/Audio.cpp" + +#define QOA_SLICE_LEN 20 +#define QOA_SLICES_PER_FRAME 256 +#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN) +#define QOA_LMS_LEN 4 +#define QOA_MAX_CHANNELS 8 + +#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels)) + +struct alignas(16) QoaLms { + int32 history[QOA_LMS_LEN]; + int32 weights[QOA_LMS_LEN]; +}; + +/* +The quant_tab provides an index into the dequant_tab for residuals in the +range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at +the higher end. Note that the residual zero is identical to the lowest positive +value. This is mostly fine, since the qoa_div() function always rounds away +from zero. +*/ +static const int32 qoa_quant_tab[17] = { + 7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */ + 0, /* 0 */ + 0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */ +}; + +/* +We have 16 different scalefactors. Like the quantized residuals these become +less accurate at the higher end. In theory, the highest scalefactor that we +would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we +rely on the LMS filter to predict samples accurately enough that a maximum +residual of one quarter of the 16 bit range is sufficient. I.e. with the +scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14. + +The scalefactor values are computed as: +scalefactor_tab[s] <- round(pow(s + 1, 2.75)) +*/ +static const int32 qoa_scalefactor_tab[16] = { + 1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048 +}; + + +/* +The reciprocal_tab maps each of the 16 scalefactors to their rounded +reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in +the encoder with just one multiplication instead of an expensive division. We +do this in .16 fixed point with integers, instead of floats. + +The reciprocal_tab is computed as: +reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s] +*/ +static const int32 qoa_reciprocal_tab[16] = { + 65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32 +}; + +/* +The dequant_tab maps each of the scalefactors and quantized residuals to +their unscaled & dequantized version. + +Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4 +instead of 1. The dequant_tab assumes the following dequantized values for each +of the quant_tab indices and is computed as: +float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7}; +dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q]) + +The rounding employed here is "to nearest, ties away from zero", i.e. positive +and negative values are treated symmetrically. +*/ +static const int32 qoa_dequant_tab[16][8] = { + { 1, -1, 3, -3, 5, -5, 7, -7}, + { 5, -5, 18, -18, 32, -32, 49, -49}, + { 16, -16, 53, -53, 95, -95, 147, -147}, + { 34, -34, 113, -113, 203, -203, 315, -315}, + { 63, -63, 210, -210, 378, -378, 588, -588}, + { 104, -104, 345, -345, 621, -621, 966, -966}, + { 158, -158, 528, -528, 950, -950, 1477, -1477}, + { 228, -228, 760, -760, 1368, -1368, 2128, -2128}, + { 316, -316, 1053, -1053, 1895, -1895, 2947, -2947}, + { 422, -422, 1405, -1405, 2529, -2529, 3934, -3934}, + { 548, -548, 1828, -1828, 3290, -3290, 5117, -5117}, + { 696, -696, 2320, -2320, 4176, -4176, 6496, -6496}, + { 868, -868, 2893, -2893, 5207, -5207, 8099, -8099}, + {1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933}, + {1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005}, + {1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336}, +}; + + +/* +The Least Mean Squares Filter is the heart of QOA. It predicts the next +sample based on the previous 4 reconstructed samples. It does so by continuously +adjusting 4 weights based on the residual of the previous prediction. + +The next sample is predicted as the sum of (weight[i] * history[i]). + +The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or +subtracts the residual to each weight, based on the corresponding sample from +the history. This, surprisingly, is sufficient to get worthwhile predictions. + +This is all done with fixed point integers. Hence the right-shifts when updating +the weights and calculating the prediction. +*/ +// @performance Depending on context most likely SIMDable +static inline +int32 qoa_lms_predict(QoaLms* lms) +{ + int32 prediction = 0; + for (int32 i = 0; i < QOA_LMS_LEN; ++i) { + prediction += lms->weights[i] * lms->history[i]; + } + + return prediction >> 13; +} + +// @performance Depending on context most likely SIMDable +static inline +void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) { + int32 delta = residual >> 4; + + lms->weights[0] += lms->history[0] < 0 ? -delta : delta; + for (int32 i = 0; i < QOA_LMS_LEN - 1; ++i) { + lms->history[i] = lms->history[i + 1]; + lms->weights[i + 1] += lms->history[i + 1] < 0 ? -delta : delta; + } + lms->weights[QOA_LMS_LEN - 1] += lms->history[QOA_LMS_LEN - 1] < 0 ? -delta : delta; + lms->history[QOA_LMS_LEN - 1] = sample; +} + +/* +qoa_div() implements a rounding division, but avoids rounding to zero for +small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still +returns as 0, which is handled in the qoa_quant_tab[]. +qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an +argument, so it can do the division with a cheaper integer multiplication. +*/ +static inline +int32 qoa_div(int32 v, int32 scalefactor) { + int32 reciprocal = qoa_reciprocal_tab[scalefactor]; + int32 n = (v * reciprocal + (1 << 15)) >> 16; + + /* round away from 0 */ + n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0)); + + return n; +} + +static inline +int32 qoa_clamp(int32 v, int32 min, int32 max) { + if (v < min) { return min; } + if (v > max) { return max; } + return v; +} + +/* +This specialized clamp function for the signed 16 bit range improves decode +performance quite a bit. The extra if() statement works nicely with the CPUs +branch prediction as this branch is rarely taken. +*/ +static inline +int32 qoa_clamp_s16(int32 v) { + if ((uint32) (v + 32768) > 65535) { + if (v < -32768) { return -32768; } + if (v > 32767) { return 32767; } + } + + return v; +} + +uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes) +{ + byte* start = bytes; + + int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0}; + + // Write the frame header + *((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples); + bytes += sizeof(frame_samples); + + // @performance SIMDable + for (uint32 c = 0; c < channels; ++c) { + // Write the current LMS state + uint64 history = 0; + uint64 weights = 0; + for (int32 i = 0; i < QOA_LMS_LEN; ++i) { + history = (history << 16) | (lms[c].history[i] & 0xffff); + weights = (weights << 16) | (lms[c].weights[i] & 0xffff); + } + + *((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(history); + bytes += sizeof(history); + + *((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(weights); + bytes += sizeof(weights); + } + + /* + We encode all samples with the channels interleaved on a slice level. + E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ... + */ + for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) { + // @performance SIMDable + for (uint32 c = 0; c < channels; ++c) { + int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index); + int32 slice_start = sample_index * channels + c; + int32 slice_end = (sample_index + slice_len) * channels + c; + + /* + Brute for search for the best scalefactor. Just go through all + 16 scalefactors, encode all samples for the current slice and + meassure the total squared error. + */ + uint64 best_rank = -1; + uint64 best_slice = 0; + QoaLms best_lms; + int32 best_scalefactor = 0; + + for (int32 sfi = 0; sfi < 16; ++sfi) { + /* + There is a strong correlation between the scalefactors of + neighboring slices. As an optimization, start testing + the best scalefactor of the previous slice first. + */ + int32 scalefactor = (sfi + prev_scalefactor[c]) % 16; + + /* + We have to reset the LMS state to the last known good one + before trying each scalefactor, as each pass updates the LMS + state when encoding. + */ + QoaLms lms_temp = lms[c]; + uint64 slice = scalefactor; + uint64 current_rank = 0; + + for (int32 si = slice_start; si < slice_end; si += channels) { + int32 sample = sample_data[si]; + int32 predicted = qoa_lms_predict(&lms_temp); + + int32 residual = sample - predicted; + int32 scaled = qoa_div(residual, scalefactor); + int32 clamped = qoa_clamp(scaled, -8, 8); + int32 quantized = qoa_quant_tab[clamped + 8]; + int32 dequantized = qoa_dequant_tab[scalefactor][quantized]; + int32 reconstructed = qoa_clamp_s16(predicted + dequantized); + + /* + If the weights have grown too large, we introduce a penalty + here. This prevents pops/clicks in certain problem cases + */ + int32 weights_penalty = (( + lms_temp.weights[0] * lms_temp.weights[0] + + lms_temp.weights[1] * lms_temp.weights[1] + + lms_temp.weights[2] * lms_temp.weights[2] + + lms_temp.weights[3] * lms_temp.weights[3] + ) >> 18) - 0x8ff; + + if (weights_penalty < 0) { + weights_penalty = 0; + } + + int64 error = (sample - reconstructed); + uint64 error_sq = error * error; + + current_rank += error_sq + weights_penalty * weights_penalty; + if (current_rank > best_rank) { + break; + } + + qoa_lms_update(&lms_temp, reconstructed, dequantized); + slice = (slice << 3) | quantized; + } + + if (current_rank < best_rank) { + best_rank = current_rank; + best_slice = slice; + best_lms = lms_temp; + best_scalefactor = scalefactor; + } + } + + prev_scalefactor[c] = best_scalefactor; + + lms[c] = best_lms; + + /* + If this slice was shorter than QOA_SLICE_LEN, we have to left- + shift all encoded data, to ensure the rightmost bits are the empty + ones. This should only happen in the last frame of a file as all + slices are completely filled otherwise. + */ + best_slice <<= (QOA_SLICE_LEN - slice_len) * 3; + + *((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice); + bytes += sizeof(best_slice); + } + } + + return (uint32) (bytes - start); +} + +uint32 qoa_encode(const Audio* audio, byte* data) { + byte* start = data; + + /* Calculate the encoded size and allocate */ + uint32 sample_count = audio->size / (audio->channels * audio->bloc_size); + uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN; + uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN; + + QoaLms lms[QOA_MAX_CHANNELS]; + for (int32 i = 0; i < audio->channels; ++i) { + /* + Set the initial LMS weights to {0, 0, -1, 2}. This helps with the + prediction of the first few ms of a file. + */ + lms[i].weights[0] = 0; + lms[i].weights[1] = 0; + lms[i].weights[2] = -(1 << 13); + lms[i].weights[3] = (1 << 14); + + // Explicitly set the history samples to 0, as we might have some garbage in there. + memset(lms[i].history, 0, QOA_LMS_LEN * sizeof(int32)); + } + + // Go through all frames + int32 frame_samples = QOA_FRAME_LEN; + int32 p = 0; + + for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) { + frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index); + data += qoa_encode_frame( + (int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size), + audio->channels, frame_samples, lms, data + ); + } + + return (uint32) (data - start); +} + +uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data) +{ + const byte* start = bytes; + + // Read and verify the frame header + uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes)); + bytes += sizeof(frame_samples); + + uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN; + uint32 frame_size = QOA_FRAME_SIZE(channels, slices); + uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels; + uint32 num_slices = data_size / 8; + uint32 max_total_samples = num_slices * QOA_SLICE_LEN; + + // Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel + for (uint32 c = 0; c < channels; ++c) { + uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes)); + bytes += sizeof(history); + + uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes)); + bytes += sizeof(weights); + + for (int32 i = 0; i < QOA_LMS_LEN; ++i) { + lms[c].history[i] = ((int16) (history >> 48)); + history <<= 16; + + lms[c].weights[i] = ((int16) (weights >> 48)); + weights <<= 16; + } + } + + // Decode all slices for all channels in this frame + for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) { + for (uint32 c = 0; c < channels; c++) { + uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes)); + bytes += sizeof(slice); + + int32 scalefactor = (slice >> 60) & 0xf; + slice <<= 4; + + int32 slice_start = sample_index * channels + c; + int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c; + + for (int32 si = slice_start; si < slice_end; si += channels) { + int32 predicted = qoa_lms_predict(&lms[c]); + int32 quantized = (slice >> 61) & 0x7; + int32 dequantized = qoa_dequant_tab[scalefactor][quantized]; + int32 reconstructed = qoa_clamp_s16(predicted + dequantized); + + sample_data[si] = reconstructed; + slice <<= 3; + + qoa_lms_update(&lms[c], reconstructed, dequantized); + } + } + } + + return (uint32) (bytes - start); +} + + +uint32 qoa_decode(const byte* data, Audio* audio) +{ + uint32 header_length = audio_header_from_data(data, audio); + uint32 p = header_length; + uint32 frame_size; + byte* sample_ptr = audio->data; + + QoaLms lms[QOA_MAX_CHANNELS]; + + uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels; + + do { + frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr); + sample_ptr += frame_size; + p += frame_size; + } while (frame_size && p < audio->size && audio->size - p >= limit); + // @question do we really need the audio->size - p >= limit check? + + return header_length + audio->size; +} + +#endif \ No newline at end of file diff --git a/audio/QoaSimd.h b/audio/QoaSimd.h new file mode 100644 index 0000000..11d408c --- /dev/null +++ b/audio/QoaSimd.h @@ -0,0 +1,470 @@ +/** + * Jingga + * + * @copyright 2023, Dominic Szablewski - https://phoboslab.org + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef TOS_AUDIO_QOA_H +#define TOS_AUDIO_QOA_H + +#include "../stdlib/Types.h" +#include "../utils/EndianUtils.h" +#include "../audio/Audio.cpp" +#include "../stdlib/simd/SIMD_I32.h" + +#define QOA_SLICE_LEN 20 +#define QOA_SLICES_PER_FRAME 256 +#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN) +#define QOA_LMS_LEN 4 +#define QOA_MAX_CHANNELS 8 + +#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels)) + +struct QoaLms { + int32_4 history; // automatically QOA_LMS_LEN size + int32_4 weights; // automatically QOA_LMS_LEN size +}; + +/* +The quant_tab provides an index into the dequant_tab for residuals in the +range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at +the higher end. Note that the residual zero is identical to the lowest positive +value. This is mostly fine, since the qoa_div() function always rounds away +from zero. +*/ +static const int32 qoa_quant_tab[17] = { + 7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */ + 0, /* 0 */ + 0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */ +}; + +/* +We have 16 different scalefactors. Like the quantized residuals these become +less accurate at the higher end. In theory, the highest scalefactor that we +would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we +rely on the LMS filter to predict samples accurately enough that a maximum +residual of one quarter of the 16 bit range is sufficient. I.e. with the +scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14. + +The scalefactor values are computed as: +scalefactor_tab[s] <- round(pow(s + 1, 2.75)) +*/ +static const int32 qoa_scalefactor_tab[16] = { + 1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048 +}; + + +/* +The reciprocal_tab maps each of the 16 scalefactors to their rounded +reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in +the encoder with just one multiplication instead of an expensive division. We +do this in .16 fixed point with integers, instead of floats. + +The reciprocal_tab is computed as: +reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s] +*/ +static const int32 qoa_reciprocal_tab[16] = { + 65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32 +}; + +/* +The dequant_tab maps each of the scalefactors and quantized residuals to +their unscaled & dequantized version. + +Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4 +instead of 1. The dequant_tab assumes the following dequantized values for each +of the quant_tab indices and is computed as: +float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7}; +dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q]) + +The rounding employed here is "to nearest, ties away from zero", i.e. positive +and negative values are treated symmetrically. +*/ +static const int32 qoa_dequant_tab[16][8] = { + { 1, -1, 3, -3, 5, -5, 7, -7}, + { 5, -5, 18, -18, 32, -32, 49, -49}, + { 16, -16, 53, -53, 95, -95, 147, -147}, + { 34, -34, 113, -113, 203, -203, 315, -315}, + { 63, -63, 210, -210, 378, -378, 588, -588}, + { 104, -104, 345, -345, 621, -621, 966, -966}, + { 158, -158, 528, -528, 950, -950, 1477, -1477}, + { 228, -228, 760, -760, 1368, -1368, 2128, -2128}, + { 316, -316, 1053, -1053, 1895, -1895, 2947, -2947}, + { 422, -422, 1405, -1405, 2529, -2529, 3934, -3934}, + { 548, -548, 1828, -1828, 3290, -3290, 5117, -5117}, + { 696, -696, 2320, -2320, 4176, -4176, 6496, -6496}, + { 868, -868, 2893, -2893, 5207, -5207, 8099, -8099}, + {1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933}, + {1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005}, + {1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336}, +}; + + +/* +The Least Mean Squares Filter is the heart of QOA. It predicts the next +sample based on the previous 4 reconstructed samples. It does so by continuously +adjusting 4 weights based on the residual of the previous prediction. + +The next sample is predicted as the sum of (weight[i] * history[i]). + +The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or +subtracts the residual to each weight, based on the corresponding sample from +the history. This, surprisingly, is sufficient to get worthwhile predictions. + +This is all done with fixed point integers. Hence the right-shifts when updating +the weights and calculating the prediction. +*/ +static inline +int32 qoa_lms_predict(QoaLms* lms) +{ + __m128i products = _mm_mullo_epi32(lms->weights.s, lms->history.s); + __m128i sum1 = _mm_hadd_epi32(products, products); + __m128i sum2 = _mm_hadd_epi32(sum1, sum1); + + int32 prediction = _mm_cvtsi128_si32(sum2); + + return prediction >> 13; +} + +static inline +void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) { + int32 delta = residual >> 4; + + __m128i delta_vec = _mm_set1_epi32(delta); + __m128i zero_vec = _mm_setzero_si128(); + + // Calculate adjustments for weights based on the sign of history + __m128i sign_mask = _mm_cmpgt_epi32(zero_vec, lms->history.s); // history < 0 + __m128i delta_adjust = _mm_blendv_epi8(delta_vec, _mm_sub_epi32(zero_vec, delta_vec), sign_mask); + + // Update weights + lms->weights.s = _mm_add_epi32(lms->weights.s, delta_adjust); + + // Shift history left + lms->history.s = _mm_alignr_epi8(lms->history.s, lms->history.s, 4); // Shift left by 1 int32 (4 bytes) + + // Insert the new sample into the last position of history + lms->history.s = _mm_insert_epi32(lms->history.s, sample, QOA_LMS_LEN - 1); + + // Update the last weight based on the sign of the new sample + int32 sample_sign_adjust = (sample < 0) ? -delta : delta; + lms->weights.s = _mm_insert_epi32( + lms->weights.s, + _mm_extract_epi32(lms->weights.s, QOA_LMS_LEN - 1) + sample_sign_adjust, + QOA_LMS_LEN - 1 + ); +} + +/* +qoa_div() implements a rounding division, but avoids rounding to zero for +small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still +returns as 0, which is handled in the qoa_quant_tab[]. +qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an +argument, so it can do the division with a cheaper integer multiplication. +*/ +static inline +int32 qoa_div(int32 v, int32 scalefactor) { + int32 reciprocal = qoa_reciprocal_tab[scalefactor]; + int32 n = (v * reciprocal + (1 << 15)) >> 16; + + /* round away from 0 */ + n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0)); + + return n; +} + +static inline +int32 qoa_clamp(int32 v, int32 min, int32 max) { + if (v < min) { return min; } + if (v > max) { return max; } + return v; +} + +/* +This specialized clamp function for the signed 16 bit range improves decode +performance quite a bit. The extra if() statement works nicely with the CPUs +branch prediction as this branch is rarely taken. +*/ +static inline +int32 qoa_clamp_s16(int32 v) { + if ((uint32) (v + 32768) > 65535) { + if (v < -32768) { return -32768; } + if (v > 32767) { return 32767; } + } + + return v; +} + +uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes) +{ + byte* start = bytes; + + int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0}; + + // Write the frame header + *((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples); + bytes += sizeof(frame_samples); + + for (uint32 c = 0; c < channels; ++c) { + // Load the history and weights as 16-bit masked values + __m128i history = _mm_and_si128(lms[c].history.s, _mm_set1_epi32(0xFFFF)); + __m128i weights = _mm_and_si128(lms[c].weights.s, _mm_set1_epi32(0xFFFF)); + + // Permute and pack 16-bit values into 64-bit results + __m128i packed_history = _mm_packus_epi32(history, history); // Pack 16-bit values + __m128i packed_weights = _mm_packus_epi32(weights, weights); + + // Shuffle packed values into the correct order for 64-bit construction + __m128i history_64 = _mm_packus_epi16(packed_history, packed_history); + __m128i weights_64 = _mm_packus_epi16(packed_weights, packed_weights); + + // Extract 64-bit results and swap endian if necessary + uint64 final_history = _mm_extract_epi64(history_64, 0); + uint64 final_weights = _mm_extract_epi64(weights_64, 0); + + // Store results with endian swap + *((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_history); + bytes += sizeof(final_history); + + *((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_weights); + bytes += sizeof(final_weights); + } + + /* + We encode all samples with the channels interleaved on a slice level. + E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ... + */ + for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) { + // @performance SIMDable + for (uint32 c = 0; c < channels; ++c) { + int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index); + int32 slice_start = sample_index * channels + c; + int32 slice_end = (sample_index + slice_len) * channels + c; + + /* + Brute for search for the best scalefactor. Just go through all + 16 scalefactors, encode all samples for the current slice and + meassure the total squared error. + */ + uint64 best_rank = -1; + uint64 best_slice = 0; + QoaLms best_lms; + int32 best_scalefactor = 0; + + for (int32 sfi = 0; sfi < 16; ++sfi) { + /* + There is a strong correlation between the scalefactors of + neighboring slices. As an optimization, start testing + the best scalefactor of the previous slice first. + */ + int32 scalefactor = (sfi + prev_scalefactor[c]) % 16; + + /* + We have to reset the LMS state to the last known good one + before trying each scalefactor, as each pass updates the LMS + state when encoding. + */ + QoaLms lms_temp = lms[c]; + uint64 slice = scalefactor; + uint64 current_rank = 0; + + for (int32 si = slice_start; si < slice_end; si += channels) { + int32 sample = sample_data[si]; + int32 predicted = qoa_lms_predict(&lms_temp); + + int32 residual = sample - predicted; + int32 scaled = qoa_div(residual, scalefactor); + int32 clamped = qoa_clamp(scaled, -8, 8); + int32 quantized = qoa_quant_tab[clamped + 8]; + int32 dequantized = qoa_dequant_tab[scalefactor][quantized]; + int32 reconstructed = qoa_clamp_s16(predicted + dequantized); + + /* + If the weights have grown too large, we introduce a penalty + here. This prevents pops/clicks in certain problem cases + */ + // Compute weights squared: w^2 + __m128i weights_squared = _mm_mullo_epi32(lms_temp.weights.s, lms_temp.weights.s); + + // Perform horizontal addition to sum all squared weights + __m128i sum1 = _mm_hadd_epi32(weights_squared, weights_squared); + __m128i sum2 = _mm_hadd_epi32(sum1, sum1); + + // Extract the final sum (scalar) + int32 sum_of_squares = _mm_cvtsi128_si32(sum2); + + // Apply the shift and subtraction + int32 weights_penalty = (sum_of_squares >> 18) - 0x8FF; + if (weights_penalty < 0) { + weights_penalty = 0; + } + + int64 error = (sample - reconstructed); + uint64 error_sq = error * error; + + current_rank += error_sq + weights_penalty * weights_penalty; + if (current_rank > best_rank) { + break; + } + + qoa_lms_update(&lms_temp, reconstructed, dequantized); + slice = (slice << 3) | quantized; + } + + if (current_rank < best_rank) { + best_rank = current_rank; + best_slice = slice; + best_lms = lms_temp; + best_scalefactor = scalefactor; + } + } + + prev_scalefactor[c] = best_scalefactor; + + lms[c] = best_lms; + + /* + If this slice was shorter than QOA_SLICE_LEN, we have to left- + shift all encoded data, to ensure the rightmost bits are the empty + ones. This should only happen in the last frame of a file as all + slices are completely filled otherwise. + */ + best_slice <<= (QOA_SLICE_LEN - slice_len) * 3; + + *((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice); + bytes += sizeof(best_slice); + } + } + + return (uint32) (bytes - start); +} + +uint32 qoa_encode(const Audio* audio, byte* data) +{ + byte* start = data; + + /* Calculate the encoded size and allocate */ + uint32 sample_count = audio->size / (audio->channels * audio->bloc_size); + uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN; + uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN; + + QoaLms lms[QOA_MAX_CHANNELS]; + __m128i weights_init = _mm_set_epi32(1 << 14, -(1 << 13), 0, 0); + __m128i history_init = _mm_setzero_si128(); + + for (int32 i = 0; i < audio->channels; ++i) { + /* + Set the initial LMS weights to {0, 0, -1, 2}. This helps with the + prediction of the first few ms of a file. + */ + lms[i].weights.s = weights_init; + lms[i].history.s = history_init; + } + + // Go through all frames + int32 frame_samples = QOA_FRAME_LEN; + int32 p = 0; + + for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) { + frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index); + data += qoa_encode_frame( + (int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size), + audio->channels, frame_samples, lms, data + ); + } + + return (uint32) (data - start); +} + +uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data) +{ + const byte* start = bytes; + + // Read and verify the frame header + uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes)); + bytes += sizeof(frame_samples); + + uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN; + uint32 frame_size = QOA_FRAME_SIZE(channels, slices); + uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels; + uint32 num_slices = data_size / 8; + uint32 max_total_samples = num_slices * QOA_SLICE_LEN; + + // Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel + for (uint32 c = 0; c < channels; ++c) { + uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes)); + bytes += sizeof(history); + + uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes)); + bytes += sizeof(weights); + + alignas(16) int32 history_array[4]; + alignas(16) int32 weights_array[4]; + + for (int32 i = 0; i < QOA_LMS_LEN; ++i) { + history_array[i] = ((int16) (history >> 48)); + history <<= 16; + + weights_array[i] = ((int16) (weights >> 48)); + weights <<= 16; + } + + lms[c].history.s = _mm_set_epi32(history_array[3], history_array[2], history_array[1], history_array[0]); + lms[c].weights.s = _mm_set_epi32(weights_array[3], weights_array[2], weights_array[1], weights_array[0]); + } + + // Decode all slices for all channels in this frame + for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) { + for (uint32 c = 0; c < channels; c++) { + uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes)); + bytes += sizeof(slice); + + int32 scalefactor = (slice >> 60) & 0xf; + slice <<= 4; + + int32 slice_start = sample_index * channels + c; + int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c; + + for (int32 si = slice_start; si < slice_end; si += channels) { + int32 predicted = qoa_lms_predict(&lms[c]); + int32 quantized = (slice >> 61) & 0x7; + int32 dequantized = qoa_dequant_tab[scalefactor][quantized]; + int32 reconstructed = qoa_clamp_s16(predicted + dequantized); + + sample_data[si] = reconstructed; + slice <<= 3; + + qoa_lms_update(&lms[c], reconstructed, dequantized); + } + } + } + + return (uint32) (bytes - start); +} + + +uint32 qoa_decode(const byte* data, Audio* audio) +{ + uint32 header_length = audio_header_from_data(data, audio); + uint32 p = header_length; + uint32 frame_size; + byte* sample_ptr = audio->data; + + QoaLms lms[QOA_MAX_CHANNELS]; + + uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels; + + do { + frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr); + sample_ptr += frame_size; + p += frame_size; + } while (frame_size && audio->size - p >= limit); + // @question do we really need the audio->size - p >= limit check or would p < audio->size be sufficient? + + return header_length + audio->size; +} + +#endif \ No newline at end of file diff --git a/image/Image.cpp b/image/Image.cpp index 4f14499..b8505ae 100644 --- a/image/Image.cpp +++ b/image/Image.cpp @@ -81,25 +81,26 @@ int32 image_data_size(const Image* image) + sizeof(image->image_settings); } -int32 image_header_from_data(const byte* data, Image* image) +inline +uint32 image_header_from_data(const byte* data, Image* image) { - const byte* pos = data; + const byte* start = data; - image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) pos)); - pos += sizeof(image->width); + image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) data)); + data += sizeof(image->width); - image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) pos)); - pos += sizeof(image->height); + image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) data)); + data += sizeof(image->height); image->pixel_count = image->width * image->height; - image->image_settings = *pos; - pos += sizeof(image->image_settings); + image->image_settings = *data; + data += sizeof(image->image_settings); - return (int32) (pos - data); + return (int32) (data - start); } -int32 image_from_data(const byte* data, Image* image) +uint32 image_from_data(const byte* data, Image* image) { const byte* pos = data; pos += image_header_from_data(data, image); @@ -112,23 +113,23 @@ int32 image_from_data(const byte* data, Image* image) } inline -int32 image_header_to_data(const Image* image, byte* data) +uint32 image_header_to_data(const Image* image, byte* data) { - byte* pos = data; + byte* start = data; - *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->width); - pos += sizeof(image->width); + *((uint32 *) data) = SWAP_ENDIAN_LITTLE(image->width); + data += sizeof(image->width); - *((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->height); - pos += sizeof(image->height); + *((uint32 *) data) = SWAP_ENDIAN_LITTLE(image->height); + data += sizeof(image->height); - *pos = image->image_settings; - pos += sizeof(image->image_settings); + *data = image->image_settings; + data += sizeof(image->image_settings); - return (int32) (pos - data); + return (int32) (data - start); } -int32 image_to_data(const Image* image, byte* data) +uint32 image_to_data(const Image* image, byte* data) { byte* pos = data; pos += image_header_to_data(image, data); diff --git a/image/Qoi.h b/image/Qoi.h index 9876aae..044e18a 100644 --- a/image/Qoi.h +++ b/image/Qoi.h @@ -1,6 +1,7 @@ /** * Jingga * + * @copyright 2021, Dominic Szablewski - https://phoboslab.org * @copyright Jingga * @license OMS License 2.0 * @version 1.0.0 @@ -13,20 +14,36 @@ #include #include "Image.cpp" -#define QOI_OP_INDEX 0b00000000 -#define QOI_OP_DIFF 0b01000000 -#define QOI_OP_LUMA 0b10000000 -#define QOI_OP_RUN 0b11000000 // @todo There is a HUGE step from here to QOI_OP_RGB this leaves room for more cases or using this data -#define QOI_OP_RGB 0b11111110 -#define QOI_OP_RGBA 0b11111111 -#define QOI_MASK_2 0b11000000 +#define QOI_OP_LUMA555 0b00000000 +#define QOI_OP_LUMA222 0b10000000 +#define QOI_OP_LUMA777 0b01000000 -#define QOI_COLOR_HASH(color) (color.r * 3 + color.g * 5 + color.b * 7 + color.a * 11) -#define QOI_COLOR_HASH_2(color) ((((uint32)(color)) * 0x9E3779B1U) >> 26) +#define QOI_OP_RUN 0b11000000 + +// These definitions are important and impact how large our run can be: +// Run has 6 free bits -> 2^6 = 64 +// However, the first bit is used to indicate RGB or RGBA -> 64 - 2^1 = 62 +#define QOI_OP_RGB 0b11111110 +#define QOI_OP_RGBA 0b11111111 + +#define QOI_MASK_1 0b10000000 +#define QOI_MASK_2 0b11000000 +#define QOI_MASK_3 0b11100000 + +// @performance I feel like there is some more optimization possible by handling fully transparent pixels in a special way +// @todo We need to implement monochrome handling, which is very important for game assets that often use monochrome assets for all kinds of things (e.g. translucency) + +const byte optable[128] = { + 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 +}; int32 qoi_encode(const Image* image, byte* data) { - int32 p = image_header_to_data(image, data); + byte* start = data; + data += image_header_to_data(image, data); v4_byte index[64]; memset(index, 0, sizeof(index)); @@ -34,173 +51,257 @@ int32 qoi_encode(const Image* image, byte* data) v4_byte px_prev = {0, 0, 0, 255}; v4_byte px = px_prev; - int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT); + const int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT); - // Only works with 1 byte channel size -> we don't have to multiply channel count with channel size int32 px_len = image->width * image->height * channels; int32 px_end = px_len - channels; int32 run = 0; - for (int32 px_pos = 0; px_pos < px_len; px_pos += channels) { - // @performance could We just use int32 type cast? The problem would be the last pixel which would be out of bounds by 1 byte - memcpy(&px, &image->pixels[px_pos], channels * sizeof(byte)); + if (channels == 4) { + for (int32 px_pos = 0; px_pos < px_len; px_pos += 4) { + px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) (image->pixels + px_pos))); - if (px.val == px_prev.val) { - ++run; - if (run == 62 || px_pos == px_end) { - data[p++] = (byte) (QOI_OP_RUN | (run - 1)); - run = 0; - } - } else { - if (run) { - data[p++] = (byte) (QOI_OP_RUN | (run - 1)); - run = 0; - } + while(px.val == px_prev.val) { + ++run; + if(px_pos == px_end) { + *data++ = (byte) (QOI_OP_RUN | (run - 1)); + px_pos = px_len; - int32 index_pos = QOI_COLOR_HASH(px) % 64; - //int32 index_pos = QOI_COLOR_HASH_2(px); - - if (index[index_pos].val == px.val) { - data[p++] = (byte) (QOI_OP_INDEX | index_pos); - } else { - index[index_pos] = px; - - if (px.a == px_prev.a) { - signed char vr = px.r - px_prev.r; - signed char vg = px.g - px_prev.g; - signed char vb = px.b - px_prev.b; - - signed char vg_r = vr - vg; - signed char vg_b = vb - vg; - - if (vr > -3 && vr < 2 - && vg > -3 && vg < 2 - && vb > -3 && vb < 2 - ) { - data[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2); - } else if (vg_r > -9 && vg_r < 8 - && vg > -33 && vg < 32 - && vg_b > -9 && vg_b < 8 - ) { - data[p++] = QOI_OP_LUMA | (vg + 32); - data[p++] = (vg_r + 8) << 4 | (vg_b + 8); - } else { - data[p++] = QOI_OP_RGB; - data[p++] = px.r; - data[p++] = px.g; - data[p++] = px.b; - } - } else { - data[p++] = QOI_OP_RGBA; - *((uint32 *) &data[p]) = SWAP_ENDIAN_LITTLE(px.val); - p += 4; + break; + } else if (run == 62) { + *data++ = (byte) (QOI_OP_RUN | (run - 1)); + run = 0; } + + px_pos += 4; + px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) (image->pixels + px_pos))); + } + + if (run) { + *data++ = (byte) (QOI_OP_RUN | (run - 1)); + run = 0; + } + + if(px.a != px_prev.a){ + *data++ = QOI_OP_RGBA; + *data++ = px.a; + } + + signed char vr = px.r - px_prev.r; + signed char vg = px.g - px_prev.g; + signed char vb = px.b - px_prev.b; + + signed char vg_r = vr - vg; + signed char vg_b = vb - vg; + + byte ar = vg_r < 0 ? -vg_r - 1 : vg_r; + byte ag = vg < 0 ? -vg - 1 : vg; + byte ab = vg_b < 0 ? -vg_b - 1 : vg_b; + byte argb = ar | ag | ab; + + switch(optable[argb]) { + case 0: + *data++ = QOI_OP_LUMA222 | ((vg_r + 2) << 4) | ((vg_b + 2) << 2) | (vg + 2); + break; + case 1: + *data++ = QOI_OP_LUMA555 | ((vg_b + 16) << 2) | ((vg_r + 16) >> 3); + *data++ = (((vg_r + 16) & 7) << 5) | (vg + 16); + break; + case 2: + *data++ = QOI_OP_LUMA777 | ((vg_b + 64) >> 2); + *data++ = (((vg_b + 64) & 3) << 6) | ((vg_r + 64) >> 1); + *data++ = (((vg_r + 64) & 1) << 7) | (vg + 64); + break; + case 3: + *data++ = QOI_OP_RGB; + *data++ = px.r; + *data++ = px.g; + *data++ = px.b; + break; + } + + px_prev = px; + } + } else { + for (int32 px_pos = 0; px_pos < px_len; px_pos += 3) { + px.r = image->pixels[px_pos]; + px.g = image->pixels[px_pos + 1]; + px.b = image->pixels[px_pos + 2]; + + while(px.val == px_prev.val) { + ++run; + if(px_pos == px_end) { + *data++ = (byte) (QOI_OP_RUN | (run - 1)); + px_pos = px_len; + + break; + } else if (run == 62) { + *data++ = (byte) (QOI_OP_RUN | (run - 1)); + run = 0; + } + + px_pos += 3; + px.r = image->pixels[px_pos]; + px.g = image->pixels[px_pos + 1]; + px.b = image->pixels[px_pos + 2]; + } + + if (run) { + *data++ = (byte) (QOI_OP_RUN | (run - 1)); + run = 0; + } + + signed char vr = px.r - px_prev.r; + signed char vg = px.g - px_prev.g; + signed char vb = px.b - px_prev.b; + + signed char vg_r = vr - vg; + signed char vg_b = vb - vg; + + byte ar = vg_r < 0 ? -vg_r - 1 : vg_r; + byte ag = vg < 0 ? -vg - 1 : vg; + byte ab = vg_b < 0 ? -vg_b - 1 : vg_b; + byte argb = ar | ag | ab; + + switch(optable[argb]) { + case 0: + *data++ = QOI_OP_LUMA222 | ((vg_r + 2) << 4) | ((vg_b + 2) << 2) | (vg + 2); + break; + case 1: + *data++ = QOI_OP_LUMA555 | ((vg_b + 16) << 2) | ((vg_r + 16) >> 3); + *data++ = (((vg_r + 16) & 7) << 5) | (vg + 16); + break; + case 2: + *data++ = QOI_OP_LUMA777 | ((vg_b + 64) >> 2); + *data++ = (((vg_b + 64) & 3) << 6) | ((vg_r + 64) >> 1); + *data++ = (((vg_r + 64) & 1) << 7) | (vg + 64); + break; + case 3: + *data++ = QOI_OP_RGB; + *data++ = px.r; + *data++ = px.g; + *data++ = px.b; + break; + } + + px_prev = px; + } + } + + return (int32) (data - start); +} + +int32 qoi_decode_4(const byte* data, Image* image) +{ + uint32 px_len = image->width * image->height * 4; + v4_byte px = {0, 0, 0, 255}; + v4_byte index[64] = {0}; + int32 run = 0; + + for (int32 px_pos = 0; px_pos < px_len; px_pos += 4) { + if (run > 0) { + --run; + } else { + OP_RGBA_GOTO: + byte b1 = *data++; + + if (b1 == QOI_OP_RGB) { + px.r = *data++; + px.g = *data++; + px.b = *data++; + } else if (b1 == QOI_OP_RGBA) { + px.a = *data++; + goto OP_RGBA_GOTO; + } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA222) { + byte vg = (b1 & 3) - 2; + px.r += vg - 2 + ((b1 >> 4) & 3); + px.g += vg; + px.b += vg - 2 + ((b1 >> 2) & 3); + } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA555) { + byte b2 = *data++; + byte vg = (b2 & 31) - 16; + px.r += vg - 16 + (((b1 & 3) << 3) | (b2 >> 5)); + px.g += vg; + px.b += vg - 16 + ((b1 >> 2) & 31); + } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA777) { + byte b2 = *data++; + byte b3 = *data++; + byte vg = (b3 & 0x7f) - 64; + px.r += vg - 64 + ((b2 & 0x3f) << 1) + (b3 >> 7); + px.g += vg; + px.b += vg - 64 + ((b1 & 0x1f) << 2) + (b2 >> 6); + } else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) { + run = (b1 & 0x3f); } } - px_prev = px; + *((uint32 *) &image->pixels[px_pos]) = SWAP_ENDIAN_LITTLE(px.val); } - return p; + return px_len; } -int32 qoi_decode(const byte* data, Image* image, int32 steps = 8) +int32 qoi_decode_3(const byte* data, Image* image) { - int32 header_length = image_header_from_data(data, image); - int32 p = header_length; - - int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT); - uint32 px_len = image->width * image->height * channels; - - v4_byte px = {0, 0, 0, 255}; - - v4_byte index[64]; - memset(index, 0, sizeof(index)); - + uint32 px_len = image->width * image->height * 3; + v3_byte px = {0, 0, 0}; int32 run = 0; - for (uint32 px_pos = 0; px_pos < px_len; px_pos += channels) { - int32 b1 = data[p++]; + for (int32 px_pos = 0; px_pos < px_len; px_pos += 3) { + if (run > 0) { + --run; + } else { + byte b1 = *data++; - if (b1 == QOI_OP_RGB) { - px.r = data[p++]; - px.g = data[p++]; - px.b = data[p++]; - } else if (b1 == QOI_OP_RGBA) { - px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p])); - p += 4; - } else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) { - px = index[b1]; - } else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) { - px.r += ((b1 >> 4) & 0x03) - 2; - px.g += ((b1 >> 2) & 0x03) - 2; - px.b += ( b1 & 0x03) - 2; - } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) { - int32 b2 = data[p++]; - byte vg = (b1 & 0x3f) - 32; - px.r += vg - 8 + ((b2 >> 4) & 0x0f); - px.g += vg; - px.b += vg - 8 + (b2 & 0x0f); - } else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) { - run = (b1 & 0x3f); + if (b1 == QOI_OP_RGB) { + px.r = *data++; + px.g = *data++; + px.b = *data++; + } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA222) { + byte vg = (b1 & 3) - 2; + px.r += vg - 2 + ((b1 >> 4) & 3); + px.g += vg; + px.b += vg - 2 + ((b1 >> 2) & 3); + } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA555) { + byte b2 = *data++; + byte vg = (b2 & 31) - 16; + px.r += vg - 16 + (((b1 & 3) << 3) | (b2 >> 5)); + px.g += vg; + px.b += vg - 16 + ((b1 >> 2) & 31); + } else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA777) { + byte b2 = *data++; + byte b3 = *data++; + byte vg = (b3 & 0x7f) - 64; + px.r += vg - 64 + ((b2 & 0x3f) << 1) + (b3 >> 7); + px.g += vg; + px.b += vg - 64 + ((b1 & 0x1f) << 2) + (b2 >> 6); + } else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) { + run = (b1 & 0x3f); + } + } - if (channels == 4) { - uint32 px_little_endian = SWAP_ENDIAN_LITTLE(px.val); - int32 pixel_step_size = steps * 4; - int32 i = 0; - - // @performance Implement for ARM - #if ARM - #else - if (steps == 16) { - __m512i simd_value = _mm512_set1_epi32(px_little_endian); - for(; i <= run - steps; i += steps, px_pos += pixel_step_size) { - _mm512_storeu_si512((__m512i *) &image->pixels[px_pos], simd_value); - } - } else if (steps >= 8) { - __m256i simd_value = _mm256_set1_epi32(px_little_endian); - for (; i <= run - steps; i += steps, px_pos += pixel_step_size) { - _mm256_storeu_si256((__m256i *) &image->pixels[px_pos], simd_value); - } - } else if (steps >= 4) { - __m128i simd_value = _mm_set1_epi32(px_little_endian); - for(; i <= run - steps; i += steps, px_pos += pixel_step_size) { - _mm_storeu_si128((__m128i *) &image->pixels[px_pos], simd_value); - } - } - #endif - - for (; i < run; ++i) { - *((uint32 *) &image->pixels[px_pos]) = px_little_endian; - px_pos += channels; - } - } else if (channels == 3) { - for (int32 i = 0; i < run; ++i) { - image->pixels[px_pos++] = px.r; - image->pixels[px_pos++] = px.g; - image->pixels[px_pos++] = px.b; - } - } else if (channels == 1) { - memset(&image->pixels[px_pos], px.r, run * sizeof(byte)); - px_pos += run; - } - - // Correction, since the loop increments by channels count as well - px_pos -= channels; - - index[QOI_COLOR_HASH(px) % 64] = px; - //index[QOI_COLOR_HASH_2(px)] = px; - - continue; - } - - index[QOI_COLOR_HASH(px) % 64] = px; - //index[QOI_COLOR_HASH_2(px)] = px; - - memcpy(&image->pixels[px_pos], &px, channels * sizeof(byte)); + image->pixels[px_pos] = px.r; + image->pixels[px_pos + 1] = px.g; + image->pixels[px_pos + 2] = px.b; } - return header_length + px_len; + return px_len; +} + +int32 qoi_decode(const byte* data, Image* image) +{ + int32 header_length = image_header_from_data(data, image); + + const int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT); + + int32 len = 0; + if (channels == 4) { + len = qoi_decode_4(data + header_length, image); + } else if (channels == 3) { + len = qoi_decode_3(data + header_length, image); + } + + return header_length + len; } #endif \ No newline at end of file diff --git a/platform/win32/FileUtils.cpp b/platform/win32/FileUtils.cpp index 26c3bc0..9d71aa8 100644 --- a/platform/win32/FileUtils.cpp +++ b/platform/win32/FileUtils.cpp @@ -21,6 +21,7 @@ #include "../../utils/Utils.h" #include "../../utils/TestUtils.h" #include "../../memory/RingMemory.h" +#include "../../log/Log.h" typedef HANDLE FileHandle; typedef HANDLE MMFHandle; diff --git a/platform/win32/audio/DirectSound.h b/platform/win32/audio/DirectSound.h index 22ddb6e..06f1d3a 100644 --- a/platform/win32/audio/DirectSound.h +++ b/platform/win32/audio/DirectSound.h @@ -148,7 +148,7 @@ uint32 audio_buffer_fillable(const AudioSetting* setting, const DirectSoundSetti return 0; } - DWORD bytes_to_lock = setting->sample_buffer_size; + DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size; DWORD bytes_to_write = 0; DWORD target_cursor = (player_cursor + (setting->latency * setting->sample_size)) % setting->buffer_size; @@ -180,7 +180,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting) void* region2; DWORD region2_size; - DWORD bytes_to_lock = setting->sample_buffer_size; + DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size; api_setting->secondary_buffer->Lock( bytes_to_lock, setting->sample_buffer_size, @@ -204,7 +204,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting) } api_setting->secondary_buffer->Unlock(region1, region1_size, region2, region2_size); - + setting->sample_index += (uint16) (setting->sample_buffer_size / setting->sample_size); setting->sample_buffer_size = 0; } diff --git a/platform/win32/audio/XAudio2.h b/platform/win32/audio/XAudio2.h index 4190112..7d9b466 100644 --- a/platform/win32/audio/XAudio2.h +++ b/platform/win32/audio/XAudio2.h @@ -115,7 +115,6 @@ void audio_play(AudioSetting* setting, XAudio2Setting* api_setting) { } api_setting->source_voice->Start(0, XAUDIO2_COMMIT_NOW); - setting->sample_index = 0; } inline @@ -194,9 +193,6 @@ void audio_play_buffer(AudioSetting* setting, XAudio2Setting* api_setting) { } ++setting->sample_output; - - // @performance Why do I even need this? - //setting->sample_index += setting->sample_buffer_size / setting->sample_size; setting->sample_buffer_size = 0; }