sound kinda working again but a choppy, probably an interval issue or smaple index issue

This commit is contained in:
Dennis Eichhorn 2024-12-24 02:57:40 +01:00
parent c1c028149f
commit b916506f89
10 changed files with 1219 additions and 192 deletions

View File

@ -44,8 +44,11 @@ int32 audio_data_size(const Audio* audio)
);
}
int32 audio_from_data(const byte* data, Audio* audio)
inline
uint32 audio_header_from_data(const byte* data, Audio* audio)
{
const byte* start = data;
audio->sample_rate = SWAP_ENDIAN_LITTLE(*((uint16 *) data));
data += sizeof(audio->sample_rate);
@ -60,14 +63,14 @@ int32 audio_from_data(const byte* data, Audio* audio)
audio->size = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
data += sizeof(audio->size);
memcpy(audio->data, data, audio->size);
data += audio->size;
return audio_data_size(audio);
return (int32) (data - start);
}
int32 audio_to_data(const Audio* audio, byte* data)
inline
uint32 audio_header_to_data(const Audio* audio, byte* data)
{
byte* start = data;
*((uint16 *) data) = SWAP_ENDIAN_LITTLE(audio->sample_rate);
data += sizeof(audio->sample_rate);
@ -78,6 +81,23 @@ int32 audio_to_data(const Audio* audio, byte* data)
*((uint32 *) data) = SWAP_ENDIAN_LITTLE(audio->size);
data += sizeof(audio->size);
return (int32) (data - start);
}
uint32 audio_from_data(const byte* data, Audio* audio)
{
data += audio_header_from_data(data, audio);
memcpy(audio->data, data, audio->size);
data += audio->size;
return audio_data_size(audio);
}
uint32 audio_to_data(const Audio* audio, byte* data)
{
data += audio_header_to_data(audio, data);
memcpy(data, audio->data, audio->size);
data += audio->size;

View File

@ -69,7 +69,6 @@ struct AudioInstance {
enum AudioMixerState {
AUDIO_MIXER_STATE_UNINITIALIZED,
AUDIO_MIXER_STATE_INACTIVE,
AUDIO_MIXER_STATE_SHOULD_PLAY,
AUDIO_MIXER_STATE_ACTIVE,
};
@ -100,7 +99,7 @@ struct AudioMixer {
};
bool audio_mixer_is_active(AudioMixer* mixer) {
if (mixer->state_new == AUDIO_MIXER_STATE_ACTIVE
if (mixer->state_old == AUDIO_MIXER_STATE_ACTIVE
&& atomic_get_relaxed((int32 *) &mixer->state_new) == AUDIO_MIXER_STATE_ACTIVE
) {
return true;
@ -108,14 +107,14 @@ bool audio_mixer_is_active(AudioMixer* mixer) {
AudioMixerState mixer_state;
if ((mixer_state = (AudioMixerState) atomic_get_relaxed((int32 *) &mixer->state_new)) != mixer->state_old) {
if (mixer_state != AUDIO_MIXER_STATE_UNINITIALIZED) {
if (mixer->state_old == AUDIO_MIXER_STATE_UNINITIALIZED) {
audio_load(
mixer->window,
&mixer->settings,
&mixer->api_setting
);
mixer_state = AUDIO_MIXER_STATE_INACTIVE;
mixer->state_old = AUDIO_MIXER_STATE_INACTIVE;
}
if (mixer_state == AUDIO_MIXER_STATE_ACTIVE) {

View File

@ -22,6 +22,9 @@ struct AudioSetting {
// usually 48000 or 44100
uint16 sample_rate;
// This sample index is used to calculate the position in a ring buffer
uint16 sample_index;
// bytes per bloc
// channel count * bit
// usually 2 * 16 = 4

436
audio/Qoa.h Normal file
View File

@ -0,0 +1,436 @@
/**
* Jingga
*
* @copyright 2023, Dominic Szablewski - https://phoboslab.org
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef TOS_AUDIO_QOA_H
#define TOS_AUDIO_QOA_H
#include "../stdlib/Types.h"
#include "../utils/EndianUtils.h"
#include "../audio/Audio.cpp"
#define QOA_SLICE_LEN 20
#define QOA_SLICES_PER_FRAME 256
#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN)
#define QOA_LMS_LEN 4
#define QOA_MAX_CHANNELS 8
#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels))
struct alignas(16) QoaLms {
int32 history[QOA_LMS_LEN];
int32 weights[QOA_LMS_LEN];
};
/*
The quant_tab provides an index into the dequant_tab for residuals in the
range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at
the higher end. Note that the residual zero is identical to the lowest positive
value. This is mostly fine, since the qoa_div() function always rounds away
from zero.
*/
static const int32 qoa_quant_tab[17] = {
7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
0, /* 0 */
0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */
};
/*
We have 16 different scalefactors. Like the quantized residuals these become
less accurate at the higher end. In theory, the highest scalefactor that we
would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
rely on the LMS filter to predict samples accurately enough that a maximum
residual of one quarter of the 16 bit range is sufficient. I.e. with the
scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
The scalefactor values are computed as:
scalefactor_tab[s] <- round(pow(s + 1, 2.75))
*/
static const int32 qoa_scalefactor_tab[16] = {
1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
};
/*
The reciprocal_tab maps each of the 16 scalefactors to their rounded
reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in
the encoder with just one multiplication instead of an expensive division. We
do this in .16 fixed point with integers, instead of floats.
The reciprocal_tab is computed as:
reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s]
*/
static const int32 qoa_reciprocal_tab[16] = {
65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
};
/*
The dequant_tab maps each of the scalefactors and quantized residuals to
their unscaled & dequantized version.
Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
instead of 1. The dequant_tab assumes the following dequantized values for each
of the quant_tab indices and is computed as:
float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
The rounding employed here is "to nearest, ties away from zero", i.e. positive
and negative values are treated symmetrically.
*/
static const int32 qoa_dequant_tab[16][8] = {
{ 1, -1, 3, -3, 5, -5, 7, -7},
{ 5, -5, 18, -18, 32, -32, 49, -49},
{ 16, -16, 53, -53, 95, -95, 147, -147},
{ 34, -34, 113, -113, 203, -203, 315, -315},
{ 63, -63, 210, -210, 378, -378, 588, -588},
{ 104, -104, 345, -345, 621, -621, 966, -966},
{ 158, -158, 528, -528, 950, -950, 1477, -1477},
{ 228, -228, 760, -760, 1368, -1368, 2128, -2128},
{ 316, -316, 1053, -1053, 1895, -1895, 2947, -2947},
{ 422, -422, 1405, -1405, 2529, -2529, 3934, -3934},
{ 548, -548, 1828, -1828, 3290, -3290, 5117, -5117},
{ 696, -696, 2320, -2320, 4176, -4176, 6496, -6496},
{ 868, -868, 2893, -2893, 5207, -5207, 8099, -8099},
{1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933},
{1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005},
{1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336},
};
/*
The Least Mean Squares Filter is the heart of QOA. It predicts the next
sample based on the previous 4 reconstructed samples. It does so by continuously
adjusting 4 weights based on the residual of the previous prediction.
The next sample is predicted as the sum of (weight[i] * history[i]).
The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or
subtracts the residual to each weight, based on the corresponding sample from
the history. This, surprisingly, is sufficient to get worthwhile predictions.
This is all done with fixed point integers. Hence the right-shifts when updating
the weights and calculating the prediction.
*/
// @performance Depending on context most likely SIMDable
static inline
int32 qoa_lms_predict(QoaLms* lms)
{
int32 prediction = 0;
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
prediction += lms->weights[i] * lms->history[i];
}
return prediction >> 13;
}
// @performance Depending on context most likely SIMDable
static inline
void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) {
int32 delta = residual >> 4;
lms->weights[0] += lms->history[0] < 0 ? -delta : delta;
for (int32 i = 0; i < QOA_LMS_LEN - 1; ++i) {
lms->history[i] = lms->history[i + 1];
lms->weights[i + 1] += lms->history[i + 1] < 0 ? -delta : delta;
}
lms->weights[QOA_LMS_LEN - 1] += lms->history[QOA_LMS_LEN - 1] < 0 ? -delta : delta;
lms->history[QOA_LMS_LEN - 1] = sample;
}
/*
qoa_div() implements a rounding division, but avoids rounding to zero for
small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still
returns as 0, which is handled in the qoa_quant_tab[].
qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an
argument, so it can do the division with a cheaper integer multiplication.
*/
static inline
int32 qoa_div(int32 v, int32 scalefactor) {
int32 reciprocal = qoa_reciprocal_tab[scalefactor];
int32 n = (v * reciprocal + (1 << 15)) >> 16;
/* round away from 0 */
n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0));
return n;
}
static inline
int32 qoa_clamp(int32 v, int32 min, int32 max) {
if (v < min) { return min; }
if (v > max) { return max; }
return v;
}
/*
This specialized clamp function for the signed 16 bit range improves decode
performance quite a bit. The extra if() statement works nicely with the CPUs
branch prediction as this branch is rarely taken.
*/
static inline
int32 qoa_clamp_s16(int32 v) {
if ((uint32) (v + 32768) > 65535) {
if (v < -32768) { return -32768; }
if (v > 32767) { return 32767; }
}
return v;
}
uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes)
{
byte* start = bytes;
int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0};
// Write the frame header
*((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples);
bytes += sizeof(frame_samples);
// @performance SIMDable
for (uint32 c = 0; c < channels; ++c) {
// Write the current LMS state
uint64 history = 0;
uint64 weights = 0;
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
history = (history << 16) | (lms[c].history[i] & 0xffff);
weights = (weights << 16) | (lms[c].weights[i] & 0xffff);
}
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(history);
bytes += sizeof(history);
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(weights);
bytes += sizeof(weights);
}
/*
We encode all samples with the channels interleaved on a slice level.
E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ...
*/
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
// @performance SIMDable
for (uint32 c = 0; c < channels; ++c) {
int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index);
int32 slice_start = sample_index * channels + c;
int32 slice_end = (sample_index + slice_len) * channels + c;
/*
Brute for search for the best scalefactor. Just go through all
16 scalefactors, encode all samples for the current slice and
meassure the total squared error.
*/
uint64 best_rank = -1;
uint64 best_slice = 0;
QoaLms best_lms;
int32 best_scalefactor = 0;
for (int32 sfi = 0; sfi < 16; ++sfi) {
/*
There is a strong correlation between the scalefactors of
neighboring slices. As an optimization, start testing
the best scalefactor of the previous slice first.
*/
int32 scalefactor = (sfi + prev_scalefactor[c]) % 16;
/*
We have to reset the LMS state to the last known good one
before trying each scalefactor, as each pass updates the LMS
state when encoding.
*/
QoaLms lms_temp = lms[c];
uint64 slice = scalefactor;
uint64 current_rank = 0;
for (int32 si = slice_start; si < slice_end; si += channels) {
int32 sample = sample_data[si];
int32 predicted = qoa_lms_predict(&lms_temp);
int32 residual = sample - predicted;
int32 scaled = qoa_div(residual, scalefactor);
int32 clamped = qoa_clamp(scaled, -8, 8);
int32 quantized = qoa_quant_tab[clamped + 8];
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
/*
If the weights have grown too large, we introduce a penalty
here. This prevents pops/clicks in certain problem cases
*/
int32 weights_penalty = ((
lms_temp.weights[0] * lms_temp.weights[0]
+ lms_temp.weights[1] * lms_temp.weights[1]
+ lms_temp.weights[2] * lms_temp.weights[2]
+ lms_temp.weights[3] * lms_temp.weights[3]
) >> 18) - 0x8ff;
if (weights_penalty < 0) {
weights_penalty = 0;
}
int64 error = (sample - reconstructed);
uint64 error_sq = error * error;
current_rank += error_sq + weights_penalty * weights_penalty;
if (current_rank > best_rank) {
break;
}
qoa_lms_update(&lms_temp, reconstructed, dequantized);
slice = (slice << 3) | quantized;
}
if (current_rank < best_rank) {
best_rank = current_rank;
best_slice = slice;
best_lms = lms_temp;
best_scalefactor = scalefactor;
}
}
prev_scalefactor[c] = best_scalefactor;
lms[c] = best_lms;
/*
If this slice was shorter than QOA_SLICE_LEN, we have to left-
shift all encoded data, to ensure the rightmost bits are the empty
ones. This should only happen in the last frame of a file as all
slices are completely filled otherwise.
*/
best_slice <<= (QOA_SLICE_LEN - slice_len) * 3;
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice);
bytes += sizeof(best_slice);
}
}
return (uint32) (bytes - start);
}
uint32 qoa_encode(const Audio* audio, byte* data) {
byte* start = data;
/* Calculate the encoded size and allocate */
uint32 sample_count = audio->size / (audio->channels * audio->bloc_size);
uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN;
uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
QoaLms lms[QOA_MAX_CHANNELS];
for (int32 i = 0; i < audio->channels; ++i) {
/*
Set the initial LMS weights to {0, 0, -1, 2}. This helps with the
prediction of the first few ms of a file.
*/
lms[i].weights[0] = 0;
lms[i].weights[1] = 0;
lms[i].weights[2] = -(1 << 13);
lms[i].weights[3] = (1 << 14);
// Explicitly set the history samples to 0, as we might have some garbage in there.
memset(lms[i].history, 0, QOA_LMS_LEN * sizeof(int32));
}
// Go through all frames
int32 frame_samples = QOA_FRAME_LEN;
int32 p = 0;
for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) {
frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index);
data += qoa_encode_frame(
(int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size),
audio->channels, frame_samples, lms, data
);
}
return (uint32) (data - start);
}
uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data)
{
const byte* start = bytes;
// Read and verify the frame header
uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes));
bytes += sizeof(frame_samples);
uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
uint32 frame_size = QOA_FRAME_SIZE(channels, slices);
uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels;
uint32 num_slices = data_size / 8;
uint32 max_total_samples = num_slices * QOA_SLICE_LEN;
// Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel
for (uint32 c = 0; c < channels; ++c) {
uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
bytes += sizeof(history);
uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
bytes += sizeof(weights);
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
lms[c].history[i] = ((int16) (history >> 48));
history <<= 16;
lms[c].weights[i] = ((int16) (weights >> 48));
weights <<= 16;
}
}
// Decode all slices for all channels in this frame
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
for (uint32 c = 0; c < channels; c++) {
uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
bytes += sizeof(slice);
int32 scalefactor = (slice >> 60) & 0xf;
slice <<= 4;
int32 slice_start = sample_index * channels + c;
int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c;
for (int32 si = slice_start; si < slice_end; si += channels) {
int32 predicted = qoa_lms_predict(&lms[c]);
int32 quantized = (slice >> 61) & 0x7;
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
sample_data[si] = reconstructed;
slice <<= 3;
qoa_lms_update(&lms[c], reconstructed, dequantized);
}
}
}
return (uint32) (bytes - start);
}
uint32 qoa_decode(const byte* data, Audio* audio)
{
uint32 header_length = audio_header_from_data(data, audio);
uint32 p = header_length;
uint32 frame_size;
byte* sample_ptr = audio->data;
QoaLms lms[QOA_MAX_CHANNELS];
uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels;
do {
frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr);
sample_ptr += frame_size;
p += frame_size;
} while (frame_size && p < audio->size && audio->size - p >= limit);
// @question do we really need the audio->size - p >= limit check?
return header_length + audio->size;
}
#endif

470
audio/QoaSimd.h Normal file
View File

@ -0,0 +1,470 @@
/**
* Jingga
*
* @copyright 2023, Dominic Szablewski - https://phoboslab.org
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef TOS_AUDIO_QOA_H
#define TOS_AUDIO_QOA_H
#include "../stdlib/Types.h"
#include "../utils/EndianUtils.h"
#include "../audio/Audio.cpp"
#include "../stdlib/simd/SIMD_I32.h"
#define QOA_SLICE_LEN 20
#define QOA_SLICES_PER_FRAME 256
#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN)
#define QOA_LMS_LEN 4
#define QOA_MAX_CHANNELS 8
#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels))
struct QoaLms {
int32_4 history; // automatically QOA_LMS_LEN size
int32_4 weights; // automatically QOA_LMS_LEN size
};
/*
The quant_tab provides an index into the dequant_tab for residuals in the
range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at
the higher end. Note that the residual zero is identical to the lowest positive
value. This is mostly fine, since the qoa_div() function always rounds away
from zero.
*/
static const int32 qoa_quant_tab[17] = {
7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
0, /* 0 */
0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */
};
/*
We have 16 different scalefactors. Like the quantized residuals these become
less accurate at the higher end. In theory, the highest scalefactor that we
would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
rely on the LMS filter to predict samples accurately enough that a maximum
residual of one quarter of the 16 bit range is sufficient. I.e. with the
scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
The scalefactor values are computed as:
scalefactor_tab[s] <- round(pow(s + 1, 2.75))
*/
static const int32 qoa_scalefactor_tab[16] = {
1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
};
/*
The reciprocal_tab maps each of the 16 scalefactors to their rounded
reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in
the encoder with just one multiplication instead of an expensive division. We
do this in .16 fixed point with integers, instead of floats.
The reciprocal_tab is computed as:
reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s]
*/
static const int32 qoa_reciprocal_tab[16] = {
65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
};
/*
The dequant_tab maps each of the scalefactors and quantized residuals to
their unscaled & dequantized version.
Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
instead of 1. The dequant_tab assumes the following dequantized values for each
of the quant_tab indices and is computed as:
float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
The rounding employed here is "to nearest, ties away from zero", i.e. positive
and negative values are treated symmetrically.
*/
static const int32 qoa_dequant_tab[16][8] = {
{ 1, -1, 3, -3, 5, -5, 7, -7},
{ 5, -5, 18, -18, 32, -32, 49, -49},
{ 16, -16, 53, -53, 95, -95, 147, -147},
{ 34, -34, 113, -113, 203, -203, 315, -315},
{ 63, -63, 210, -210, 378, -378, 588, -588},
{ 104, -104, 345, -345, 621, -621, 966, -966},
{ 158, -158, 528, -528, 950, -950, 1477, -1477},
{ 228, -228, 760, -760, 1368, -1368, 2128, -2128},
{ 316, -316, 1053, -1053, 1895, -1895, 2947, -2947},
{ 422, -422, 1405, -1405, 2529, -2529, 3934, -3934},
{ 548, -548, 1828, -1828, 3290, -3290, 5117, -5117},
{ 696, -696, 2320, -2320, 4176, -4176, 6496, -6496},
{ 868, -868, 2893, -2893, 5207, -5207, 8099, -8099},
{1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933},
{1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005},
{1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336},
};
/*
The Least Mean Squares Filter is the heart of QOA. It predicts the next
sample based on the previous 4 reconstructed samples. It does so by continuously
adjusting 4 weights based on the residual of the previous prediction.
The next sample is predicted as the sum of (weight[i] * history[i]).
The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or
subtracts the residual to each weight, based on the corresponding sample from
the history. This, surprisingly, is sufficient to get worthwhile predictions.
This is all done with fixed point integers. Hence the right-shifts when updating
the weights and calculating the prediction.
*/
static inline
int32 qoa_lms_predict(QoaLms* lms)
{
__m128i products = _mm_mullo_epi32(lms->weights.s, lms->history.s);
__m128i sum1 = _mm_hadd_epi32(products, products);
__m128i sum2 = _mm_hadd_epi32(sum1, sum1);
int32 prediction = _mm_cvtsi128_si32(sum2);
return prediction >> 13;
}
static inline
void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) {
int32 delta = residual >> 4;
__m128i delta_vec = _mm_set1_epi32(delta);
__m128i zero_vec = _mm_setzero_si128();
// Calculate adjustments for weights based on the sign of history
__m128i sign_mask = _mm_cmpgt_epi32(zero_vec, lms->history.s); // history < 0
__m128i delta_adjust = _mm_blendv_epi8(delta_vec, _mm_sub_epi32(zero_vec, delta_vec), sign_mask);
// Update weights
lms->weights.s = _mm_add_epi32(lms->weights.s, delta_adjust);
// Shift history left
lms->history.s = _mm_alignr_epi8(lms->history.s, lms->history.s, 4); // Shift left by 1 int32 (4 bytes)
// Insert the new sample into the last position of history
lms->history.s = _mm_insert_epi32(lms->history.s, sample, QOA_LMS_LEN - 1);
// Update the last weight based on the sign of the new sample
int32 sample_sign_adjust = (sample < 0) ? -delta : delta;
lms->weights.s = _mm_insert_epi32(
lms->weights.s,
_mm_extract_epi32(lms->weights.s, QOA_LMS_LEN - 1) + sample_sign_adjust,
QOA_LMS_LEN - 1
);
}
/*
qoa_div() implements a rounding division, but avoids rounding to zero for
small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still
returns as 0, which is handled in the qoa_quant_tab[].
qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an
argument, so it can do the division with a cheaper integer multiplication.
*/
static inline
int32 qoa_div(int32 v, int32 scalefactor) {
int32 reciprocal = qoa_reciprocal_tab[scalefactor];
int32 n = (v * reciprocal + (1 << 15)) >> 16;
/* round away from 0 */
n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0));
return n;
}
static inline
int32 qoa_clamp(int32 v, int32 min, int32 max) {
if (v < min) { return min; }
if (v > max) { return max; }
return v;
}
/*
This specialized clamp function for the signed 16 bit range improves decode
performance quite a bit. The extra if() statement works nicely with the CPUs
branch prediction as this branch is rarely taken.
*/
static inline
int32 qoa_clamp_s16(int32 v) {
if ((uint32) (v + 32768) > 65535) {
if (v < -32768) { return -32768; }
if (v > 32767) { return 32767; }
}
return v;
}
uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes)
{
byte* start = bytes;
int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0};
// Write the frame header
*((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples);
bytes += sizeof(frame_samples);
for (uint32 c = 0; c < channels; ++c) {
// Load the history and weights as 16-bit masked values
__m128i history = _mm_and_si128(lms[c].history.s, _mm_set1_epi32(0xFFFF));
__m128i weights = _mm_and_si128(lms[c].weights.s, _mm_set1_epi32(0xFFFF));
// Permute and pack 16-bit values into 64-bit results
__m128i packed_history = _mm_packus_epi32(history, history); // Pack 16-bit values
__m128i packed_weights = _mm_packus_epi32(weights, weights);
// Shuffle packed values into the correct order for 64-bit construction
__m128i history_64 = _mm_packus_epi16(packed_history, packed_history);
__m128i weights_64 = _mm_packus_epi16(packed_weights, packed_weights);
// Extract 64-bit results and swap endian if necessary
uint64 final_history = _mm_extract_epi64(history_64, 0);
uint64 final_weights = _mm_extract_epi64(weights_64, 0);
// Store results with endian swap
*((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_history);
bytes += sizeof(final_history);
*((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_weights);
bytes += sizeof(final_weights);
}
/*
We encode all samples with the channels interleaved on a slice level.
E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ...
*/
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
// @performance SIMDable
for (uint32 c = 0; c < channels; ++c) {
int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index);
int32 slice_start = sample_index * channels + c;
int32 slice_end = (sample_index + slice_len) * channels + c;
/*
Brute for search for the best scalefactor. Just go through all
16 scalefactors, encode all samples for the current slice and
meassure the total squared error.
*/
uint64 best_rank = -1;
uint64 best_slice = 0;
QoaLms best_lms;
int32 best_scalefactor = 0;
for (int32 sfi = 0; sfi < 16; ++sfi) {
/*
There is a strong correlation between the scalefactors of
neighboring slices. As an optimization, start testing
the best scalefactor of the previous slice first.
*/
int32 scalefactor = (sfi + prev_scalefactor[c]) % 16;
/*
We have to reset the LMS state to the last known good one
before trying each scalefactor, as each pass updates the LMS
state when encoding.
*/
QoaLms lms_temp = lms[c];
uint64 slice = scalefactor;
uint64 current_rank = 0;
for (int32 si = slice_start; si < slice_end; si += channels) {
int32 sample = sample_data[si];
int32 predicted = qoa_lms_predict(&lms_temp);
int32 residual = sample - predicted;
int32 scaled = qoa_div(residual, scalefactor);
int32 clamped = qoa_clamp(scaled, -8, 8);
int32 quantized = qoa_quant_tab[clamped + 8];
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
/*
If the weights have grown too large, we introduce a penalty
here. This prevents pops/clicks in certain problem cases
*/
// Compute weights squared: w^2
__m128i weights_squared = _mm_mullo_epi32(lms_temp.weights.s, lms_temp.weights.s);
// Perform horizontal addition to sum all squared weights
__m128i sum1 = _mm_hadd_epi32(weights_squared, weights_squared);
__m128i sum2 = _mm_hadd_epi32(sum1, sum1);
// Extract the final sum (scalar)
int32 sum_of_squares = _mm_cvtsi128_si32(sum2);
// Apply the shift and subtraction
int32 weights_penalty = (sum_of_squares >> 18) - 0x8FF;
if (weights_penalty < 0) {
weights_penalty = 0;
}
int64 error = (sample - reconstructed);
uint64 error_sq = error * error;
current_rank += error_sq + weights_penalty * weights_penalty;
if (current_rank > best_rank) {
break;
}
qoa_lms_update(&lms_temp, reconstructed, dequantized);
slice = (slice << 3) | quantized;
}
if (current_rank < best_rank) {
best_rank = current_rank;
best_slice = slice;
best_lms = lms_temp;
best_scalefactor = scalefactor;
}
}
prev_scalefactor[c] = best_scalefactor;
lms[c] = best_lms;
/*
If this slice was shorter than QOA_SLICE_LEN, we have to left-
shift all encoded data, to ensure the rightmost bits are the empty
ones. This should only happen in the last frame of a file as all
slices are completely filled otherwise.
*/
best_slice <<= (QOA_SLICE_LEN - slice_len) * 3;
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice);
bytes += sizeof(best_slice);
}
}
return (uint32) (bytes - start);
}
uint32 qoa_encode(const Audio* audio, byte* data)
{
byte* start = data;
/* Calculate the encoded size and allocate */
uint32 sample_count = audio->size / (audio->channels * audio->bloc_size);
uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN;
uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
QoaLms lms[QOA_MAX_CHANNELS];
__m128i weights_init = _mm_set_epi32(1 << 14, -(1 << 13), 0, 0);
__m128i history_init = _mm_setzero_si128();
for (int32 i = 0; i < audio->channels; ++i) {
/*
Set the initial LMS weights to {0, 0, -1, 2}. This helps with the
prediction of the first few ms of a file.
*/
lms[i].weights.s = weights_init;
lms[i].history.s = history_init;
}
// Go through all frames
int32 frame_samples = QOA_FRAME_LEN;
int32 p = 0;
for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) {
frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index);
data += qoa_encode_frame(
(int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size),
audio->channels, frame_samples, lms, data
);
}
return (uint32) (data - start);
}
uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data)
{
const byte* start = bytes;
// Read and verify the frame header
uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes));
bytes += sizeof(frame_samples);
uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
uint32 frame_size = QOA_FRAME_SIZE(channels, slices);
uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels;
uint32 num_slices = data_size / 8;
uint32 max_total_samples = num_slices * QOA_SLICE_LEN;
// Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel
for (uint32 c = 0; c < channels; ++c) {
uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
bytes += sizeof(history);
uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
bytes += sizeof(weights);
alignas(16) int32 history_array[4];
alignas(16) int32 weights_array[4];
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
history_array[i] = ((int16) (history >> 48));
history <<= 16;
weights_array[i] = ((int16) (weights >> 48));
weights <<= 16;
}
lms[c].history.s = _mm_set_epi32(history_array[3], history_array[2], history_array[1], history_array[0]);
lms[c].weights.s = _mm_set_epi32(weights_array[3], weights_array[2], weights_array[1], weights_array[0]);
}
// Decode all slices for all channels in this frame
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
for (uint32 c = 0; c < channels; c++) {
uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
bytes += sizeof(slice);
int32 scalefactor = (slice >> 60) & 0xf;
slice <<= 4;
int32 slice_start = sample_index * channels + c;
int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c;
for (int32 si = slice_start; si < slice_end; si += channels) {
int32 predicted = qoa_lms_predict(&lms[c]);
int32 quantized = (slice >> 61) & 0x7;
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
sample_data[si] = reconstructed;
slice <<= 3;
qoa_lms_update(&lms[c], reconstructed, dequantized);
}
}
}
return (uint32) (bytes - start);
}
uint32 qoa_decode(const byte* data, Audio* audio)
{
uint32 header_length = audio_header_from_data(data, audio);
uint32 p = header_length;
uint32 frame_size;
byte* sample_ptr = audio->data;
QoaLms lms[QOA_MAX_CHANNELS];
uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels;
do {
frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr);
sample_ptr += frame_size;
p += frame_size;
} while (frame_size && audio->size - p >= limit);
// @question do we really need the audio->size - p >= limit check or would p < audio->size be sufficient?
return header_length + audio->size;
}
#endif

View File

@ -81,25 +81,26 @@ int32 image_data_size(const Image* image)
+ sizeof(image->image_settings);
}
int32 image_header_from_data(const byte* data, Image* image)
inline
uint32 image_header_from_data(const byte* data, Image* image)
{
const byte* pos = data;
const byte* start = data;
image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
pos += sizeof(image->width);
image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
data += sizeof(image->width);
image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
pos += sizeof(image->height);
image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
data += sizeof(image->height);
image->pixel_count = image->width * image->height;
image->image_settings = *pos;
pos += sizeof(image->image_settings);
image->image_settings = *data;
data += sizeof(image->image_settings);
return (int32) (pos - data);
return (int32) (data - start);
}
int32 image_from_data(const byte* data, Image* image)
uint32 image_from_data(const byte* data, Image* image)
{
const byte* pos = data;
pos += image_header_from_data(data, image);
@ -112,23 +113,23 @@ int32 image_from_data(const byte* data, Image* image)
}
inline
int32 image_header_to_data(const Image* image, byte* data)
uint32 image_header_to_data(const Image* image, byte* data)
{
byte* pos = data;
byte* start = data;
*((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->width);
pos += sizeof(image->width);
*((uint32 *) data) = SWAP_ENDIAN_LITTLE(image->width);
data += sizeof(image->width);
*((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->height);
pos += sizeof(image->height);
*((uint32 *) data) = SWAP_ENDIAN_LITTLE(image->height);
data += sizeof(image->height);
*pos = image->image_settings;
pos += sizeof(image->image_settings);
*data = image->image_settings;
data += sizeof(image->image_settings);
return (int32) (pos - data);
return (int32) (data - start);
}
int32 image_to_data(const Image* image, byte* data)
uint32 image_to_data(const Image* image, byte* data)
{
byte* pos = data;
pos += image_header_to_data(image, data);

View File

@ -1,6 +1,7 @@
/**
* Jingga
*
* @copyright 2021, Dominic Szablewski - https://phoboslab.org
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
@ -13,20 +14,36 @@
#include <string.h>
#include "Image.cpp"
#define QOI_OP_INDEX 0b00000000
#define QOI_OP_DIFF 0b01000000
#define QOI_OP_LUMA 0b10000000
#define QOI_OP_RUN 0b11000000 // @todo There is a HUGE step from here to QOI_OP_RGB this leaves room for more cases or using this data
#define QOI_OP_RGB 0b11111110
#define QOI_OP_RGBA 0b11111111
#define QOI_MASK_2 0b11000000
#define QOI_OP_LUMA555 0b00000000
#define QOI_OP_LUMA222 0b10000000
#define QOI_OP_LUMA777 0b01000000
#define QOI_COLOR_HASH(color) (color.r * 3 + color.g * 5 + color.b * 7 + color.a * 11)
#define QOI_COLOR_HASH_2(color) ((((uint32)(color)) * 0x9E3779B1U) >> 26)
#define QOI_OP_RUN 0b11000000
// These definitions are important and impact how large our run can be:
// Run has 6 free bits -> 2^6 = 64
// However, the first bit is used to indicate RGB or RGBA -> 64 - 2^1 = 62
#define QOI_OP_RGB 0b11111110
#define QOI_OP_RGBA 0b11111111
#define QOI_MASK_1 0b10000000
#define QOI_MASK_2 0b11000000
#define QOI_MASK_3 0b11100000
// @performance I feel like there is some more optimization possible by handling fully transparent pixels in a special way
// @todo We need to implement monochrome handling, which is very important for game assets that often use monochrome assets for all kinds of things (e.g. translucency)
const byte optable[128] = {
0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
};
int32 qoi_encode(const Image* image, byte* data)
{
int32 p = image_header_to_data(image, data);
byte* start = data;
data += image_header_to_data(image, data);
v4_byte index[64];
memset(index, 0, sizeof(index));
@ -34,173 +51,257 @@ int32 qoi_encode(const Image* image, byte* data)
v4_byte px_prev = {0, 0, 0, 255};
v4_byte px = px_prev;
int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
const int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
// Only works with 1 byte channel size -> we don't have to multiply channel count with channel size
int32 px_len = image->width * image->height * channels;
int32 px_end = px_len - channels;
int32 run = 0;
for (int32 px_pos = 0; px_pos < px_len; px_pos += channels) {
// @performance could We just use int32 type cast? The problem would be the last pixel which would be out of bounds by 1 byte
memcpy(&px, &image->pixels[px_pos], channels * sizeof(byte));
if (channels == 4) {
for (int32 px_pos = 0; px_pos < px_len; px_pos += 4) {
px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) (image->pixels + px_pos)));
if (px.val == px_prev.val) {
++run;
if (run == 62 || px_pos == px_end) {
data[p++] = (byte) (QOI_OP_RUN | (run - 1));
run = 0;
}
} else {
if (run) {
data[p++] = (byte) (QOI_OP_RUN | (run - 1));
run = 0;
}
while(px.val == px_prev.val) {
++run;
if(px_pos == px_end) {
*data++ = (byte) (QOI_OP_RUN | (run - 1));
px_pos = px_len;
int32 index_pos = QOI_COLOR_HASH(px) % 64;
//int32 index_pos = QOI_COLOR_HASH_2(px);
if (index[index_pos].val == px.val) {
data[p++] = (byte) (QOI_OP_INDEX | index_pos);
} else {
index[index_pos] = px;
if (px.a == px_prev.a) {
signed char vr = px.r - px_prev.r;
signed char vg = px.g - px_prev.g;
signed char vb = px.b - px_prev.b;
signed char vg_r = vr - vg;
signed char vg_b = vb - vg;
if (vr > -3 && vr < 2
&& vg > -3 && vg < 2
&& vb > -3 && vb < 2
) {
data[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
} else if (vg_r > -9 && vg_r < 8
&& vg > -33 && vg < 32
&& vg_b > -9 && vg_b < 8
) {
data[p++] = QOI_OP_LUMA | (vg + 32);
data[p++] = (vg_r + 8) << 4 | (vg_b + 8);
} else {
data[p++] = QOI_OP_RGB;
data[p++] = px.r;
data[p++] = px.g;
data[p++] = px.b;
}
} else {
data[p++] = QOI_OP_RGBA;
*((uint32 *) &data[p]) = SWAP_ENDIAN_LITTLE(px.val);
p += 4;
break;
} else if (run == 62) {
*data++ = (byte) (QOI_OP_RUN | (run - 1));
run = 0;
}
px_pos += 4;
px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) (image->pixels + px_pos)));
}
if (run) {
*data++ = (byte) (QOI_OP_RUN | (run - 1));
run = 0;
}
if(px.a != px_prev.a){
*data++ = QOI_OP_RGBA;
*data++ = px.a;
}
signed char vr = px.r - px_prev.r;
signed char vg = px.g - px_prev.g;
signed char vb = px.b - px_prev.b;
signed char vg_r = vr - vg;
signed char vg_b = vb - vg;
byte ar = vg_r < 0 ? -vg_r - 1 : vg_r;
byte ag = vg < 0 ? -vg - 1 : vg;
byte ab = vg_b < 0 ? -vg_b - 1 : vg_b;
byte argb = ar | ag | ab;
switch(optable[argb]) {
case 0:
*data++ = QOI_OP_LUMA222 | ((vg_r + 2) << 4) | ((vg_b + 2) << 2) | (vg + 2);
break;
case 1:
*data++ = QOI_OP_LUMA555 | ((vg_b + 16) << 2) | ((vg_r + 16) >> 3);
*data++ = (((vg_r + 16) & 7) << 5) | (vg + 16);
break;
case 2:
*data++ = QOI_OP_LUMA777 | ((vg_b + 64) >> 2);
*data++ = (((vg_b + 64) & 3) << 6) | ((vg_r + 64) >> 1);
*data++ = (((vg_r + 64) & 1) << 7) | (vg + 64);
break;
case 3:
*data++ = QOI_OP_RGB;
*data++ = px.r;
*data++ = px.g;
*data++ = px.b;
break;
}
px_prev = px;
}
} else {
for (int32 px_pos = 0; px_pos < px_len; px_pos += 3) {
px.r = image->pixels[px_pos];
px.g = image->pixels[px_pos + 1];
px.b = image->pixels[px_pos + 2];
while(px.val == px_prev.val) {
++run;
if(px_pos == px_end) {
*data++ = (byte) (QOI_OP_RUN | (run - 1));
px_pos = px_len;
break;
} else if (run == 62) {
*data++ = (byte) (QOI_OP_RUN | (run - 1));
run = 0;
}
px_pos += 3;
px.r = image->pixels[px_pos];
px.g = image->pixels[px_pos + 1];
px.b = image->pixels[px_pos + 2];
}
if (run) {
*data++ = (byte) (QOI_OP_RUN | (run - 1));
run = 0;
}
signed char vr = px.r - px_prev.r;
signed char vg = px.g - px_prev.g;
signed char vb = px.b - px_prev.b;
signed char vg_r = vr - vg;
signed char vg_b = vb - vg;
byte ar = vg_r < 0 ? -vg_r - 1 : vg_r;
byte ag = vg < 0 ? -vg - 1 : vg;
byte ab = vg_b < 0 ? -vg_b - 1 : vg_b;
byte argb = ar | ag | ab;
switch(optable[argb]) {
case 0:
*data++ = QOI_OP_LUMA222 | ((vg_r + 2) << 4) | ((vg_b + 2) << 2) | (vg + 2);
break;
case 1:
*data++ = QOI_OP_LUMA555 | ((vg_b + 16) << 2) | ((vg_r + 16) >> 3);
*data++ = (((vg_r + 16) & 7) << 5) | (vg + 16);
break;
case 2:
*data++ = QOI_OP_LUMA777 | ((vg_b + 64) >> 2);
*data++ = (((vg_b + 64) & 3) << 6) | ((vg_r + 64) >> 1);
*data++ = (((vg_r + 64) & 1) << 7) | (vg + 64);
break;
case 3:
*data++ = QOI_OP_RGB;
*data++ = px.r;
*data++ = px.g;
*data++ = px.b;
break;
}
px_prev = px;
}
}
return (int32) (data - start);
}
int32 qoi_decode_4(const byte* data, Image* image)
{
uint32 px_len = image->width * image->height * 4;
v4_byte px = {0, 0, 0, 255};
v4_byte index[64] = {0};
int32 run = 0;
for (int32 px_pos = 0; px_pos < px_len; px_pos += 4) {
if (run > 0) {
--run;
} else {
OP_RGBA_GOTO:
byte b1 = *data++;
if (b1 == QOI_OP_RGB) {
px.r = *data++;
px.g = *data++;
px.b = *data++;
} else if (b1 == QOI_OP_RGBA) {
px.a = *data++;
goto OP_RGBA_GOTO;
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA222) {
byte vg = (b1 & 3) - 2;
px.r += vg - 2 + ((b1 >> 4) & 3);
px.g += vg;
px.b += vg - 2 + ((b1 >> 2) & 3);
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA555) {
byte b2 = *data++;
byte vg = (b2 & 31) - 16;
px.r += vg - 16 + (((b1 & 3) << 3) | (b2 >> 5));
px.g += vg;
px.b += vg - 16 + ((b1 >> 2) & 31);
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA777) {
byte b2 = *data++;
byte b3 = *data++;
byte vg = (b3 & 0x7f) - 64;
px.r += vg - 64 + ((b2 & 0x3f) << 1) + (b3 >> 7);
px.g += vg;
px.b += vg - 64 + ((b1 & 0x1f) << 2) + (b2 >> 6);
} else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
run = (b1 & 0x3f);
}
}
px_prev = px;
*((uint32 *) &image->pixels[px_pos]) = SWAP_ENDIAN_LITTLE(px.val);
}
return p;
return px_len;
}
int32 qoi_decode(const byte* data, Image* image, int32 steps = 8)
int32 qoi_decode_3(const byte* data, Image* image)
{
int32 header_length = image_header_from_data(data, image);
int32 p = header_length;
int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
uint32 px_len = image->width * image->height * channels;
v4_byte px = {0, 0, 0, 255};
v4_byte index[64];
memset(index, 0, sizeof(index));
uint32 px_len = image->width * image->height * 3;
v3_byte px = {0, 0, 0};
int32 run = 0;
for (uint32 px_pos = 0; px_pos < px_len; px_pos += channels) {
int32 b1 = data[p++];
for (int32 px_pos = 0; px_pos < px_len; px_pos += 3) {
if (run > 0) {
--run;
} else {
byte b1 = *data++;
if (b1 == QOI_OP_RGB) {
px.r = data[p++];
px.g = data[p++];
px.b = data[p++];
} else if (b1 == QOI_OP_RGBA) {
px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p]));
p += 4;
} else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) {
px = index[b1];
} else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) {
px.r += ((b1 >> 4) & 0x03) - 2;
px.g += ((b1 >> 2) & 0x03) - 2;
px.b += ( b1 & 0x03) - 2;
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) {
int32 b2 = data[p++];
byte vg = (b1 & 0x3f) - 32;
px.r += vg - 8 + ((b2 >> 4) & 0x0f);
px.g += vg;
px.b += vg - 8 + (b2 & 0x0f);
} else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
run = (b1 & 0x3f);
if (b1 == QOI_OP_RGB) {
px.r = *data++;
px.g = *data++;
px.b = *data++;
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA222) {
byte vg = (b1 & 3) - 2;
px.r += vg - 2 + ((b1 >> 4) & 3);
px.g += vg;
px.b += vg - 2 + ((b1 >> 2) & 3);
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA555) {
byte b2 = *data++;
byte vg = (b2 & 31) - 16;
px.r += vg - 16 + (((b1 & 3) << 3) | (b2 >> 5));
px.g += vg;
px.b += vg - 16 + ((b1 >> 2) & 31);
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA777) {
byte b2 = *data++;
byte b3 = *data++;
byte vg = (b3 & 0x7f) - 64;
px.r += vg - 64 + ((b2 & 0x3f) << 1) + (b3 >> 7);
px.g += vg;
px.b += vg - 64 + ((b1 & 0x1f) << 2) + (b2 >> 6);
} else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
run = (b1 & 0x3f);
}
}
if (channels == 4) {
uint32 px_little_endian = SWAP_ENDIAN_LITTLE(px.val);
int32 pixel_step_size = steps * 4;
int32 i = 0;
// @performance Implement for ARM
#if ARM
#else
if (steps == 16) {
__m512i simd_value = _mm512_set1_epi32(px_little_endian);
for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
_mm512_storeu_si512((__m512i *) &image->pixels[px_pos], simd_value);
}
} else if (steps >= 8) {
__m256i simd_value = _mm256_set1_epi32(px_little_endian);
for (; i <= run - steps; i += steps, px_pos += pixel_step_size) {
_mm256_storeu_si256((__m256i *) &image->pixels[px_pos], simd_value);
}
} else if (steps >= 4) {
__m128i simd_value = _mm_set1_epi32(px_little_endian);
for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
_mm_storeu_si128((__m128i *) &image->pixels[px_pos], simd_value);
}
}
#endif
for (; i < run; ++i) {
*((uint32 *) &image->pixels[px_pos]) = px_little_endian;
px_pos += channels;
}
} else if (channels == 3) {
for (int32 i = 0; i < run; ++i) {
image->pixels[px_pos++] = px.r;
image->pixels[px_pos++] = px.g;
image->pixels[px_pos++] = px.b;
}
} else if (channels == 1) {
memset(&image->pixels[px_pos], px.r, run * sizeof(byte));
px_pos += run;
}
// Correction, since the loop increments by channels count as well
px_pos -= channels;
index[QOI_COLOR_HASH(px) % 64] = px;
//index[QOI_COLOR_HASH_2(px)] = px;
continue;
}
index[QOI_COLOR_HASH(px) % 64] = px;
//index[QOI_COLOR_HASH_2(px)] = px;
memcpy(&image->pixels[px_pos], &px, channels * sizeof(byte));
image->pixels[px_pos] = px.r;
image->pixels[px_pos + 1] = px.g;
image->pixels[px_pos + 2] = px.b;
}
return header_length + px_len;
return px_len;
}
int32 qoi_decode(const byte* data, Image* image)
{
int32 header_length = image_header_from_data(data, image);
const int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
int32 len = 0;
if (channels == 4) {
len = qoi_decode_4(data + header_length, image);
} else if (channels == 3) {
len = qoi_decode_3(data + header_length, image);
}
return header_length + len;
}
#endif

View File

@ -21,6 +21,7 @@
#include "../../utils/Utils.h"
#include "../../utils/TestUtils.h"
#include "../../memory/RingMemory.h"
#include "../../log/Log.h"
typedef HANDLE FileHandle;
typedef HANDLE MMFHandle;

View File

@ -148,7 +148,7 @@ uint32 audio_buffer_fillable(const AudioSetting* setting, const DirectSoundSetti
return 0;
}
DWORD bytes_to_lock = setting->sample_buffer_size;
DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
DWORD bytes_to_write = 0;
DWORD target_cursor = (player_cursor + (setting->latency * setting->sample_size)) % setting->buffer_size;
@ -180,7 +180,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
void* region2;
DWORD region2_size;
DWORD bytes_to_lock = setting->sample_buffer_size;
DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
api_setting->secondary_buffer->Lock(
bytes_to_lock, setting->sample_buffer_size,
@ -204,7 +204,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
}
api_setting->secondary_buffer->Unlock(region1, region1_size, region2, region2_size);
setting->sample_index += (uint16) (setting->sample_buffer_size / setting->sample_size);
setting->sample_buffer_size = 0;
}

View File

@ -115,7 +115,6 @@ void audio_play(AudioSetting* setting, XAudio2Setting* api_setting) {
}
api_setting->source_voice->Start(0, XAUDIO2_COMMIT_NOW);
setting->sample_index = 0;
}
inline
@ -194,9 +193,6 @@ void audio_play_buffer(AudioSetting* setting, XAudio2Setting* api_setting) {
}
++setting->sample_output;
// @performance Why do I even need this?
//setting->sample_index += setting->sample_buffer_size / setting->sample_size;
setting->sample_buffer_size = 0;
}