mirror of
https://github.com/Karaka-Management/cOMS.git
synced 2026-01-10 19:08:39 +00:00
sound kinda working again but a choppy, probably an interval issue or smaple index issue
This commit is contained in:
parent
c1c028149f
commit
b916506f89
|
|
@ -44,8 +44,11 @@ int32 audio_data_size(const Audio* audio)
|
|||
);
|
||||
}
|
||||
|
||||
int32 audio_from_data(const byte* data, Audio* audio)
|
||||
inline
|
||||
uint32 audio_header_from_data(const byte* data, Audio* audio)
|
||||
{
|
||||
const byte* start = data;
|
||||
|
||||
audio->sample_rate = SWAP_ENDIAN_LITTLE(*((uint16 *) data));
|
||||
data += sizeof(audio->sample_rate);
|
||||
|
||||
|
|
@ -60,14 +63,14 @@ int32 audio_from_data(const byte* data, Audio* audio)
|
|||
audio->size = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
|
||||
data += sizeof(audio->size);
|
||||
|
||||
memcpy(audio->data, data, audio->size);
|
||||
data += audio->size;
|
||||
|
||||
return audio_data_size(audio);
|
||||
return (int32) (data - start);
|
||||
}
|
||||
|
||||
int32 audio_to_data(const Audio* audio, byte* data)
|
||||
inline
|
||||
uint32 audio_header_to_data(const Audio* audio, byte* data)
|
||||
{
|
||||
byte* start = data;
|
||||
|
||||
*((uint16 *) data) = SWAP_ENDIAN_LITTLE(audio->sample_rate);
|
||||
data += sizeof(audio->sample_rate);
|
||||
|
||||
|
|
@ -78,6 +81,23 @@ int32 audio_to_data(const Audio* audio, byte* data)
|
|||
*((uint32 *) data) = SWAP_ENDIAN_LITTLE(audio->size);
|
||||
data += sizeof(audio->size);
|
||||
|
||||
return (int32) (data - start);
|
||||
}
|
||||
|
||||
uint32 audio_from_data(const byte* data, Audio* audio)
|
||||
{
|
||||
data += audio_header_from_data(data, audio);
|
||||
|
||||
memcpy(audio->data, data, audio->size);
|
||||
data += audio->size;
|
||||
|
||||
return audio_data_size(audio);
|
||||
}
|
||||
|
||||
uint32 audio_to_data(const Audio* audio, byte* data)
|
||||
{
|
||||
data += audio_header_to_data(audio, data);
|
||||
|
||||
memcpy(data, audio->data, audio->size);
|
||||
data += audio->size;
|
||||
|
||||
|
|
|
|||
|
|
@ -69,7 +69,6 @@ struct AudioInstance {
|
|||
enum AudioMixerState {
|
||||
AUDIO_MIXER_STATE_UNINITIALIZED,
|
||||
AUDIO_MIXER_STATE_INACTIVE,
|
||||
AUDIO_MIXER_STATE_SHOULD_PLAY,
|
||||
AUDIO_MIXER_STATE_ACTIVE,
|
||||
};
|
||||
|
||||
|
|
@ -100,7 +99,7 @@ struct AudioMixer {
|
|||
};
|
||||
|
||||
bool audio_mixer_is_active(AudioMixer* mixer) {
|
||||
if (mixer->state_new == AUDIO_MIXER_STATE_ACTIVE
|
||||
if (mixer->state_old == AUDIO_MIXER_STATE_ACTIVE
|
||||
&& atomic_get_relaxed((int32 *) &mixer->state_new) == AUDIO_MIXER_STATE_ACTIVE
|
||||
) {
|
||||
return true;
|
||||
|
|
@ -108,14 +107,14 @@ bool audio_mixer_is_active(AudioMixer* mixer) {
|
|||
|
||||
AudioMixerState mixer_state;
|
||||
if ((mixer_state = (AudioMixerState) atomic_get_relaxed((int32 *) &mixer->state_new)) != mixer->state_old) {
|
||||
if (mixer_state != AUDIO_MIXER_STATE_UNINITIALIZED) {
|
||||
if (mixer->state_old == AUDIO_MIXER_STATE_UNINITIALIZED) {
|
||||
audio_load(
|
||||
mixer->window,
|
||||
&mixer->settings,
|
||||
&mixer->api_setting
|
||||
);
|
||||
|
||||
mixer_state = AUDIO_MIXER_STATE_INACTIVE;
|
||||
mixer->state_old = AUDIO_MIXER_STATE_INACTIVE;
|
||||
}
|
||||
|
||||
if (mixer_state == AUDIO_MIXER_STATE_ACTIVE) {
|
||||
|
|
|
|||
|
|
@ -22,6 +22,9 @@ struct AudioSetting {
|
|||
// usually 48000 or 44100
|
||||
uint16 sample_rate;
|
||||
|
||||
// This sample index is used to calculate the position in a ring buffer
|
||||
uint16 sample_index;
|
||||
|
||||
// bytes per bloc
|
||||
// channel count * bit
|
||||
// usually 2 * 16 = 4
|
||||
|
|
|
|||
436
audio/Qoa.h
Normal file
436
audio/Qoa.h
Normal file
|
|
@ -0,0 +1,436 @@
|
|||
/**
|
||||
* Jingga
|
||||
*
|
||||
* @copyright 2023, Dominic Szablewski - https://phoboslab.org
|
||||
* @copyright Jingga
|
||||
* @license OMS License 2.0
|
||||
* @version 1.0.0
|
||||
* @link https://jingga.app
|
||||
*/
|
||||
#ifndef TOS_AUDIO_QOA_H
|
||||
#define TOS_AUDIO_QOA_H
|
||||
|
||||
#include "../stdlib/Types.h"
|
||||
#include "../utils/EndianUtils.h"
|
||||
#include "../audio/Audio.cpp"
|
||||
|
||||
#define QOA_SLICE_LEN 20
|
||||
#define QOA_SLICES_PER_FRAME 256
|
||||
#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN)
|
||||
#define QOA_LMS_LEN 4
|
||||
#define QOA_MAX_CHANNELS 8
|
||||
|
||||
#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels))
|
||||
|
||||
struct alignas(16) QoaLms {
|
||||
int32 history[QOA_LMS_LEN];
|
||||
int32 weights[QOA_LMS_LEN];
|
||||
};
|
||||
|
||||
/*
|
||||
The quant_tab provides an index into the dequant_tab for residuals in the
|
||||
range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at
|
||||
the higher end. Note that the residual zero is identical to the lowest positive
|
||||
value. This is mostly fine, since the qoa_div() function always rounds away
|
||||
from zero.
|
||||
*/
|
||||
static const int32 qoa_quant_tab[17] = {
|
||||
7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
|
||||
0, /* 0 */
|
||||
0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */
|
||||
};
|
||||
|
||||
/*
|
||||
We have 16 different scalefactors. Like the quantized residuals these become
|
||||
less accurate at the higher end. In theory, the highest scalefactor that we
|
||||
would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
|
||||
rely on the LMS filter to predict samples accurately enough that a maximum
|
||||
residual of one quarter of the 16 bit range is sufficient. I.e. with the
|
||||
scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
|
||||
|
||||
The scalefactor values are computed as:
|
||||
scalefactor_tab[s] <- round(pow(s + 1, 2.75))
|
||||
*/
|
||||
static const int32 qoa_scalefactor_tab[16] = {
|
||||
1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
The reciprocal_tab maps each of the 16 scalefactors to their rounded
|
||||
reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in
|
||||
the encoder with just one multiplication instead of an expensive division. We
|
||||
do this in .16 fixed point with integers, instead of floats.
|
||||
|
||||
The reciprocal_tab is computed as:
|
||||
reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s]
|
||||
*/
|
||||
static const int32 qoa_reciprocal_tab[16] = {
|
||||
65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
|
||||
};
|
||||
|
||||
/*
|
||||
The dequant_tab maps each of the scalefactors and quantized residuals to
|
||||
their unscaled & dequantized version.
|
||||
|
||||
Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
|
||||
instead of 1. The dequant_tab assumes the following dequantized values for each
|
||||
of the quant_tab indices and is computed as:
|
||||
float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
|
||||
dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
|
||||
|
||||
The rounding employed here is "to nearest, ties away from zero", i.e. positive
|
||||
and negative values are treated symmetrically.
|
||||
*/
|
||||
static const int32 qoa_dequant_tab[16][8] = {
|
||||
{ 1, -1, 3, -3, 5, -5, 7, -7},
|
||||
{ 5, -5, 18, -18, 32, -32, 49, -49},
|
||||
{ 16, -16, 53, -53, 95, -95, 147, -147},
|
||||
{ 34, -34, 113, -113, 203, -203, 315, -315},
|
||||
{ 63, -63, 210, -210, 378, -378, 588, -588},
|
||||
{ 104, -104, 345, -345, 621, -621, 966, -966},
|
||||
{ 158, -158, 528, -528, 950, -950, 1477, -1477},
|
||||
{ 228, -228, 760, -760, 1368, -1368, 2128, -2128},
|
||||
{ 316, -316, 1053, -1053, 1895, -1895, 2947, -2947},
|
||||
{ 422, -422, 1405, -1405, 2529, -2529, 3934, -3934},
|
||||
{ 548, -548, 1828, -1828, 3290, -3290, 5117, -5117},
|
||||
{ 696, -696, 2320, -2320, 4176, -4176, 6496, -6496},
|
||||
{ 868, -868, 2893, -2893, 5207, -5207, 8099, -8099},
|
||||
{1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933},
|
||||
{1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005},
|
||||
{1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336},
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
The Least Mean Squares Filter is the heart of QOA. It predicts the next
|
||||
sample based on the previous 4 reconstructed samples. It does so by continuously
|
||||
adjusting 4 weights based on the residual of the previous prediction.
|
||||
|
||||
The next sample is predicted as the sum of (weight[i] * history[i]).
|
||||
|
||||
The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or
|
||||
subtracts the residual to each weight, based on the corresponding sample from
|
||||
the history. This, surprisingly, is sufficient to get worthwhile predictions.
|
||||
|
||||
This is all done with fixed point integers. Hence the right-shifts when updating
|
||||
the weights and calculating the prediction.
|
||||
*/
|
||||
// @performance Depending on context most likely SIMDable
|
||||
static inline
|
||||
int32 qoa_lms_predict(QoaLms* lms)
|
||||
{
|
||||
int32 prediction = 0;
|
||||
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
|
||||
prediction += lms->weights[i] * lms->history[i];
|
||||
}
|
||||
|
||||
return prediction >> 13;
|
||||
}
|
||||
|
||||
// @performance Depending on context most likely SIMDable
|
||||
static inline
|
||||
void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) {
|
||||
int32 delta = residual >> 4;
|
||||
|
||||
lms->weights[0] += lms->history[0] < 0 ? -delta : delta;
|
||||
for (int32 i = 0; i < QOA_LMS_LEN - 1; ++i) {
|
||||
lms->history[i] = lms->history[i + 1];
|
||||
lms->weights[i + 1] += lms->history[i + 1] < 0 ? -delta : delta;
|
||||
}
|
||||
lms->weights[QOA_LMS_LEN - 1] += lms->history[QOA_LMS_LEN - 1] < 0 ? -delta : delta;
|
||||
lms->history[QOA_LMS_LEN - 1] = sample;
|
||||
}
|
||||
|
||||
/*
|
||||
qoa_div() implements a rounding division, but avoids rounding to zero for
|
||||
small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still
|
||||
returns as 0, which is handled in the qoa_quant_tab[].
|
||||
qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an
|
||||
argument, so it can do the division with a cheaper integer multiplication.
|
||||
*/
|
||||
static inline
|
||||
int32 qoa_div(int32 v, int32 scalefactor) {
|
||||
int32 reciprocal = qoa_reciprocal_tab[scalefactor];
|
||||
int32 n = (v * reciprocal + (1 << 15)) >> 16;
|
||||
|
||||
/* round away from 0 */
|
||||
n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0));
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline
|
||||
int32 qoa_clamp(int32 v, int32 min, int32 max) {
|
||||
if (v < min) { return min; }
|
||||
if (v > max) { return max; }
|
||||
return v;
|
||||
}
|
||||
|
||||
/*
|
||||
This specialized clamp function for the signed 16 bit range improves decode
|
||||
performance quite a bit. The extra if() statement works nicely with the CPUs
|
||||
branch prediction as this branch is rarely taken.
|
||||
*/
|
||||
static inline
|
||||
int32 qoa_clamp_s16(int32 v) {
|
||||
if ((uint32) (v + 32768) > 65535) {
|
||||
if (v < -32768) { return -32768; }
|
||||
if (v > 32767) { return 32767; }
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes)
|
||||
{
|
||||
byte* start = bytes;
|
||||
|
||||
int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0};
|
||||
|
||||
// Write the frame header
|
||||
*((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples);
|
||||
bytes += sizeof(frame_samples);
|
||||
|
||||
// @performance SIMDable
|
||||
for (uint32 c = 0; c < channels; ++c) {
|
||||
// Write the current LMS state
|
||||
uint64 history = 0;
|
||||
uint64 weights = 0;
|
||||
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
|
||||
history = (history << 16) | (lms[c].history[i] & 0xffff);
|
||||
weights = (weights << 16) | (lms[c].weights[i] & 0xffff);
|
||||
}
|
||||
|
||||
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(history);
|
||||
bytes += sizeof(history);
|
||||
|
||||
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(weights);
|
||||
bytes += sizeof(weights);
|
||||
}
|
||||
|
||||
/*
|
||||
We encode all samples with the channels interleaved on a slice level.
|
||||
E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ...
|
||||
*/
|
||||
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
|
||||
// @performance SIMDable
|
||||
for (uint32 c = 0; c < channels; ++c) {
|
||||
int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index);
|
||||
int32 slice_start = sample_index * channels + c;
|
||||
int32 slice_end = (sample_index + slice_len) * channels + c;
|
||||
|
||||
/*
|
||||
Brute for search for the best scalefactor. Just go through all
|
||||
16 scalefactors, encode all samples for the current slice and
|
||||
meassure the total squared error.
|
||||
*/
|
||||
uint64 best_rank = -1;
|
||||
uint64 best_slice = 0;
|
||||
QoaLms best_lms;
|
||||
int32 best_scalefactor = 0;
|
||||
|
||||
for (int32 sfi = 0; sfi < 16; ++sfi) {
|
||||
/*
|
||||
There is a strong correlation between the scalefactors of
|
||||
neighboring slices. As an optimization, start testing
|
||||
the best scalefactor of the previous slice first.
|
||||
*/
|
||||
int32 scalefactor = (sfi + prev_scalefactor[c]) % 16;
|
||||
|
||||
/*
|
||||
We have to reset the LMS state to the last known good one
|
||||
before trying each scalefactor, as each pass updates the LMS
|
||||
state when encoding.
|
||||
*/
|
||||
QoaLms lms_temp = lms[c];
|
||||
uint64 slice = scalefactor;
|
||||
uint64 current_rank = 0;
|
||||
|
||||
for (int32 si = slice_start; si < slice_end; si += channels) {
|
||||
int32 sample = sample_data[si];
|
||||
int32 predicted = qoa_lms_predict(&lms_temp);
|
||||
|
||||
int32 residual = sample - predicted;
|
||||
int32 scaled = qoa_div(residual, scalefactor);
|
||||
int32 clamped = qoa_clamp(scaled, -8, 8);
|
||||
int32 quantized = qoa_quant_tab[clamped + 8];
|
||||
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
|
||||
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
|
||||
|
||||
/*
|
||||
If the weights have grown too large, we introduce a penalty
|
||||
here. This prevents pops/clicks in certain problem cases
|
||||
*/
|
||||
int32 weights_penalty = ((
|
||||
lms_temp.weights[0] * lms_temp.weights[0]
|
||||
+ lms_temp.weights[1] * lms_temp.weights[1]
|
||||
+ lms_temp.weights[2] * lms_temp.weights[2]
|
||||
+ lms_temp.weights[3] * lms_temp.weights[3]
|
||||
) >> 18) - 0x8ff;
|
||||
|
||||
if (weights_penalty < 0) {
|
||||
weights_penalty = 0;
|
||||
}
|
||||
|
||||
int64 error = (sample - reconstructed);
|
||||
uint64 error_sq = error * error;
|
||||
|
||||
current_rank += error_sq + weights_penalty * weights_penalty;
|
||||
if (current_rank > best_rank) {
|
||||
break;
|
||||
}
|
||||
|
||||
qoa_lms_update(&lms_temp, reconstructed, dequantized);
|
||||
slice = (slice << 3) | quantized;
|
||||
}
|
||||
|
||||
if (current_rank < best_rank) {
|
||||
best_rank = current_rank;
|
||||
best_slice = slice;
|
||||
best_lms = lms_temp;
|
||||
best_scalefactor = scalefactor;
|
||||
}
|
||||
}
|
||||
|
||||
prev_scalefactor[c] = best_scalefactor;
|
||||
|
||||
lms[c] = best_lms;
|
||||
|
||||
/*
|
||||
If this slice was shorter than QOA_SLICE_LEN, we have to left-
|
||||
shift all encoded data, to ensure the rightmost bits are the empty
|
||||
ones. This should only happen in the last frame of a file as all
|
||||
slices are completely filled otherwise.
|
||||
*/
|
||||
best_slice <<= (QOA_SLICE_LEN - slice_len) * 3;
|
||||
|
||||
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice);
|
||||
bytes += sizeof(best_slice);
|
||||
}
|
||||
}
|
||||
|
||||
return (uint32) (bytes - start);
|
||||
}
|
||||
|
||||
uint32 qoa_encode(const Audio* audio, byte* data) {
|
||||
byte* start = data;
|
||||
|
||||
/* Calculate the encoded size and allocate */
|
||||
uint32 sample_count = audio->size / (audio->channels * audio->bloc_size);
|
||||
uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN;
|
||||
uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
|
||||
|
||||
QoaLms lms[QOA_MAX_CHANNELS];
|
||||
for (int32 i = 0; i < audio->channels; ++i) {
|
||||
/*
|
||||
Set the initial LMS weights to {0, 0, -1, 2}. This helps with the
|
||||
prediction of the first few ms of a file.
|
||||
*/
|
||||
lms[i].weights[0] = 0;
|
||||
lms[i].weights[1] = 0;
|
||||
lms[i].weights[2] = -(1 << 13);
|
||||
lms[i].weights[3] = (1 << 14);
|
||||
|
||||
// Explicitly set the history samples to 0, as we might have some garbage in there.
|
||||
memset(lms[i].history, 0, QOA_LMS_LEN * sizeof(int32));
|
||||
}
|
||||
|
||||
// Go through all frames
|
||||
int32 frame_samples = QOA_FRAME_LEN;
|
||||
int32 p = 0;
|
||||
|
||||
for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) {
|
||||
frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index);
|
||||
data += qoa_encode_frame(
|
||||
(int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size),
|
||||
audio->channels, frame_samples, lms, data
|
||||
);
|
||||
}
|
||||
|
||||
return (uint32) (data - start);
|
||||
}
|
||||
|
||||
uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data)
|
||||
{
|
||||
const byte* start = bytes;
|
||||
|
||||
// Read and verify the frame header
|
||||
uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes));
|
||||
bytes += sizeof(frame_samples);
|
||||
|
||||
uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
|
||||
uint32 frame_size = QOA_FRAME_SIZE(channels, slices);
|
||||
uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels;
|
||||
uint32 num_slices = data_size / 8;
|
||||
uint32 max_total_samples = num_slices * QOA_SLICE_LEN;
|
||||
|
||||
// Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel
|
||||
for (uint32 c = 0; c < channels; ++c) {
|
||||
uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
||||
bytes += sizeof(history);
|
||||
|
||||
uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
||||
bytes += sizeof(weights);
|
||||
|
||||
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
|
||||
lms[c].history[i] = ((int16) (history >> 48));
|
||||
history <<= 16;
|
||||
|
||||
lms[c].weights[i] = ((int16) (weights >> 48));
|
||||
weights <<= 16;
|
||||
}
|
||||
}
|
||||
|
||||
// Decode all slices for all channels in this frame
|
||||
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
|
||||
for (uint32 c = 0; c < channels; c++) {
|
||||
uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
||||
bytes += sizeof(slice);
|
||||
|
||||
int32 scalefactor = (slice >> 60) & 0xf;
|
||||
slice <<= 4;
|
||||
|
||||
int32 slice_start = sample_index * channels + c;
|
||||
int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c;
|
||||
|
||||
for (int32 si = slice_start; si < slice_end; si += channels) {
|
||||
int32 predicted = qoa_lms_predict(&lms[c]);
|
||||
int32 quantized = (slice >> 61) & 0x7;
|
||||
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
|
||||
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
|
||||
|
||||
sample_data[si] = reconstructed;
|
||||
slice <<= 3;
|
||||
|
||||
qoa_lms_update(&lms[c], reconstructed, dequantized);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (uint32) (bytes - start);
|
||||
}
|
||||
|
||||
|
||||
uint32 qoa_decode(const byte* data, Audio* audio)
|
||||
{
|
||||
uint32 header_length = audio_header_from_data(data, audio);
|
||||
uint32 p = header_length;
|
||||
uint32 frame_size;
|
||||
byte* sample_ptr = audio->data;
|
||||
|
||||
QoaLms lms[QOA_MAX_CHANNELS];
|
||||
|
||||
uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels;
|
||||
|
||||
do {
|
||||
frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr);
|
||||
sample_ptr += frame_size;
|
||||
p += frame_size;
|
||||
} while (frame_size && p < audio->size && audio->size - p >= limit);
|
||||
// @question do we really need the audio->size - p >= limit check?
|
||||
|
||||
return header_length + audio->size;
|
||||
}
|
||||
|
||||
#endif
|
||||
470
audio/QoaSimd.h
Normal file
470
audio/QoaSimd.h
Normal file
|
|
@ -0,0 +1,470 @@
|
|||
/**
|
||||
* Jingga
|
||||
*
|
||||
* @copyright 2023, Dominic Szablewski - https://phoboslab.org
|
||||
* @copyright Jingga
|
||||
* @license OMS License 2.0
|
||||
* @version 1.0.0
|
||||
* @link https://jingga.app
|
||||
*/
|
||||
#ifndef TOS_AUDIO_QOA_H
|
||||
#define TOS_AUDIO_QOA_H
|
||||
|
||||
#include "../stdlib/Types.h"
|
||||
#include "../utils/EndianUtils.h"
|
||||
#include "../audio/Audio.cpp"
|
||||
#include "../stdlib/simd/SIMD_I32.h"
|
||||
|
||||
#define QOA_SLICE_LEN 20
|
||||
#define QOA_SLICES_PER_FRAME 256
|
||||
#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN)
|
||||
#define QOA_LMS_LEN 4
|
||||
#define QOA_MAX_CHANNELS 8
|
||||
|
||||
#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels))
|
||||
|
||||
struct QoaLms {
|
||||
int32_4 history; // automatically QOA_LMS_LEN size
|
||||
int32_4 weights; // automatically QOA_LMS_LEN size
|
||||
};
|
||||
|
||||
/*
|
||||
The quant_tab provides an index into the dequant_tab for residuals in the
|
||||
range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at
|
||||
the higher end. Note that the residual zero is identical to the lowest positive
|
||||
value. This is mostly fine, since the qoa_div() function always rounds away
|
||||
from zero.
|
||||
*/
|
||||
static const int32 qoa_quant_tab[17] = {
|
||||
7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
|
||||
0, /* 0 */
|
||||
0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */
|
||||
};
|
||||
|
||||
/*
|
||||
We have 16 different scalefactors. Like the quantized residuals these become
|
||||
less accurate at the higher end. In theory, the highest scalefactor that we
|
||||
would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
|
||||
rely on the LMS filter to predict samples accurately enough that a maximum
|
||||
residual of one quarter of the 16 bit range is sufficient. I.e. with the
|
||||
scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
|
||||
|
||||
The scalefactor values are computed as:
|
||||
scalefactor_tab[s] <- round(pow(s + 1, 2.75))
|
||||
*/
|
||||
static const int32 qoa_scalefactor_tab[16] = {
|
||||
1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
The reciprocal_tab maps each of the 16 scalefactors to their rounded
|
||||
reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in
|
||||
the encoder with just one multiplication instead of an expensive division. We
|
||||
do this in .16 fixed point with integers, instead of floats.
|
||||
|
||||
The reciprocal_tab is computed as:
|
||||
reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s]
|
||||
*/
|
||||
static const int32 qoa_reciprocal_tab[16] = {
|
||||
65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
|
||||
};
|
||||
|
||||
/*
|
||||
The dequant_tab maps each of the scalefactors and quantized residuals to
|
||||
their unscaled & dequantized version.
|
||||
|
||||
Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
|
||||
instead of 1. The dequant_tab assumes the following dequantized values for each
|
||||
of the quant_tab indices and is computed as:
|
||||
float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
|
||||
dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
|
||||
|
||||
The rounding employed here is "to nearest, ties away from zero", i.e. positive
|
||||
and negative values are treated symmetrically.
|
||||
*/
|
||||
static const int32 qoa_dequant_tab[16][8] = {
|
||||
{ 1, -1, 3, -3, 5, -5, 7, -7},
|
||||
{ 5, -5, 18, -18, 32, -32, 49, -49},
|
||||
{ 16, -16, 53, -53, 95, -95, 147, -147},
|
||||
{ 34, -34, 113, -113, 203, -203, 315, -315},
|
||||
{ 63, -63, 210, -210, 378, -378, 588, -588},
|
||||
{ 104, -104, 345, -345, 621, -621, 966, -966},
|
||||
{ 158, -158, 528, -528, 950, -950, 1477, -1477},
|
||||
{ 228, -228, 760, -760, 1368, -1368, 2128, -2128},
|
||||
{ 316, -316, 1053, -1053, 1895, -1895, 2947, -2947},
|
||||
{ 422, -422, 1405, -1405, 2529, -2529, 3934, -3934},
|
||||
{ 548, -548, 1828, -1828, 3290, -3290, 5117, -5117},
|
||||
{ 696, -696, 2320, -2320, 4176, -4176, 6496, -6496},
|
||||
{ 868, -868, 2893, -2893, 5207, -5207, 8099, -8099},
|
||||
{1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933},
|
||||
{1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005},
|
||||
{1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336},
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
The Least Mean Squares Filter is the heart of QOA. It predicts the next
|
||||
sample based on the previous 4 reconstructed samples. It does so by continuously
|
||||
adjusting 4 weights based on the residual of the previous prediction.
|
||||
|
||||
The next sample is predicted as the sum of (weight[i] * history[i]).
|
||||
|
||||
The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or
|
||||
subtracts the residual to each weight, based on the corresponding sample from
|
||||
the history. This, surprisingly, is sufficient to get worthwhile predictions.
|
||||
|
||||
This is all done with fixed point integers. Hence the right-shifts when updating
|
||||
the weights and calculating the prediction.
|
||||
*/
|
||||
static inline
|
||||
int32 qoa_lms_predict(QoaLms* lms)
|
||||
{
|
||||
__m128i products = _mm_mullo_epi32(lms->weights.s, lms->history.s);
|
||||
__m128i sum1 = _mm_hadd_epi32(products, products);
|
||||
__m128i sum2 = _mm_hadd_epi32(sum1, sum1);
|
||||
|
||||
int32 prediction = _mm_cvtsi128_si32(sum2);
|
||||
|
||||
return prediction >> 13;
|
||||
}
|
||||
|
||||
static inline
|
||||
void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) {
|
||||
int32 delta = residual >> 4;
|
||||
|
||||
__m128i delta_vec = _mm_set1_epi32(delta);
|
||||
__m128i zero_vec = _mm_setzero_si128();
|
||||
|
||||
// Calculate adjustments for weights based on the sign of history
|
||||
__m128i sign_mask = _mm_cmpgt_epi32(zero_vec, lms->history.s); // history < 0
|
||||
__m128i delta_adjust = _mm_blendv_epi8(delta_vec, _mm_sub_epi32(zero_vec, delta_vec), sign_mask);
|
||||
|
||||
// Update weights
|
||||
lms->weights.s = _mm_add_epi32(lms->weights.s, delta_adjust);
|
||||
|
||||
// Shift history left
|
||||
lms->history.s = _mm_alignr_epi8(lms->history.s, lms->history.s, 4); // Shift left by 1 int32 (4 bytes)
|
||||
|
||||
// Insert the new sample into the last position of history
|
||||
lms->history.s = _mm_insert_epi32(lms->history.s, sample, QOA_LMS_LEN - 1);
|
||||
|
||||
// Update the last weight based on the sign of the new sample
|
||||
int32 sample_sign_adjust = (sample < 0) ? -delta : delta;
|
||||
lms->weights.s = _mm_insert_epi32(
|
||||
lms->weights.s,
|
||||
_mm_extract_epi32(lms->weights.s, QOA_LMS_LEN - 1) + sample_sign_adjust,
|
||||
QOA_LMS_LEN - 1
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
qoa_div() implements a rounding division, but avoids rounding to zero for
|
||||
small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still
|
||||
returns as 0, which is handled in the qoa_quant_tab[].
|
||||
qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an
|
||||
argument, so it can do the division with a cheaper integer multiplication.
|
||||
*/
|
||||
static inline
|
||||
int32 qoa_div(int32 v, int32 scalefactor) {
|
||||
int32 reciprocal = qoa_reciprocal_tab[scalefactor];
|
||||
int32 n = (v * reciprocal + (1 << 15)) >> 16;
|
||||
|
||||
/* round away from 0 */
|
||||
n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0));
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline
|
||||
int32 qoa_clamp(int32 v, int32 min, int32 max) {
|
||||
if (v < min) { return min; }
|
||||
if (v > max) { return max; }
|
||||
return v;
|
||||
}
|
||||
|
||||
/*
|
||||
This specialized clamp function for the signed 16 bit range improves decode
|
||||
performance quite a bit. The extra if() statement works nicely with the CPUs
|
||||
branch prediction as this branch is rarely taken.
|
||||
*/
|
||||
static inline
|
||||
int32 qoa_clamp_s16(int32 v) {
|
||||
if ((uint32) (v + 32768) > 65535) {
|
||||
if (v < -32768) { return -32768; }
|
||||
if (v > 32767) { return 32767; }
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes)
|
||||
{
|
||||
byte* start = bytes;
|
||||
|
||||
int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0};
|
||||
|
||||
// Write the frame header
|
||||
*((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples);
|
||||
bytes += sizeof(frame_samples);
|
||||
|
||||
for (uint32 c = 0; c < channels; ++c) {
|
||||
// Load the history and weights as 16-bit masked values
|
||||
__m128i history = _mm_and_si128(lms[c].history.s, _mm_set1_epi32(0xFFFF));
|
||||
__m128i weights = _mm_and_si128(lms[c].weights.s, _mm_set1_epi32(0xFFFF));
|
||||
|
||||
// Permute and pack 16-bit values into 64-bit results
|
||||
__m128i packed_history = _mm_packus_epi32(history, history); // Pack 16-bit values
|
||||
__m128i packed_weights = _mm_packus_epi32(weights, weights);
|
||||
|
||||
// Shuffle packed values into the correct order for 64-bit construction
|
||||
__m128i history_64 = _mm_packus_epi16(packed_history, packed_history);
|
||||
__m128i weights_64 = _mm_packus_epi16(packed_weights, packed_weights);
|
||||
|
||||
// Extract 64-bit results and swap endian if necessary
|
||||
uint64 final_history = _mm_extract_epi64(history_64, 0);
|
||||
uint64 final_weights = _mm_extract_epi64(weights_64, 0);
|
||||
|
||||
// Store results with endian swap
|
||||
*((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_history);
|
||||
bytes += sizeof(final_history);
|
||||
|
||||
*((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_weights);
|
||||
bytes += sizeof(final_weights);
|
||||
}
|
||||
|
||||
/*
|
||||
We encode all samples with the channels interleaved on a slice level.
|
||||
E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ...
|
||||
*/
|
||||
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
|
||||
// @performance SIMDable
|
||||
for (uint32 c = 0; c < channels; ++c) {
|
||||
int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index);
|
||||
int32 slice_start = sample_index * channels + c;
|
||||
int32 slice_end = (sample_index + slice_len) * channels + c;
|
||||
|
||||
/*
|
||||
Brute for search for the best scalefactor. Just go through all
|
||||
16 scalefactors, encode all samples for the current slice and
|
||||
meassure the total squared error.
|
||||
*/
|
||||
uint64 best_rank = -1;
|
||||
uint64 best_slice = 0;
|
||||
QoaLms best_lms;
|
||||
int32 best_scalefactor = 0;
|
||||
|
||||
for (int32 sfi = 0; sfi < 16; ++sfi) {
|
||||
/*
|
||||
There is a strong correlation between the scalefactors of
|
||||
neighboring slices. As an optimization, start testing
|
||||
the best scalefactor of the previous slice first.
|
||||
*/
|
||||
int32 scalefactor = (sfi + prev_scalefactor[c]) % 16;
|
||||
|
||||
/*
|
||||
We have to reset the LMS state to the last known good one
|
||||
before trying each scalefactor, as each pass updates the LMS
|
||||
state when encoding.
|
||||
*/
|
||||
QoaLms lms_temp = lms[c];
|
||||
uint64 slice = scalefactor;
|
||||
uint64 current_rank = 0;
|
||||
|
||||
for (int32 si = slice_start; si < slice_end; si += channels) {
|
||||
int32 sample = sample_data[si];
|
||||
int32 predicted = qoa_lms_predict(&lms_temp);
|
||||
|
||||
int32 residual = sample - predicted;
|
||||
int32 scaled = qoa_div(residual, scalefactor);
|
||||
int32 clamped = qoa_clamp(scaled, -8, 8);
|
||||
int32 quantized = qoa_quant_tab[clamped + 8];
|
||||
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
|
||||
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
|
||||
|
||||
/*
|
||||
If the weights have grown too large, we introduce a penalty
|
||||
here. This prevents pops/clicks in certain problem cases
|
||||
*/
|
||||
// Compute weights squared: w^2
|
||||
__m128i weights_squared = _mm_mullo_epi32(lms_temp.weights.s, lms_temp.weights.s);
|
||||
|
||||
// Perform horizontal addition to sum all squared weights
|
||||
__m128i sum1 = _mm_hadd_epi32(weights_squared, weights_squared);
|
||||
__m128i sum2 = _mm_hadd_epi32(sum1, sum1);
|
||||
|
||||
// Extract the final sum (scalar)
|
||||
int32 sum_of_squares = _mm_cvtsi128_si32(sum2);
|
||||
|
||||
// Apply the shift and subtraction
|
||||
int32 weights_penalty = (sum_of_squares >> 18) - 0x8FF;
|
||||
if (weights_penalty < 0) {
|
||||
weights_penalty = 0;
|
||||
}
|
||||
|
||||
int64 error = (sample - reconstructed);
|
||||
uint64 error_sq = error * error;
|
||||
|
||||
current_rank += error_sq + weights_penalty * weights_penalty;
|
||||
if (current_rank > best_rank) {
|
||||
break;
|
||||
}
|
||||
|
||||
qoa_lms_update(&lms_temp, reconstructed, dequantized);
|
||||
slice = (slice << 3) | quantized;
|
||||
}
|
||||
|
||||
if (current_rank < best_rank) {
|
||||
best_rank = current_rank;
|
||||
best_slice = slice;
|
||||
best_lms = lms_temp;
|
||||
best_scalefactor = scalefactor;
|
||||
}
|
||||
}
|
||||
|
||||
prev_scalefactor[c] = best_scalefactor;
|
||||
|
||||
lms[c] = best_lms;
|
||||
|
||||
/*
|
||||
If this slice was shorter than QOA_SLICE_LEN, we have to left-
|
||||
shift all encoded data, to ensure the rightmost bits are the empty
|
||||
ones. This should only happen in the last frame of a file as all
|
||||
slices are completely filled otherwise.
|
||||
*/
|
||||
best_slice <<= (QOA_SLICE_LEN - slice_len) * 3;
|
||||
|
||||
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice);
|
||||
bytes += sizeof(best_slice);
|
||||
}
|
||||
}
|
||||
|
||||
return (uint32) (bytes - start);
|
||||
}
|
||||
|
||||
uint32 qoa_encode(const Audio* audio, byte* data)
|
||||
{
|
||||
byte* start = data;
|
||||
|
||||
/* Calculate the encoded size and allocate */
|
||||
uint32 sample_count = audio->size / (audio->channels * audio->bloc_size);
|
||||
uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN;
|
||||
uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
|
||||
|
||||
QoaLms lms[QOA_MAX_CHANNELS];
|
||||
__m128i weights_init = _mm_set_epi32(1 << 14, -(1 << 13), 0, 0);
|
||||
__m128i history_init = _mm_setzero_si128();
|
||||
|
||||
for (int32 i = 0; i < audio->channels; ++i) {
|
||||
/*
|
||||
Set the initial LMS weights to {0, 0, -1, 2}. This helps with the
|
||||
prediction of the first few ms of a file.
|
||||
*/
|
||||
lms[i].weights.s = weights_init;
|
||||
lms[i].history.s = history_init;
|
||||
}
|
||||
|
||||
// Go through all frames
|
||||
int32 frame_samples = QOA_FRAME_LEN;
|
||||
int32 p = 0;
|
||||
|
||||
for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) {
|
||||
frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index);
|
||||
data += qoa_encode_frame(
|
||||
(int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size),
|
||||
audio->channels, frame_samples, lms, data
|
||||
);
|
||||
}
|
||||
|
||||
return (uint32) (data - start);
|
||||
}
|
||||
|
||||
uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data)
|
||||
{
|
||||
const byte* start = bytes;
|
||||
|
||||
// Read and verify the frame header
|
||||
uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes));
|
||||
bytes += sizeof(frame_samples);
|
||||
|
||||
uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
|
||||
uint32 frame_size = QOA_FRAME_SIZE(channels, slices);
|
||||
uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels;
|
||||
uint32 num_slices = data_size / 8;
|
||||
uint32 max_total_samples = num_slices * QOA_SLICE_LEN;
|
||||
|
||||
// Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel
|
||||
for (uint32 c = 0; c < channels; ++c) {
|
||||
uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
||||
bytes += sizeof(history);
|
||||
|
||||
uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
||||
bytes += sizeof(weights);
|
||||
|
||||
alignas(16) int32 history_array[4];
|
||||
alignas(16) int32 weights_array[4];
|
||||
|
||||
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
|
||||
history_array[i] = ((int16) (history >> 48));
|
||||
history <<= 16;
|
||||
|
||||
weights_array[i] = ((int16) (weights >> 48));
|
||||
weights <<= 16;
|
||||
}
|
||||
|
||||
lms[c].history.s = _mm_set_epi32(history_array[3], history_array[2], history_array[1], history_array[0]);
|
||||
lms[c].weights.s = _mm_set_epi32(weights_array[3], weights_array[2], weights_array[1], weights_array[0]);
|
||||
}
|
||||
|
||||
// Decode all slices for all channels in this frame
|
||||
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
|
||||
for (uint32 c = 0; c < channels; c++) {
|
||||
uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
||||
bytes += sizeof(slice);
|
||||
|
||||
int32 scalefactor = (slice >> 60) & 0xf;
|
||||
slice <<= 4;
|
||||
|
||||
int32 slice_start = sample_index * channels + c;
|
||||
int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c;
|
||||
|
||||
for (int32 si = slice_start; si < slice_end; si += channels) {
|
||||
int32 predicted = qoa_lms_predict(&lms[c]);
|
||||
int32 quantized = (slice >> 61) & 0x7;
|
||||
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
|
||||
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
|
||||
|
||||
sample_data[si] = reconstructed;
|
||||
slice <<= 3;
|
||||
|
||||
qoa_lms_update(&lms[c], reconstructed, dequantized);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (uint32) (bytes - start);
|
||||
}
|
||||
|
||||
|
||||
uint32 qoa_decode(const byte* data, Audio* audio)
|
||||
{
|
||||
uint32 header_length = audio_header_from_data(data, audio);
|
||||
uint32 p = header_length;
|
||||
uint32 frame_size;
|
||||
byte* sample_ptr = audio->data;
|
||||
|
||||
QoaLms lms[QOA_MAX_CHANNELS];
|
||||
|
||||
uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels;
|
||||
|
||||
do {
|
||||
frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr);
|
||||
sample_ptr += frame_size;
|
||||
p += frame_size;
|
||||
} while (frame_size && audio->size - p >= limit);
|
||||
// @question do we really need the audio->size - p >= limit check or would p < audio->size be sufficient?
|
||||
|
||||
return header_length + audio->size;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -81,25 +81,26 @@ int32 image_data_size(const Image* image)
|
|||
+ sizeof(image->image_settings);
|
||||
}
|
||||
|
||||
int32 image_header_from_data(const byte* data, Image* image)
|
||||
inline
|
||||
uint32 image_header_from_data(const byte* data, Image* image)
|
||||
{
|
||||
const byte* pos = data;
|
||||
const byte* start = data;
|
||||
|
||||
image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
|
||||
pos += sizeof(image->width);
|
||||
image->width = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
|
||||
data += sizeof(image->width);
|
||||
|
||||
image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) pos));
|
||||
pos += sizeof(image->height);
|
||||
image->height = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
|
||||
data += sizeof(image->height);
|
||||
|
||||
image->pixel_count = image->width * image->height;
|
||||
|
||||
image->image_settings = *pos;
|
||||
pos += sizeof(image->image_settings);
|
||||
image->image_settings = *data;
|
||||
data += sizeof(image->image_settings);
|
||||
|
||||
return (int32) (pos - data);
|
||||
return (int32) (data - start);
|
||||
}
|
||||
|
||||
int32 image_from_data(const byte* data, Image* image)
|
||||
uint32 image_from_data(const byte* data, Image* image)
|
||||
{
|
||||
const byte* pos = data;
|
||||
pos += image_header_from_data(data, image);
|
||||
|
|
@ -112,23 +113,23 @@ int32 image_from_data(const byte* data, Image* image)
|
|||
}
|
||||
|
||||
inline
|
||||
int32 image_header_to_data(const Image* image, byte* data)
|
||||
uint32 image_header_to_data(const Image* image, byte* data)
|
||||
{
|
||||
byte* pos = data;
|
||||
byte* start = data;
|
||||
|
||||
*((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->width);
|
||||
pos += sizeof(image->width);
|
||||
*((uint32 *) data) = SWAP_ENDIAN_LITTLE(image->width);
|
||||
data += sizeof(image->width);
|
||||
|
||||
*((uint32 *) pos) = SWAP_ENDIAN_LITTLE(image->height);
|
||||
pos += sizeof(image->height);
|
||||
*((uint32 *) data) = SWAP_ENDIAN_LITTLE(image->height);
|
||||
data += sizeof(image->height);
|
||||
|
||||
*pos = image->image_settings;
|
||||
pos += sizeof(image->image_settings);
|
||||
*data = image->image_settings;
|
||||
data += sizeof(image->image_settings);
|
||||
|
||||
return (int32) (pos - data);
|
||||
return (int32) (data - start);
|
||||
}
|
||||
|
||||
int32 image_to_data(const Image* image, byte* data)
|
||||
uint32 image_to_data(const Image* image, byte* data)
|
||||
{
|
||||
byte* pos = data;
|
||||
pos += image_header_to_data(image, data);
|
||||
|
|
|
|||
411
image/Qoi.h
411
image/Qoi.h
|
|
@ -1,6 +1,7 @@
|
|||
/**
|
||||
* Jingga
|
||||
*
|
||||
* @copyright 2021, Dominic Szablewski - https://phoboslab.org
|
||||
* @copyright Jingga
|
||||
* @license OMS License 2.0
|
||||
* @version 1.0.0
|
||||
|
|
@ -13,20 +14,36 @@
|
|||
#include <string.h>
|
||||
#include "Image.cpp"
|
||||
|
||||
#define QOI_OP_INDEX 0b00000000
|
||||
#define QOI_OP_DIFF 0b01000000
|
||||
#define QOI_OP_LUMA 0b10000000
|
||||
#define QOI_OP_RUN 0b11000000 // @todo There is a HUGE step from here to QOI_OP_RGB this leaves room for more cases or using this data
|
||||
#define QOI_OP_RGB 0b11111110
|
||||
#define QOI_OP_RGBA 0b11111111
|
||||
#define QOI_MASK_2 0b11000000
|
||||
#define QOI_OP_LUMA555 0b00000000
|
||||
#define QOI_OP_LUMA222 0b10000000
|
||||
#define QOI_OP_LUMA777 0b01000000
|
||||
|
||||
#define QOI_COLOR_HASH(color) (color.r * 3 + color.g * 5 + color.b * 7 + color.a * 11)
|
||||
#define QOI_COLOR_HASH_2(color) ((((uint32)(color)) * 0x9E3779B1U) >> 26)
|
||||
#define QOI_OP_RUN 0b11000000
|
||||
|
||||
// These definitions are important and impact how large our run can be:
|
||||
// Run has 6 free bits -> 2^6 = 64
|
||||
// However, the first bit is used to indicate RGB or RGBA -> 64 - 2^1 = 62
|
||||
#define QOI_OP_RGB 0b11111110
|
||||
#define QOI_OP_RGBA 0b11111111
|
||||
|
||||
#define QOI_MASK_1 0b10000000
|
||||
#define QOI_MASK_2 0b11000000
|
||||
#define QOI_MASK_3 0b11100000
|
||||
|
||||
// @performance I feel like there is some more optimization possible by handling fully transparent pixels in a special way
|
||||
// @todo We need to implement monochrome handling, which is very important for game assets that often use monochrome assets for all kinds of things (e.g. translucency)
|
||||
|
||||
const byte optable[128] = {
|
||||
0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
|
||||
};
|
||||
|
||||
int32 qoi_encode(const Image* image, byte* data)
|
||||
{
|
||||
int32 p = image_header_to_data(image, data);
|
||||
byte* start = data;
|
||||
data += image_header_to_data(image, data);
|
||||
|
||||
v4_byte index[64];
|
||||
memset(index, 0, sizeof(index));
|
||||
|
|
@ -34,173 +51,257 @@ int32 qoi_encode(const Image* image, byte* data)
|
|||
v4_byte px_prev = {0, 0, 0, 255};
|
||||
v4_byte px = px_prev;
|
||||
|
||||
int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
|
||||
const int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
|
||||
|
||||
// Only works with 1 byte channel size -> we don't have to multiply channel count with channel size
|
||||
int32 px_len = image->width * image->height * channels;
|
||||
int32 px_end = px_len - channels;
|
||||
|
||||
int32 run = 0;
|
||||
for (int32 px_pos = 0; px_pos < px_len; px_pos += channels) {
|
||||
// @performance could We just use int32 type cast? The problem would be the last pixel which would be out of bounds by 1 byte
|
||||
memcpy(&px, &image->pixels[px_pos], channels * sizeof(byte));
|
||||
if (channels == 4) {
|
||||
for (int32 px_pos = 0; px_pos < px_len; px_pos += 4) {
|
||||
px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) (image->pixels + px_pos)));
|
||||
|
||||
if (px.val == px_prev.val) {
|
||||
++run;
|
||||
if (run == 62 || px_pos == px_end) {
|
||||
data[p++] = (byte) (QOI_OP_RUN | (run - 1));
|
||||
run = 0;
|
||||
}
|
||||
} else {
|
||||
if (run) {
|
||||
data[p++] = (byte) (QOI_OP_RUN | (run - 1));
|
||||
run = 0;
|
||||
}
|
||||
while(px.val == px_prev.val) {
|
||||
++run;
|
||||
if(px_pos == px_end) {
|
||||
*data++ = (byte) (QOI_OP_RUN | (run - 1));
|
||||
px_pos = px_len;
|
||||
|
||||
int32 index_pos = QOI_COLOR_HASH(px) % 64;
|
||||
//int32 index_pos = QOI_COLOR_HASH_2(px);
|
||||
|
||||
if (index[index_pos].val == px.val) {
|
||||
data[p++] = (byte) (QOI_OP_INDEX | index_pos);
|
||||
} else {
|
||||
index[index_pos] = px;
|
||||
|
||||
if (px.a == px_prev.a) {
|
||||
signed char vr = px.r - px_prev.r;
|
||||
signed char vg = px.g - px_prev.g;
|
||||
signed char vb = px.b - px_prev.b;
|
||||
|
||||
signed char vg_r = vr - vg;
|
||||
signed char vg_b = vb - vg;
|
||||
|
||||
if (vr > -3 && vr < 2
|
||||
&& vg > -3 && vg < 2
|
||||
&& vb > -3 && vb < 2
|
||||
) {
|
||||
data[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
|
||||
} else if (vg_r > -9 && vg_r < 8
|
||||
&& vg > -33 && vg < 32
|
||||
&& vg_b > -9 && vg_b < 8
|
||||
) {
|
||||
data[p++] = QOI_OP_LUMA | (vg + 32);
|
||||
data[p++] = (vg_r + 8) << 4 | (vg_b + 8);
|
||||
} else {
|
||||
data[p++] = QOI_OP_RGB;
|
||||
data[p++] = px.r;
|
||||
data[p++] = px.g;
|
||||
data[p++] = px.b;
|
||||
}
|
||||
} else {
|
||||
data[p++] = QOI_OP_RGBA;
|
||||
*((uint32 *) &data[p]) = SWAP_ENDIAN_LITTLE(px.val);
|
||||
p += 4;
|
||||
break;
|
||||
} else if (run == 62) {
|
||||
*data++ = (byte) (QOI_OP_RUN | (run - 1));
|
||||
run = 0;
|
||||
}
|
||||
|
||||
px_pos += 4;
|
||||
px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) (image->pixels + px_pos)));
|
||||
}
|
||||
|
||||
if (run) {
|
||||
*data++ = (byte) (QOI_OP_RUN | (run - 1));
|
||||
run = 0;
|
||||
}
|
||||
|
||||
if(px.a != px_prev.a){
|
||||
*data++ = QOI_OP_RGBA;
|
||||
*data++ = px.a;
|
||||
}
|
||||
|
||||
signed char vr = px.r - px_prev.r;
|
||||
signed char vg = px.g - px_prev.g;
|
||||
signed char vb = px.b - px_prev.b;
|
||||
|
||||
signed char vg_r = vr - vg;
|
||||
signed char vg_b = vb - vg;
|
||||
|
||||
byte ar = vg_r < 0 ? -vg_r - 1 : vg_r;
|
||||
byte ag = vg < 0 ? -vg - 1 : vg;
|
||||
byte ab = vg_b < 0 ? -vg_b - 1 : vg_b;
|
||||
byte argb = ar | ag | ab;
|
||||
|
||||
switch(optable[argb]) {
|
||||
case 0:
|
||||
*data++ = QOI_OP_LUMA222 | ((vg_r + 2) << 4) | ((vg_b + 2) << 2) | (vg + 2);
|
||||
break;
|
||||
case 1:
|
||||
*data++ = QOI_OP_LUMA555 | ((vg_b + 16) << 2) | ((vg_r + 16) >> 3);
|
||||
*data++ = (((vg_r + 16) & 7) << 5) | (vg + 16);
|
||||
break;
|
||||
case 2:
|
||||
*data++ = QOI_OP_LUMA777 | ((vg_b + 64) >> 2);
|
||||
*data++ = (((vg_b + 64) & 3) << 6) | ((vg_r + 64) >> 1);
|
||||
*data++ = (((vg_r + 64) & 1) << 7) | (vg + 64);
|
||||
break;
|
||||
case 3:
|
||||
*data++ = QOI_OP_RGB;
|
||||
*data++ = px.r;
|
||||
*data++ = px.g;
|
||||
*data++ = px.b;
|
||||
break;
|
||||
}
|
||||
|
||||
px_prev = px;
|
||||
}
|
||||
} else {
|
||||
for (int32 px_pos = 0; px_pos < px_len; px_pos += 3) {
|
||||
px.r = image->pixels[px_pos];
|
||||
px.g = image->pixels[px_pos + 1];
|
||||
px.b = image->pixels[px_pos + 2];
|
||||
|
||||
while(px.val == px_prev.val) {
|
||||
++run;
|
||||
if(px_pos == px_end) {
|
||||
*data++ = (byte) (QOI_OP_RUN | (run - 1));
|
||||
px_pos = px_len;
|
||||
|
||||
break;
|
||||
} else if (run == 62) {
|
||||
*data++ = (byte) (QOI_OP_RUN | (run - 1));
|
||||
run = 0;
|
||||
}
|
||||
|
||||
px_pos += 3;
|
||||
px.r = image->pixels[px_pos];
|
||||
px.g = image->pixels[px_pos + 1];
|
||||
px.b = image->pixels[px_pos + 2];
|
||||
}
|
||||
|
||||
if (run) {
|
||||
*data++ = (byte) (QOI_OP_RUN | (run - 1));
|
||||
run = 0;
|
||||
}
|
||||
|
||||
signed char vr = px.r - px_prev.r;
|
||||
signed char vg = px.g - px_prev.g;
|
||||
signed char vb = px.b - px_prev.b;
|
||||
|
||||
signed char vg_r = vr - vg;
|
||||
signed char vg_b = vb - vg;
|
||||
|
||||
byte ar = vg_r < 0 ? -vg_r - 1 : vg_r;
|
||||
byte ag = vg < 0 ? -vg - 1 : vg;
|
||||
byte ab = vg_b < 0 ? -vg_b - 1 : vg_b;
|
||||
byte argb = ar | ag | ab;
|
||||
|
||||
switch(optable[argb]) {
|
||||
case 0:
|
||||
*data++ = QOI_OP_LUMA222 | ((vg_r + 2) << 4) | ((vg_b + 2) << 2) | (vg + 2);
|
||||
break;
|
||||
case 1:
|
||||
*data++ = QOI_OP_LUMA555 | ((vg_b + 16) << 2) | ((vg_r + 16) >> 3);
|
||||
*data++ = (((vg_r + 16) & 7) << 5) | (vg + 16);
|
||||
break;
|
||||
case 2:
|
||||
*data++ = QOI_OP_LUMA777 | ((vg_b + 64) >> 2);
|
||||
*data++ = (((vg_b + 64) & 3) << 6) | ((vg_r + 64) >> 1);
|
||||
*data++ = (((vg_r + 64) & 1) << 7) | (vg + 64);
|
||||
break;
|
||||
case 3:
|
||||
*data++ = QOI_OP_RGB;
|
||||
*data++ = px.r;
|
||||
*data++ = px.g;
|
||||
*data++ = px.b;
|
||||
break;
|
||||
}
|
||||
|
||||
px_prev = px;
|
||||
}
|
||||
}
|
||||
|
||||
return (int32) (data - start);
|
||||
}
|
||||
|
||||
int32 qoi_decode_4(const byte* data, Image* image)
|
||||
{
|
||||
uint32 px_len = image->width * image->height * 4;
|
||||
v4_byte px = {0, 0, 0, 255};
|
||||
v4_byte index[64] = {0};
|
||||
int32 run = 0;
|
||||
|
||||
for (int32 px_pos = 0; px_pos < px_len; px_pos += 4) {
|
||||
if (run > 0) {
|
||||
--run;
|
||||
} else {
|
||||
OP_RGBA_GOTO:
|
||||
byte b1 = *data++;
|
||||
|
||||
if (b1 == QOI_OP_RGB) {
|
||||
px.r = *data++;
|
||||
px.g = *data++;
|
||||
px.b = *data++;
|
||||
} else if (b1 == QOI_OP_RGBA) {
|
||||
px.a = *data++;
|
||||
goto OP_RGBA_GOTO;
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA222) {
|
||||
byte vg = (b1 & 3) - 2;
|
||||
px.r += vg - 2 + ((b1 >> 4) & 3);
|
||||
px.g += vg;
|
||||
px.b += vg - 2 + ((b1 >> 2) & 3);
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA555) {
|
||||
byte b2 = *data++;
|
||||
byte vg = (b2 & 31) - 16;
|
||||
px.r += vg - 16 + (((b1 & 3) << 3) | (b2 >> 5));
|
||||
px.g += vg;
|
||||
px.b += vg - 16 + ((b1 >> 2) & 31);
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA777) {
|
||||
byte b2 = *data++;
|
||||
byte b3 = *data++;
|
||||
byte vg = (b3 & 0x7f) - 64;
|
||||
px.r += vg - 64 + ((b2 & 0x3f) << 1) + (b3 >> 7);
|
||||
px.g += vg;
|
||||
px.b += vg - 64 + ((b1 & 0x1f) << 2) + (b2 >> 6);
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
|
||||
run = (b1 & 0x3f);
|
||||
}
|
||||
}
|
||||
|
||||
px_prev = px;
|
||||
*((uint32 *) &image->pixels[px_pos]) = SWAP_ENDIAN_LITTLE(px.val);
|
||||
}
|
||||
|
||||
return p;
|
||||
return px_len;
|
||||
}
|
||||
|
||||
int32 qoi_decode(const byte* data, Image* image, int32 steps = 8)
|
||||
int32 qoi_decode_3(const byte* data, Image* image)
|
||||
{
|
||||
int32 header_length = image_header_from_data(data, image);
|
||||
int32 p = header_length;
|
||||
|
||||
int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
|
||||
uint32 px_len = image->width * image->height * channels;
|
||||
|
||||
v4_byte px = {0, 0, 0, 255};
|
||||
|
||||
v4_byte index[64];
|
||||
memset(index, 0, sizeof(index));
|
||||
|
||||
uint32 px_len = image->width * image->height * 3;
|
||||
v3_byte px = {0, 0, 0};
|
||||
int32 run = 0;
|
||||
|
||||
for (uint32 px_pos = 0; px_pos < px_len; px_pos += channels) {
|
||||
int32 b1 = data[p++];
|
||||
for (int32 px_pos = 0; px_pos < px_len; px_pos += 3) {
|
||||
if (run > 0) {
|
||||
--run;
|
||||
} else {
|
||||
byte b1 = *data++;
|
||||
|
||||
if (b1 == QOI_OP_RGB) {
|
||||
px.r = data[p++];
|
||||
px.g = data[p++];
|
||||
px.b = data[p++];
|
||||
} else if (b1 == QOI_OP_RGBA) {
|
||||
px.val = SWAP_ENDIAN_LITTLE(*((uint32 *) &data[p]));
|
||||
p += 4;
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) {
|
||||
px = index[b1];
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) {
|
||||
px.r += ((b1 >> 4) & 0x03) - 2;
|
||||
px.g += ((b1 >> 2) & 0x03) - 2;
|
||||
px.b += ( b1 & 0x03) - 2;
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) {
|
||||
int32 b2 = data[p++];
|
||||
byte vg = (b1 & 0x3f) - 32;
|
||||
px.r += vg - 8 + ((b2 >> 4) & 0x0f);
|
||||
px.g += vg;
|
||||
px.b += vg - 8 + (b2 & 0x0f);
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
|
||||
run = (b1 & 0x3f);
|
||||
if (b1 == QOI_OP_RGB) {
|
||||
px.r = *data++;
|
||||
px.g = *data++;
|
||||
px.b = *data++;
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA222) {
|
||||
byte vg = (b1 & 3) - 2;
|
||||
px.r += vg - 2 + ((b1 >> 4) & 3);
|
||||
px.g += vg;
|
||||
px.b += vg - 2 + ((b1 >> 2) & 3);
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA555) {
|
||||
byte b2 = *data++;
|
||||
byte vg = (b2 & 31) - 16;
|
||||
px.r += vg - 16 + (((b1 & 3) << 3) | (b2 >> 5));
|
||||
px.g += vg;
|
||||
px.b += vg - 16 + ((b1 >> 2) & 31);
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA777) {
|
||||
byte b2 = *data++;
|
||||
byte b3 = *data++;
|
||||
byte vg = (b3 & 0x7f) - 64;
|
||||
px.r += vg - 64 + ((b2 & 0x3f) << 1) + (b3 >> 7);
|
||||
px.g += vg;
|
||||
px.b += vg - 64 + ((b1 & 0x1f) << 2) + (b2 >> 6);
|
||||
} else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
|
||||
run = (b1 & 0x3f);
|
||||
}
|
||||
}
|
||||
|
||||
if (channels == 4) {
|
||||
uint32 px_little_endian = SWAP_ENDIAN_LITTLE(px.val);
|
||||
int32 pixel_step_size = steps * 4;
|
||||
int32 i = 0;
|
||||
|
||||
// @performance Implement for ARM
|
||||
#if ARM
|
||||
#else
|
||||
if (steps == 16) {
|
||||
__m512i simd_value = _mm512_set1_epi32(px_little_endian);
|
||||
for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
|
||||
_mm512_storeu_si512((__m512i *) &image->pixels[px_pos], simd_value);
|
||||
}
|
||||
} else if (steps >= 8) {
|
||||
__m256i simd_value = _mm256_set1_epi32(px_little_endian);
|
||||
for (; i <= run - steps; i += steps, px_pos += pixel_step_size) {
|
||||
_mm256_storeu_si256((__m256i *) &image->pixels[px_pos], simd_value);
|
||||
}
|
||||
} else if (steps >= 4) {
|
||||
__m128i simd_value = _mm_set1_epi32(px_little_endian);
|
||||
for(; i <= run - steps; i += steps, px_pos += pixel_step_size) {
|
||||
_mm_storeu_si128((__m128i *) &image->pixels[px_pos], simd_value);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < run; ++i) {
|
||||
*((uint32 *) &image->pixels[px_pos]) = px_little_endian;
|
||||
px_pos += channels;
|
||||
}
|
||||
} else if (channels == 3) {
|
||||
for (int32 i = 0; i < run; ++i) {
|
||||
image->pixels[px_pos++] = px.r;
|
||||
image->pixels[px_pos++] = px.g;
|
||||
image->pixels[px_pos++] = px.b;
|
||||
}
|
||||
} else if (channels == 1) {
|
||||
memset(&image->pixels[px_pos], px.r, run * sizeof(byte));
|
||||
px_pos += run;
|
||||
}
|
||||
|
||||
// Correction, since the loop increments by channels count as well
|
||||
px_pos -= channels;
|
||||
|
||||
index[QOI_COLOR_HASH(px) % 64] = px;
|
||||
//index[QOI_COLOR_HASH_2(px)] = px;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
index[QOI_COLOR_HASH(px) % 64] = px;
|
||||
//index[QOI_COLOR_HASH_2(px)] = px;
|
||||
|
||||
memcpy(&image->pixels[px_pos], &px, channels * sizeof(byte));
|
||||
image->pixels[px_pos] = px.r;
|
||||
image->pixels[px_pos + 1] = px.g;
|
||||
image->pixels[px_pos + 2] = px.b;
|
||||
}
|
||||
|
||||
return header_length + px_len;
|
||||
return px_len;
|
||||
}
|
||||
|
||||
int32 qoi_decode(const byte* data, Image* image)
|
||||
{
|
||||
int32 header_length = image_header_from_data(data, image);
|
||||
|
||||
const int32 channels = (image->image_settings & IMAGE_SETTING_CHANNEL_COUNT);
|
||||
|
||||
int32 len = 0;
|
||||
if (channels == 4) {
|
||||
len = qoi_decode_4(data + header_length, image);
|
||||
} else if (channels == 3) {
|
||||
len = qoi_decode_3(data + header_length, image);
|
||||
}
|
||||
|
||||
return header_length + len;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -21,6 +21,7 @@
|
|||
#include "../../utils/Utils.h"
|
||||
#include "../../utils/TestUtils.h"
|
||||
#include "../../memory/RingMemory.h"
|
||||
#include "../../log/Log.h"
|
||||
|
||||
typedef HANDLE FileHandle;
|
||||
typedef HANDLE MMFHandle;
|
||||
|
|
|
|||
|
|
@ -148,7 +148,7 @@ uint32 audio_buffer_fillable(const AudioSetting* setting, const DirectSoundSetti
|
|||
return 0;
|
||||
}
|
||||
|
||||
DWORD bytes_to_lock = setting->sample_buffer_size;
|
||||
DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
|
||||
DWORD bytes_to_write = 0;
|
||||
|
||||
DWORD target_cursor = (player_cursor + (setting->latency * setting->sample_size)) % setting->buffer_size;
|
||||
|
|
@ -180,7 +180,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
|
|||
void* region2;
|
||||
DWORD region2_size;
|
||||
|
||||
DWORD bytes_to_lock = setting->sample_buffer_size;
|
||||
DWORD bytes_to_lock = (setting->sample_index * setting->sample_size) % setting->buffer_size;
|
||||
|
||||
api_setting->secondary_buffer->Lock(
|
||||
bytes_to_lock, setting->sample_buffer_size,
|
||||
|
|
@ -204,7 +204,7 @@ void audio_play_buffer(AudioSetting* setting, DirectSoundSetting* api_setting)
|
|||
}
|
||||
|
||||
api_setting->secondary_buffer->Unlock(region1, region1_size, region2, region2_size);
|
||||
|
||||
setting->sample_index += (uint16) (setting->sample_buffer_size / setting->sample_size);
|
||||
setting->sample_buffer_size = 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -115,7 +115,6 @@ void audio_play(AudioSetting* setting, XAudio2Setting* api_setting) {
|
|||
}
|
||||
|
||||
api_setting->source_voice->Start(0, XAUDIO2_COMMIT_NOW);
|
||||
setting->sample_index = 0;
|
||||
}
|
||||
|
||||
inline
|
||||
|
|
@ -194,9 +193,6 @@ void audio_play_buffer(AudioSetting* setting, XAudio2Setting* api_setting) {
|
|||
}
|
||||
|
||||
++setting->sample_output;
|
||||
|
||||
// @performance Why do I even need this?
|
||||
//setting->sample_index += setting->sample_buffer_size / setting->sample_size;
|
||||
setting->sample_buffer_size = 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user