mirror of
https://github.com/Karaka-Management/cOMS.git
synced 2026-01-10 19:08:39 +00:00
470 lines
16 KiB
C
Executable File
470 lines
16 KiB
C
Executable File
/**
|
|
* Jingga
|
|
*
|
|
* @copyright 2023, Dominic Szablewski - https://phoboslab.org
|
|
* @copyright Jingga
|
|
* @license OMS License 2.0
|
|
* @version 1.0.0
|
|
* @link https://jingga.app
|
|
*/
|
|
#ifndef COMS_AUDIO_QOA_H
|
|
#define COMS_AUDIO_QOA_H
|
|
|
|
#include "../stdlib/Types.h"
|
|
#include "../utils/EndianUtils.h"
|
|
#include "../audio/Audio.cpp"
|
|
#include "../stdlib/Simd.h"
|
|
|
|
#define QOA_SLICE_LEN 20
|
|
#define QOA_SLICES_PER_FRAME 256
|
|
#define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN)
|
|
#define QOA_LMS_LEN 4
|
|
#define QOA_MAX_CHANNELS 8
|
|
|
|
#define QOA_FRAME_SIZE(channels, slices) (4 + QOA_LMS_LEN * 4 * (channels) + 8 * (slices) * (channels))
|
|
|
|
struct QoaLms {
|
|
int32_4 history; // automatically QOA_LMS_LEN size
|
|
int32_4 weights; // automatically QOA_LMS_LEN size
|
|
};
|
|
|
|
/*
|
|
The quant_tab provides an index into the dequant_tab for residuals in the
|
|
range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at
|
|
the higher end. Note that the residual zero is identical to the lowest positive
|
|
value. This is mostly fine, since the qoa_div() function always rounds away
|
|
from zero.
|
|
*/
|
|
static const int32 qoa_quant_tab[17] = {
|
|
7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
|
|
0, /* 0 */
|
|
0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */
|
|
};
|
|
|
|
/*
|
|
We have 16 different scalefactors. Like the quantized residuals these become
|
|
less accurate at the higher end. In theory, the highest scalefactor that we
|
|
would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
|
|
rely on the LMS filter to predict samples accurately enough that a maximum
|
|
residual of one quarter of the 16 bit range is sufficient. I.e. with the
|
|
scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
|
|
|
|
The scalefactor values are computed as:
|
|
scalefactor_tab[s] <- round(pow(s + 1, 2.75))
|
|
*/
|
|
static const int32 qoa_scalefactor_tab[16] = {
|
|
1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
|
|
};
|
|
|
|
|
|
/*
|
|
The reciprocal_tab maps each of the 16 scalefactors to their rounded
|
|
reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in
|
|
the encoder with just one multiplication instead of an expensive division. We
|
|
do this in .16 fixed point with integers, instead of floats.
|
|
|
|
The reciprocal_tab is computed as:
|
|
reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s]
|
|
*/
|
|
static const int32 qoa_reciprocal_tab[16] = {
|
|
65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
|
|
};
|
|
|
|
/*
|
|
The dequant_tab maps each of the scalefactors and quantized residuals to
|
|
their unscaled & dequantized version.
|
|
|
|
Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
|
|
instead of 1. The dequant_tab assumes the following dequantized values for each
|
|
of the quant_tab indices and is computed as:
|
|
float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
|
|
dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
|
|
|
|
The rounding employed here is "to nearest, ties away from zero", i.e. positive
|
|
and negative values are treated symmetrically.
|
|
*/
|
|
static const int32 qoa_dequant_tab[16][8] = {
|
|
{ 1, -1, 3, -3, 5, -5, 7, -7},
|
|
{ 5, -5, 18, -18, 32, -32, 49, -49},
|
|
{ 16, -16, 53, -53, 95, -95, 147, -147},
|
|
{ 34, -34, 113, -113, 203, -203, 315, -315},
|
|
{ 63, -63, 210, -210, 378, -378, 588, -588},
|
|
{ 104, -104, 345, -345, 621, -621, 966, -966},
|
|
{ 158, -158, 528, -528, 950, -950, 1477, -1477},
|
|
{ 228, -228, 760, -760, 1368, -1368, 2128, -2128},
|
|
{ 316, -316, 1053, -1053, 1895, -1895, 2947, -2947},
|
|
{ 422, -422, 1405, -1405, 2529, -2529, 3934, -3934},
|
|
{ 548, -548, 1828, -1828, 3290, -3290, 5117, -5117},
|
|
{ 696, -696, 2320, -2320, 4176, -4176, 6496, -6496},
|
|
{ 868, -868, 2893, -2893, 5207, -5207, 8099, -8099},
|
|
{1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933},
|
|
{1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005},
|
|
{1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336},
|
|
};
|
|
|
|
|
|
/*
|
|
The Least Mean Squares Filter is the heart of QOA. It predicts the next
|
|
sample based on the previous 4 reconstructed samples. It does so by continuously
|
|
adjusting 4 weights based on the residual of the previous prediction.
|
|
|
|
The next sample is predicted as the sum of (weight[i] * history[i]).
|
|
|
|
The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or
|
|
subtracts the residual to each weight, based on the corresponding sample from
|
|
the history. This, surprisingly, is sufficient to get worthwhile predictions.
|
|
|
|
This is all done with fixed point integers. Hence the right-shifts when updating
|
|
the weights and calculating the prediction.
|
|
*/
|
|
static inline
|
|
int32 qoa_lms_predict(QoaLms* lms)
|
|
{
|
|
__m128i products = _mm_mullo_epi32(lms->weights.s, lms->history.s);
|
|
__m128i sum1 = _mm_hadd_epi32(products, products);
|
|
__m128i sum2 = _mm_hadd_epi32(sum1, sum1);
|
|
|
|
int32 prediction = _mm_cvtsi128_si32(sum2);
|
|
|
|
return prediction >> 13;
|
|
}
|
|
|
|
static inline
|
|
void qoa_lms_update(QoaLms* lms, int32 sample, int32 residual) {
|
|
int32 delta = residual >> 4;
|
|
|
|
__m128i delta_vec = _mm_set1_epi32(delta);
|
|
__m128i zero_vec = _mm_setzero_si128();
|
|
|
|
// Calculate adjustments for weights based on the sign of history
|
|
__m128i sign_mask = _mm_cmpgt_epi32(zero_vec, lms->history.s); // history < 0
|
|
__m128i delta_adjust = _mm_blendv_epi8(delta_vec, _mm_sub_epi32(zero_vec, delta_vec), sign_mask);
|
|
|
|
// Update weights
|
|
lms->weights.s = _mm_add_epi32(lms->weights.s, delta_adjust);
|
|
|
|
// Shift history left
|
|
lms->history.s = _mm_alignr_epi8(lms->history.s, lms->history.s, 4); // Shift left by 1 int32 (4 bytes)
|
|
|
|
// Insert the new sample into the last position of history
|
|
lms->history.s = _mm_insert_epi32(lms->history.s, sample, QOA_LMS_LEN - 1);
|
|
|
|
// Update the last weight based on the sign of the new sample
|
|
int32 sample_sign_adjust = (sample < 0) ? -delta : delta;
|
|
lms->weights.s = _mm_insert_epi32(
|
|
lms->weights.s,
|
|
_mm_extract_epi32(lms->weights.s, QOA_LMS_LEN - 1) + sample_sign_adjust,
|
|
QOA_LMS_LEN - 1
|
|
);
|
|
}
|
|
|
|
/*
|
|
qoa_div() implements a rounding division, but avoids rounding to zero for
|
|
small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still
|
|
returns as 0, which is handled in the qoa_quant_tab[].
|
|
qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an
|
|
argument, so it can do the division with a cheaper integer multiplication.
|
|
*/
|
|
static inline
|
|
int32 qoa_div(int32 v, int32 scalefactor) {
|
|
int32 reciprocal = qoa_reciprocal_tab[scalefactor];
|
|
int32 n = (v * reciprocal + (1 << 15)) >> 16;
|
|
|
|
/* round away from 0 */
|
|
n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0));
|
|
|
|
return n;
|
|
}
|
|
|
|
static inline
|
|
int32 qoa_clamp(int32 v, int32 min, int32 max) {
|
|
if (v < min) { return min; }
|
|
if (v > max) { return max; }
|
|
return v;
|
|
}
|
|
|
|
/*
|
|
This specialized clamp function for the signed 16 bit range improves decode
|
|
performance quite a bit. The extra if() statement works nicely with the CPUs
|
|
branch prediction as this branch is rarely taken.
|
|
*/
|
|
static inline
|
|
int32 qoa_clamp_s16(int32 v) {
|
|
if ((uint32) (v + 32768) > 65535) {
|
|
if (v < -32768) { return -32768; }
|
|
if (v > 32767) { return 32767; }
|
|
}
|
|
|
|
return v;
|
|
}
|
|
|
|
uint32 qoa_encode_frame(const int16* sample_data, int32 channels, uint32 frame_samples, QoaLms* lms, byte* bytes)
|
|
{
|
|
byte* start = bytes;
|
|
|
|
int32 prev_scalefactor[QOA_MAX_CHANNELS] = {0};
|
|
|
|
// Write the frame header
|
|
*((uint32 *) bytes) = SWAP_ENDIAN_LITTLE(frame_samples);
|
|
bytes += sizeof(frame_samples);
|
|
|
|
for (uint32 c = 0; c < channels; ++c) {
|
|
// Load the history and weights as 16-bit masked values
|
|
__m128i history = _mm_and_si128(lms[c].history.s, _mm_set1_epi32(0xFFFF));
|
|
__m128i weights = _mm_and_si128(lms[c].weights.s, _mm_set1_epi32(0xFFFF));
|
|
|
|
// Permute and pack 16-bit values into 64-bit results
|
|
__m128i packed_history = _mm_packus_epi32(history, history); // Pack 16-bit values
|
|
__m128i packed_weights = _mm_packus_epi32(weights, weights);
|
|
|
|
// Shuffle packed values into the correct order for 64-bit construction
|
|
__m128i history_64 = _mm_packus_epi16(packed_history, packed_history);
|
|
__m128i weights_64 = _mm_packus_epi16(packed_weights, packed_weights);
|
|
|
|
// Extract 64-bit results and swap endian if necessary
|
|
uint64 final_history = _mm_extract_epi64(history_64, 0);
|
|
uint64 final_weights = _mm_extract_epi64(weights_64, 0);
|
|
|
|
// Store results with endian swap
|
|
*((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_history);
|
|
bytes += sizeof(final_history);
|
|
|
|
*((uint64*) bytes) = SWAP_ENDIAN_LITTLE(final_weights);
|
|
bytes += sizeof(final_weights);
|
|
}
|
|
|
|
/*
|
|
We encode all samples with the channels interleaved on a slice level.
|
|
E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ...
|
|
*/
|
|
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
|
|
// @performance SIMDable
|
|
for (uint32 c = 0; c < channels; ++c) {
|
|
int32 slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_samples - sample_index);
|
|
int32 slice_start = sample_index * channels + c;
|
|
int32 slice_end = (sample_index + slice_len) * channels + c;
|
|
|
|
/*
|
|
Brute for search for the best scalefactor. Just go through all
|
|
16 scalefactors, encode all samples for the current slice and
|
|
meassure the total squared error.
|
|
*/
|
|
uint64 best_rank = -1;
|
|
uint64 best_slice = 0;
|
|
QoaLms best_lms;
|
|
int32 best_scalefactor = 0;
|
|
|
|
for (int32 sfi = 0; sfi < 16; ++sfi) {
|
|
/*
|
|
There is a strong correlation between the scalefactors of
|
|
neighboring slices. As an optimization, start testing
|
|
the best scalefactor of the previous slice first.
|
|
*/
|
|
int32 scalefactor = (sfi + prev_scalefactor[c]) % 16;
|
|
|
|
/*
|
|
We have to reset the LMS state to the last known good one
|
|
before trying each scalefactor, as each pass updates the LMS
|
|
state when encoding.
|
|
*/
|
|
QoaLms lms_temp = lms[c];
|
|
uint64 slice = scalefactor;
|
|
uint64 current_rank = 0;
|
|
|
|
for (int32 si = slice_start; si < slice_end; si += channels) {
|
|
int32 sample = sample_data[si];
|
|
int32 predicted = qoa_lms_predict(&lms_temp);
|
|
|
|
int32 residual = sample - predicted;
|
|
int32 scaled = qoa_div(residual, scalefactor);
|
|
int32 clamped = qoa_clamp(scaled, -8, 8);
|
|
int32 quantized = qoa_quant_tab[clamped + 8];
|
|
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
|
|
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
|
|
|
|
/*
|
|
If the weights have grown too large, we introduce a penalty
|
|
here. This prevents pops/clicks in certain problem cases
|
|
*/
|
|
// Compute weights squared: w^2
|
|
__m128i weights_squared = _mm_mullo_epi32(lms_temp.weights.s, lms_temp.weights.s);
|
|
|
|
// Perform horizontal addition to sum all squared weights
|
|
__m128i sum1 = _mm_hadd_epi32(weights_squared, weights_squared);
|
|
__m128i sum2 = _mm_hadd_epi32(sum1, sum1);
|
|
|
|
// Extract the final sum (scalar)
|
|
int32 sum_of_squares = _mm_cvtsi128_si32(sum2);
|
|
|
|
// Apply the shift and subtraction
|
|
int32 weights_penalty = (sum_of_squares >> 18) - 0x8FF;
|
|
if (weights_penalty < 0) {
|
|
weights_penalty = 0;
|
|
}
|
|
|
|
int64 error = (sample - reconstructed);
|
|
uint64 error_sq = error * error;
|
|
|
|
current_rank += error_sq + weights_penalty * weights_penalty;
|
|
if (current_rank > best_rank) {
|
|
break;
|
|
}
|
|
|
|
qoa_lms_update(&lms_temp, reconstructed, dequantized);
|
|
slice = (slice << 3) | quantized;
|
|
}
|
|
|
|
if (current_rank < best_rank) {
|
|
best_rank = current_rank;
|
|
best_slice = slice;
|
|
best_lms = lms_temp;
|
|
best_scalefactor = scalefactor;
|
|
}
|
|
}
|
|
|
|
prev_scalefactor[c] = best_scalefactor;
|
|
|
|
lms[c] = best_lms;
|
|
|
|
/*
|
|
If this slice was shorter than QOA_SLICE_LEN, we have to left-
|
|
shift all encoded data, to ensure the rightmost bits are the empty
|
|
ones. This should only happen in the last frame of a file as all
|
|
slices are completely filled otherwise.
|
|
*/
|
|
best_slice <<= (QOA_SLICE_LEN - slice_len) * 3;
|
|
|
|
*((uint64 *) bytes) = SWAP_ENDIAN_LITTLE(best_slice);
|
|
bytes += sizeof(best_slice);
|
|
}
|
|
}
|
|
|
|
return (uint32) (bytes - start);
|
|
}
|
|
|
|
uint32 qoa_encode(const Audio* audio, byte* data)
|
|
{
|
|
byte* start = data;
|
|
|
|
/* Calculate the encoded size and allocate */
|
|
uint32 sample_count = audio->size / (audio->channels * audio->bloc_size);
|
|
uint32 num_frames = (sample_count + QOA_FRAME_LEN - 1) / QOA_FRAME_LEN;
|
|
uint32 num_slices = (sample_count + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
|
|
|
|
QoaLms lms[QOA_MAX_CHANNELS];
|
|
__m128i weights_init = _mm_set_epi32(1 << 14, -(1 << 13), 0, 0);
|
|
__m128i history_init = _mm_setzero_si128();
|
|
|
|
for (int32 i = 0; i < audio->channels; ++i) {
|
|
/*
|
|
Set the initial LMS weights to {0, 0, -1, 2}. This helps with the
|
|
prediction of the first few ms of a file.
|
|
*/
|
|
lms[i].weights.s = weights_init;
|
|
lms[i].history.s = history_init;
|
|
}
|
|
|
|
// Go through all frames
|
|
int32 frame_samples = QOA_FRAME_LEN;
|
|
int32 p = 0;
|
|
|
|
for (uint32 sample_index = 0; sample_index < sample_count; sample_index += frame_samples) {
|
|
frame_samples = qoa_clamp(QOA_FRAME_LEN, 0, sample_count - sample_index);
|
|
data += qoa_encode_frame(
|
|
(int16 *) (audio->data + sample_index * audio->channels * audio->bloc_size),
|
|
audio->channels, frame_samples, lms, data
|
|
);
|
|
}
|
|
|
|
return (uint32) (data - start);
|
|
}
|
|
|
|
uint32 qoa_decode_frame(const byte* bytes, int32 channels, QoaLms* lms, byte* sample_data)
|
|
{
|
|
const byte* start = bytes;
|
|
|
|
// Read and verify the frame header
|
|
uint32 frame_samples = SWAP_ENDIAN_LITTLE(*((uint32 *) bytes));
|
|
bytes += sizeof(frame_samples);
|
|
|
|
uint32 slices = (frame_samples + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
|
|
uint32 frame_size = QOA_FRAME_SIZE(channels, slices);
|
|
uint32 data_size = frame_size - 4 - QOA_LMS_LEN * 4 * channels;
|
|
uint32 num_slices = data_size / 8;
|
|
uint32 max_total_samples = num_slices * QOA_SLICE_LEN;
|
|
|
|
// Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel
|
|
for (uint32 c = 0; c < channels; ++c) {
|
|
uint64 history = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
|
bytes += sizeof(history);
|
|
|
|
uint64 weights = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
|
bytes += sizeof(weights);
|
|
|
|
alignas(16) int32 history_array[4];
|
|
alignas(16) int32 weights_array[4];
|
|
|
|
for (int32 i = 0; i < QOA_LMS_LEN; ++i) {
|
|
history_array[i] = ((int16) (history >> 48));
|
|
history <<= 16;
|
|
|
|
weights_array[i] = ((int16) (weights >> 48));
|
|
weights <<= 16;
|
|
}
|
|
|
|
lms[c].history.s = _mm_set_epi32(history_array[3], history_array[2], history_array[1], history_array[0]);
|
|
lms[c].weights.s = _mm_set_epi32(weights_array[3], weights_array[2], weights_array[1], weights_array[0]);
|
|
}
|
|
|
|
// Decode all slices for all channels in this frame
|
|
for (uint32 sample_index = 0; sample_index < frame_samples; sample_index += QOA_SLICE_LEN) {
|
|
for (uint32 c = 0; c < channels; c++) {
|
|
uint64 slice = SWAP_ENDIAN_LITTLE(*((uint64 *) bytes));
|
|
bytes += sizeof(slice);
|
|
|
|
int32 scalefactor = (slice >> 60) & 0xf;
|
|
slice <<= 4;
|
|
|
|
int32 slice_start = sample_index * channels + c;
|
|
int32 slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, frame_samples) * channels + c;
|
|
|
|
for (int32 si = slice_start; si < slice_end; si += channels) {
|
|
int32 predicted = qoa_lms_predict(&lms[c]);
|
|
int32 quantized = (slice >> 61) & 0x7;
|
|
int32 dequantized = qoa_dequant_tab[scalefactor][quantized];
|
|
int32 reconstructed = qoa_clamp_s16(predicted + dequantized);
|
|
|
|
sample_data[si] = reconstructed;
|
|
slice <<= 3;
|
|
|
|
qoa_lms_update(&lms[c], reconstructed, dequantized);
|
|
}
|
|
}
|
|
}
|
|
|
|
return (uint32) (bytes - start);
|
|
}
|
|
|
|
|
|
uint32 qoa_decode(const byte* data, Audio* audio)
|
|
{
|
|
uint32 header_length = audio_header_from_data(data, audio);
|
|
uint32 p = header_length;
|
|
uint32 frame_size;
|
|
byte* sample_ptr = audio->data;
|
|
|
|
QoaLms lms[QOA_MAX_CHANNELS];
|
|
|
|
uint32 limit = 4 + QOA_LMS_LEN * 4 * audio->channels;
|
|
|
|
do {
|
|
frame_size = qoa_decode_frame(data + p, audio->channels, lms, sample_ptr);
|
|
sample_ptr += frame_size;
|
|
p += frame_size;
|
|
} while (frame_size && audio->size - p >= limit);
|
|
// @question do we really need the audio->size - p >= limit check or would p < audio->size be sufficient?
|
|
|
|
return header_length + audio->size;
|
|
}
|
|
|
|
#endif |