implement png BUT not working since not debugged yet

2026-01-11 11:18:40 +00:00 · 2024-09-23 04:34:31 +02:00 · 2024-09-23 04:34:31 +02:00 · e88840f0fa
commit e88840f0fa
parent 44ebefd06a
11 changed files with 715 additions and 305 deletions
--- a/image/Bitmap.h
+++ b/image/Bitmap.h
@ -277,41 +277,65 @@ void image_bmp_generate(const FileBody* src_data, Image* image)
    image->width = src.dib_header.width;
    image->height = src.dib_header.height;
-    image->length = image->width * image->height;
+    image->pixel_count = image->width * image->height;
    // rows are 4 bytes multiples in length
    uint32 width = ROUND_TO_NEAREST(src.dib_header.width, 4);
    uint32 pixel_bytes = src.dib_header.bits_per_pixel / 8;
-    if (image->order_pixels == IMAGE_PIXEL_ORDER_BGRA) {
+    byte alpha_offset = pixel_bytes > 3;
-        memcpy((void *) image->pixels, src.pixels, image->length * pixel_bytes);
+
    image->has_alpha |= (bool) alpha_offset;
    if (image->order_pixels == IMAGE_PIXEL_ORDER_BGRA
        && image->order_rows == IMAGE_ROW_ORDER_BOTTOM_TO_TOP
    ) {
        // @bug This doesn't consider the situation where we want alpha as a setting but the img doesn't have it
        // @bug This also copies possible padding which will corrupt the image
        memcpy((void *) image->pixels, src.pixels, image->pixel_count * pixel_bytes);
        return;
    }
-    byte alpha_offset = pixel_bytes == 3 ? 0 : 1;
+    uint32 pixel_rgb_bytes = pixel_bytes - alpha_offset;
-    uint32 row_pos1 = 0;
+    uint32 row_pos1;
-    uint32 row_pos2 = 0;
+    uint32 row_pos2;
    uint32 width_pixel_bytes = width * pixel_bytes;
    for (uint32 y = 0; y < src.dib_header.height; ++y) {
        row_pos1 = y * width_pixel_bytes;
        if (image->order_rows == IMAGE_ROW_ORDER_TOP_TO_BOTTOM) {
            row_pos2 = (src.dib_header.height - y - 1) * width_pixel_bytes;
        } else {
            row_pos2 = y * width_pixel_bytes;
        }
        for (uint32 x = 0; x < width; ++x) {
            if (x >= image->width) {
-                // we don't care about the padding
+                // Bitmaps may have padding at the end of the row
                // We don't care about that
                continue;
            }
            row_pos1 = y * width * pixel_bytes;
            row_pos2 = (src.dib_header.height - y - 1) * width * pixel_bytes;
            // Invert byte order
-            for (uint32 i = 0; i < pixel_bytes - alpha_offset; ++i) {
+            if (image->order_pixels == IMAGE_PIXEL_ORDER_RGBA) {
-                image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes - alpha_offset - i];
+                for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
                    image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + pixel_rgb_bytes - i];
                }
            } else {
                for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
                    image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + i];
                }
            }
-            // Add alpha channel at end
+            // Add alpha channel at end of every RGB value
            if (alpha_offset > 0) {
                image->pixels[row_pos1 + x * pixel_bytes + 3] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes + 3];
            } else if (image->has_alpha) {
                image->pixels[row_pos1 + x * pixel_bytes + 3] = 0xFF;
            }
        }
    }
--- a/image/Image.cpp
+++ b/image/Image.cpp
@ -29,7 +29,7 @@ void image_from_file(RingMemory* ring, const char* path, Image* image)
    file_read(path, &file, ring);
    if (str_ends_with(path, ".png")) {
-        image_png_generate(&file, image);
+        image_png_generate(&file, image, ring);
    } else if (str_ends_with(path, ".tga")) {
        image_tga_generate(&file, image);
    } else if (str_ends_with(path, ".bmp")) {
--- a/image/Image.h
+++ b/image/Image.h
@ -17,11 +17,16 @@
 #define IMAGE_ROW_ORDER_TOP_TO_BOTTOM 0
 #define IMAGE_ROW_ORDER_BOTTOM_TO_TOP 1
 // This struct also functions as a setting on how to load the image data
 //      has_alpha is defined it forces an alpha channel even for bitmaps
 //      order_pixels defines how the pixels should be ordered
 //      order_rows defines how the rows should be ordered
 struct Image {
    uint32 width;
    uint32 height;
-    uint32 length;
+    uint32 pixel_count;
    // Image settings
    bool has_alpha;
    byte order_pixels; // RGBA vs BGRA
    byte order_rows; // top-to-bottom vs bottom-to-top
--- a/image/Png.h
+++ b/image/Png.h
@ -7,6 +7,7 @@
 * @link      https://jingga.app
 *
 * png: https://www.w3.org/TR/2003/REC-PNG-20031110/
 * png: https://www.w3.org/TR/PNG-Chunks.html
 * zlib: https://www.ietf.org/rfc/rfc1950.txt
 * deflate: https://www.ietf.org/rfc/rfc1951.txt
 */
@ -15,7 +16,7 @@
 #include <string.h>
 #include "../stdlib/Types.h"
-#include "../utils/Utils.h"
+#include "../utils/BitUtils.h"
 #include "../utils/EndianUtils.h"
 #include "Image.h"
@ -23,31 +24,66 @@
 #define PNG_HEADER_SIZE 8
 struct PngHeader {
-    byte signature[8];
+    uint8 signature[8];
 };
 /*
 The following table describes the chunk layout.
 Please note that we do NOT support most of this
 Critical chunks (order is defined):
    Name  Multiple  Ordering constraints
    IHDR    No      Must be first
    PLTE    No      Before IDAT (optional)
    IDAT    Yes     Multiple IDATs must be consecutive
    IEND    No      Must be last
 Ancillary chunks (order is not defined):
    Name  Multiple  Ordering constraints
    cHRM    No      Before PLTE and IDAT
    gAMA    No      Before PLTE and IDAT
    iCCP    No      Before PLTE and IDAT
    sBIT    No      Before PLTE and IDAT
    sRGB    No      Before PLTE and IDAT
    bKGD    No      After PLTE, before IDAT
    hIST    No      After PLTE, before IDAT
    tRNS    No      After PLTE, before IDAT
    pHYs    No      Before IDAT
    sPLT    Yes     Before IDAT
    tIME    No      None
    iTXt    Yes     None
    tEXt    Yes     None
    zTXt    Yes     None
 */
 #define PNG_CHUNK_SIZE_MIN 12
 struct PngChunk {
    uint32 length;
    uint32 type;
    // +data here, can be 0
    uint32 crc;
 };
 // Special chunk
 #define PNG_IHDR_SIZE 25
 struct PngIHDR {
    uint32 length;
    uint32 type;
    uint32 width;
    uint32 height;
-    byte bit_depth;
+    uint8 bit_depth;
-    byte colory_type;
+    uint8 colory_type;
-    byte compression;
+    uint8 compression;
-    byte filter;
+    uint8 filter;
-    byte interlace;
+    uint8 interlace;
    uint32 crc;
 };
 struct PngIDATHeader {
-    byte zlib_method_flag;
+    uint8 zlib_method_flag;
-    byte add_flag;
+    uint8 add_flag;
 };
 struct Png {
@ -55,10 +91,10 @@ struct Png {
    PngIHDR ihdr;
    // Encoded pixel data
-    byte* pixels; // WARNING: This is not the owner of the data. The owner is the FileBody
+    uint8* pixels; // WARNING: This is not the owner of the data. The owner is the FileBody
    uint32 size;
-    byte* data; // WARNING: This is not the owner of the data. The owner is the FileBody
+    uint8* data; // WARNING: This is not the owner of the data. The owner is the FileBody
 };
 struct PngHuffmanEntry {
@ -72,7 +108,7 @@ struct PngHuffman {
    PngHuffmanEntry entries[32768]; // 2^15
 };
-static const byte PNG_SIGNATURE[] = {0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A};
+static const uint8 PNG_SIGNATURE[] = {0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A};
 static const uint32 HUFFMAN_BIT_COUNTS[][2] = {{143, 8}, {255, 9}, {279, 7}, {287, 8}, {319, 5}};
 static const uint32 HUFFMAN_CODE_LENGTH_ALPHA[] = {
    16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
@ -91,7 +127,7 @@ static const PngHuffmanEntry PNG_DIST_EXTRA[] = {
    {4097, 11}, {6145, 11}, {8193, 12}, {12289, 12}, {16385, 13}, {24577, 13}
 };
-void huffman_png_compute(uint32 symbol_count, uint32* symbol_code_length, PngHuffman* huff)
+void huffman_png_compute(uint32 symbol_count, const uint32* __restrict symbol_code_length, PngHuffman* huff)
 {
    uint32 code_length_hist[16] = {};
    for (uint32 i = 0; i < symbol_count; ++i) {
@ -118,7 +154,7 @@ void huffman_png_compute(uint32 symbol_count, uint32* symbol_code_length, PngHuf
        for (uint32 j = 0; j < entries; ++j) {
            uint32 base_index = (code << bits) | j;
-            uint32 index = reverse_bits(base_index, huff->max_code_length);
+            uint32 index = bits_reverse(base_index, huff->max_code_length);
            PngHuffmanEntry* entry = huff->entries + index;
@ -129,41 +165,140 @@ void huffman_png_compute(uint32 symbol_count, uint32* symbol_code_length, PngHuf
 }
 inline
-PngHuffmanEntry huffman_png_decode(PngHuffman* huff, const byte* data, int pos)
+uint16 huffman_png_decode(PngHuffman* __restrict huff, BitWalk* __restrict stream)
 {
-    uint32 index = (uint32) get_bits(data, huff->max_code_length, pos);
+    // huff->max_code_length has a length of a maximum of 15 -> span a maximum of 3 bytes
-    return huff->entries[index];
+    uint32 index = SWAP_ENDIAN_BIG(BITS_GET_32(BYTES_MERGE_4(stream->pos), stream->bit_pos, huff->max_code_length));
    bits_walk(stream, huff->entries[index].bits_used);
    return huff->entries[index].symbol;
 }
-void png_filter_reconstruct(uint32 width, uint32 height, const byte* decompressed, byte* finalized, int steps)
+inline
 uint8 png_filter_1_and_2(const uint8* __restrict x, const uint8* __restrict a, uint32 channel)
 {
-    uint32 zero = 0;
+    return x[channel] + a[channel];
-    byte* prev_row = NULL;
+}
-    byte prev_row_advance = 0;
+
 inline
 uint8 png_filter_3(const uint8* x, const uint8* a, const uint8* b, uint32 channel)
 {
    return x[channel] + (uint8) (((uint32) a[channel] + (uint32) b[channel]) / 2);
 }
 inline
 uint8 png_filter_4(const uint8* x, const uint8* a_full, const uint8* b_full, const uint8* c_full, uint32 channel)
 {
    int32 a = (int32) a_full[channel];
    int32 b = (int32) b_full[channel];
    int32 c = (int32) c_full[channel];
    int32 p = a + b - c;
    int32 pa = p >= a ? p - a : a - p;
    int32 pb = p >= b ? p - b : b - p;
    int32 pc = p >= c ? p - c : c - p;
    int32 paeth;
    if (pa < pb && pa <= pc) {
        paeth = a;
    } else if (pb <= pc) {
        paeth = b;
    } else {
        paeth = c;
    }
    return x[channel] + (uint8) paeth;
 }
 void png_filter_reconstruct(uint32 width, uint32 height, const uint8* decompressed, uint8* finalized, int steps = 8)
 {
    uint64 zero = 0;
    uint8* prev_row = (uint8 *) &zero;
    uint8 prev_row_advance = 0;
    const uint8* src = decompressed;
    uint8* dest = finalized;
    for (uint32 y = 0; y < height; ++y) {
-        byte filter = *decompressed;
+        uint8 filter = *decompressed;
-        byte* current_row = 0; // @todo need actual value
+        uint8* current_row = dest;
        switch (filter) {
            case 0: {
-                    memcpy(finalized + y * width, decompressed + y * width, width);
+                    memcpy(dest, src, width * sizeof(uint32));
                    dest += 4 * width;
                    src += 4 * width;
                } break;
            case 1: {
-                    // no simd possible, well 4 + 4 probably not worth it
+                    uint32 a_pixel = 0;
                    for (uint32 x = 0; x < width; ++x) {
                        // png_filter_1_and_2
                        dest[0] = src[0] + ((uint8 *) &a_pixel)[0];
                        dest[1] = src[1] + ((uint8 *) &a_pixel)[1];
                        dest[2] = src[2] + ((uint8 *) &a_pixel)[2];
                        dest[3] = src[3] + ((uint8 *) &a_pixel)[3];
                        a_pixel = *(uint32 *) dest;
                        dest += 4;
                        src += 4;
                    }
                } break;
            case 2: {
                    // @performance this is simd optimizable
                    // requires manual simd impl. since prev_row_advance can be 0 or 4
                    uint8* b_pixel = prev_row;
                    for (uint32 x = 0; x < width; ++x) {
                        // png_filter_1_and_2
                        dest[0] = src[0] + b_pixel[0];
                        dest[1] = src[1] + b_pixel[1];
                        dest[2] = src[2] + b_pixel[2];
                        dest[3] = src[3] + b_pixel[3];
                        b_pixel += prev_row_advance;
                        dest += 4;
                        src += 4;
                    }
                } break;
            case 3: {
-                    // no simd possible, well 4 + 4 probably not worth it
+                    uint32 a_pixel = 0;
                    uint8* b_pixel = prev_row;
                    for (uint32 x = 0; x < width; ++x) {
                        // png_filter_3
                        dest[0] = src[0] + (uint8) (((uint32) ((uint8 *) &a_pixel)[0] + (uint32) b_pixel[0]) / 2);
                        dest[1] = src[1] + (uint8) (((uint32) ((uint8 *) &a_pixel)[1] + (uint32) b_pixel[1]) / 2);
                        dest[2] = src[2] + (uint8) (((uint32) ((uint8 *) &a_pixel)[2] + (uint32) b_pixel[2]) / 2);
                        dest[3] = src[3] + (uint8) (((uint32) ((uint8 *) &a_pixel)[3] + (uint32) b_pixel[3]) / 2);
                        a_pixel = *(uint32 *) dest;
                        b_pixel += prev_row_advance;
                        dest += 4;
                        src += 4;
                    }
                } break;
            case 4: {
-                    // no simd possible, well 4 + 4 probably not worth it
+                    uint32 a_pixel = 0;
                    uint32 c_pixel = 0;
                    uint8* b_pixel = prev_row;
                    for (uint32 x = 0; x < width; ++x) {
                        // png_filter_4
                        dest[0] = png_filter_4(src, (uint8 *) &a_pixel, b_pixel, (uint8 *) &c_pixel, 0);
                        dest[1] = png_filter_4(src, (uint8 *) &a_pixel, b_pixel, (uint8 *) &c_pixel, 1);
                        dest[2] = png_filter_4(src, (uint8 *) &a_pixel, b_pixel, (uint8 *) &c_pixel, 2);
                        dest[3] = png_filter_4(src, (uint8 *) &a_pixel, b_pixel, (uint8 *) &c_pixel, 3);
                        a_pixel = *(uint32 *) dest;
                        c_pixel = *(uint32 *) b_pixel;
                        b_pixel += prev_row_advance;
                        dest += 4;
                        src += 4;
                    }
                } break;
            default: {
-
+                ASSERT_SIMPLE(false);
            }
        }
@ -177,22 +312,30 @@ void generate_default_png_references(const FileBody* file, Png* png)
    png->size = (uint32) file->size;
    png->data = file->content;
-    if (png->size < 33) {
+    if (png->size < PNG_IHDR_SIZE + PNG_HEADER_SIZE) {
        // This shouldn't happen
        ASSERT_SIMPLE(false);
        return;
    }
    // The first chunk MUST be IHDR -> we handle it here
-    memcpy(png, file->content, 29);
+    ASSERT_SIMPLE_CONST(PNG_HEADER_SIZE + PNG_IHDR_SIZE == 33);
-    png->ihdr.crc = SWAP_ENDIAN_BIG((uint32 *) (file->content + 30));
+    memcpy(png, file->content, PNG_HEADER_SIZE + PNG_IHDR_SIZE);
-    png->ihdr.length = SWAP_ENDIAN_BIG(&png->ihdr.length);
+    png->ihdr.length = SWAP_ENDIAN_BIG(png->ihdr.length);
-    png->ihdr.type = SWAP_ENDIAN_BIG(&png->ihdr.type);
+    png->ihdr.type = SWAP_ENDIAN_BIG(png->ihdr.type);
-    png->ihdr.width = SWAP_ENDIAN_BIG(&png->ihdr.width);
+    png->ihdr.width = SWAP_ENDIAN_BIG(png->ihdr.width);
-    png->ihdr.height = SWAP_ENDIAN_BIG(&png->ihdr.height);
+    png->ihdr.height = SWAP_ENDIAN_BIG(png->ihdr.height);
    png->ihdr.crc = SWAP_ENDIAN_BIG(png->ihdr.crc);
 }
-bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
+// Below you will often see code like SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2()))
 //      1. Merge two bytes together creating a "new" data structure from which we can easily read bits
 //          1.1. This is required to read bits that cross multiple bytes
 //          1.2. Only if you read more than 8 bits will you need to merge 4 bytes
 //      2. Now we can retrieve the bits from this data structure at a position with a length
 //      3. Whenever we use the result as an integer (16 or 32 bits) we need to consider the endianness
 bool image_png_generate(const FileBody* src_data, Image* image, RingMemory* ring, int32 steps = 8)
 {
    // @performance We are generating the struct and then filling the data.
    //      There is some asignment/copy overhead
@ -205,154 +348,169 @@ bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
    //  3. temp pixel buffer (larger)
    //  4. final pixel buffer (already here)
    // @todo Consider to support (0, 2, 3, 4, and 6)
    //      A simple black and white image or a image without alpha should be supported
    if (src.ihdr.bit_depth != 8
        || src.ihdr.colory_type != 6
        || src.ihdr.compression != 0
        || src.ihdr.filter != 0
        || src.ihdr.interlace != 0
    ) {
-        // We don't support this type of png
+        // We don't support this type of png (see comment below)
        ASSERT_SIMPLE(false);
        /*
        Color   Allowed     Interpretation
        Type    Bit Depths
        0       1,2,4,8,16  Each pixel is a grayscale sample.
        2       8,16        Each pixel is an R,G,B triple.
        3       1,2,4,8     Each pixel is a palette index, a PLTE chunk must appear.
        4       8,16        Each pixel is a grayscale sample, followed by an alpha sample.
        6       8,16        Each pixel is an R,G,B triple, followed by an alpha sample.
        */
        return false;
    }
-    PngChunk chunk;
+    // @performance Could we probably avoid this? There is some overhead using this.
-    PngIDATHeader idat_header;
+    //      We are only using it because there might be situations where there is a bit overhang to another chunk
    BitWalk stream;
    // Note: If we would support more png formats this offset would be wrong
    stream.pos = src_data->content + PNG_IHDR_SIZE + PNG_HEADER_SIZE;
    stream.bit_pos = 0;
    bool is_first_idat = true;
    uint32 out_pos = 0;
    // @question the following is a lot of data, should this be moved to heap?
    uint32 literal_length_dist_table[512];
-    PngHuffman literal_length_huffman;
+    PngHuffman* literal_length_huffman = (PngHuffman *) ring_get_memory(ring, sizeof(PngHuffman));
-    literal_length_huffman.max_code_length = 15;
+    literal_length_huffman->max_code_length = 15;
-    literal_length_huffman.count = 1 << literal_length_huffman.max_code_length;
+    literal_length_huffman->count = 1 << literal_length_huffman->max_code_length;
-    PngHuffman distance_huffman;
+    PngHuffman* distance_huffman = (PngHuffman *) ring_get_memory(ring, sizeof(PngHuffman));
-    distance_huffman.max_code_length = 15;
+    distance_huffman->max_code_length = 15;
-    distance_huffman.count = 1 << distance_huffman.max_code_length;
+    distance_huffman->count = 1 << distance_huffman->max_code_length;
-    PngHuffman dictionary_huffman;
+    PngHuffman* dictionary_huffman = (PngHuffman *) ring_get_memory(ring, sizeof(PngHuffman));
-    dictionary_huffman.max_code_length = 7;
+    dictionary_huffman->max_code_length = 7;
-    dictionary_huffman.count = 1 << dictionary_huffman.max_code_length;
+    dictionary_huffman->count = 1 << dictionary_huffman->max_code_length;
-    // i is the current byte to read
+    // We need full width * height, since we don't know how much data this IDAT actually holds
-    int i = 33;
+    uint8* finalized = ring_get_memory(ring, src.ihdr.width * src.ihdr.height * 4);
-    // r is the re-shift value in case we need to go back
+    // Needs some extra space
-    // @todo r unused?
+    uint8* decompressed = ring_get_memory(ring, src.ihdr.width * src.ihdr.height * 4 + src.ihdr.height);
    int r = 0;
-    // b is the current bit to read
+    uint8* dest = decompressed;
    int b = 0;
-    while(i < src.size) {
+    // @bug We might not be able/allowed to simply iterate this loop below since data might be split accross chunks
-        chunk.length = SWAP_ENDIAN_BIG((uint32 *) (src_data->content + i));
+    //      If that is the case we have to first create a linked list of all the actual data and then we perform the actions below on this linked list
-        chunk.type = SWAP_ENDIAN_BIG((uint32 *) (src_data->content + i + 4));
+    //      This ofcourse poses the challenge of handling the border between two list elements
    //      Copying data would be slow so we ideally would like to iterate through that list and just handle the border
    //      since the border only becomes relevant at the beginning of every loop we should be fine, no?
    uint8 BFINAL = 0;
    while(stream.pos - src_data->content < src.size && BFINAL == 0) {
        PngChunk chunk;
        PngIDATHeader idat_header;
        // @bug the code below doesn't need bit walk on the first loop, what about the second loop?
        // For our png reader, we only care about IDAT
        //  @question consider PLTE, tRNS, gAMA, iCCP
        chunk.length = SWAP_ENDIAN_BIG(*((uint32 *) stream.pos));
        stream.pos += sizeof(chunk.length);
        chunk.type = SWAP_ENDIAN_BIG(*((uint32 *) stream.pos));
        stream.pos += sizeof(chunk.type);
        if (chunk.type == 'IEND') {
            // we arrived at the end of the file
            break;
        } else if (chunk.type != 'IDAT') {
-            // IDAT chunks are continuous and we don't care for anything else
+            // some other data?!
            if (!is_first_idat) {
                break;
            }
            i += chunk.length + 12;
            continue;
        }
-        if (is_first_idat) {
+        // @question Not sure if this below is actually the case
-            idat_header.zlib_method_flag = *(src_data->content + i + 8);
+        // @bug Is this even correct, we might have an overhang from the previous chunk
-            idat_header.add_flag = *(src_data->content + i + 9);
+        //  Then we need to:
        //      read n bits from the previous chunk
        //      move accross the chunk header data
        //      read another x bits from the new chunk
        //
        //  This means we cannot jump here (or better we need to check if the bit position is != 0)
        // BUT WE MIGHT NOT CARE ABOUT MULTIPLE IDAT CHUNKS?
        idat_header.zlib_method_flag = *stream.pos;
        ++stream.pos;
-            byte CM = idat_header.zlib_method_flag & 0xF;
+        idat_header.add_flag = *stream.pos;
-            byte FDICT = (idat_header.add_flag >> 5) & 0x1;
+        ++stream.pos;
-            is_first_idat = false;
+        uint8 CM = idat_header.zlib_method_flag & 0xF;
        uint8 FDICT = (idat_header.add_flag >> 5) & 0x1;
-            if (CM != 8 || FDICT != 0) {
+        if (CM != 8 || FDICT != 0) {
-                return false;
+            // Not supported
-            }
+            return false;
            i += 10;
        }
-        // @bug The algorithm below works on "blocks".
+        // This data might be stored in the prvious IDAT chunk?!
-        //      Could it be possible that a block is spread accross 2 IDAT chunks?
+        BFINAL = (uint8) SWAP_ENDIAN_BIG(BITS_GET_8(*stream.pos, stream.bit_pos, 1));
-        //      If so this would be bad and break the code below
+        bits_walk(&stream, 1);
        //      We could solve this by just having another counting variable and jump to the next block
-        // start: src_data->content + i + 8
+        uint32 BTYPE = SWAP_ENDIAN_BIG(BITS_GET_8(BYTES_MERGE_2(stream.pos), stream.bit_pos, 2));
-        // end: src_data->content + i + 8 + length - 1
+        bits_walk(&stream, 2);
        // DEFLATE Algorithm
        // @bug the following 3 lines are wrong, they don't have to start at a bit 0/1
        //      A block doesn't have to start at an byte boundary
        byte BFINAL = (byte) get_bits(src_data->content + i, 1, b);
        i += (b > 7 - 1);
        b = (b + 1) & 7;
        byte BTYPE = (byte) get_bits(src_data->content + i, 2, b);
        i += (b > 7 - 2);
        b = (b + 2) & 7;
        if (BTYPE == 0) {
-            // starts at byte boundary -> position = +1 of previous byte
+            // starts at uint8 boundary -> position = +1 of previous uint8
-            if (b == 0) {
+            bits_flush(&stream);
                i -= 1;
            }
-            uint16 len = *((uint16 *) (src_data->content + i + 1));
+            uint16 len = *((uint16 *) stream.pos);
            stream.pos += 2;
-            // @todo nlen unused?
+            uint16 nlen = *((uint16 *) stream.pos);
-            uint16 nlen = *((uint16 *) (src_data->content + i + 3));
+            stream.pos += 2;
-            memcpy(image->pixels + out_pos, src_data->content + i + 5, len);
+            ASSERT_SIMPLE(len == ~nlen);
            out_pos += len;
-            i += 5 + len;
+            memcpy(dest, &stream.pos, len);
-            b = 0;
+            stream.pos += len;
        } else if (BTYPE == 3) {
            // Invalid BTYPE
            ASSERT_SIMPLE(false);
        } else {
            // @question is this even required or are we overwriting anyways?
-            memset(&literal_length_dist_table, 0, 512 * 4);
+            memset(&literal_length_dist_table, 0, sizeof(literal_length_dist_table));
-            memset(&literal_length_huffman.entries, 0, sizeof(PngHuffmanEntry) * 15);
+            memset(literal_length_huffman->entries, 0, sizeof(PngHuffmanEntry) * literal_length_huffman->max_code_length);
-            memset(&distance_huffman.entries, 0, sizeof(PngHuffmanEntry) * 15);
+            memset(distance_huffman->entries, 0, sizeof(PngHuffmanEntry) * distance_huffman->max_code_length);
-            memset(&dictionary_huffman.entries, 0, sizeof(PngHuffmanEntry) * 7);
+            memset(dictionary_huffman->entries, 0, sizeof(PngHuffmanEntry) * dictionary_huffman->max_code_length);
            uint32 huffman_literal = 0;
            uint32 huffman_dist = 0;
            if (BTYPE == 2) {
                // Compressed with dynamic Huffman code
-                huffman_literal = (uint32) get_bits(src_data->content + i, 5, b);
+                huffman_literal = SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2(stream.pos), stream.bit_pos, 5));
-                i += (b > 7 - 5);
+                bits_walk(&stream, 5);
                b = (b + 5) & 7;
-                huffman_dist = (uint32) get_bits(src_data->content + i, 5, b);
+                huffman_dist = SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2(stream.pos), stream.bit_pos, 5));
-                i += (b > 7 - 5);
+                bits_walk(&stream, 5);
                b = (b + 5) & 7;
-                uint32 huffman_code_length = (uint32) get_bits(src_data->content + i, 4, b);
+                uint32 huffman_code_length = SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2(stream.pos), stream.bit_pos, 4));
-                i += (b > 7 - 4);
+                bits_walk(&stream, 5);
                b = (b + 4) & 7;
                huffman_literal += 257;
                huffman_dist += 1;
                huffman_code_length += 4;
-                uint32 huffman_code_length_table[19] = {};
+                uint32 huffman_code_length_table[ARRAY_COUNT(HUFFMAN_CODE_LENGTH_ALPHA)] = {};
                for (uint32 j = 0; j < huffman_code_length; ++j) {
-                    huffman_code_length_table[HUFFMAN_CODE_LENGTH_ALPHA[j]] = (uint32) get_bits(src_data->content + i, 3, b);
+                    huffman_code_length_table[HUFFMAN_CODE_LENGTH_ALPHA[j]] = SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2(stream.pos), stream.bit_pos, 3));
-                    i += (b > 7 - 3);
+                    bits_walk(&stream, 3);
                    b = (b + 3) & 7;
                }
-                huffman_png_compute(19, huffman_code_length_table, &dictionary_huffman);
+                huffman_png_compute(ARRAY_COUNT(HUFFMAN_CODE_LENGTH_ALPHA), huffman_code_length_table, dictionary_huffman);
                uint32 literal_length_count = 0;
                uint32 length_count = huffman_literal + huffman_dist;
@ -362,31 +520,26 @@ bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
                    uint32 rep_count = 1;
                    uint32 rep_val = 0;
-                    PngHuffmanEntry dict = huffman_png_decode(&dictionary_huffman, src_data->content + i, b);
+                    uint32 encoded_length = huffman_png_decode(dictionary_huffman, &stream);
                    i += (b + dict.bits_used) / 8;
                    b = (b + dict.bits_used) & 7;
                    uint32 encoded_length = dict.bits_used;
                    if (encoded_length <= 15) {
                        rep_val = encoded_length;
                    } else if (encoded_length == 16) {
-                        rep_count = 3 + (uint32) get_bits(src_data->content + i, 2, b);
+                        rep_count = 3 + SWAP_ENDIAN_BIG(BITS_GET_8(BYTES_MERGE_2(stream.pos), stream.bit_pos, 2));
-                        i += (b > 7 - 2);
+                        bits_walk(&stream, 2);
                        b = (b + 2) & 7;
                        rep_val = literal_length_dist_table[literal_length_count - 1];
                    } else if (encoded_length == 17) {
-                        rep_count = 3 + (uint32) get_bits(src_data->content + i, 3, b);
+                        rep_count = 3 + SWAP_ENDIAN_BIG(BITS_GET_8(BYTES_MERGE_2(stream.pos), stream.bit_pos, 3));
-                        i += (b > 7 - 3);
+                        bits_walk(&stream, 3);
                        b = (b + 3) & 7;
                    } else if (encoded_length == 18) {
-                        rep_count = 11 + (uint32) get_bits(src_data->content + i, 7, b);
+                        rep_count = 11 + SWAP_ENDIAN_BIG(BITS_GET_8(BYTES_MERGE_2(stream.pos), stream.bit_pos, 7));
-                        i += (b > 7 - 7);
+                        bits_walk(&stream, 7);
                        b = (b + 7) & 7;
                    }
-                    memset(literal_length_dist_table + literal_length_count, rep_val, rep_count);
+                    while (rep_count--) {
                        literal_length_dist_table[literal_length_count++] = rep_val;
                    }
                }
            } else if (BTYPE == 1) {
                // Compressed with fixed Huffman code
@ -394,7 +547,7 @@ bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
                huffman_dist = 32;
                uint32 bit_index = 0;
-                for(uint32 range_index = 0; range_index < 5; ++range_index) {
+                for(uint32 range_index = 0; range_index < ARRAY_COUNT(HUFFMAN_BIT_COUNTS); ++range_index) {
                    uint32 bit_count = HUFFMAN_BIT_COUNTS[range_index][1];
                    uint32 last = HUFFMAN_BIT_COUNTS[range_index][0];
@ -404,68 +557,65 @@ bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
                }
            }
-            huffman_png_compute(huffman_literal, literal_length_dist_table, &literal_length_huffman);
+            huffman_png_compute(huffman_literal, literal_length_dist_table, literal_length_huffman);
-            huffman_png_compute(huffman_dist, literal_length_dist_table + huffman_literal, &distance_huffman);
+            huffman_png_compute(huffman_dist, literal_length_dist_table + huffman_literal, distance_huffman);
            while (true) {
-                PngHuffmanEntry literal = huffman_png_decode(&literal_length_huffman, src_data->content + i, b);
+                uint32 literal_length = huffman_png_decode(literal_length_huffman, &stream);
                i += (b + literal.bits_used) / 8;
                b = (b + literal.bits_used) & 7;
                uint32 literal_length = literal.bits_used;
                if (literal_length == 256) {
                    break;
                }
                if (literal_length <= 255) {
-                    *(image->pixels + out_pos) = (byte) (literal_length & 0xFF);
+                    *dest++ = (literal_length & 0xFF);
                    ++out_pos;
                } else {
                    uint32 length_tab_index = literal_length - 257;
                    PngHuffmanEntry length_tab = PNG_LENGTH_EXTRA[length_tab_index];
                    uint32 length = length_tab.symbol;
                    if (length_tab.bits_used) {
-                        uint32 extra_bits = (uint32) get_bits(src_data->content + i, length_tab.bits_used, b);
+                        // @performance If we knew that bits_used is always <= 15 we could use more efficient MERGE/GET
-                        i += (b + length_tab.bits_used) / 8;
+                        uint32 extra_bits = SWAP_ENDIAN_BIG(BITS_GET_32(BYTES_MERGE_4(stream.pos), stream.bit_pos, length_tab.bits_used));
-                        b = (b + length_tab.bits_used) & 7;
+                        bits_walk(&stream, length_tab.bits_used);
                        length += extra_bits;
                    }
-                    PngHuffmanEntry tab = huffman_png_decode(&distance_huffman, src_data->content + i, b);
+                    uint32 dist_tab_index = huffman_png_decode(distance_huffman, &stream);
                    i += (b + tab.bits_used) / 8;
                    b = (b + tab.bits_used) & 7;
                    uint32 dist_tab_index = tab.bits_used;
                    PngHuffmanEntry dist_tab = PNG_DIST_EXTRA[dist_tab_index];
                    uint32 dist = dist_tab.symbol;
                    if (dist_tab.bits_used) {
-                        uint32 extra_bits = (uint32) get_bits(src_data->content + i, dist_tab.bits_used, b);
+                        // @performance If we knew that bits_used is always <= 15 we could use more efficient MERGE/GET
-                        i += (b + dist_tab.bits_used) / 8;
+                        uint32 extra_bits = SWAP_ENDIAN_BIG(BITS_GET_32(BYTES_MERGE_4(stream.pos), stream.bit_pos, dist_tab.bits_used));
-                        b = (b + dist_tab.bits_used) & 7;
+                        bits_walk(&stream, dist_tab.bits_used);
                        dist += extra_bits;
                    }
-                    memcpy(image->pixels + out_pos, image->pixels + out_pos - dist, length);
+                    // @performance Maybe we could use memcopy depending on length and dist
                    uint8* source = dest - dist;
                    while (length--) {
                        *dest++ = *source++;
                    }
                }
            }
        }
-        if (BFINAL == 0) {
+        // Skip the CRC
-            break;
+        stream.pos += sizeof(chunk.crc);
-        }
+        stream.bit_pos = 0;
    }
    image->width = src.ihdr.width;
    image->height = src.ihdr.height;
    image->pixel_count = image->width * image->height;
    image->has_alpha = true;
    image->order_pixels = IMAGE_PIXEL_ORDER_RGBA;
    image->order_rows = IMAGE_ROW_ORDER_TOP_TO_BOTTOM;
-    // @todo fix pixels parameter
+    png_filter_reconstruct(src.ihdr.width, src.ihdr.height, decompressed, finalized, steps);
    png_filter_reconstruct(image->width, image->height, (byte *) image->pixels, (byte *) image->pixels, steps);
    return true;
 }
--- a/image/Tga.h
+++ b/image/Tga.h
@ -85,36 +85,57 @@ void image_tga_generate(const FileBody* src_data, Image* image)
    image->width = src.header.width;
    image->height = src.header.height;
-    image->length = image->width * image->height;
+    image->pixel_count = image->width * image->height;
    // @todo also handle bottom-top/top-bottom order here
    uint32 pixel_bytes = src.header.bits_per_pixel / 8;
-    if (image->order_pixels == IMAGE_PIXEL_ORDER_BGRA) {
+    byte alpha_offset = pixel_bytes > 3;
-        memcpy((void *) image->pixels, src.pixels, image->length * pixel_bytes);
+
    image->has_alpha |= (bool) alpha_offset;
    // We can check same settings through equality since we use the same values
    if (image->order_rows == src.header.vertical_ordering
        && image->order_pixels == src.header.horizonal_ordering
    ) {
        // @bug This doesn't consider the situation where we want alpha as a setting but the img doesn't have it
        memcpy((void *) image->pixels, src.pixels, image->pixel_count * pixel_bytes);
        return;
    }
    byte alpha_offset = pixel_bytes == 3 ? 0 : 1;
    uint32 pixel_rgb_bytes = pixel_bytes - alpha_offset;
    uint32 row_pos1;
    uint32 row_pos2;
-    for (uint32 y = 0; y < src.header.height; ++y) {
+    uint32 width_pixel_bytes = src.header.width * pixel_bytes;
        for (uint32 x = 0; x < src.header.width; ++x) {
            row_pos1 = y * image->width * pixel_bytes;
            row_pos2 = src.header.vertical_ordering == 0
                ? y * image->width * pixel_bytes
                : (image->height - y - 1) * image->width * pixel_bytes;
-            for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
+    for (uint32 y = 0; y < src.header.height; ++y) {
-                image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + pixel_rgb_bytes - i];
+        row_pos1 = y * image->width * pixel_bytes;
        if ((image->order_rows == IMAGE_ROW_ORDER_TOP_TO_BOTTOM && src.header.vertical_ordering == 1)
            || (image->order_rows == IMAGE_ROW_ORDER_BOTTOM_TO_TOP && src.header.vertical_ordering == 0)
        ) {
            row_pos2 = (src.header.height - y - 1) * image->width * pixel_bytes;
        } else {
            row_pos2 = y * width_pixel_bytes;
        }
        for (uint32 x = 0; x < src.header.width; ++x) {
            if (image->order_pixels == src.header.horizonal_ordering) {
                for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
                    image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + i];
                }
            } else {
                for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
                    image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + pixel_rgb_bytes - i];
                }
            }
-            // Add alpha channel at end
+            // Add alpha channel at end of every RGB value
            if (alpha_offset > 0) {
                image->pixels[row_pos1 + x * pixel_bytes + 3] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes + 3];
            } else if (image->has_alpha) {
                image->pixels[row_pos1 + x * pixel_bytes + 3] = 0xFF;
            }
        }
    }
--- a/object/Texture.h
+++ b/object/Texture.h
@ -43,7 +43,6 @@ struct Texture {
    //  If yes remember to update prepare_texture()
    byte texture_data_type;
    byte texture_wrap_type_s;
    byte texture_wrap_type_t;
    byte texture_wrap_type_r;
@ -51,8 +50,6 @@ struct Texture {
    byte texture_minification;
    Image image;
    int32 texture_ref;
 };
 #endif
--- a/platform/win32/audio/DirectSound.h
+++ b/platform/win32/audio/DirectSound.h
@ -108,6 +108,16 @@ void audio_play(AudioSetting* setting, DirectSoundSetting* api_setting)
    setting->is_playing = true;
 }
 inline
 void audio_stop(AudioSetting* setting, DirectSoundSetting* api_setting) {
    if (!api_setting->secondary_buffer) {
        return;
    }
    api_setting->secondary_buffer->Stop();
    setting->is_playing = false;
 }
 inline
 void audio_free(AudioSetting*, DirectSoundSetting* api_setting)
 {
--- a/platform/win32/audio/XAudio2.h
+++ b/platform/win32/audio/XAudio2.h
@ -124,21 +124,19 @@ void audio_play(AudioSetting* setting, XAudio2Setting* api_setting) {
    }
 }
 inline
 void audio_stop(AudioSetting* setting, XAudio2Setting* api_setting) {
    if (!api_setting->source_voice) {
        return;
    }
    api_setting->source_voice->Stop(0, XAUDIO2_COMMIT_NOW);
    setting->is_playing = false;
 }
 inline
 void audio_free(AudioSetting* setting, XAudio2Setting* api_setting)
 {
    if (api_setting->internal_buffer[0].pAudioData) {
        free((void *) api_setting->internal_buffer[0].pAudioData);
    }
    if (api_setting->internal_buffer[1].pAudioData) {
        free((void *) api_setting->internal_buffer[1].pAudioData);
    }
    if (setting->buffer) {
        free((void *) setting->buffer);
    }
    if (api_setting->source_voice) {
        api_setting->source_voice->DestroyVoice();
    }
@ -150,6 +148,18 @@ void audio_free(AudioSetting* setting, XAudio2Setting* api_setting)
    if (api_setting->audio_handle) {
        api_setting->audio_handle->Release();
    }
    if (api_setting->internal_buffer[0].pAudioData) {
        free((void *) api_setting->internal_buffer[0].pAudioData);
    }
    if (api_setting->internal_buffer[1].pAudioData) {
        free((void *) api_setting->internal_buffer[1].pAudioData);
    }
    if (setting->buffer) {
        free((void *) setting->buffer);
    }
 }
 /**
--- a/utils/BitUtils.h
+++ b/utils/BitUtils.h
@ -17,6 +17,244 @@
 #define BIT_UNSET(num, pos) ((num) & ~((uint32) 1 << (pos)))
 #define BIT_FLIP(num, pos) ((num) ^ ((uint32) 1 << (pos)))
 #define BIT_SET_TO(num, pos, x) ((num) & ~((uint32) 1 << (pos)) | ((uint32) (x) << (pos)))
 #define BITS_GET_8(num, pos, to_read) (((num) >> (8 - (pos) - (to_read))) & ((1U << (to_read)) - 1))
 #define BITS_GET_16(num, pos, to_read) (((num) >> (16 - (pos) - (to_read))) & ((1U << (to_read)) - 1))
 #define BITS_GET_32(num, pos, to_read) (((num) >> (32 - (pos) - (to_read))) & ((1U << (to_read)) - 1))
 #define BITS_GET_64(num, pos, to_read) (((num) >> (64 - (pos) - (to_read))) & ((1ULL << (to_read)) - 1))
 #define BYTES_MERGE_2(num) (((num)[0] << 8) | (num)[1])
 #define BYTES_MERGE_4(num) (((num)[0] << 24) | ((num)[1] << 16) | ((num)[2] << 8) | (num)[3])
 #define BYTES_MERGE_8(num) (((uint64_t)(num)[0] << 56) | ((uint64_t)(num)[1] << 48) | ((uint64_t)(num)[2] << 40) | ((uint64_t)(num)[3] << 32) | ((uint64_t)(num)[4] << 24) | ((uint64_t)(num)[5] << 16) | ((uint64_t)(num)[6] << 8)  | ((uint64_t)(num)[7]))
 struct BitWalk {
    byte* pos;
    uint32 bit_pos;
 };
 inline
 void bits_walk(BitWalk* stream, uint32 bits_to_walk)
 {
    stream->bit_pos += bits_to_walk;
    stream->pos += stream->bit_pos / 8;
    stream->bit_pos %= 8;
 }
 inline
 void bits_flush(BitWalk* stream)
 {
    if (stream->bit_pos > 0) {
        stream->bit_pos = 0;
        ++stream->pos;
    }
 }
 // inline
 // uint8 bits_consume_8(BitWalk* stream, uint32 bits_to_consume)
 // {
 //     uint8 result;
 //     uint32 remaining = 8 - stream->bit_pos;
 //     uint32 range_1 = bits_to_consume >= remaining
 //         ? remaining
 //         : bits_to_consume;
 //     result = (*stream->pos >> (remaining - range_1)) & ((1 << range_1) - 1);
 //     stream->bit_pos += range_1;
 //     if (bits_to_consume < remaining) {
 //         return result;
 //     }
 //     ++stream->pos;
 //     stream->bit_pos = 0;
 //     bits_to_consume -= range_1;
 //     /*
 //     uint32 full_bytes = bits_to_consume / 8;
 //     if (full_bytes > 0) {
 //         for (int i = 0; i < full_bytes; ++i) {
 //             result = (result << 8) | *stream->pos;
 //             ++stream->pos;
 //         }
 //     }
 //     */
 //     if (bits_to_consume == 0) {
 //         return result;
 //     }
 //     stream->bit_pos += bits_to_consume;
 //     return (result << bits_to_consume) | ((*stream->pos >> (8 - bits_to_consume)) & ((1 << bits_to_consume) - 1));
 // }
 // inline
 // uint16 bits_consume_16(BitWalk* stream, uint32 bits_to_consume)
 // {
 //     uint16 result;
 //     uint32 remaining = 8 - stream->bit_pos;
 //     uint32 range_1 = bits_to_consume >= remaining
 //         ? remaining
 //         : bits_to_consume;
 //     result = (*stream->pos >> (remaining - range_1)) & ((1 << range_1) - 1);
 //     stream->bit_pos += range_1;
 //     if (bits_to_consume < remaining) {
 //         return result;
 //     }
 //     ++stream->pos;
 //     stream->bit_pos = 0;
 //     bits_to_consume -= range_1;
 //     uint32 full_bytes = bits_to_consume / 8;
 //     if (full_bytes > 0) {
 //         for (int i = 0; i < full_bytes; ++i) {
 //             result = (result << 8) | *stream->pos;
 //             ++stream->pos;
 //         }
 //     }
 //     uint32 range_2 = bits_to_consume - full_bytes * 8;
 //     if (range_2 == 0) {
 //         return result;
 //     }
 //     stream->bit_pos += range_2;
 //     return (result << range_2) | ((*stream->pos >> (8 - range_2)) & ((1 << range_2) - 1));
 // }
 // inline
 // uint32 bits_consume_32(BitWalk* stream, uint32 bits_to_consume)
 // {
 //     uint32 result;
 //     uint32 remaining = 8 - stream->bit_pos;
 //     uint32 range_1 = bits_to_consume >= remaining
 //         ? remaining
 //         : bits_to_consume;
 //     result = (*stream->pos >> (remaining - range_1)) & ((1 << range_1) - 1);
 //     stream->bit_pos += range_1;
 //     if (bits_to_consume < remaining) {
 //         return result;
 //     }
 //     ++stream->pos;
 //     stream->bit_pos = 0;
 //     bits_to_consume -= range_1;
 //     uint32 full_bytes = bits_to_consume / 8;
 //     if (full_bytes > 0) {
 //         for (int i = 0; i < full_bytes; ++i) {
 //             result = (result << 8) | *stream->pos;
 //             ++stream->pos;
 //         }
 //     }
 //     uint32 range_2 = bits_to_consume - full_bytes * 8;
 //     if (range_2 == 0) {
 //         return result;
 //     }
 //     stream->bit_pos += range_2;
 //     return (result << range_2) | ((*stream->pos >> (8 - range_2)) & ((1 << range_2) - 1));
 // }
 // inline
 // uint64 bits_consume_64(BitWalk* stream, uint32 bits_to_consume)
 // {
 //     uint64 result;
 //     uint32 remaining = 8 - stream->bit_pos;
 //     uint32 range_1 = bits_to_consume >= remaining
 //         ? remaining
 //         : bits_to_consume;
 //     result = (*stream->pos >> (remaining - range_1)) & ((1 << range_1) - 1);
 //     stream->bit_pos += range_1;
 //     if (bits_to_consume < remaining) {
 //         return result;
 //     }
 //     ++stream->pos;
 //     stream->bit_pos = 0;
 //     bits_to_consume -= range_1;
 //     uint32 full_bytes = bits_to_consume / 8;
 //     if (full_bytes > 0) {
 //         for (int i = 0; i < full_bytes; ++i) {
 //             result = (result << 8) | *stream->pos;
 //             ++stream->pos;
 //         }
 //     }
 //     uint32 range_2 = bits_to_consume - full_bytes * 8;
 //     if (range_2 == 0) {
 //         return result;
 //     }
 //     stream->bit_pos += range_2;
 //     return (result << range_2) | ((*stream->pos >> (8 - range_2)) & ((1 << range_2) - 1));
 // }
 // uint8 bits_peek_8(BitWalk* stream, uint32 bits_to_consume) {
 //     byte* pos = stream->pos;
 //     byte bit_pos = stream->bit_pos;
 //     uint8 bits = bits_consume_8(stream, bits_to_consume);
 //     stream->pos = pos;
 //     stream->bit_pos = bit_pos;
 //     return bits;
 // }
 // uint16 bits_peek_16(BitWalk* stream, uint32 bits_to_consume) {
 //     byte* pos = stream->pos;
 //     byte bit_pos = stream->bit_pos;
 //     uint16 bits = bits_consume_16(stream, bits_to_consume);
 //     stream->pos = pos;
 //     stream->bit_pos = bit_pos;
 //     return bits;
 // }
 // uint32 bits_peek_32(BitWalk* stream, uint32 bits_to_consume) {
 //     byte* pos = stream->pos;
 //     byte bit_pos = stream->bit_pos;
 //     uint32 bits = bits_consume_32(stream, bits_to_consume);
 //     stream->pos = pos;
 //     stream->bit_pos = bit_pos;
 //     return bits;
 // }
 // uint64 bits_peek_64(BitWalk* stream, uint32 bits_to_consume) {
 //     byte* pos = stream->pos;
 //     byte bit_pos = stream->bit_pos;
 //     uint64 bits = bits_consume_64(stream, bits_to_consume);
 //     stream->pos = pos;
 //     stream->bit_pos = bit_pos;
 //     return bits;
 // }
 inline
 uint32 bytes_merge(byte b0, byte b1, byte b2, byte b3) {
@ -77,55 +315,8 @@ inline int find_first_set_bit(int value) {
    #endif
 }
 inline
-byte get_bits(byte data, int bits_to_read, int start_pos)
+uint32 bits_reverse(uint32 data, uint32 count)
 {
    byte mask = (1 << bits_to_read) - 1;
    return (data >> (8 - start_pos - bits_to_read)) & mask;
 }
 inline
 uint64 get_bits(const byte* data, int bits_to_read, int start_pos)
 {
    if (bits_to_read <= 0 || bits_to_read > sizeof(uint64)) {
        return 0;
    }
    int byte_index = start_pos / 8;
    int bit_offset = start_pos % 8;
    uint64_t mask = (1ULL << bits_to_read) - 1;
    uint64_t result = 0;
    int bits_read = 0;
    while (bits_read < bits_to_read) {
        int bits_in_current_byte = 8 - bit_offset;
        int bits_to_take = bits_to_read - bits_read;
        if (bits_to_take > bits_in_current_byte) {
            bits_to_take = bits_in_current_byte;
        }
        uint8_t current_byte = data[byte_index];
        current_byte >>= bit_offset;
        current_byte &= (1 << bits_to_take) - 1;
        result |= ((uint64_t)current_byte << bits_read);
        bits_read += bits_to_take;
        bit_offset = 0;
        byte_index++;
    }
    result &= mask;
    return result;
 }
 inline
 uint32 reverse_bits(uint32 data, uint32 count)
 {
    uint32 reversed = 0;
    for (uint32 i = 0; i <= (count / 2); ++i) {
--- a/utils/EndianUtils.h
+++ b/utils/EndianUtils.h
@ -11,6 +11,10 @@
 #include "../stdlib/Types.h"
 #define SWAP_ENDIAN_16(val) ((((val) << 8) | ((val) >> 8)))
 #define SWAP_ENDIAN_32(val) (((val) << 24) | (((val) & 0xFF00) << 8) | (((val) >> 8) & 0xFF00) | ((val) >> 24))
 #define SWAP_ENDIAN_64(val) (((val) << 56) | (((val) & 0x000000000000FF00ULL) << 40) | (((val) & 0x0000000000FF0000ULL) << 24) | (((val) & 0x00000000FF000000ULL) << 8) | (((val) & 0x000000FF00000000ULL) >> 8) | (((val) & 0x0000FF0000000000ULL) >> 24) | (((val) & 0x00FF000000000000ULL) >> 40) | ((val) >> 56))
 // Automatically perform endian swap if necessary
 // If we are on little endian (e.g. Win32) we swap big endian data but not little endian
 #if _WIN32 || __LITTLE_ENDIAN
@ -29,79 +33,71 @@ bool is_little_endian()
 }
 inline
-uint16 endian_swap(const uint16* val)
+uint16 endian_swap(uint16 val)
 {
-    uint16 v = *val;
+    return ((val << 8) | (val >> 8));
    return ((v << 8) | (v >> 8));
 }
 inline
-int16 endian_swap(const int16* val)
+int16 endian_swap(int16 val)
 {
-    uint16 v = (uint16) (*val);
+    return (int16) ((val << 8) | (val >> 8));
    return (int16) ((v << 8) | (v >> 8));
 }
 inline
-uint32 endian_swap(const uint32* val)
+uint32 endian_swap(uint32 val)
 {
-    uint32 v = *val;
+    return ((val << 24)
-    return ((v << 24)
+        | ((val & 0xFF00) << 8)
-        | ((v & 0xFF00) << 8)
+        | ((val >> 8) & 0xFF00)
-        | ((v >> 8) & 0xFF00)
+        | (val >> 24));
        | (v >> 24));
 }
 inline
-int32 endian_swap(const int32* val)
+int32 endian_swap(int32 val)
 {
-    uint32 v = (uint32) (*val);
+    return (int32) ((val << 24)
-    return (int32) ((v << 24)
+        | ((val & 0xFF00) << 8)
-        | ((v & 0xFF00) << 8)
+        | ((val >> 8) & 0xFF00)
-        | ((v >> 8) & 0xFF00)
+        | (val >> 24));
        | (v >> 24));
 }
 inline
-uint64 endian_swap(const uint64* val)
+uint64 endian_swap(uint64 val)
 {
-    uint64 v = *val;
+    return ((val << 56)
-    return ((v << 56)
+        | ((val & 0x000000000000FF00ULL) << 40)
-        | ((v & 0x000000000000FF00ULL) << 40)
+        | ((val & 0x0000000000FF0000ULL) << 24)
-        | ((v & 0x0000000000FF0000ULL) << 24)
+        | ((val & 0x00000000FF000000ULL) << 8)
-        | ((v & 0x00000000FF000000ULL) << 8)
+        | ((val & 0x000000FF00000000ULL) >> 8)
-        | ((v & 0x000000FF00000000ULL) >> 8)
+        | ((val & 0x0000FF0000000000ULL) >> 24)
-        | ((v & 0x0000FF0000000000ULL) >> 24)
+        | ((val & 0x00FF000000000000ULL) >> 40)
-        | ((v & 0x00FF000000000000ULL) >> 40)
+        | (val >> 56));
        | (v >> 56));
 }
 inline
-int64 endian_swap(const int64* val)
+int64 endian_swap(int64 val)
 {
-    uint64 v = (uint64) (*val);
+    return (int64) ((val << 56)
-    return (int64) ((v << 56)
+        | ((val & 0x000000000000FF00ULL) << 40)
-        | ((v & 0x000000000000FF00ULL) << 40)
+        | ((val & 0x0000000000FF0000ULL) << 24)
-        | ((v & 0x0000000000FF0000ULL) << 24)
+        | ((val & 0x00000000FF000000ULL) << 8)
-        | ((v & 0x00000000FF000000ULL) << 8)
+        | ((val & 0x000000FF00000000ULL) >> 8)
-        | ((v & 0x000000FF00000000ULL) >> 8)
+        | ((val & 0x0000FF0000000000ULL) >> 24)
-        | ((v & 0x0000FF0000000000ULL) >> 24)
+        | ((val & 0x00FF000000000000ULL) >> 40)
-        | ((v & 0x00FF000000000000ULL) >> 40)
+        | (val >> 56));
        | (v >> 56));
 }
 inline
-float endian_swap(const float* val)
+float endian_swap(float val)
 {
-    uint32* ival = (uint32 *) val;
+    return (float) endian_swap(val);
    return (float) endian_swap(ival);
 }
 inline
-double endian_swap(const double* val)
+double endian_swap(double val)
 {
-    uint64* ival = (uint64 *) val;
+    return (double) endian_swap(val);
    return (double) endian_swap(ival);
 }
 #endif
--- a/utils/TestUtils.h
+++ b/utils/TestUtils.h
@ -114,10 +114,16 @@ void update_timing_stat(TimingStat *stat)
 #if DEBUG
    #define ASSERT_SIMPLE(a)                             \
        if (!(a)) {                                      \
-            *(volatile int *)0 = 0;                      \
+            *(volatile int *) 0 = 0;                     \
        }
    #define ASSERT_SIMPLE_CONST(a)                       \
        if constexpr (!(a)) {                            \
            *(volatile int *) 0 = 0;                     \
        }
 #else
    #define ASSERT_SIMPLE(a) ((void) 0)
    #define ASSERT_SIMPLE_CONST(a) ((void) 0)
 #endif
 #define ASSERT_TRUE(a)                                   \