From e88840f0fa00e8d2eb23d4db23b1fb5dab3c5b0a Mon Sep 17 00:00:00 2001
From: Dennis Eichhorn <spl1nes.com@googlemail.com>
Date: Mon, 23 Sep 2024 04:34:31 +0200
Subject: [PATCH] implement png BUT not working since not debugged yet

---
 image/Bitmap.h                     |  50 ++-
 image/Image.cpp                    |   2 +-
 image/Image.h                      |   7 +-
 image/Png.h                        | 486 +++++++++++++++++++----------
 image/Tga.h                        |  49 ++-
 object/Texture.h                   |   3 -
 platform/win32/audio/DirectSound.h |  10 +
 platform/win32/audio/XAudio2.h     |  34 +-
 utils/BitUtils.h                   | 287 ++++++++++++++---
 utils/EndianUtils.h                |  84 +++--
 utils/TestUtils.h                  |   8 +-
 11 files changed, 715 insertions(+), 305 deletions(-)

diff --git a/image/Bitmap.h b/image/Bitmap.h
index ebef356..b1578dc 100644
--- a/image/Bitmap.h
+++ b/image/Bitmap.h
@@ -277,41 +277,65 @@ void image_bmp_generate(const FileBody* src_data, Image* image)
 
     image->width = src.dib_header.width;
     image->height = src.dib_header.height;
-    image->length = image->width * image->height;
+    image->pixel_count = image->width * image->height;
 
     // rows are 4 bytes multiples in length
     uint32 width = ROUND_TO_NEAREST(src.dib_header.width, 4);
 
     uint32 pixel_bytes = src.dib_header.bits_per_pixel / 8;
-    if (image->order_pixels == IMAGE_PIXEL_ORDER_BGRA) {
-        memcpy((void *) image->pixels, src.pixels, image->length * pixel_bytes);
+    byte alpha_offset = pixel_bytes > 3;
+
+    image->has_alpha |= (bool) alpha_offset;
+
+    if (image->order_pixels == IMAGE_PIXEL_ORDER_BGRA
+        && image->order_rows == IMAGE_ROW_ORDER_BOTTOM_TO_TOP
+    ) {
+        // @bug This doesn't consider the situation where we want alpha as a setting but the img doesn't have it
+        // @bug This also copies possible padding which will corrupt the image
+        memcpy((void *) image->pixels, src.pixels, image->pixel_count * pixel_bytes);
 
         return;
     }
 
-    byte alpha_offset = pixel_bytes == 3 ? 0 : 1;
+    uint32 pixel_rgb_bytes = pixel_bytes - alpha_offset;
 
-    uint32 row_pos1 = 0;
-    uint32 row_pos2 = 0;
+    uint32 row_pos1;
+    uint32 row_pos2;
+
+    uint32 width_pixel_bytes = width * pixel_bytes;
 
     for (uint32 y = 0; y < src.dib_header.height; ++y) {
+        row_pos1 = y * width_pixel_bytes;
+
+        if (image->order_rows == IMAGE_ROW_ORDER_TOP_TO_BOTTOM) {
+            row_pos2 = (src.dib_header.height - y - 1) * width_pixel_bytes;
+        } else {
+            row_pos2 = y * width_pixel_bytes;
+        }
+
         for (uint32 x = 0; x < width; ++x) {
             if (x >= image->width) {
-                // we don't care about the padding
+                // Bitmaps may have padding at the end of the row
+                // We don't care about that
                 continue;
             }
 
-            row_pos1 = y * width * pixel_bytes;
-            row_pos2 = (src.dib_header.height - y - 1) * width * pixel_bytes;
-
             // Invert byte order
-            for (uint32 i = 0; i < pixel_bytes - alpha_offset; ++i) {
-                image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes - alpha_offset - i];
+            if (image->order_pixels == IMAGE_PIXEL_ORDER_RGBA) {
+                for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
+                    image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + pixel_rgb_bytes - i];
+                }
+            } else {
+                for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
+                    image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + i];
+                }
             }
 
-            // Add alpha channel at end
+            // Add alpha channel at end of every RGB value
             if (alpha_offset > 0) {
                 image->pixels[row_pos1 + x * pixel_bytes + 3] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes + 3];
+            } else if (image->has_alpha) {
+                image->pixels[row_pos1 + x * pixel_bytes + 3] = 0xFF;
             }
         }
     }
diff --git a/image/Image.cpp b/image/Image.cpp
index a6e8282..6810718 100644
--- a/image/Image.cpp
+++ b/image/Image.cpp
@@ -29,7 +29,7 @@ void image_from_file(RingMemory* ring, const char* path, Image* image)
     file_read(path, &file, ring);
 
     if (str_ends_with(path, ".png")) {
-        image_png_generate(&file, image);
+        image_png_generate(&file, image, ring);
     } else if (str_ends_with(path, ".tga")) {
         image_tga_generate(&file, image);
     } else if (str_ends_with(path, ".bmp")) {
diff --git a/image/Image.h b/image/Image.h
index 7e4bfd8..fef1705 100644
--- a/image/Image.h
+++ b/image/Image.h
@@ -17,11 +17,16 @@
 #define IMAGE_ROW_ORDER_TOP_TO_BOTTOM 0
 #define IMAGE_ROW_ORDER_BOTTOM_TO_TOP 1
 
+// This struct also functions as a setting on how to load the image data
+//      has_alpha is defined it forces an alpha channel even for bitmaps
+//      order_pixels defines how the pixels should be ordered
+//      order_rows defines how the rows should be ordered
 struct Image {
     uint32 width;
     uint32 height;
-    uint32 length;
+    uint32 pixel_count;
 
+    // Image settings
     bool has_alpha;
     byte order_pixels; // RGBA vs BGRA
     byte order_rows; // top-to-bottom vs bottom-to-top
diff --git a/image/Png.h b/image/Png.h
index a710f77..c0569a7 100644
--- a/image/Png.h
+++ b/image/Png.h
@@ -7,6 +7,7 @@
  * @link      https://jingga.app
  *
  * png: https://www.w3.org/TR/2003/REC-PNG-20031110/
+ * png: https://www.w3.org/TR/PNG-Chunks.html
  * zlib: https://www.ietf.org/rfc/rfc1950.txt
  * deflate: https://www.ietf.org/rfc/rfc1951.txt
  */
@@ -15,7 +16,7 @@
 
 #include <string.h>
 #include "../stdlib/Types.h"
-#include "../utils/Utils.h"
+#include "../utils/BitUtils.h"
 #include "../utils/EndianUtils.h"
 #include "Image.h"
 
@@ -23,31 +24,66 @@
 #define PNG_HEADER_SIZE 8
 
 struct PngHeader {
-    byte signature[8];
+    uint8 signature[8];
 };
 
+/*
+The following table describes the chunk layout.
+Please note that we do NOT support most of this
+
+Critical chunks (order is defined):
+
+    Name  Multiple  Ordering constraints
+    IHDR    No      Must be first
+    PLTE    No      Before IDAT (optional)
+    IDAT    Yes     Multiple IDATs must be consecutive
+    IEND    No      Must be last
+
+Ancillary chunks (order is not defined):
+
+    Name  Multiple  Ordering constraints
+    cHRM    No      Before PLTE and IDAT
+    gAMA    No      Before PLTE and IDAT
+    iCCP    No      Before PLTE and IDAT
+    sBIT    No      Before PLTE and IDAT
+    sRGB    No      Before PLTE and IDAT
+    bKGD    No      After PLTE, before IDAT
+    hIST    No      After PLTE, before IDAT
+    tRNS    No      After PLTE, before IDAT
+    pHYs    No      Before IDAT
+    sPLT    Yes     Before IDAT
+    tIME    No      None
+    iTXt    Yes     None
+    tEXt    Yes     None
+    zTXt    Yes     None
+*/
+#define PNG_CHUNK_SIZE_MIN 12
+
 struct PngChunk {
     uint32 length;
     uint32 type;
+    // +data here, can be 0
     uint32 crc;
 };
 
+// Special chunk
+#define PNG_IHDR_SIZE 25
 struct PngIHDR {
     uint32 length;
     uint32 type;
     uint32 width;
     uint32 height;
-    byte bit_depth;
-    byte colory_type;
-    byte compression;
-    byte filter;
-    byte interlace;
+    uint8 bit_depth;
+    uint8 colory_type;
+    uint8 compression;
+    uint8 filter;
+    uint8 interlace;
     uint32 crc;
 };
 
 struct PngIDATHeader {
-    byte zlib_method_flag;
-    byte add_flag;
+    uint8 zlib_method_flag;
+    uint8 add_flag;
 };
 
 struct Png {
@@ -55,10 +91,10 @@ struct Png {
     PngIHDR ihdr;
 
     // Encoded pixel data
-    byte* pixels; // WARNING: This is not the owner of the data. The owner is the FileBody
+    uint8* pixels; // WARNING: This is not the owner of the data. The owner is the FileBody
 
     uint32 size;
-    byte* data; // WARNING: This is not the owner of the data. The owner is the FileBody
+    uint8* data; // WARNING: This is not the owner of the data. The owner is the FileBody
 };
 
 struct PngHuffmanEntry {
@@ -72,7 +108,7 @@ struct PngHuffman {
     PngHuffmanEntry entries[32768]; // 2^15
 };
 
-static const byte PNG_SIGNATURE[] = {0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A};
+static const uint8 PNG_SIGNATURE[] = {0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A};
 static const uint32 HUFFMAN_BIT_COUNTS[][2] = {{143, 8}, {255, 9}, {279, 7}, {287, 8}, {319, 5}};
 static const uint32 HUFFMAN_CODE_LENGTH_ALPHA[] = {
     16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
@@ -91,7 +127,7 @@ static const PngHuffmanEntry PNG_DIST_EXTRA[] = {
     {4097, 11}, {6145, 11}, {8193, 12}, {12289, 12}, {16385, 13}, {24577, 13}
 };
 
-void huffman_png_compute(uint32 symbol_count, uint32* symbol_code_length, PngHuffman* huff)
+void huffman_png_compute(uint32 symbol_count, const uint32* __restrict symbol_code_length, PngHuffman* huff)
 {
     uint32 code_length_hist[16] = {};
     for (uint32 i = 0; i < symbol_count; ++i) {
@@ -118,7 +154,7 @@ void huffman_png_compute(uint32 symbol_count, uint32* symbol_code_length, PngHuf
 
         for (uint32 j = 0; j < entries; ++j) {
             uint32 base_index = (code << bits) | j;
-            uint32 index = reverse_bits(base_index, huff->max_code_length);
+            uint32 index = bits_reverse(base_index, huff->max_code_length);
 
             PngHuffmanEntry* entry = huff->entries + index;
 
@@ -129,41 +165,140 @@ void huffman_png_compute(uint32 symbol_count, uint32* symbol_code_length, PngHuf
 }
 
 inline
-PngHuffmanEntry huffman_png_decode(PngHuffman* huff, const byte* data, int pos)
+uint16 huffman_png_decode(PngHuffman* __restrict huff, BitWalk* __restrict stream)
 {
-    uint32 index = (uint32) get_bits(data, huff->max_code_length, pos);
-    return huff->entries[index];
+    // huff->max_code_length has a length of a maximum of 15 -> span a maximum of 3 bytes
+    uint32 index = SWAP_ENDIAN_BIG(BITS_GET_32(BYTES_MERGE_4(stream->pos), stream->bit_pos, huff->max_code_length));
+
+    bits_walk(stream, huff->entries[index].bits_used);
+
+    return huff->entries[index].symbol;
 }
 
-void png_filter_reconstruct(uint32 width, uint32 height, const byte* decompressed, byte* finalized, int steps)
+inline
+uint8 png_filter_1_and_2(const uint8* __restrict x, const uint8* __restrict a, uint32 channel)
 {
-    uint32 zero = 0;
-    byte* prev_row = NULL;
-    byte prev_row_advance = 0;
+    return x[channel] + a[channel];
+}
+
+inline
+uint8 png_filter_3(const uint8* x, const uint8* a, const uint8* b, uint32 channel)
+{
+    return x[channel] + (uint8) (((uint32) a[channel] + (uint32) b[channel]) / 2);
+}
+
+inline
+uint8 png_filter_4(const uint8* x, const uint8* a_full, const uint8* b_full, const uint8* c_full, uint32 channel)
+{
+    int32 a = (int32) a_full[channel];
+    int32 b = (int32) b_full[channel];
+    int32 c = (int32) c_full[channel];
+    int32 p = a + b - c;
+    int32 pa = p >= a ? p - a : a - p;
+    int32 pb = p >= b ? p - b : b - p;
+    int32 pc = p >= c ? p - c : c - p;
+
+    int32 paeth;
+    if (pa < pb && pa <= pc) {
+        paeth = a;
+    } else if (pb <= pc) {
+        paeth = b;
+    } else {
+        paeth = c;
+    }
+
+    return x[channel] + (uint8) paeth;
+}
+
+void png_filter_reconstruct(uint32 width, uint32 height, const uint8* decompressed, uint8* finalized, int steps = 8)
+{
+    uint64 zero = 0;
+    uint8* prev_row = (uint8 *) &zero;
+    uint8 prev_row_advance = 0;
+
+    const uint8* src = decompressed;
+    uint8* dest = finalized;
 
     for (uint32 y = 0; y < height; ++y) {
-        byte filter = *decompressed;
-        byte* current_row = 0; // @todo need actual value
+        uint8 filter = *decompressed;
+        uint8* current_row = dest;
 
         switch (filter) {
             case 0: {
-                    memcpy(finalized + y * width, decompressed + y * width, width);
+                    memcpy(dest, src, width * sizeof(uint32));
+                    dest += 4 * width;
+                    src += 4 * width;
                 } break;
             case 1: {
-                    // no simd possible, well 4 + 4 probably not worth it
+                    uint32 a_pixel = 0;
+                    for (uint32 x = 0; x < width; ++x) {
+                        // png_filter_1_and_2
+                        dest[0] = src[0] + ((uint8 *) &a_pixel)[0];
+                        dest[1] = src[1] + ((uint8 *) &a_pixel)[1];
+                        dest[2] = src[2] + ((uint8 *) &a_pixel)[2];
+                        dest[3] = src[3] + ((uint8 *) &a_pixel)[3];
 
+                        a_pixel = *(uint32 *) dest;
+
+                        dest += 4;
+                        src += 4;
+                    }
                 } break;
             case 2: {
+                    // @performance this is simd optimizable
                     // requires manual simd impl. since prev_row_advance can be 0 or 4
+                    uint8* b_pixel = prev_row;
+                    for (uint32 x = 0; x < width; ++x) {
+                        // png_filter_1_and_2
+                        dest[0] = src[0] + b_pixel[0];
+                        dest[1] = src[1] + b_pixel[1];
+                        dest[2] = src[2] + b_pixel[2];
+                        dest[3] = src[3] + b_pixel[3];
+
+                        b_pixel += prev_row_advance;
+
+                        dest += 4;
+                        src += 4;
+                    }
                 } break;
             case 3: {
-                    // no simd possible, well 4 + 4 probably not worth it
+                    uint32 a_pixel = 0;
+                    uint8* b_pixel = prev_row;
+                    for (uint32 x = 0; x < width; ++x) {
+                        // png_filter_3
+                        dest[0] = src[0] + (uint8) (((uint32) ((uint8 *) &a_pixel)[0] + (uint32) b_pixel[0]) / 2);
+                        dest[1] = src[1] + (uint8) (((uint32) ((uint8 *) &a_pixel)[1] + (uint32) b_pixel[1]) / 2);
+                        dest[2] = src[2] + (uint8) (((uint32) ((uint8 *) &a_pixel)[2] + (uint32) b_pixel[2]) / 2);
+                        dest[3] = src[3] + (uint8) (((uint32) ((uint8 *) &a_pixel)[3] + (uint32) b_pixel[3]) / 2);
+
+                        a_pixel = *(uint32 *) dest;
+                        b_pixel += prev_row_advance;
+
+                        dest += 4;
+                        src += 4;
+                    }
                 } break;
             case 4: {
-                    // no simd possible, well 4 + 4 probably not worth it
+                    uint32 a_pixel = 0;
+                    uint32 c_pixel = 0;
+                    uint8* b_pixel = prev_row;
+                    for (uint32 x = 0; x < width; ++x) {
+                        // png_filter_4
+                        dest[0] = png_filter_4(src, (uint8 *) &a_pixel, b_pixel, (uint8 *) &c_pixel, 0);
+                        dest[1] = png_filter_4(src, (uint8 *) &a_pixel, b_pixel, (uint8 *) &c_pixel, 1);
+                        dest[2] = png_filter_4(src, (uint8 *) &a_pixel, b_pixel, (uint8 *) &c_pixel, 2);
+                        dest[3] = png_filter_4(src, (uint8 *) &a_pixel, b_pixel, (uint8 *) &c_pixel, 3);
+
+                        a_pixel = *(uint32 *) dest;
+                        c_pixel = *(uint32 *) b_pixel;
+                        b_pixel += prev_row_advance;
+
+                        dest += 4;
+                        src += 4;
+                    }
                 } break;
             default: {
-
+                ASSERT_SIMPLE(false);
             }
         }
 
@@ -177,22 +312,30 @@ void generate_default_png_references(const FileBody* file, Png* png)
     png->size = (uint32) file->size;
     png->data = file->content;
 
-    if (png->size < 33) {
+    if (png->size < PNG_IHDR_SIZE + PNG_HEADER_SIZE) {
         // This shouldn't happen
+        ASSERT_SIMPLE(false);
         return;
     }
 
     // The first chunk MUST be IHDR -> we handle it here
-    memcpy(png, file->content, 29);
-    png->ihdr.crc = SWAP_ENDIAN_BIG((uint32 *) (file->content + 30));
+    ASSERT_SIMPLE_CONST(PNG_HEADER_SIZE + PNG_IHDR_SIZE == 33);
+    memcpy(png, file->content, PNG_HEADER_SIZE + PNG_IHDR_SIZE);
 
-    png->ihdr.length = SWAP_ENDIAN_BIG(&png->ihdr.length);
-    png->ihdr.type = SWAP_ENDIAN_BIG(&png->ihdr.type);
-    png->ihdr.width = SWAP_ENDIAN_BIG(&png->ihdr.width);
-    png->ihdr.height = SWAP_ENDIAN_BIG(&png->ihdr.height);
+    png->ihdr.length = SWAP_ENDIAN_BIG(png->ihdr.length);
+    png->ihdr.type = SWAP_ENDIAN_BIG(png->ihdr.type);
+    png->ihdr.width = SWAP_ENDIAN_BIG(png->ihdr.width);
+    png->ihdr.height = SWAP_ENDIAN_BIG(png->ihdr.height);
+    png->ihdr.crc = SWAP_ENDIAN_BIG(png->ihdr.crc);
 }
 
-bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
+// Below you will often see code like SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2()))
+//      1. Merge two bytes together creating a "new" data structure from which we can easily read bits
+//          1.1. This is required to read bits that cross multiple bytes
+//          1.2. Only if you read more than 8 bits will you need to merge 4 bytes
+//      2. Now we can retrieve the bits from this data structure at a position with a length
+//      3. Whenever we use the result as an integer (16 or 32 bits) we need to consider the endianness
+bool image_png_generate(const FileBody* src_data, Image* image, RingMemory* ring, int32 steps = 8)
 {
     // @performance We are generating the struct and then filling the data.
     //      There is some asignment/copy overhead
@@ -205,154 +348,169 @@ bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
     //  3. temp pixel buffer (larger)
     //  4. final pixel buffer (already here)
 
+    // @todo Consider to support (0, 2, 3, 4, and 6)
+    //      A simple black and white image or a image without alpha should be supported
     if (src.ihdr.bit_depth != 8
         || src.ihdr.colory_type != 6
         || src.ihdr.compression != 0
         || src.ihdr.filter != 0
         || src.ihdr.interlace != 0
     ) {
-        // We don't support this type of png
+        // We don't support this type of png (see comment below)
+        ASSERT_SIMPLE(false);
+
+        /*
+        Color   Allowed     Interpretation
+        Type    Bit Depths
+
+        0       1,2,4,8,16  Each pixel is a grayscale sample.
+        2       8,16        Each pixel is an R,G,B triple.
+        3       1,2,4,8     Each pixel is a palette index, a PLTE chunk must appear.
+        4       8,16        Each pixel is a grayscale sample, followed by an alpha sample.
+        6       8,16        Each pixel is an R,G,B triple, followed by an alpha sample.
+        */
+
         return false;
     }
 
-    PngChunk chunk;
-    PngIDATHeader idat_header;
+    // @performance Could we probably avoid this? There is some overhead using this.
+    //      We are only using it because there might be situations where there is a bit overhang to another chunk
+    BitWalk stream;
+    // Note: If we would support more png formats this offset would be wrong
+    stream.pos = src_data->content + PNG_IHDR_SIZE + PNG_HEADER_SIZE;
+    stream.bit_pos = 0;
 
-    bool is_first_idat = true;
-
-    uint32 out_pos = 0;
-
-    // @question the following is a lot of data, should this be moved to heap?
     uint32 literal_length_dist_table[512];
 
-    PngHuffman literal_length_huffman;
-    literal_length_huffman.max_code_length = 15;
-    literal_length_huffman.count = 1 << literal_length_huffman.max_code_length;
+    PngHuffman* literal_length_huffman = (PngHuffman *) ring_get_memory(ring, sizeof(PngHuffman));
+    literal_length_huffman->max_code_length = 15;
+    literal_length_huffman->count = 1 << literal_length_huffman->max_code_length;
 
-    PngHuffman distance_huffman;
-    distance_huffman.max_code_length = 15;
-    distance_huffman.count = 1 << distance_huffman.max_code_length;
+    PngHuffman* distance_huffman = (PngHuffman *) ring_get_memory(ring, sizeof(PngHuffman));
+    distance_huffman->max_code_length = 15;
+    distance_huffman->count = 1 << distance_huffman->max_code_length;
 
-    PngHuffman dictionary_huffman;
-    dictionary_huffman.max_code_length = 7;
-    dictionary_huffman.count = 1 << dictionary_huffman.max_code_length;
+    PngHuffman* dictionary_huffman = (PngHuffman *) ring_get_memory(ring, sizeof(PngHuffman));
+    dictionary_huffman->max_code_length = 7;
+    dictionary_huffman->count = 1 << dictionary_huffman->max_code_length;
 
-    // i is the current byte to read
-    int i = 33;
+    // We need full width * height, since we don't know how much data this IDAT actually holds
+    uint8* finalized = ring_get_memory(ring, src.ihdr.width * src.ihdr.height * 4);
 
-    // r is the re-shift value in case we need to go back
-    // @todo r unused?
-    int r = 0;
+    // Needs some extra space
+    uint8* decompressed = ring_get_memory(ring, src.ihdr.width * src.ihdr.height * 4 + src.ihdr.height);
 
-    // b is the current bit to read
-    int b = 0;
+    uint8* dest = decompressed;
 
-    while(i < src.size) {
-        chunk.length = SWAP_ENDIAN_BIG((uint32 *) (src_data->content + i));
-        chunk.type = SWAP_ENDIAN_BIG((uint32 *) (src_data->content + i + 4));
+    // @bug We might not be able/allowed to simply iterate this loop below since data might be split accross chunks
+    //      If that is the case we have to first create a linked list of all the actual data and then we perform the actions below on this linked list
+    //      This ofcourse poses the challenge of handling the border between two list elements
+    //      Copying data would be slow so we ideally would like to iterate through that list and just handle the border
+    //      since the border only becomes relevant at the beginning of every loop we should be fine, no?
 
+    uint8 BFINAL = 0;
+    while(stream.pos - src_data->content < src.size && BFINAL == 0) {
+        PngChunk chunk;
+        PngIDATHeader idat_header;
+
+        // @bug the code below doesn't need bit walk on the first loop, what about the second loop?
         // For our png reader, we only care about IDAT
         //  @question consider PLTE, tRNS, gAMA, iCCP
+        chunk.length = SWAP_ENDIAN_BIG(*((uint32 *) stream.pos));
+        stream.pos += sizeof(chunk.length);
+
+        chunk.type = SWAP_ENDIAN_BIG(*((uint32 *) stream.pos));
+        stream.pos += sizeof(chunk.type);
+
         if (chunk.type == 'IEND') {
+            // we arrived at the end of the file
             break;
         } else if (chunk.type != 'IDAT') {
-            // IDAT chunks are continuous and we don't care for anything else
-            if (!is_first_idat) {
-                break;
-            }
+            // some other data?!
 
-            i += chunk.length + 12;
             continue;
         }
 
-        if (is_first_idat) {
-            idat_header.zlib_method_flag = *(src_data->content + i + 8);
-            idat_header.add_flag = *(src_data->content + i + 9);
+        // @question Not sure if this below is actually the case
+        // @bug Is this even correct, we might have an overhang from the previous chunk
+        //  Then we need to:
+        //      read n bits from the previous chunk
+        //      move accross the chunk header data
+        //      read another x bits from the new chunk
+        //
+        //  This means we cannot jump here (or better we need to check if the bit position is != 0)
+        // BUT WE MIGHT NOT CARE ABOUT MULTIPLE IDAT CHUNKS?
+        idat_header.zlib_method_flag = *stream.pos;
+        ++stream.pos;
 
-            byte CM = idat_header.zlib_method_flag & 0xF;
-            byte FDICT = (idat_header.add_flag >> 5) & 0x1;
+        idat_header.add_flag = *stream.pos;
+        ++stream.pos;
 
-            is_first_idat = false;
+        uint8 CM = idat_header.zlib_method_flag & 0xF;
+        uint8 FDICT = (idat_header.add_flag >> 5) & 0x1;
 
-            if (CM != 8 || FDICT != 0) {
-                return false;
-            }
-
-            i += 10;
+        if (CM != 8 || FDICT != 0) {
+            // Not supported
+            return false;
         }
 
-        // @bug The algorithm below works on "blocks".
-        //      Could it be possible that a block is spread accross 2 IDAT chunks?
-        //      If so this would be bad and break the code below
-        //      We could solve this by just having another counting variable and jump to the next block
+        // This data might be stored in the prvious IDAT chunk?!
+        BFINAL = (uint8) SWAP_ENDIAN_BIG(BITS_GET_8(*stream.pos, stream.bit_pos, 1));
+        bits_walk(&stream, 1);
 
-        // start: src_data->content + i + 8
-        // end: src_data->content + i + 8 + length - 1
-
-        // DEFLATE Algorithm
-        // @bug the following 3 lines are wrong, they don't have to start at a bit 0/1
-        //      A block doesn't have to start at an byte boundary
-        byte BFINAL = (byte) get_bits(src_data->content + i, 1, b);
-        i += (b > 7 - 1);
-        b = (b + 1) & 7;
-
-        byte BTYPE = (byte) get_bits(src_data->content + i, 2, b);
-        i += (b > 7 - 2);
-        b = (b + 2) & 7;
+        uint32 BTYPE = SWAP_ENDIAN_BIG(BITS_GET_8(BYTES_MERGE_2(stream.pos), stream.bit_pos, 2));
+        bits_walk(&stream, 2);
 
         if (BTYPE == 0) {
-            // starts at byte boundary -> position = +1 of previous byte
-            if (b == 0) {
-                i -= 1;
-            }
+            // starts at uint8 boundary -> position = +1 of previous uint8
+            bits_flush(&stream);
 
-            uint16 len = *((uint16 *) (src_data->content + i + 1));
+            uint16 len = *((uint16 *) stream.pos);
+            stream.pos += 2;
 
-            // @todo nlen unused?
-            uint16 nlen = *((uint16 *) (src_data->content + i + 3));
+            uint16 nlen = *((uint16 *) stream.pos);
+            stream.pos += 2;
 
-            memcpy(image->pixels + out_pos, src_data->content + i + 5, len);
-            out_pos += len;
+            ASSERT_SIMPLE(len == ~nlen);
 
-            i += 5 + len;
-            b = 0;
+            memcpy(dest, &stream.pos, len);
+            stream.pos += len;
+        } else if (BTYPE == 3) {
+            // Invalid BTYPE
+            ASSERT_SIMPLE(false);
         } else {
             // @question is this even required or are we overwriting anyways?
-            memset(&literal_length_dist_table, 0, 512 * 4);
-            memset(&literal_length_huffman.entries, 0, sizeof(PngHuffmanEntry) * 15);
-            memset(&distance_huffman.entries, 0, sizeof(PngHuffmanEntry) * 15);
-            memset(&dictionary_huffman.entries, 0, sizeof(PngHuffmanEntry) * 7);
+            memset(&literal_length_dist_table, 0, sizeof(literal_length_dist_table));
+            memset(literal_length_huffman->entries, 0, sizeof(PngHuffmanEntry) * literal_length_huffman->max_code_length);
+            memset(distance_huffman->entries, 0, sizeof(PngHuffmanEntry) * distance_huffman->max_code_length);
+            memset(dictionary_huffman->entries, 0, sizeof(PngHuffmanEntry) * dictionary_huffman->max_code_length);
 
             uint32 huffman_literal = 0;
             uint32 huffman_dist = 0;
 
             if (BTYPE == 2) {
                 // Compressed with dynamic Huffman code
-                huffman_literal = (uint32) get_bits(src_data->content + i, 5, b);
-                i += (b > 7 - 5);
-                b = (b + 5) & 7;
+                huffman_literal = SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2(stream.pos), stream.bit_pos, 5));
+                bits_walk(&stream, 5);
 
-                huffman_dist = (uint32) get_bits(src_data->content + i, 5, b);
-                i += (b > 7 - 5);
-                b = (b + 5) & 7;
+                huffman_dist = SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2(stream.pos), stream.bit_pos, 5));
+                bits_walk(&stream, 5);
 
-                uint32 huffman_code_length = (uint32) get_bits(src_data->content + i, 4, b);
-                i += (b > 7 - 4);
-                b = (b + 4) & 7;
+                uint32 huffman_code_length = SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2(stream.pos), stream.bit_pos, 4));
+                bits_walk(&stream, 5);
 
                 huffman_literal += 257;
                 huffman_dist += 1;
                 huffman_code_length += 4;
 
-                uint32 huffman_code_length_table[19] = {};
+                uint32 huffman_code_length_table[ARRAY_COUNT(HUFFMAN_CODE_LENGTH_ALPHA)] = {};
 
                 for (uint32 j = 0; j < huffman_code_length; ++j) {
-                    huffman_code_length_table[HUFFMAN_CODE_LENGTH_ALPHA[j]] = (uint32) get_bits(src_data->content + i, 3, b);
-                    i += (b > 7 - 3);
-                    b = (b + 3) & 7;
+                    huffman_code_length_table[HUFFMAN_CODE_LENGTH_ALPHA[j]] = SWAP_ENDIAN_BIG(BITS_GET_16(BYTES_MERGE_2(stream.pos), stream.bit_pos, 3));
+                    bits_walk(&stream, 3);
                 }
 
-                huffman_png_compute(19, huffman_code_length_table, &dictionary_huffman);
+                huffman_png_compute(ARRAY_COUNT(HUFFMAN_CODE_LENGTH_ALPHA), huffman_code_length_table, dictionary_huffman);
 
                 uint32 literal_length_count = 0;
                 uint32 length_count = huffman_literal + huffman_dist;
@@ -362,31 +520,26 @@ bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
                     uint32 rep_count = 1;
                     uint32 rep_val = 0;
 
-                    PngHuffmanEntry dict = huffman_png_decode(&dictionary_huffman, src_data->content + i, b);
-                    i += (b + dict.bits_used) / 8;
-                    b = (b + dict.bits_used) & 7;
-
-                    uint32 encoded_length = dict.bits_used;
+                    uint32 encoded_length = huffman_png_decode(dictionary_huffman, &stream);
 
                     if (encoded_length <= 15) {
                         rep_val = encoded_length;
                     } else if (encoded_length == 16) {
-                        rep_count = 3 + (uint32) get_bits(src_data->content + i, 2, b);
-                        i += (b > 7 - 2);
-                        b = (b + 2) & 7;
+                        rep_count = 3 + SWAP_ENDIAN_BIG(BITS_GET_8(BYTES_MERGE_2(stream.pos), stream.bit_pos, 2));
+                        bits_walk(&stream, 2);
 
                         rep_val = literal_length_dist_table[literal_length_count - 1];
                     } else if (encoded_length == 17) {
-                        rep_count = 3 + (uint32) get_bits(src_data->content + i, 3, b);
-                        i += (b > 7 - 3);
-                        b = (b + 3) & 7;
+                        rep_count = 3 + SWAP_ENDIAN_BIG(BITS_GET_8(BYTES_MERGE_2(stream.pos), stream.bit_pos, 3));
+                        bits_walk(&stream, 3);
                     } else if (encoded_length == 18) {
-                        rep_count = 11 + (uint32) get_bits(src_data->content + i, 7, b);
-                        i += (b > 7 - 7);
-                        b = (b + 7) & 7;
+                        rep_count = 11 + SWAP_ENDIAN_BIG(BITS_GET_8(BYTES_MERGE_2(stream.pos), stream.bit_pos, 7));
+                        bits_walk(&stream, 7);
                     }
 
-                    memset(literal_length_dist_table + literal_length_count, rep_val, rep_count);
+                    while (rep_count--) {
+                        literal_length_dist_table[literal_length_count++] = rep_val;
+                    }
                 }
             } else if (BTYPE == 1) {
                 // Compressed with fixed Huffman code
@@ -394,7 +547,7 @@ bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
                 huffman_dist = 32;
 
                 uint32 bit_index = 0;
-                for(uint32 range_index = 0; range_index < 5; ++range_index) {
+                for(uint32 range_index = 0; range_index < ARRAY_COUNT(HUFFMAN_BIT_COUNTS); ++range_index) {
                     uint32 bit_count = HUFFMAN_BIT_COUNTS[range_index][1];
                     uint32 last = HUFFMAN_BIT_COUNTS[range_index][0];
 
@@ -404,68 +557,65 @@ bool image_png_generate(const FileBody* src_data, Image* image, int steps = 8)
                 }
             }
 
-            huffman_png_compute(huffman_literal, literal_length_dist_table, &literal_length_huffman);
-            huffman_png_compute(huffman_dist, literal_length_dist_table + huffman_literal, &distance_huffman);
+            huffman_png_compute(huffman_literal, literal_length_dist_table, literal_length_huffman);
+            huffman_png_compute(huffman_dist, literal_length_dist_table + huffman_literal, distance_huffman);
 
             while (true) {
-                PngHuffmanEntry literal = huffman_png_decode(&literal_length_huffman, src_data->content + i, b);
-                i += (b + literal.bits_used) / 8;
-                b = (b + literal.bits_used) & 7;
-
-                uint32 literal_length = literal.bits_used;
-
+                uint32 literal_length = huffman_png_decode(literal_length_huffman, &stream);
                 if (literal_length == 256) {
                     break;
                 }
 
                 if (literal_length <= 255) {
-                    *(image->pixels + out_pos) = (byte) (literal_length & 0xFF);
-                    ++out_pos;
+                    *dest++ = (literal_length & 0xFF);
                 } else {
                     uint32 length_tab_index = literal_length - 257;
                     PngHuffmanEntry length_tab = PNG_LENGTH_EXTRA[length_tab_index];
                     uint32 length = length_tab.symbol;
 
                     if (length_tab.bits_used) {
-                        uint32 extra_bits = (uint32) get_bits(src_data->content + i, length_tab.bits_used, b);
-                        i += (b + length_tab.bits_used) / 8;
-                        b = (b + length_tab.bits_used) & 7;
+                        // @performance If we knew that bits_used is always <= 15 we could use more efficient MERGE/GET
+                        uint32 extra_bits = SWAP_ENDIAN_BIG(BITS_GET_32(BYTES_MERGE_4(stream.pos), stream.bit_pos, length_tab.bits_used));
+                        bits_walk(&stream, length_tab.bits_used);
 
                         length += extra_bits;
                     }
 
-                    PngHuffmanEntry tab = huffman_png_decode(&distance_huffman, src_data->content + i, b);
-                    i += (b + tab.bits_used) / 8;
-                    b = (b + tab.bits_used) & 7;
-
-                    uint32 dist_tab_index = tab.bits_used;
+                    uint32 dist_tab_index = huffman_png_decode(distance_huffman, &stream);
 
                     PngHuffmanEntry dist_tab = PNG_DIST_EXTRA[dist_tab_index];
                     uint32 dist = dist_tab.symbol;
 
                     if (dist_tab.bits_used) {
-                        uint32 extra_bits = (uint32) get_bits(src_data->content + i, dist_tab.bits_used, b);
-                        i += (b + dist_tab.bits_used) / 8;
-                        b = (b + dist_tab.bits_used) & 7;
+                        // @performance If we knew that bits_used is always <= 15 we could use more efficient MERGE/GET
+                        uint32 extra_bits = SWAP_ENDIAN_BIG(BITS_GET_32(BYTES_MERGE_4(stream.pos), stream.bit_pos, dist_tab.bits_used));
+                        bits_walk(&stream, dist_tab.bits_used);
 
                         dist += extra_bits;
                     }
 
-                    memcpy(image->pixels + out_pos, image->pixels + out_pos - dist, length);
+                    // @performance Maybe we could use memcopy depending on length and dist
+                    uint8* source = dest - dist;
+                    while (length--) {
+                        *dest++ = *source++;
+                    }
                 }
             }
         }
 
-        if (BFINAL == 0) {
-            break;
-        }
+        // Skip the CRC
+        stream.pos += sizeof(chunk.crc);
+        stream.bit_pos = 0;
     }
 
     image->width = src.ihdr.width;
     image->height = src.ihdr.height;
+    image->pixel_count = image->width * image->height;
+    image->has_alpha = true;
+    image->order_pixels = IMAGE_PIXEL_ORDER_RGBA;
+    image->order_rows = IMAGE_ROW_ORDER_TOP_TO_BOTTOM;
 
-    // @todo fix pixels parameter
-    png_filter_reconstruct(image->width, image->height, (byte *) image->pixels, (byte *) image->pixels, steps);
+    png_filter_reconstruct(src.ihdr.width, src.ihdr.height, decompressed, finalized, steps);
 
     return true;
 }
diff --git a/image/Tga.h b/image/Tga.h
index f4b38b9..857e9bb 100644
--- a/image/Tga.h
+++ b/image/Tga.h
@@ -85,36 +85,57 @@ void image_tga_generate(const FileBody* src_data, Image* image)
 
     image->width = src.header.width;
     image->height = src.header.height;
-    image->length = image->width * image->height;
+    image->pixel_count = image->width * image->height;
 
-    // @todo also handle bottom-top/top-bottom order here
     uint32 pixel_bytes = src.header.bits_per_pixel / 8;
-    if (image->order_pixels == IMAGE_PIXEL_ORDER_BGRA) {
-        memcpy((void *) image->pixels, src.pixels, image->length * pixel_bytes);
+    byte alpha_offset = pixel_bytes > 3;
+
+    image->has_alpha |= (bool) alpha_offset;
+
+    // We can check same settings through equality since we use the same values
+    if (image->order_rows == src.header.vertical_ordering
+        && image->order_pixels == src.header.horizonal_ordering
+    ) {
+        // @bug This doesn't consider the situation where we want alpha as a setting but the img doesn't have it
+        memcpy((void *) image->pixels, src.pixels, image->pixel_count * pixel_bytes);
 
         return;
     }
 
-    byte alpha_offset = pixel_bytes == 3 ? 0 : 1;
     uint32 pixel_rgb_bytes = pixel_bytes - alpha_offset;
 
     uint32 row_pos1;
     uint32 row_pos2;
 
-    for (uint32 y = 0; y < src.header.height; ++y) {
-        for (uint32 x = 0; x < src.header.width; ++x) {
-            row_pos1 = y * image->width * pixel_bytes;
-            row_pos2 = src.header.vertical_ordering == 0
-                ? y * image->width * pixel_bytes
-                : (image->height - y - 1) * image->width * pixel_bytes;
+    uint32 width_pixel_bytes = src.header.width * pixel_bytes;
 
-            for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
-                image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + pixel_rgb_bytes - i];
+    for (uint32 y = 0; y < src.header.height; ++y) {
+        row_pos1 = y * image->width * pixel_bytes;
+
+        if ((image->order_rows == IMAGE_ROW_ORDER_TOP_TO_BOTTOM && src.header.vertical_ordering == 1)
+            || (image->order_rows == IMAGE_ROW_ORDER_BOTTOM_TO_TOP && src.header.vertical_ordering == 0)
+        ) {
+            row_pos2 = (src.header.height - y - 1) * image->width * pixel_bytes;
+        } else {
+            row_pos2 = y * width_pixel_bytes;
+        }
+
+        for (uint32 x = 0; x < src.header.width; ++x) {
+            if (image->order_pixels == src.header.horizonal_ordering) {
+                for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
+                    image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + i];
+                }
+            } else {
+                for (uint32 i = 0; i < pixel_rgb_bytes; ++i) {
+                    image->pixels[row_pos1 + x * pixel_bytes + i] = src.pixels[row_pos2 + x * pixel_bytes + pixel_rgb_bytes - i];
+                }
             }
 
-            // Add alpha channel at end
+            // Add alpha channel at end of every RGB value
             if (alpha_offset > 0) {
                 image->pixels[row_pos1 + x * pixel_bytes + 3] = src.pixels[row_pos2 + x * pixel_bytes + pixel_bytes + 3];
+            } else if (image->has_alpha) {
+                image->pixels[row_pos1 + x * pixel_bytes + 3] = 0xFF;
             }
         }
     }
diff --git a/object/Texture.h b/object/Texture.h
index b4ba7f3..11a2c4a 100644
--- a/object/Texture.h
+++ b/object/Texture.h
@@ -43,7 +43,6 @@ struct Texture {
     //  If yes remember to update prepare_texture()
 
     byte texture_data_type;
-
     byte texture_wrap_type_s;
     byte texture_wrap_type_t;
     byte texture_wrap_type_r;
@@ -51,8 +50,6 @@ struct Texture {
     byte texture_minification;
 
     Image image;
-
-    int32 texture_ref;
 };
 
 #endif
\ No newline at end of file
diff --git a/platform/win32/audio/DirectSound.h b/platform/win32/audio/DirectSound.h
index 9bca3d6..2a2feb3 100644
--- a/platform/win32/audio/DirectSound.h
+++ b/platform/win32/audio/DirectSound.h
@@ -108,6 +108,16 @@ void audio_play(AudioSetting* setting, DirectSoundSetting* api_setting)
     setting->is_playing = true;
 }
 
+inline
+void audio_stop(AudioSetting* setting, DirectSoundSetting* api_setting) {
+    if (!api_setting->secondary_buffer) {
+        return;
+    }
+
+    api_setting->secondary_buffer->Stop();
+    setting->is_playing = false;
+}
+
 inline
 void audio_free(AudioSetting*, DirectSoundSetting* api_setting)
 {
diff --git a/platform/win32/audio/XAudio2.h b/platform/win32/audio/XAudio2.h
index eda81c0..f2f569f 100644
--- a/platform/win32/audio/XAudio2.h
+++ b/platform/win32/audio/XAudio2.h
@@ -124,21 +124,19 @@ void audio_play(AudioSetting* setting, XAudio2Setting* api_setting) {
     }
 }
 
+inline
+void audio_stop(AudioSetting* setting, XAudio2Setting* api_setting) {
+    if (!api_setting->source_voice) {
+        return;
+    }
+
+    api_setting->source_voice->Stop(0, XAUDIO2_COMMIT_NOW);
+    setting->is_playing = false;
+}
+
 inline
 void audio_free(AudioSetting* setting, XAudio2Setting* api_setting)
 {
-    if (api_setting->internal_buffer[0].pAudioData) {
-        free((void *) api_setting->internal_buffer[0].pAudioData);
-    }
-
-    if (api_setting->internal_buffer[1].pAudioData) {
-        free((void *) api_setting->internal_buffer[1].pAudioData);
-    }
-
-    if (setting->buffer) {
-        free((void *) setting->buffer);
-    }
-
     if (api_setting->source_voice) {
         api_setting->source_voice->DestroyVoice();
     }
@@ -150,6 +148,18 @@ void audio_free(AudioSetting* setting, XAudio2Setting* api_setting)
     if (api_setting->audio_handle) {
         api_setting->audio_handle->Release();
     }
+
+    if (api_setting->internal_buffer[0].pAudioData) {
+        free((void *) api_setting->internal_buffer[0].pAudioData);
+    }
+
+    if (api_setting->internal_buffer[1].pAudioData) {
+        free((void *) api_setting->internal_buffer[1].pAudioData);
+    }
+
+    if (setting->buffer) {
+        free((void *) setting->buffer);
+    }
 }
 
 /**
diff --git a/utils/BitUtils.h b/utils/BitUtils.h
index f76e059..96416e3 100644
--- a/utils/BitUtils.h
+++ b/utils/BitUtils.h
@@ -17,6 +17,244 @@
 #define BIT_UNSET(num, pos) ((num) & ~((uint32) 1 << (pos)))
 #define BIT_FLIP(num, pos) ((num) ^ ((uint32) 1 << (pos)))
 #define BIT_SET_TO(num, pos, x) ((num) & ~((uint32) 1 << (pos)) | ((uint32) (x) << (pos)))
+#define BITS_GET_8(num, pos, to_read) (((num) >> (8 - (pos) - (to_read))) & ((1U << (to_read)) - 1))
+#define BITS_GET_16(num, pos, to_read) (((num) >> (16 - (pos) - (to_read))) & ((1U << (to_read)) - 1))
+#define BITS_GET_32(num, pos, to_read) (((num) >> (32 - (pos) - (to_read))) & ((1U << (to_read)) - 1))
+#define BITS_GET_64(num, pos, to_read) (((num) >> (64 - (pos) - (to_read))) & ((1ULL << (to_read)) - 1))
+#define BYTES_MERGE_2(num) (((num)[0] << 8) | (num)[1])
+#define BYTES_MERGE_4(num) (((num)[0] << 24) | ((num)[1] << 16) | ((num)[2] << 8) | (num)[3])
+#define BYTES_MERGE_8(num) (((uint64_t)(num)[0] << 56) | ((uint64_t)(num)[1] << 48) | ((uint64_t)(num)[2] << 40) | ((uint64_t)(num)[3] << 32) | ((uint64_t)(num)[4] << 24) | ((uint64_t)(num)[5] << 16) | ((uint64_t)(num)[6] << 8)  | ((uint64_t)(num)[7]))
+
+struct BitWalk {
+    byte* pos;
+    uint32 bit_pos;
+};
+
+inline
+void bits_walk(BitWalk* stream, uint32 bits_to_walk)
+{
+    stream->bit_pos += bits_to_walk;
+    stream->pos += stream->bit_pos / 8;
+    stream->bit_pos %= 8;
+}
+
+inline
+void bits_flush(BitWalk* stream)
+{
+    if (stream->bit_pos > 0) {
+        stream->bit_pos = 0;
+        ++stream->pos;
+    }
+}
+
+// inline
+// uint8 bits_consume_8(BitWalk* stream, uint32 bits_to_consume)
+// {
+//     uint8 result;
+
+//     uint32 remaining = 8 - stream->bit_pos;
+//     uint32 range_1 = bits_to_consume >= remaining
+//         ? remaining
+//         : bits_to_consume;
+
+//     result = (*stream->pos >> (remaining - range_1)) & ((1 << range_1) - 1);
+//     stream->bit_pos += range_1;
+
+//     if (bits_to_consume < remaining) {
+//         return result;
+//     }
+
+//     ++stream->pos;
+//     stream->bit_pos = 0;
+//     bits_to_consume -= range_1;
+
+//     /*
+//     uint32 full_bytes = bits_to_consume / 8;
+//     if (full_bytes > 0) {
+//         for (int i = 0; i < full_bytes; ++i) {
+//             result = (result << 8) | *stream->pos;
+
+//             ++stream->pos;
+//         }
+//     }
+//     */
+
+//     if (bits_to_consume == 0) {
+//         return result;
+//     }
+
+//     stream->bit_pos += bits_to_consume;
+
+//     return (result << bits_to_consume) | ((*stream->pos >> (8 - bits_to_consume)) & ((1 << bits_to_consume) - 1));
+// }
+
+// inline
+// uint16 bits_consume_16(BitWalk* stream, uint32 bits_to_consume)
+// {
+//     uint16 result;
+
+//     uint32 remaining = 8 - stream->bit_pos;
+//     uint32 range_1 = bits_to_consume >= remaining
+//         ? remaining
+//         : bits_to_consume;
+
+//     result = (*stream->pos >> (remaining - range_1)) & ((1 << range_1) - 1);
+//     stream->bit_pos += range_1;
+
+//     if (bits_to_consume < remaining) {
+//         return result;
+//     }
+
+//     ++stream->pos;
+//     stream->bit_pos = 0;
+//     bits_to_consume -= range_1;
+
+//     uint32 full_bytes = bits_to_consume / 8;
+//     if (full_bytes > 0) {
+//         for (int i = 0; i < full_bytes; ++i) {
+//             result = (result << 8) | *stream->pos;
+
+//             ++stream->pos;
+//         }
+//     }
+
+//     uint32 range_2 = bits_to_consume - full_bytes * 8;
+//     if (range_2 == 0) {
+//         return result;
+//     }
+
+//     stream->bit_pos += range_2;
+
+//     return (result << range_2) | ((*stream->pos >> (8 - range_2)) & ((1 << range_2) - 1));
+// }
+
+// inline
+// uint32 bits_consume_32(BitWalk* stream, uint32 bits_to_consume)
+// {
+//     uint32 result;
+
+//     uint32 remaining = 8 - stream->bit_pos;
+//     uint32 range_1 = bits_to_consume >= remaining
+//         ? remaining
+//         : bits_to_consume;
+
+//     result = (*stream->pos >> (remaining - range_1)) & ((1 << range_1) - 1);
+//     stream->bit_pos += range_1;
+
+//     if (bits_to_consume < remaining) {
+//         return result;
+//     }
+
+//     ++stream->pos;
+//     stream->bit_pos = 0;
+//     bits_to_consume -= range_1;
+
+//     uint32 full_bytes = bits_to_consume / 8;
+//     if (full_bytes > 0) {
+//         for (int i = 0; i < full_bytes; ++i) {
+//             result = (result << 8) | *stream->pos;
+
+//             ++stream->pos;
+//         }
+//     }
+
+//     uint32 range_2 = bits_to_consume - full_bytes * 8;
+//     if (range_2 == 0) {
+//         return result;
+//     }
+
+//     stream->bit_pos += range_2;
+
+//     return (result << range_2) | ((*stream->pos >> (8 - range_2)) & ((1 << range_2) - 1));
+// }
+
+// inline
+// uint64 bits_consume_64(BitWalk* stream, uint32 bits_to_consume)
+// {
+//     uint64 result;
+
+//     uint32 remaining = 8 - stream->bit_pos;
+//     uint32 range_1 = bits_to_consume >= remaining
+//         ? remaining
+//         : bits_to_consume;
+
+//     result = (*stream->pos >> (remaining - range_1)) & ((1 << range_1) - 1);
+//     stream->bit_pos += range_1;
+
+//     if (bits_to_consume < remaining) {
+//         return result;
+//     }
+
+//     ++stream->pos;
+//     stream->bit_pos = 0;
+//     bits_to_consume -= range_1;
+
+//     uint32 full_bytes = bits_to_consume / 8;
+//     if (full_bytes > 0) {
+//         for (int i = 0; i < full_bytes; ++i) {
+//             result = (result << 8) | *stream->pos;
+
+//             ++stream->pos;
+//         }
+//     }
+
+//     uint32 range_2 = bits_to_consume - full_bytes * 8;
+//     if (range_2 == 0) {
+//         return result;
+//     }
+
+//     stream->bit_pos += range_2;
+
+//     return (result << range_2) | ((*stream->pos >> (8 - range_2)) & ((1 << range_2) - 1));
+// }
+
+// uint8 bits_peek_8(BitWalk* stream, uint32 bits_to_consume) {
+//     byte* pos = stream->pos;
+//     byte bit_pos = stream->bit_pos;
+
+//     uint8 bits = bits_consume_8(stream, bits_to_consume);
+
+//     stream->pos = pos;
+//     stream->bit_pos = bit_pos;
+
+//     return bits;
+// }
+
+// uint16 bits_peek_16(BitWalk* stream, uint32 bits_to_consume) {
+//     byte* pos = stream->pos;
+//     byte bit_pos = stream->bit_pos;
+
+//     uint16 bits = bits_consume_16(stream, bits_to_consume);
+
+//     stream->pos = pos;
+//     stream->bit_pos = bit_pos;
+
+//     return bits;
+// }
+
+// uint32 bits_peek_32(BitWalk* stream, uint32 bits_to_consume) {
+//     byte* pos = stream->pos;
+//     byte bit_pos = stream->bit_pos;
+
+//     uint32 bits = bits_consume_32(stream, bits_to_consume);
+
+//     stream->pos = pos;
+//     stream->bit_pos = bit_pos;
+
+//     return bits;
+// }
+
+// uint64 bits_peek_64(BitWalk* stream, uint32 bits_to_consume) {
+//     byte* pos = stream->pos;
+//     byte bit_pos = stream->bit_pos;
+
+//     uint64 bits = bits_consume_64(stream, bits_to_consume);
+
+//     stream->pos = pos;
+//     stream->bit_pos = bit_pos;
+
+//     return bits;
+// }
 
 inline
 uint32 bytes_merge(byte b0, byte b1, byte b2, byte b3) {
@@ -77,55 +315,8 @@ inline int find_first_set_bit(int value) {
     #endif
 }
 
-
 inline
-byte get_bits(byte data, int bits_to_read, int start_pos)
-{
-    byte mask = (1 << bits_to_read) - 1;
-    return (data >> (8 - start_pos - bits_to_read)) & mask;
-}
-
-inline
-uint64 get_bits(const byte* data, int bits_to_read, int start_pos)
-{
-    if (bits_to_read <= 0 || bits_to_read > sizeof(uint64)) {
-        return 0;
-    }
-
-    int byte_index = start_pos / 8;
-    int bit_offset = start_pos % 8;
-
-    uint64_t mask = (1ULL << bits_to_read) - 1;
-    uint64_t result = 0;
-
-    int bits_read = 0;
-
-    while (bits_read < bits_to_read) {
-        int bits_in_current_byte = 8 - bit_offset;
-        int bits_to_take = bits_to_read - bits_read;
-
-        if (bits_to_take > bits_in_current_byte) {
-            bits_to_take = bits_in_current_byte;
-        }
-
-        uint8_t current_byte = data[byte_index];
-        current_byte >>= bit_offset;
-        current_byte &= (1 << bits_to_take) - 1;
-
-        result |= ((uint64_t)current_byte << bits_read);
-
-        bits_read += bits_to_take;
-        bit_offset = 0;
-        byte_index++;
-    }
-
-    result &= mask;
-
-    return result;
-}
-
-inline
-uint32 reverse_bits(uint32 data, uint32 count)
+uint32 bits_reverse(uint32 data, uint32 count)
 {
     uint32 reversed = 0;
     for (uint32 i = 0; i <= (count / 2); ++i) {
diff --git a/utils/EndianUtils.h b/utils/EndianUtils.h
index a27c9b7..e01e250 100644
--- a/utils/EndianUtils.h
+++ b/utils/EndianUtils.h
@@ -11,6 +11,10 @@
 
 #include "../stdlib/Types.h"
 
+#define SWAP_ENDIAN_16(val) ((((val) << 8) | ((val) >> 8)))
+#define SWAP_ENDIAN_32(val) (((val) << 24) | (((val) & 0xFF00) << 8) | (((val) >> 8) & 0xFF00) | ((val) >> 24))
+#define SWAP_ENDIAN_64(val) (((val) << 56) | (((val) & 0x000000000000FF00ULL) << 40) | (((val) & 0x0000000000FF0000ULL) << 24) | (((val) & 0x00000000FF000000ULL) << 8) | (((val) & 0x000000FF00000000ULL) >> 8) | (((val) & 0x0000FF0000000000ULL) >> 24) | (((val) & 0x00FF000000000000ULL) >> 40) | ((val) >> 56))
+
 // Automatically perform endian swap if necessary
 // If we are on little endian (e.g. Win32) we swap big endian data but not little endian
 #if _WIN32 || __LITTLE_ENDIAN
@@ -29,79 +33,71 @@ bool is_little_endian()
 }
 
 inline
-uint16 endian_swap(const uint16* val)
+uint16 endian_swap(uint16 val)
 {
-    uint16 v = *val;
-    return ((v << 8) | (v >> 8));
+    return ((val << 8) | (val >> 8));
 }
 
 inline
-int16 endian_swap(const int16* val)
+int16 endian_swap(int16 val)
 {
-    uint16 v = (uint16) (*val);
-    return (int16) ((v << 8) | (v >> 8));
+    return (int16) ((val << 8) | (val >> 8));
 }
 
 inline
-uint32 endian_swap(const uint32* val)
+uint32 endian_swap(uint32 val)
 {
-    uint32 v = *val;
-    return ((v << 24)
-        | ((v & 0xFF00) << 8)
-        | ((v >> 8) & 0xFF00)
-        | (v >> 24));
+    return ((val << 24)
+        | ((val & 0xFF00) << 8)
+        | ((val >> 8) & 0xFF00)
+        | (val >> 24));
 }
 
 inline
-int32 endian_swap(const int32* val)
+int32 endian_swap(int32 val)
 {
-    uint32 v = (uint32) (*val);
-    return (int32) ((v << 24)
-        | ((v & 0xFF00) << 8)
-        | ((v >> 8) & 0xFF00)
-        | (v >> 24));
+    return (int32) ((val << 24)
+        | ((val & 0xFF00) << 8)
+        | ((val >> 8) & 0xFF00)
+        | (val >> 24));
 }
 
 inline
-uint64 endian_swap(const uint64* val)
+uint64 endian_swap(uint64 val)
 {
-    uint64 v = *val;
-    return ((v << 56)
-        | ((v & 0x000000000000FF00ULL) << 40)
-        | ((v & 0x0000000000FF0000ULL) << 24)
-        | ((v & 0x00000000FF000000ULL) << 8)
-        | ((v & 0x000000FF00000000ULL) >> 8)
-        | ((v & 0x0000FF0000000000ULL) >> 24)
-        | ((v & 0x00FF000000000000ULL) >> 40)
-        | (v >> 56));
+    return ((val << 56)
+        | ((val & 0x000000000000FF00ULL) << 40)
+        | ((val & 0x0000000000FF0000ULL) << 24)
+        | ((val & 0x00000000FF000000ULL) << 8)
+        | ((val & 0x000000FF00000000ULL) >> 8)
+        | ((val & 0x0000FF0000000000ULL) >> 24)
+        | ((val & 0x00FF000000000000ULL) >> 40)
+        | (val >> 56));
 }
 
 inline
-int64 endian_swap(const int64* val)
+int64 endian_swap(int64 val)
 {
-    uint64 v = (uint64) (*val);
-    return (int64) ((v << 56)
-        | ((v & 0x000000000000FF00ULL) << 40)
-        | ((v & 0x0000000000FF0000ULL) << 24)
-        | ((v & 0x00000000FF000000ULL) << 8)
-        | ((v & 0x000000FF00000000ULL) >> 8)
-        | ((v & 0x0000FF0000000000ULL) >> 24)
-        | ((v & 0x00FF000000000000ULL) >> 40)
-        | (v >> 56));
+    return (int64) ((val << 56)
+        | ((val & 0x000000000000FF00ULL) << 40)
+        | ((val & 0x0000000000FF0000ULL) << 24)
+        | ((val & 0x00000000FF000000ULL) << 8)
+        | ((val & 0x000000FF00000000ULL) >> 8)
+        | ((val & 0x0000FF0000000000ULL) >> 24)
+        | ((val & 0x00FF000000000000ULL) >> 40)
+        | (val >> 56));
 }
 
 inline
-float endian_swap(const float* val)
+float endian_swap(float val)
 {
-    uint32* ival = (uint32 *) val;
-    return (float) endian_swap(ival);
+    return (float) endian_swap(val);
 }
 
 inline
-double endian_swap(const double* val)
+double endian_swap(double val)
 {
-    uint64* ival = (uint64 *) val;
-    return (double) endian_swap(ival);
+    return (double) endian_swap(val);
 }
 
 #endif
\ No newline at end of file
diff --git a/utils/TestUtils.h b/utils/TestUtils.h
index db61876..f60bf1e 100644
--- a/utils/TestUtils.h
+++ b/utils/TestUtils.h
@@ -114,10 +114,16 @@ void update_timing_stat(TimingStat *stat)
 #if DEBUG
     #define ASSERT_SIMPLE(a)                             \
         if (!(a)) {                                      \
-            *(volatile int *)0 = 0;                      \
+            *(volatile int *) 0 = 0;                     \
+        }
+
+    #define ASSERT_SIMPLE_CONST(a)                       \
+        if constexpr (!(a)) {                            \
+            *(volatile int *) 0 = 0;                     \
         }
 #else
     #define ASSERT_SIMPLE(a) ((void) 0)
+    #define ASSERT_SIMPLE_CONST(a) ((void) 0)
 #endif
 
 #define ASSERT_TRUE(a)                                   \