/**
 * Jingga
 *
 * @copyright Jingga
 * @license   OMS License 2.0
 * @version   1.0.0
 * @link      https://jingga.app
 */
#ifndef TOS_MEMORY_CHUNK_MEMORY_H
#define TOS_MEMORY_CHUNK_MEMORY_H

#include <string.h>
#include "../stdlib/Types.h"
#include "../utils/TestUtils.h"
#include "../utils/EndianUtils.h"
#include "../utils/BitUtils.h"
#include "../compiler/CompilerUtils.h"
#include "../log/Log.h"
#include "../log/Stats.h"
#include "../log/PerformanceProfiler.h"
#include "../log/DebugMemory.h"
#include "BufferMemory.h"
#include "../system/Allocator.h"
#include "../thread/Thread.h"

struct ChunkMemory {
    byte* memory;

    // @question Why are we making the count 64 bit? is this really realistically possible?
    uint64 size;
    int32 last_pos;
    uint32 count;
    uint32 chunk_size;
    uint32 alignment;

    // length = count
    // free describes which locations are used and which are free
    uint64* free;
};

// INFO: A chunk count of 2^n is recommended for maximum performance
inline
void chunk_alloc(ChunkMemory* buf, uint32 count, uint32 chunk_size, int32 alignment = 64)
{
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);
    PROFILE(PROFILE_CHUNK_ALLOC, NULL, false, true);
    LOG_1("Allocating ChunkMemory");

    chunk_size = ROUND_TO_NEAREST(chunk_size, alignment);

    buf->memory = alignment < 2
        ? (byte *) platform_alloc(count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64))
        : (byte *) platform_alloc_aligned(count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64), alignment);

    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
    buf->last_pos = -1;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
    buf->free = (uint64 *) (buf->memory + count * chunk_size);

    memset(buf->memory, 0, buf->size);

    LOG_1("Allocated ChunkMemory: %n B", {{LOG_DATA_UINT64, &buf->size}});
}

inline
void chunk_init(ChunkMemory* buf, BufferMemory* data, uint32 count, uint32 chunk_size, int32 alignment = 64)
{
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);

    chunk_size = ROUND_TO_NEAREST(chunk_size, alignment);

    buf->memory = buffer_get_memory(data, count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64));

    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
    buf->last_pos = -1;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
    //  On the other hand the way we do it right now we never have to move past the free array since it is at the end
    //  On another hand we could by accident overwrite the values in free if we are not careful
    buf->free = (uint64 *) (buf->memory + count * chunk_size);

    DEBUG_MEMORY_SUBREGION((uintptr_t) buf->memory, buf->size);
}

inline
void chunk_init(ChunkMemory* buf, byte* data, uint32 count, uint32 chunk_size, int32 alignment = 64)
{
    ASSERT_SIMPLE(chunk_size);
    ASSERT_SIMPLE(count);

    chunk_size = ROUND_TO_NEAREST(chunk_size, alignment);

    // @bug what if an alignment is defined?
    buf->memory = data;

    buf->count = count;
    buf->size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, 64);
    buf->chunk_size = chunk_size;
    buf->last_pos = -1;
    buf->alignment = alignment;

    // @question Could it be beneficial to have this before the element data?
    //  On the other hand the way we do it right now we never have to move past the free array since it is at the end
    //  On another hand we could by accident overwrite the values in free if we are not careful
    buf->free = (uint64 *) (buf->memory + count * chunk_size);

    DEBUG_MEMORY_SUBREGION((uintptr_t) buf->memory, buf->size);
}

inline
void chunk_free(ChunkMemory* buf)
{
    DEBUG_MEMORY_DELETE((uintptr_t) buf->memory, buf->size);

    if (buf->alignment < 2) {
        platform_free((void **) &buf->memory);
    } else {
        platform_aligned_free((void **) &buf->memory);
    }

    buf->size = 0;
    buf->memory = NULL;
}

inline
uint32 chunk_id_from_memory(const ChunkMemory* buf, const byte* pos) noexcept {
    return (uint32) ((uintptr_t) pos - (uintptr_t) buf->memory) / buf->chunk_size;
}

inline
byte* chunk_get_element(ChunkMemory* buf, uint64 element, bool zeroed = false) noexcept
{
    if (element >= buf->count) {
        return NULL;
    }

    byte* offset = buf->memory + element * buf->chunk_size;
    ASSERT_SIMPLE(offset);

    if (zeroed) {
        memset((void *) offset, 0, buf->chunk_size);
    }

    DEBUG_MEMORY_READ((uintptr_t) offset, buf->chunk_size);

    return offset;
}

int32 chunk_reserve(ChunkMemory* buf, uint32 elements = 1) noexcept
{
    if ((uint32) (buf->last_pos + 1) >= buf->count) {
        buf->last_pos = -1;
    }

    uint32 free_index = (buf->last_pos + 1) / 64;
    uint32 bit_index = (buf->last_pos + 1) & 63;

    // Check standard simple solution
    if (elements == 1 && !IS_BIT_SET_64_R2L(buf->free[free_index], bit_index)) {
        buf->free[free_index] |= (1ULL << bit_index);
        ++buf->last_pos;

        return free_index * 64 + bit_index;
    }

    int32 free_element = -1;
    uint32 i = 0;
    uint32 consecutive_free_bits = 0;

    while (free_element < 0 && i++ <= buf->count) {
        if (free_index * 64 + bit_index + elements - consecutive_free_bits > buf->count) {
            // Go to beginning after overflow
            i += buf->count - (free_index * 64 + bit_index);
            consecutive_free_bits = 0;
            free_index = 0;
            bit_index = 0;

            continue;
        } else if (buf->free[free_index] == 0xFFFFFFFFFFFFFFFF) {
            // Skip fully filled ranges
            ++free_index;
            bit_index = 0;
            i += 64;
            consecutive_free_bits = 0;

            continue;
        }

        // Find first free element
        // This MUST find a free element, otherwise we wouldn't have gotten here
        bit_index = compiler_find_first_bit_r2l(~buf->free[free_index]);

        // Let's check if we have enough free space, we need more than just one free bit
        do {
            ++i;
            ++consecutive_free_bits;
            ++bit_index;

            if (bit_index > 63) {
                bit_index = 0;
                ++free_index;

                break;
            }
        } while (!IS_BIT_SET_64_R2L(buf->free[free_index], bit_index)
            && consecutive_free_bits != elements
            && free_index * 64 + bit_index + elements - consecutive_free_bits <= buf->count
            && i <= buf->count
        );

        // Do we have enough free bits?
        if (consecutive_free_bits == elements) {
            free_element = free_index * 64 + bit_index - elements;
            uint32 possible_free_index = free_element / 64;
            uint32 possible_bit_index = free_element & 63;

            // Mark as used
            if (elements == 1) {
                buf->free[possible_free_index] |= (1ULL << possible_bit_index);
            } else {
                uint32 elements_temp = elements;
                uint64 current_free_index = possible_free_index;
                uint32 current_bit_index = possible_bit_index;

                while (elements_temp > 0) {
                    // Calculate the number of bits we can set in the current 64-bit block
                    uint32 bits_in_current_block = OMS_MIN(64 - current_bit_index, elements_temp);

                    // Create a mask to set the bits
                    uint64 mask = ((1ULL << (bits_in_current_block & 63)) - 1) << current_bit_index | ((bits_in_current_block >> 6) * ((uint64_t)-1));
                    buf->free[current_free_index] |= mask;

                    // Update the counters and indices
                    elements_temp -= bits_in_current_block;
                    ++current_free_index;
                    current_bit_index = 0;
                }
            }

            break;
        }
    }

    if (free_element < 0) {
        ASSERT_SIMPLE(false);
        return -1;
    }

    DEBUG_MEMORY_WRITE((uintptr_t) (buf->memory + free_element * buf->chunk_size), elements * buf->chunk_size);

    buf->last_pos = free_element;

    return (int32) free_element;
}

inline
void chunk_free_element(ChunkMemory* buf, uint64 free_index, int32 bit_index) noexcept
{
    DEBUG_MEMORY_DELETE((uintptr_t) (buf->memory + (free_index * 64 + bit_index) * buf->chunk_size), buf->chunk_size);
    buf->free[free_index] &= ~(1ULL << bit_index);
}

inline
void chunk_free_elements(ChunkMemory* buf, uint64 element, uint32 element_count = 1) noexcept
{
    DEBUG_MEMORY_DELETE((uintptr_t) (buf->memory + element * buf->chunk_size), buf->chunk_size);

    uint64 free_index = element / 64;
    uint32 bit_index = element & 63;

    if (element == 1) {
        chunk_free_element(buf, free_index, bit_index);
        return;
    }

    while (element_count > 0) {
        // Calculate the number of bits we can clear in the current 64-bit block
        uint32 bits_in_current_block = OMS_MIN(64 - bit_index, element_count);

        // Create a mask to clear the bits
        uint64 mask = ((1ULL << bits_in_current_block) - 1) << bit_index;
        buf->free[free_index] &= ~mask;

        // Update the counters and indices
        element_count -= bits_in_current_block;
        ++free_index;
        bit_index = 0;
    }
}

inline
int64 chunk_dump(const ChunkMemory* buf, byte* data)
{
    LOG_1("Dump ChunkMemory");
    byte* start = data;

    // Count
    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(buf->count);
    data += sizeof(buf->count);

    // Size
    *((uint64 *) data) = SWAP_ENDIAN_LITTLE(buf->size);
    data += sizeof(buf->size);

    // Chunk Size
    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(buf->chunk_size);
    data += sizeof(buf->chunk_size);

    // Last pos
    *((int32 *) data) = SWAP_ENDIAN_LITTLE(buf->last_pos);
    data += sizeof(buf->last_pos);

    // Alignment
    *((uint32 *) data) = SWAP_ENDIAN_LITTLE(buf->alignment);
    data += sizeof(buf->alignment);

    // All memory is handled in the buffer -> simply copy the buffer
    // This also includes the free array
    memcpy(data, buf->memory, buf->size);
    data += buf->size;

    LOG_1("Dumped ChunkMemory: %n B", {{LOG_DATA_UINT64, (void *) &buf->size}});

    return data - start;
}

inline
int64 chunk_load(ChunkMemory* buf, const byte* data)
{
    LOG_1("Loading ChunkMemory");

    // Count
    buf->count = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
    data += sizeof(buf->count);

    // Size
    buf->size = SWAP_ENDIAN_LITTLE(*((uint64 *) data));
    data += sizeof(buf->size);

    // Chunk Size
    buf->chunk_size = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
    data += sizeof(buf->chunk_size);

    // Last pos
    buf->last_pos = SWAP_ENDIAN_LITTLE(*((int32 *) data));
    data += sizeof(buf->last_pos);

    // Alignment
    buf->alignment = SWAP_ENDIAN_LITTLE(*((uint32 *) data));
    data += sizeof(buf->alignment);

    memcpy(buf->memory, data, buf->size);
    data += buf->size;

    buf->free = (uint64 *) (buf->memory + buf->count * buf->chunk_size);

    LOG_1("Loaded ChunkMemory: %n B", {{LOG_DATA_UINT64, &buf->size}});

    return buf->size;
}

// @performance Is _BitScanForward faster?
// @performance We could probably even reduce the number of iterations by only iterating until popcount is reached?
#define chunk_iterate_start(buf, chunk_id) {                                                    \
    uint32 free_index = 0;                                                                      \
    uint32 bit_index = 0;                                                                       \
                                                                                                \
    /* Iterate the chunk memory */                                                              \
    for (; chunk_id < (buf)->count; ++chunk_id) {                                               \
        /* Check if asset is defined */                                                         \
        if (!(buf)->free[free_index]) {                                                         \
            /* Skip various elements */                                                         \
            /* @performance Consider to only check 1 byte instead of 8 */                       \
            /* There are probably even better ways by using compiler intrinsics if available */ \
            bit_index += 63; /* +64 - 1 since the loop also increases by 1 */                   \
        } else if ((buf)->free[free_index] & (1ULL << bit_index))

#define chunk_iterate_end       \
        ++bit_index;            \
        if (bit_index > 63) {   \
            bit_index = 0;      \
            ++free_index;       \
        }                       \
    }}

#endif