/** * Jingga * * @copyright Jingga * @license OMS License 2.0 * @version 1.0.0 * @link https://jingga.app */ #ifndef COMS_MEMORY_THREADED_CHUNK_MEMORY_H #define COMS_MEMORY_THREADED_CHUNK_MEMORY_H #include "../stdlib/Types.h" #include "../thread/Thread.h" #include "ChunkMemory.h" struct ThreadedChunkMemory { byte* memory; uint64 size; uint32 last_pos; uint32 count; uint32 chunk_size; int32 alignment; // length = count // free describes which locations are used and which are free alignas(8) atomic_64 uint64* free; // Chunk implementation ends here // The completeness indicates if the data is completely written to alignas(8) atomic_64 uint64* completeness; mutex lock; }; // INFO: A chunk count of 2^n is recommended for maximum performance inline void thrd_chunk_alloc(ThreadedChunkMemory* buf, uint32 count, uint32 chunk_size, int32 alignment = 64) { ASSERT_SIMPLE(chunk_size); ASSERT_SIMPLE(count); PROFILE(PROFILE_CHUNK_ALLOC, NULL, false, true); LOG_1("Allocating ChunkMemory"); chunk_size = ROUND_TO_NEAREST(chunk_size, alignment); uint64 size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, alignment) // free + sizeof(uint64) * CEIL_DIV(count, alignment) // completeness + alignment * 3; // overhead for alignment buf->memory = alignment < 2 ? (byte *) platform_alloc(size) : (byte *) platform_alloc_aligned(size, alignment); buf->count = count; buf->size = size; buf->chunk_size = chunk_size; buf->last_pos = -1; buf->alignment = alignment; // @question Could it be beneficial to have this before the element data? buf->free = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->memory + count * chunk_size), alignment); buf->completeness = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->free + count), alignment); memset(buf->memory, 0, buf->size); mutex_init(&buf->lock, NULL); LOG_1("Allocated ChunkMemory: %n B", {{LOG_DATA_UINT64, &buf->size}}); } inline void thrd_chunk_init(ThreadedChunkMemory* buf, BufferMemory* data, uint32 count, uint32 chunk_size, int32 alignment = 64) { ASSERT_SIMPLE(chunk_size); ASSERT_SIMPLE(count); chunk_size = ROUND_TO_NEAREST(chunk_size, alignment); uint64 size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, alignment) // free + sizeof(uint64) * CEIL_DIV(count, alignment) // completeness + alignment * 3; // overhead for alignment buf->memory = buffer_get_memory(data, size); buf->count = count; buf->size = size; buf->chunk_size = chunk_size; buf->last_pos = -1; buf->alignment = alignment; // @question Could it be beneficial to have this before the element data? // On the other hand the way we do it right now we never have to move past the free array since it is at the end // On another hand we could by accident overwrite the values in free if we are not careful buf->free = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->memory + count * chunk_size), alignment); buf->completeness = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->free + count), alignment); mutex_init(&buf->lock, NULL); DEBUG_MEMORY_SUBREGION((uintptr_t) buf->memory, buf->size); } inline void thrd_chunk_init(ThreadedChunkMemory* buf, byte* data, uint32 count, uint32 chunk_size, int32 alignment = 64) { ASSERT_SIMPLE(chunk_size); ASSERT_SIMPLE(count); chunk_size = ROUND_TO_NEAREST(chunk_size, alignment); uint64 size = count * chunk_size + sizeof(uint64) * CEIL_DIV(count, alignment) // free + sizeof(uint64) * CEIL_DIV(count, alignment) // completeness + alignment * 3; // overhead for alignment // @bug what if an alignment is defined? buf->memory = data; buf->count = count; buf->size = size; buf->chunk_size = chunk_size; buf->last_pos = -1; buf->alignment = alignment; // @question Could it be beneficial to have this before the element data? // On the other hand the way we do it right now we never have to move past the free array since it is at the end // On another hand we could by accident overwrite the values in free if we are not careful buf->free = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->memory + count * chunk_size), alignment); buf->completeness = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->free + count), alignment); mutex_init(&buf->lock, NULL); DEBUG_MEMORY_SUBREGION((uintptr_t) buf->memory, buf->size); } FORCE_INLINE void thrd_chunk_free(ThreadedChunkMemory* buf) noexcept { chunk_free((ChunkMemory *) buf); mutex_destroy(&buf->lock); } FORCE_INLINE uint32 thrd_chunk_id_from_memory(const ThreadedChunkMemory* buf, const byte* pos) noexcept { return chunk_id_from_memory((ChunkMemory *) buf, pos); } FORCE_INLINE byte* thrd_chunk_get_element(ThreadedChunkMemory* buf, uint32 element, bool zeroed = false) noexcept { return chunk_get_element((ChunkMemory *) buf, element, zeroed); } inline void thrd_chunk_set_unset(uint32 element, atomic_64 uint64* state) noexcept { int32 free_index = element / 64; int32 bit_index = element & 63; uint64 mask = ~(1ULL << bit_index); atomic_fetch_and_release(&state[free_index], mask); } int32 thrd_chunk_get_unset(atomic_64 uint64* state, uint32 state_count, int32 start_index = 0) noexcept { if ((uint32) start_index >= state_count) { start_index = 0; } uint32 free_index = start_index / 64; uint32 bit_index = start_index & 63; // Check standard simple solution uint64 current = atomic_get_acquire(&state[free_index]); if (!(current & (1ULL << bit_index))) { uint64_t desired = current | (1ULL << bit_index); if (atomic_compare_exchange_strong_acquire_release(&state[free_index], current, desired) == current) { return free_index * 64 + bit_index; } } for (uint32 i = 0; i < state_count; i += 64) { if (state[free_index] != 0xFFFFFFFFFFFFFFFF) { uint64 current_free = atomic_get_acquire(&state[free_index]); uint64 inverted = ~current_free; int32 bit_index; int32 j = 0; // We will only try 3 times to avoid infinite or long loops while (j < 3 && (bit_index = compiler_find_first_bit_r2l(inverted)) >= 0) { uint32 id = free_index * 64 + bit_index; if (id >= state_count) { break; } uint64 new_free = current_free | (1ULL << bit_index); if ((new_free = atomic_compare_exchange_strong_acquire_release(&state[free_index], current_free, new_free)) == current_free) { return id; } inverted = ~new_free; ++j; } } ++free_index; if (free_index * 64 >= state_count) { free_index = 0; } } return -1; } inline int32 thrd_chunk_reserve(ThreadedChunkMemory* buf, uint32 elements = 1) noexcept { mutex_lock(&buf->lock); int32 free_element = chunk_reserve((ChunkMemory *) buf, elements); mutex_unlock(&buf->lock); return free_element; } inline void thrd_chunk_free_element(ThreadedChunkMemory* buf, uint64 free_index, int32 bit_index) noexcept { alignas(8) atomic_64 uint64* target = &buf->free[free_index]; uint64 old_value, new_value; do { old_value = atomic_get_relaxed(target); new_value = old_value | (1ULL << bit_index); if (old_value == new_value) { return; } // @bug Wrong use } while (!atomic_compare_exchange_strong_release(target, &old_value, new_value)); DEBUG_MEMORY_DELETE((uintptr_t) (buf->memory + (free_index * 64 + bit_index) * buf->chunk_size), buf->chunk_size); } inline void thrd_chunk_free_elements(ThreadedChunkMemory* buf, uint64 element, uint32 element_count = 1) noexcept { uint64 free_index = element / 64; uint32 bit_index = element & 63; if (element == 1) { thrd_chunk_free_element(buf, free_index, bit_index); return; } while (element_count > 0) { // Calculate the number of bits we can clear in the current 64-bit block uint32 bits_in_current_block = OMS_MIN(64 - bit_index, element_count); // Create a mask to clear the bits uint64 mask = ((1ULL << bits_in_current_block) - 1) << bit_index; uint64 old_value, new_value; alignas(8) atomic_64 uint64* target = &buf->free[free_index]; do { old_value = atomic_get_relaxed(target); new_value = old_value & ~mask; if (old_value == new_value) { break; } // @bug Wrong use } while (!atomic_compare_exchange_strong_release(target, &old_value, new_value)); // Update the counters and indices element_count -= bits_in_current_block; ++free_index; bit_index = 0; } DEBUG_MEMORY_DELETE((uintptr_t) (buf->memory + element * buf->chunk_size), buf->chunk_size); } inline int32 thrd_chunk_resize(ThreadedChunkMemory* buf, int32 element_id, uint32 elements_old, uint32 elements_new) noexcept { const byte* data = thrd_chunk_get_element(buf, element_id); int32 chunk_id = thrd_chunk_reserve(buf, elements_new); byte* data_new = thrd_chunk_get_element(buf, chunk_id); memcpy(data_new, data, buf->chunk_size * elements_old); return chunk_id; } #endif