mirror of
https://github.com/Karaka-Management/cOMS.git
synced 2026-01-10 19:08:39 +00:00
297 lines
9.4 KiB
C
Executable File
297 lines
9.4 KiB
C
Executable File
/**
|
|
* Jingga
|
|
*
|
|
* @copyright Jingga
|
|
* @license OMS License 2.0
|
|
* @version 1.0.0
|
|
* @link https://jingga.app
|
|
*/
|
|
#ifndef COMS_MEMORY_THREADED_CHUNK_MEMORY_H
|
|
#define COMS_MEMORY_THREADED_CHUNK_MEMORY_H
|
|
|
|
#include "../stdlib/Types.h"
|
|
#include "../thread/Thread.h"
|
|
#include "ChunkMemory.h"
|
|
|
|
struct ThreadedChunkMemory {
|
|
byte* memory;
|
|
|
|
uint64 size;
|
|
uint32 last_pos;
|
|
uint32 count;
|
|
uint32 chunk_size;
|
|
int32 alignment;
|
|
|
|
// length = count
|
|
// free describes which locations are used and which are free
|
|
alignas(8) atomic_64 uint64* free;
|
|
|
|
// Chunk implementation ends here
|
|
// The completeness indicates if the data is completely written to
|
|
alignas(8) atomic_64 uint64* completeness;
|
|
|
|
mutex lock;
|
|
};
|
|
|
|
// INFO: A chunk count of 2^n is recommended for maximum performance
|
|
inline
|
|
void thrd_chunk_alloc(ThreadedChunkMemory* buf, uint32 count, uint32 chunk_size, int32 alignment = 64)
|
|
{
|
|
ASSERT_SIMPLE(chunk_size);
|
|
ASSERT_SIMPLE(count);
|
|
PROFILE(PROFILE_CHUNK_ALLOC, NULL, false, true);
|
|
LOG_1("Allocating ChunkMemory");
|
|
|
|
chunk_size = ROUND_TO_NEAREST(chunk_size, alignment);
|
|
|
|
uint64 size = count * chunk_size
|
|
+ sizeof(uint64) * CEIL_DIV(count, alignment) // free
|
|
+ sizeof(uint64) * CEIL_DIV(count, alignment) // completeness
|
|
+ alignment * 3; // overhead for alignment
|
|
|
|
buf->memory = alignment < 2
|
|
? (byte *) platform_alloc(size)
|
|
: (byte *) platform_alloc_aligned(size, alignment);
|
|
|
|
buf->count = count;
|
|
buf->size = size;
|
|
buf->chunk_size = chunk_size;
|
|
buf->last_pos = -1;
|
|
buf->alignment = alignment;
|
|
|
|
// @question Could it be beneficial to have this before the element data?
|
|
buf->free = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->memory + count * chunk_size), alignment);
|
|
buf->completeness = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->free + count), alignment);
|
|
|
|
memset(buf->memory, 0, buf->size);
|
|
mutex_init(&buf->lock, NULL);
|
|
|
|
LOG_1("Allocated ChunkMemory: %n B", {{LOG_DATA_UINT64, &buf->size}});
|
|
}
|
|
|
|
inline
|
|
void thrd_chunk_init(ThreadedChunkMemory* buf, BufferMemory* data, uint32 count, uint32 chunk_size, int32 alignment = 64)
|
|
{
|
|
ASSERT_SIMPLE(chunk_size);
|
|
ASSERT_SIMPLE(count);
|
|
|
|
chunk_size = ROUND_TO_NEAREST(chunk_size, alignment);
|
|
|
|
uint64 size = count * chunk_size
|
|
+ sizeof(uint64) * CEIL_DIV(count, alignment) // free
|
|
+ sizeof(uint64) * CEIL_DIV(count, alignment) // completeness
|
|
+ alignment * 3; // overhead for alignment
|
|
|
|
buf->memory = buffer_get_memory(data, size);
|
|
|
|
buf->count = count;
|
|
buf->size = size;
|
|
buf->chunk_size = chunk_size;
|
|
buf->last_pos = -1;
|
|
buf->alignment = alignment;
|
|
|
|
// @question Could it be beneficial to have this before the element data?
|
|
// On the other hand the way we do it right now we never have to move past the free array since it is at the end
|
|
// On another hand we could by accident overwrite the values in free if we are not careful
|
|
buf->free = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->memory + count * chunk_size), alignment);
|
|
buf->completeness = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->free + count), alignment);
|
|
|
|
mutex_init(&buf->lock, NULL);
|
|
|
|
DEBUG_MEMORY_SUBREGION((uintptr_t) buf->memory, buf->size);
|
|
}
|
|
|
|
inline
|
|
void thrd_chunk_init(ThreadedChunkMemory* buf, byte* data, uint32 count, uint32 chunk_size, int32 alignment = 64)
|
|
{
|
|
ASSERT_SIMPLE(chunk_size);
|
|
ASSERT_SIMPLE(count);
|
|
|
|
chunk_size = ROUND_TO_NEAREST(chunk_size, alignment);
|
|
|
|
uint64 size = count * chunk_size
|
|
+ sizeof(uint64) * CEIL_DIV(count, alignment) // free
|
|
+ sizeof(uint64) * CEIL_DIV(count, alignment) // completeness
|
|
+ alignment * 3; // overhead for alignment
|
|
|
|
// @bug what if an alignment is defined?
|
|
buf->memory = data;
|
|
|
|
buf->count = count;
|
|
buf->size = size;
|
|
buf->chunk_size = chunk_size;
|
|
buf->last_pos = -1;
|
|
buf->alignment = alignment;
|
|
|
|
// @question Could it be beneficial to have this before the element data?
|
|
// On the other hand the way we do it right now we never have to move past the free array since it is at the end
|
|
// On another hand we could by accident overwrite the values in free if we are not careful
|
|
buf->free = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->memory + count * chunk_size), alignment);
|
|
buf->completeness = (uint64 *) ROUND_TO_NEAREST((uintptr_t) (buf->free + count), alignment);
|
|
|
|
mutex_init(&buf->lock, NULL);
|
|
|
|
DEBUG_MEMORY_SUBREGION((uintptr_t) buf->memory, buf->size);
|
|
}
|
|
|
|
FORCE_INLINE
|
|
void thrd_chunk_free(ThreadedChunkMemory* buf) noexcept
|
|
{
|
|
chunk_free((ChunkMemory *) buf);
|
|
mutex_destroy(&buf->lock);
|
|
}
|
|
|
|
FORCE_INLINE
|
|
uint32 thrd_chunk_id_from_memory(const ThreadedChunkMemory* buf, const byte* pos) noexcept
|
|
{
|
|
return chunk_id_from_memory((ChunkMemory *) buf, pos);
|
|
}
|
|
|
|
FORCE_INLINE
|
|
byte* thrd_chunk_get_element(ThreadedChunkMemory* buf, uint32 element, bool zeroed = false) noexcept
|
|
{
|
|
return chunk_get_element((ChunkMemory *) buf, element, zeroed);
|
|
}
|
|
|
|
void thrd_chunk_set_unset(uint32 element, atomic_64 uint64* state) {
|
|
uint32 free_index = element / 64;
|
|
uint32 bit_index = element & 63;
|
|
|
|
alignas(8) atomic_64 uint64* target = &state[free_index];
|
|
uint64 old_value, new_value;
|
|
|
|
do {
|
|
old_value = atomic_get_relaxed(target);
|
|
new_value = old_value | (1ULL << bit_index);
|
|
|
|
if (old_value == new_value) {
|
|
return;
|
|
}
|
|
} while (!atomic_compare_exchange_strong_release(target, &old_value, new_value));
|
|
}
|
|
|
|
int32 thrd_chunk_get_unset(const ThreadedChunkMemory* buf, atomic_64 uint64* state, int32 start_index = 0) {
|
|
if ((uint32) start_index >= buf->count) {
|
|
start_index = 0;
|
|
}
|
|
|
|
uint32 free_index = start_index / 64;
|
|
uint32 bit_index = start_index & 63;
|
|
|
|
if (!IS_BIT_SET_64_R2L(state[free_index], bit_index)) {
|
|
uint64 expected = atomic_get_relaxed(&state[free_index]);
|
|
expected &= ~(1ULL << bit_index);
|
|
uint64 desired = expected | (1ULL << bit_index);
|
|
|
|
if (atomic_compare_exchange_strong_release(&state[free_index], &expected, desired)) {
|
|
return free_index * 64 + bit_index;
|
|
}
|
|
}
|
|
|
|
for (uint32 i = 0; i < buf->count; ++i) {
|
|
if (state[free_index] != 0xFFFFFFFFFFFFFFFF) {
|
|
// We will try 3 times, usually this would be a while but since compiler_find_... doesn't use atomics
|
|
// we might get the same index over and over again
|
|
for (uint32 j = 0; j < 3; ++j) {
|
|
bit_index = compiler_find_first_bit_r2l(~state[free_index]);
|
|
|
|
uint64 expected = atomic_get_relaxed(&state[free_index]);
|
|
expected &= ~(1ULL << bit_index);
|
|
uint64 desired = expected | (1ULL << bit_index);
|
|
|
|
if (atomic_compare_exchange_strong_release(&state[free_index], &expected, desired)) {
|
|
return free_index * 64 + bit_index;
|
|
}
|
|
}
|
|
}
|
|
|
|
++free_index;
|
|
if (free_index * 64 >= buf->count) {
|
|
free_index = 0;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
inline
|
|
int32 thrd_chunk_reserve(ThreadedChunkMemory* buf, uint32 elements = 1) noexcept
|
|
{
|
|
mutex_lock(&buf->lock);
|
|
int32 free_element = chunk_reserve((ChunkMemory *) buf, elements);
|
|
mutex_unlock(&buf->lock);
|
|
|
|
return free_element;
|
|
}
|
|
|
|
inline
|
|
void thrd_chunk_free_element(ThreadedChunkMemory* buf, uint64 free_index, int32 bit_index) noexcept
|
|
{
|
|
alignas(8) atomic_64 uint64* target = &buf->free[free_index];
|
|
uint64 old_value, new_value;
|
|
|
|
do {
|
|
old_value = atomic_get_relaxed(target);
|
|
new_value = old_value | (1ULL << bit_index);
|
|
|
|
if (old_value == new_value) {
|
|
return;
|
|
}
|
|
} while (!atomic_compare_exchange_strong_release(target, &old_value, new_value));
|
|
|
|
DEBUG_MEMORY_DELETE((uintptr_t) (buf->memory + (free_index * 64 + bit_index) * buf->chunk_size), buf->chunk_size);
|
|
}
|
|
|
|
inline
|
|
void thrd_chunk_free_elements(ThreadedChunkMemory* buf, uint64 element, uint32 element_count = 1) noexcept
|
|
{
|
|
uint64 free_index = element / 64;
|
|
uint32 bit_index = element & 63;
|
|
|
|
if (element == 1) {
|
|
thrd_chunk_free_element(buf, free_index, bit_index);
|
|
return;
|
|
}
|
|
|
|
while (element_count > 0) {
|
|
// Calculate the number of bits we can clear in the current 64-bit block
|
|
uint32 bits_in_current_block = OMS_MIN(64 - bit_index, element_count);
|
|
|
|
// Create a mask to clear the bits
|
|
uint64 mask = ((1ULL << bits_in_current_block) - 1) << bit_index;
|
|
|
|
uint64 old_value, new_value;
|
|
alignas(8) atomic_64 uint64* target = &buf->free[free_index];
|
|
|
|
do {
|
|
old_value = atomic_get_relaxed(target);
|
|
new_value = old_value & ~mask;
|
|
|
|
if (old_value == new_value) {
|
|
break;
|
|
}
|
|
} while (!atomic_compare_exchange_strong_release(target, &old_value, new_value));
|
|
|
|
// Update the counters and indices
|
|
element_count -= bits_in_current_block;
|
|
++free_index;
|
|
bit_index = 0;
|
|
}
|
|
|
|
DEBUG_MEMORY_DELETE((uintptr_t) (buf->memory + element * buf->chunk_size), buf->chunk_size);
|
|
}
|
|
|
|
inline
|
|
int32 thrd_chunk_resize(ThreadedChunkMemory* buf, int32 element_id, uint32 elements_old, uint32 elements_new) noexcept
|
|
{
|
|
const byte* data = thrd_chunk_get_element(buf, element_id);
|
|
|
|
int32 chunk_id = thrd_chunk_reserve(buf, elements_new);
|
|
byte* data_new = thrd_chunk_get_element(buf, chunk_id);
|
|
|
|
memcpy(data_new, data, buf->chunk_size * elements_old);
|
|
|
|
return chunk_id;
|
|
}
|
|
|
|
#endif |