diff --git a/memory/Queue.h b/memory/Queue.h new file mode 100644 index 0000000..b09926d --- /dev/null +++ b/memory/Queue.h @@ -0,0 +1,118 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef TOS_MEMORY_QUEUE_H +#define TOS_MEMORY_QUEUE_H + +#include "RingMemory.h" + +typedef RingMemory Queue; + +inline +void queue_alloc(Queue* ring, uint64 size, int32 alignment = 64) +{ + ring_alloc(ring, size, alignment); + + pthread_mutex_init(&ring->mutex, NULL); + pthread_cond_init(&ring->cond, NULL); +} + +inline +void queue_init(Queue* ring, BufferMemory* buf, uint64 size, int32 alignment = 64) +{ + ring_init(ring, buf, size, alignment); + + pthread_mutex_init(&ring->mutex, NULL); + pthread_cond_init(&ring->cond, NULL); +} + +inline +void queue_free(Queue* buf) +{ + ring_free(buf); +} + +inline +void queue_init(Queue* ring, byte* buf, uint64 size, int32 alignment = 64) +{ + ring_init(ring, buf, size, alignment); +} + +inline +void ring_enqueue(Queue* ring, byte* data, uint64 size) +{ + pthread_mutex_lock(&ring->mutex); + + while (!ring_commit_safe(ring, size)) { + pthread_cond_wait(&ring->cond, &ring->mutex); + } + + byte* mem = ring_get_memory(ring, size); + memcpy(mem, data, size); + + pthread_cond_signal(&ring->cond); + pthread_mutex_unlock(&ring->mutex); +} + +inline +byte* ring_enqueue_start(Queue* ring, uint64 size, byte aligned = 0) +{ + pthread_mutex_lock(&ring->mutex); + + while (!ring_commit_safe(ring, size, aligned)) { + pthread_cond_wait(&ring->cond, &ring->mutex); + } + + return ring_get_memory(ring, size, aligned); +} + +inline +void ring_enqueue_end(Queue* ring) +{ + pthread_cond_signal(&ring->cond); + pthread_mutex_unlock(&ring->mutex); +} + +inline +byte* ring_dequeue(Queue* ring, byte* data, uint64 size, byte aligned = 0) +{ + pthread_mutex_lock(&ring->mutex); + + while (ring->head == ring->tail) { + pthread_cond_wait(&ring->cond, &ring->mutex); + } + + memcpy(data, ring->tail, size); + ring_move_pointer(ring, &ring->tail, size, aligned); + + pthread_cond_signal(&ring->cond); + pthread_mutex_unlock(&ring->mutex); +} + +inline +byte* ring_dequeue_start(Queue* ring) +{ + pthread_mutex_lock(&ring->mutex); + + while (ring->head == ring->tail) { + pthread_cond_wait(&ring->cond, &ring->mutex); + } + + return ring->tail; +} + +inline +void ring_dequeue_end(Queue* ring, uint64 size, byte aligned = 0) +{ + ring_move_pointer(ring, &ring->tail, size, aligned); + + pthread_cond_signal(&ring->cond); + pthread_mutex_unlock(&ring->mutex); +} + +#endif \ No newline at end of file diff --git a/memory/RingMemory.h b/memory/RingMemory.h index 243ce62..2d95d31 100644 --- a/memory/RingMemory.h +++ b/memory/RingMemory.h @@ -22,8 +22,10 @@ #if _WIN32 #include "../platform/win32/Allocation.h" + #include "../platform/win32/Thread.h" #elif __linux__ #include "../platform/linux/Allocation.h" + #include "../platform/linux/Thread.h" #endif struct RingMemory { @@ -41,8 +43,13 @@ struct RingMemory { uint64 size; int32 alignment; int32 element_alignment; + + pthread_mutex_t mutex; + pthread_cond_t cond; }; +// @bug alignment should also include the end point, not just the start + inline void ring_alloc(RingMemory* ring, uint64 size, int32 alignment = 64) { @@ -147,6 +154,33 @@ void ring_reset(RingMemory* ring) ring->head = ring->memory; } +// Moves a pointer based on the size you want to consume (new position = after consuming size) +void ring_move_pointer(RingMemory* ring, byte** pos, uint64 size, byte aligned = 0) +{ + ASSERT_SIMPLE(size <= ring->size); + + if (aligned == 0) { + aligned = (byte) OMS_MAX(ring->element_alignment, 1); + } + + if (aligned > 1) { + uintptr_t address = (uintptr_t) *pos; + *pos += (aligned - (address& (aligned - 1))) % aligned; + } + + size = ROUND_TO_NEAREST(size, aligned); + if (*pos + size > ring->end) { + *pos = ring->memory; + + if (aligned > 1) { + uintptr_t address = (uintptr_t) *pos; + *pos += (aligned - (address & (aligned - 1))) % aligned; + } + } + + *pos += size; +} + byte* ring_get_memory(RingMemory* ring, uint64 size, byte aligned = 0, bool zeroed = false) { ASSERT_SIMPLE(size <= ring->size); @@ -203,6 +237,8 @@ inline bool ring_commit_safe(const RingMemory* ring, uint64 size, byte aligned = 0) { // aligned * 2 since that should be the maximum overhead for an element + // @bug could this result in a case where the ring is considered empty/full (false positive/negative)? + // The "correct" version would probably to use ring_move_pointer in some form uint64 max_mem_required = size + aligned * 2; if (ring->tail < ring->head) { @@ -211,7 +247,6 @@ bool ring_commit_safe(const RingMemory* ring, uint64 size, byte aligned = 0) } else if (ring->tail > ring->head) { return ((uint64) (ring->tail - ring->head)) > max_mem_required; } else { - // @question Is this really the case? What if it is completely filled? return true; } } @@ -231,7 +266,7 @@ void ring_force_tail_update(const RingMemory* ring) inline int64 ring_dump(const RingMemory* ring, byte* data) { - byte* tail = data; + byte* start = data; // Size *((uint64 *) data) = SWAP_ENDIAN_LITTLE(ring->size); @@ -259,7 +294,7 @@ int64 ring_dump(const RingMemory* ring, byte* data) memcpy(data, ring->memory, ring->size); data += ring->size; - return data - tail; + return data - start; } #endif \ No newline at end of file diff --git a/platform/SystemInfo.h b/platform/SystemInfo.h index 3459bdc..939726d 100644 --- a/platform/SystemInfo.h +++ b/platform/SystemInfo.h @@ -36,6 +36,9 @@ struct SIMDInfo { f32 sse; int32 avx256; int32 avx512; + int32 sve; + int32 neon; + bool abm; }; struct CpuInfo { @@ -43,6 +46,7 @@ struct CpuInfo { char brand[49]; int32 model; int32 family; + int32 thread_count; int32 mhz; CpuCacheInfo cache[4]; int32 page_size; diff --git a/platform/linux/ThreadDefines.h b/platform/linux/ThreadDefines.h index 3419b3c..023d168 100644 --- a/platform/linux/ThreadDefines.h +++ b/platform/linux/ThreadDefines.h @@ -11,8 +11,43 @@ #include #include + +#include "../../stdlib/Types.h" + typedef void* (*ThreadJobFunc)(void*); #define THREAD_RETURN void* +inline +void atomic_set(volatile int32* value, int32 new_value) +{ + __atomic_store_n(value, new_value, __ATOMIC_SEQ_CST); +} + +inline +int32 atomic_get(volatile int32* value) +{ + return __atomic_load_n((int32 *) value, __ATOMIC_SEQ_CST); +} + +inline +void atomic_increment(volatile int32* value) { + __atomic_fetch_add(value, 1, __ATOMIC_SEQ_CST); +} + +inline +void atomic_decrement(volatile int32* value) { + __atomic_fetch_sub(value, 1, __ATOMIC_SEQ_CST); +} + +inline +int32 atomic_add(volatile int32* value, int32 increment) { + return __atomic_fetch_add(value, increment, __ATOMIC_SEQ_CST); +} + +inline +int32 atomic_subtract(volatile int32* value, int32 decrement) { + return __atomic_fetch_sub(value, decrement, __ATOMIC_SEQ_CST); +} + #endif \ No newline at end of file diff --git a/platform/win32/SystemInfo.cpp b/platform/win32/SystemInfo.cpp index 1b2558a..438f393 100644 --- a/platform/win32/SystemInfo.cpp +++ b/platform/win32/SystemInfo.cpp @@ -27,16 +27,14 @@ #include #include -#ifdef _MSC_VER - // @performance Do we really need all these libs, can't we simplify that?! - #include - #pragma comment(lib, "Advapi32.lib") - #pragma comment(lib, "wbemuuid.lib") - #pragma comment(lib, "iphlpapi.lib") - #pragma comment(lib, "d3d12.lib") - #pragma comment(lib, "dxgi.lib") - #pragma comment(lib, "Ws2_32.lib") -#endif +// @performance Do we really need all these libs, can't we simplify that?! +#include +#pragma comment(lib, "Advapi32.lib") +#pragma comment(lib, "wbemuuid.lib") +#pragma comment(lib, "iphlpapi.lib") +#pragma comment(lib, "d3d12.lib") +#pragma comment(lib, "dxgi.lib") +#pragma comment(lib, "Ws2_32.lib") // @todo implement for arm? @@ -318,6 +316,9 @@ void cpu_info_get(CpuInfo* info) { info->simd.sse = (temp = max_sse_supported()) > 9 ? temp / 10.0f : temp; info->simd.avx256 = max_avx256_supported(); info->simd.avx512 = max_avx512_supported(); + info->simd.sve = max_sve_supported(); + info->simd.neon = max_neon_supported(); + info->simd.abm = supports_abm(); cache_info_get(1, &info->cache[0]); cache_info_get(2, &info->cache[1]); @@ -326,15 +327,16 @@ void cpu_info_get(CpuInfo* info) { SYSTEM_INFO sys_info; GetSystemInfo(&sys_info); + info->thread_count = sys_info.dwNumberOfProcessors; info->page_size = sys_info.dwPageSize; int32 cpuInfo[4] = { 0 }; __cpuid(cpuInfo, 0); memset(info->vendor, 0, sizeof(info->vendor)); - *((int*)info->vendor) = cpuInfo[1]; - *((int*)(info->vendor + 4)) = cpuInfo[3]; - *((int*)(info->vendor + 8)) = cpuInfo[2]; + *((int32 *) info->vendor) = cpuInfo[1]; + *((int32 *) (info->vendor + 4)) = cpuInfo[3]; + *((int32 *) (info->vendor + 8)) = cpuInfo[2]; info->vendor[12] = '\0'; __cpuid(cpuInfo, 0x80000002); @@ -513,7 +515,7 @@ void system_info_render(char* buf, const SystemInfo* info) { "\n" "CPU:\n" "==============\n" - "Hardware\n" "Vendor: %s\n" "Brand: %s\n" "Model: %d\n" "Family: %d\n" "Mhz: %d\n" "Page Size: %d\n" + "Hardware\n" "Vendor: %s\n" "Brand: %s\n" "Model: %d\n" "Family: %d\n" "Mhz: %d\n" "Thread Count: %d\n" "Page Size: %d\n" "\n" "Cache:\n" "L1: Size %d Line %d\n" @@ -521,7 +523,7 @@ void system_info_render(char* buf, const SystemInfo* info) { "L3: Size %d Line %d\n" "L4: Size %d Line %d\n" "\n" - "SIMD:\n" "SSE: %.1f\n" "AVX256: %d\n" "AVX512: %s\n" + "SIMD:\n" "SSE: %.1f\n" "AVX256: %d\n" "AVX512: %s\n" "SVE: %d\n" "NEON: %d\n" "ABM: %d\n" "\n" "GPU:\n" "==============\n" @@ -546,12 +548,12 @@ void system_info_render(char* buf, const SystemInfo* info) { info->network_count < 2 ? "" : info->network[1].slot, info->network_count < 2 ? 0 : info->network[1].mac[0], info->network_count < 2 ? 0 : info->network[1].mac[1], info->network_count < 2 ? 0 : info->network[1].mac[2], info->network_count < 2 ? 0 : info->network[1].mac[3], info->network_count < 2 ? 0 : info->network[1].mac[4], info->network_count < 2 ? 0 : info->network[1].mac[5], info->network_count < 2 ? 0 : info->network[1].mac[6], info->network_count < 2 ? 0 : info->network[1].mac[7], info->network_count < 3 ? "" : info->network[2].slot, info->network_count < 3 ? 0 : info->network[2].mac[0], info->network_count < 3 ? 0 : info->network[2].mac[1], info->network_count < 3 ? 0 : info->network[2].mac[2], info->network_count < 3 ? 0 : info->network[2].mac[3], info->network_count < 3 ? 0 : info->network[2].mac[4], info->network_count < 3 ? 0 : info->network[2].mac[5], info->network_count < 3 ? 0 : info->network[2].mac[6], info->network_count < 3 ? 0 : info->network[2].mac[7], info->network_count < 4 ? "" : info->network[3].slot, info->network_count < 4 ? 0 : info->network[3].mac[0], info->network_count < 4 ? 0 : info->network[3].mac[1], info->network_count < 4 ? 0 : info->network[3].mac[2], info->network_count < 4 ? 0 : info->network[3].mac[3], info->network_count < 4 ? 0 : info->network[3].mac[4], info->network_count < 4 ? 0 : info->network[3].mac[5], info->network_count < 4 ? 0 : info->network[3].mac[6], info->network_count < 4 ? 0 : info->network[3].mac[7], - info->cpu.vendor, info->cpu.brand, info->cpu.model, info->cpu.family, info->cpu.mhz, info->cpu.page_size, + info->cpu.vendor, info->cpu.brand, info->cpu.model, info->cpu.family, info->cpu.mhz, info->cpu.thread_count, info->cpu.page_size, info->cpu.cache[0].size, info->cpu.cache[0].line_size, info->cpu.cache[1].size, info->cpu.cache[1].line_size, info->cpu.cache[2].size, info->cpu.cache[2].line_size, info->cpu.cache[3].size, info->cpu.cache[3].line_size, - info->cpu.simd.sse, info->cpu.simd.avx256, info->cpu.simd.avx512 > 0 ? avx512[info->cpu.simd.avx512 - 1] : "0", + info->cpu.simd.sse, info->cpu.simd.avx256, info->cpu.simd.avx512 > 0 ? avx512[info->cpu.simd.avx512 - 1] : "0", info->cpu.simd.sve, info->cpu.simd.neon, (int32) info->cpu.simd.abm, info->gpu[0].name, info->gpu[0].vram, info->gpu_count < 2 ? "" : info->gpu[1].name, info->gpu_count < 2 ? 0 : info->gpu[1].vram, info->display[0].name, info->display[0].width, info->display[0].height, info->display[0].hz, diff --git a/platform/win32/ThreadDefines.h b/platform/win32/ThreadDefines.h index 6b00030..0ab7270 100644 --- a/platform/win32/ThreadDefines.h +++ b/platform/win32/ThreadDefines.h @@ -9,8 +9,11 @@ #ifndef TOS_PLATFORM_WIN32_THREAD_DEFINES_H #define TOS_PLATFORM_WIN32_THREAD_DEFINES_H +#include #include +#include "../../stdlib/Types.h" + typedef DWORD (WINAPI *ThreadJobFunc)(void*); typedef CRITICAL_SECTION pthread_mutex_t; typedef void pthread_mutexattr_t; @@ -26,4 +29,36 @@ struct pthread_rwlock_t { #define THREAD_RETURN DWORD WINAPI +inline +void atomic_set(volatile int32* value, int32 new_value) +{ + InterlockedExchange((long *) value, new_value); +} + +inline +int32 atomic_get(volatile int32* value) +{ + return (int32) InterlockedCompareExchange((long *) value, 0, 0); +} + +inline +void atomic_increment(volatile int32* value) { + InterlockedIncrement((long *) value); +} + +inline +void atomic_decrement(volatile int32* value) { + InterlockedDecrement((long *) value); +} + +inline +int32 atomic_add(volatile int32* value, int32 increment) { + return InterlockedExchangeAdd((long *) value, increment); +} + +inline +int32 atomic_subtract(volatile int32* value, int32 decrement) { + return InterlockedExchangeAdd((long *) value, -decrement); +} + #endif \ No newline at end of file diff --git a/thread/Thread.h b/thread/Thread.h index 3597c41..7e4ea69 100644 --- a/thread/Thread.h +++ b/thread/Thread.h @@ -12,6 +12,8 @@ #include #include +#include "../stdlib/Types.h" + #if _WIN32 #include "../platform/win32/ThreadDefines.h" #include "../platform/win32/Thread.h" @@ -25,7 +27,7 @@ void thread_create(Worker* worker, ThreadJobFunc routine, void* arg) { - for (int i = 0; i < worker->mutex_size; ++i) { + for (int32 i = 0; i < worker->mutex_size; ++i) { pthread_mutex_init(&worker->mutex[i], NULL); } @@ -35,10 +37,11 @@ void thread_create(Worker* worker, ThreadJobFunc routine, void* arg) void thread_stop(Worker* worker) { + atomic_set(&worker->state, 0); pthread_join(worker->thread, NULL); pthread_cond_destroy(&worker->condition); - for (int i = 0; i < worker->mutex_size; ++i) { + for (int32 i = 0; i < worker->mutex_size; ++i) { pthread_mutex_destroy(&worker->mutex[i]); } } diff --git a/thread/ThreadJob.h b/thread/ThreadJob.h index 2be89f2..bce5949 100644 --- a/thread/ThreadJob.h +++ b/thread/ThreadJob.h @@ -12,28 +12,29 @@ #include #include +#include "../stdlib/Types.h" + #if _WIN32 #include "../platform/win32/ThreadDefines.h" #elif __linux__ #include "../platform/linux/ThreadDefines.h" #endif -struct job_t { +struct PoolWorker { ThreadJobFunc func; void *arg; - int state; - job_t *next; + volatile int32 state; + PoolWorker *next; }; -typedef job_t ThreadJob; +typedef PoolWorker ThreadJob; struct Worker { - int index; // @todo When are we using this? - int state; + volatile int32 state; pthread_t thread; pthread_cond_t condition; - int mutex_size; + int32 mutex_size; pthread_mutex_t* mutex; }; diff --git a/thread/ThreadPool.h b/thread/ThreadPool.h index ff5bcdc..90c15dd 100644 --- a/thread/ThreadPool.h +++ b/thread/ThreadPool.h @@ -12,21 +12,16 @@ #include #include +#include "../stdlib/Types.h" + #ifdef _WIN32 #include "../platform/win32/Thread.h" -#else - #include "../platform/linux/Thread.h" -#endif - -#include "../stdlib/Types.h" -#include "ThreadJob.h" - -#if _WIN32 - #elif __linux__ #include "../platform/linux/Thread.h" #endif +#include "ThreadJob.h" + struct ThreadPool { ThreadJob *work_first; ThreadJob *work_last; @@ -63,11 +58,7 @@ ThreadJob *thread_pool_work_poll(ThreadPool *pool) return work; } -#ifdef _WIN32 -static DWORD WINAPI thread_pool_worker(void* arg) -#else -static void* thread_pool_worker(void *arg) -#endif +static THREAD_RETURN thread_pool_worker(void* arg) { ThreadPool *pool = (ThreadPool *) arg; ThreadJob *work; @@ -128,7 +119,7 @@ ThreadPool *thread_pool_create(size_t num, ThreadPool* pool) pool->work_first = NULL; pool->work_last = NULL; - for (i = 0; i < num; i++) { + for (i = 0; i < num; ++i) { pthread_create(&thread, NULL, thread_pool_worker, pool); ++(pool->size); @@ -183,11 +174,7 @@ void thread_pool_destroy(ThreadPool *pool) ThreadJob* thread_pool_add_work(ThreadPool *pool, ThreadJob* job) { - if (pool == NULL) { - return NULL; - } - - if (job == NULL) { + if (pool == NULL || job == NULL) { return NULL; }