From 39fbcf43002f753cdccffb8966e8b35b7bd1e01a Mon Sep 17 00:00:00 2001
From: Dennis Eichhorn <spl1nes.com@googlemail.com>
Date: Sat, 22 Mar 2025 01:10:19 +0000
Subject: [PATCH] linux bug fixes

---
 .github/workflows/codeql.yml                  |    0
 .github/workflows/msvc.yml                    |    0
 .gitignore                                    |    0
 Guidelines.md                                 |    0
 README.md                                     |    0
 animation/Animation.h                         |    0
 animation/AnimationEaseType.h                 |    0
 architecture/CpuInfo.cpp                      |    0
 architecture/CpuInfo.h                        |    0
 architecture/Intrinsics.h                     |    0
 architecture/arm/CpuInfo.cpp                  |    0
 architecture/arm/Intrinsics.h                 |    4 +
 architecture/arm/neon/utils/Utils.h           |    0
 architecture/arm/sve/utils/Utils.h            |    0
 architecture/x86/CpuInfo.cpp                  |    0
 architecture/x86/Intrinsics.h                 |    3 +
 architecture/x86/simd/SIMD_F32.h              | 1373 ++---------
 architecture/x86/simd/SIMD_F32_AVX2.h         |  426 ++++
 architecture/x86/simd/SIMD_F32_AVX512.h       |  385 +++
 architecture/x86/simd/SIMD_F32_SSE.h          |  381 +++
 architecture/x86/simd/SIMD_F64.h              |   42 +-
 architecture/x86/simd/SIMD_F64_AVX2.h         |   30 +
 architecture/x86/simd/SIMD_F64_AVX512.h       |   29 +
 architecture/x86/simd/SIMD_F64_SSE.h          |   29 +
 architecture/x86/simd/SIMD_I16.h              |  860 +------
 architecture/x86/simd/SIMD_I16_AVX2.h         |  262 ++
 architecture/x86/simd/SIMD_I16_AVX512.h       |  265 ++
 architecture/x86/simd/SIMD_I16_SSE.h          |  261 ++
 architecture/x86/simd/SIMD_I32.h              | 2125 +++++------------
 architecture/x86/simd/SIMD_I32_AVX2.h         |  288 +++
 architecture/x86/simd/SIMD_I32_AVX512.h       |  309 +++
 architecture/x86/simd/SIMD_I32_SSE.h          |  286 +++
 architecture/x86/simd/SIMD_I64.h              |   43 +-
 architecture/x86/simd/SIMD_I64_AVX2.h         |   29 +
 architecture/x86/simd/SIMD_I64_AVX512.h       |   29 +
 architecture/x86/simd/SIMD_I64_SSE.h          |   29 +
 architecture/x86/simd/SIMD_I8.h               |  946 +-------
 architecture/x86/simd/SIMD_I8_AVX2.h          |  265 ++
 architecture/x86/simd/SIMD_I8_AVX512.h        |  270 +++
 architecture/x86/simd/SIMD_I8_SSE.h           |  265 ++
 architecture/x86/simd/SIMD_SVML.h             |  160 +-
 architecture/x86/simd/SIMD_SVML_AVX2.h        |   69 +
 architecture/x86/simd/SIMD_SVML_AVX512.h      |   70 +
 architecture/x86/simd/SIMD_SVML_SSE.h         |   70 +
 architecture/x86/simd/utils/Utils.h           |   63 +-
 asset/Asset.h                                 |    0
 asset/AssetArchive.h                          |    0
 asset/AssetManagementSystem.h                 |    0
 asset/AssetType.h                             |    0
 audio/Audio.cpp                               |    0
 audio/Audio.h                                 |    0
 audio/AudioMixer.h                            |    0
 audio/AudioSetting.h                          |    0
 audio/Qoa.h                                   |    0
 audio/QoaSimd.h                               |    0
 audio/Wav.h                                   |    0
 auth/Auth.h                                   |    0
 camera/Camera.h                               |    0
 camera/CameraMovement.h                       |    0
 color/ColorVisionDeficiency.h                 |    0
 command/AppCmdBuffer.cpp                      |    0
 command/AppCmdBuffer.h                        |    0
 command/Command.h                             |    0
 compiler/CompilerUtils.h                      |    0
 compiler/gcc/Atomic.h                         |    0
 compiler/gcc/CompilerUtils.h                  |    0
 compiler/msvc/CompilerUtils.h                 |    0
 compression/Huffman.h                         |    0
 compression/LZP.h                             |    0
 compression/RLE.h                             |    0
 database/Database.h                           |    0
 database/DatabaseConnection.h                 |    0
 database/DatabaseType.h                       |    0
 encryption/CeasarEncryption.h                 |    0
 encryption/XorEncryption.h                    |    0
 entity/AnimationEntityComponent.h             |    0
 entity/CursorEntity.h                         |    0
 entity/Entity.h                               |    0
 entity/EntityComponentSystem.h                |    0
 entity/EntitySize.h                           |    0
 environment/Globe.h                           |    0
 environment/Universe.h                        |    0
 error/HammingCodes.h                          |    0
 font/Font.h                                   |    0
 font/font_characters.txt                      |    0
 gpuapi/AntiAliasing.h                         |    0
 gpuapi/GpuApiType.h                           |    0
 gpuapi/GpuAttributeType.h                     |    0
 gpuapi/RenderUtils.h                          |    0
 gpuapi/ShaderType.h                           |    0
 gpuapi/direct3d/AppCmdBuffer.h                |    0
 gpuapi/direct3d/DirectXUtils.h                |    0
 gpuapi/direct3d/FramesInFlightContainer.h     |    0
 gpuapi/direct3d/Shader.h                      |    0
 gpuapi/direct3d/ShaderUtils.h                 |    0
 gpuapi/opengl/AppCmdBuffer.h                  |    0
 gpuapi/opengl/FramesInFlightContainer.h       |    0
 gpuapi/opengl/Opengl.h                        |    0
 gpuapi/opengl/OpenglDefines.h                 |    0
 .../opengl/OpenglDescriptorSetLayoutBinding.h |    0
 gpuapi/opengl/OpenglLinux.h                   |    0
 gpuapi/opengl/OpenglUtils.h                   |    0
 gpuapi/opengl/OpenglWin32.h                   |    0
 gpuapi/opengl/Shader.h                        |    0
 gpuapi/opengl/ShaderUtils.h                   |    0
 gpuapi/vulkan/AppCmdBuffer.h                  |    0
 gpuapi/vulkan/FramesInFlightContainer.h       |    0
 gpuapi/vulkan/Shader.h                        |    0
 gpuapi/vulkan/ShaderUtils.h                   |    0
 gpuapi/vulkan/VulkanUtils.h                   |    0
 hash/Crc.h                                    |    0
 hash/GeneralHash.h                            |    4 +-
 html/template/HtmlTemplate.h                  |    0
 html/template/HtmlTemplateCache.h             |    0
 html/template/HtmlTemplateContext.h           |    0
 html/template/HtmlTemplateInterpreter.h       |    0
 html/template/HtmlTemplateLexer.h             |    0
 html/template/HtmlTemplateParser.h            |    0
 image/Bitmap.h                                |    0
 image/Image.cpp                               |    0
 image/Image.h                                 |    0
 image/Png.h                                   |    0
 image/Qoi.h                                   |    0
 image/Tga.h                                   |    0
 image/default_colors.h                        |    0
 image/default_colors.htm                      |    0
 image/stb_image.h                             |    0
 input/ControllerInput.h                       |    0
 input/ControllerType.h                        |    0
 input/Input.h                                 |    0
 input/InputConnectionType.h                   |    0
 light/Material.h                              |    0
 localization/Dialog.h                         |    0
 localization/Language.h                       |    0
 log/DebugContainer.h                          |    0
 log/DebugMemory.h                             |    0
 log/Log.h                                     |   14 +-
 log/PerformanceProfiler.h                     |    0
 log/Stats.h                                   |    0
 math/Evaluator.h                              |    0
 math/PerlinNoise.h                            |    0
 math/matrix/MatrixFloat32.h                   |   12 +-
 math/matrix/MatrixInt32.h                     |    0
 math/matrix/MatrixInt64.h                     |    0
 math/matrix/QuaternionFloat32.h               |    0
 math/matrix/VectorFloat32.h                   |    0
 math/matrix/VectorFloat64.h                   |    0
 math/matrix/VectorInt32.h                     |    0
 math/matrix/VectorInt64.h                     |    0
 math/random/BlueNoise.h                       |    0
 memory/BufferMemory.h                         |    0
 memory/ChunkMemory.h                          |    0
 memory/Heap.h                                 |    0
 memory/Queue.h                                |    0
 memory/RingMemory.h                           |    0
 memory/ThreadedChunkMemory.h                  |    0
 memory/ThreadedQueue.h                        |    0
 memory/ThreadedRingMemory.h                   |    0
 models/Colors.h                               |    0
 models/Location.h                             |    0
 models/Map/map_chunks.h                       |    0
 models/Obj.h                                  |    0
 models/Sound.h                                |    0
 models/account/Account.h                      |    0
 models/bracket/Bracket.h                      |    0
 models/bracket/BracketMatch.h                 |    0
 models/bracket/BracketSeeding.h               |    0
 models/bracket/BracketTeam.h                  |    0
 models/chat/Chat.h                            |    0
 models/chat/ChatLevel.h                       |    0
 models/chat/ChatStatus.h                      |    0
 models/chat/ChatType.h                        |    0
 models/event/Event.h                          |    0
 models/event/EventTaskType.h                  |    0
 models/event/tmp                              |    0
 models/extension/ExtensionType.h              |    0
 models/guild/GuildBanner.h                    |    0
 models/item/Consumable.h                      |    0
 models/item/ConsumableType.h                  |    0
 models/item/Equipment.cpp                     |    0
 models/item/Equipment.h                       |    0
 models/item/EquipmentType.h                   |    0
 models/item/Item.h                            |    0
 models/item/ItemAffixDistribution.h           |    0
 models/item/ItemLevelStats.h                  |    0
 models/item/ItemRarityDefinition.h            |    0
 models/item/ItemRarityStats.h                 |    0
 models/item/ItemStatsDistribution.h           |    0
 models/item/MobLevelStats.h                   |    0
 models/item/_equipment_slots.h                |    0
 models/item/_equipment_types.h                |    0
 models/item/_item_rarity.h                    |    0
 models/map.h                                  |    0
 models/mob/ActivityStats.h                    |    0
 models/mob/FixedStats.h                       |    0
 models/mob/Mob.cpp                            |    0
 models/mob/Mob.h                              |    0
 models/mob/MobAction.h                        |    0
 models/mob/MobCategory.h                      |    0
 models/mob/MobState.h                         |    0
 models/mob/MobStats.cpp                       |    0
 models/mob/MobStats.h                         |    0
 models/mob/MobStatsType.h                     |    0
 models/mob/PrimaryStatsPoints.cpp             |    0
 models/mob/PrimaryStatsPoints.h               |    0
 models/mob/SecondaryStatsPoints.cpp           |    0
 models/mob/SecondaryStatsPoints.h             |    0
 models/mob/_mob_category.h                    |    0
 models/mob/_mob_list.h                        |    0
 models/mob/monster/Drop.h                     |    0
 models/mob/monster/LootTable.h                |    0
 models/mob/monster/Monster.h                  |    0
 models/mob/monster/MonsterStats.h             |    0
 models/mob/player/Backpack.h                  |    0
 models/mob/player/Guild.h                     |    0
 models/mob/player/LootFilter.h                |    0
 models/mob/player/Player.cpp                  |    0
 models/mob/player/Player.h                    |    0
 models/mob/player/PlayerStats.h               |    0
 models/mob/player/PlayerXPRequirement.h       |    0
 models/mob/player/Reputation.h                |    0
 models/mob/player/_player_class.h             |    0
 models/mob/skill/AoeDistribution.h            |    0
 models/mob/skill/AoeShape.h                   |    0
 models/mob/skill/ProjectileDistribution.h     |    0
 models/mob/skill/Skill.h                      |    0
 models/mob/skill/SkillLocation.h              |    0
 models/mob/skill/StatsTarget.h                |    0
 models/mob/skill/definitions/arcane_bolt.cfg  |    0
 models/mob/skill/definitions/arise.cfg        |    0
 models/mob/skill/definitions/back_fist.cfg    |    0
 models/mob/skill/definitions/beam.cfg         |    0
 models/mob/skill/definitions/black_fist.cfg   |    0
 models/mob/skill/definitions/chain.cfg        |    0
 .../mob/skill/definitions/chain_lightning.cfg |    0
 .../mob/skill/definitions/corruption_bolt.cfg |    0
 models/mob/skill/definitions/cyclone.cfg      |    0
 models/mob/skill/definitions/dodge.cfg        |    0
 models/mob/skill/definitions/earth_bolt.cfg   |    0
 .../mob/skill/definitions/elemental_pilar.cfg |    0
 models/mob/skill/definitions/fear.cfg         |    0
 models/mob/skill/definitions/fire_bolt.cfg    |    0
 models/mob/skill/definitions/frost_bolt.cfg   |    0
 models/mob/skill/definitions/ghost_walk.cfg   |    0
 .../skill/definitions/health_inverse_dmg.cfg  |    0
 .../mob/skill/definitions/health_to_dmg.cfg   |    0
 models/mob/skill/definitions/holy_bolt.cfg    |    0
 models/mob/skill/definitions/hook.cfg         |    0
 models/mob/skill/definitions/kick.cfg         |    0
 .../mob/skill/definitions/launch_strike.cfg   |    0
 .../mob/skill/definitions/lightning_bolt.cfg  |    0
 .../mob/skill/definitions/meteor_strike.cfg   |    0
 models/mob/skill/definitions/mirage.cfg       |    0
 models/mob/skill/definitions/net.cfg          |    0
 models/mob/skill/definitions/palm_strike.cfg  |    0
 .../mob/skill/definitions/poison_strike.cfg   |    0
 models/mob/skill/definitions/pull.cfg         |    0
 models/mob/skill/definitions/punch.cfg        |    0
 models/mob/skill/definitions/push.cfg         |    0
 models/mob/skill/definitions/reflect.cfg      |    0
 models/mob/skill/definitions/revive.cfg       |    0
 models/mob/skill/definitions/root.cfg         |    0
 models/mob/skill/definitions/sacrafice.cfg    |    0
 models/mob/skill/definitions/shield.cfg       |    0
 models/mob/skill/definitions/side_kick.cfg    |    0
 models/mob/skill/definitions/spikes.cfg       |    0
 models/mob/skill/definitions/sprint.cfg       |    0
 models/mob/skill/definitions/stomp.cfg        |    0
 models/mob/skill/definitions/summon.cfg       |    0
 models/mob/skill/definitions/sword_dance.cfg  |    0
 models/mob/skill/definitions/teleport.cfg     |    0
 models/mob/skill/definitions/totem.cfg        |    0
 models/mob/skill/definitions/uppercut.cfg     |    0
 models/mob/skill/definitions/whirlwind.cfg    |    0
 models/mob/skill/definitions/wind_slashes.cfg |    0
 models/mob/skill/modifiers/split_shot.cfg     |    0
 models/object/Block.cpp                       |    0
 models/object/Block.h                         |    0
 models/object/Chunk.h                         |    0
 models/object/Cube.h                          |    0
 models/object/Object.h                        |    0
 models/object/ObjectType.h                    |    0
 models/object/_object_list.h                  |    0
 models/object/_object_types.h                 |    0
 models/settings/DungeonSettings.h             |    0
 models/settings/ItemDistributionType.h        |    0
 models/settings/Settings.h                    |    0
 models/settings/setting_types.h               |    0
 module/Module.h                               |    0
 module/ModuleManager.h                        |    0
 network/Server.h                              |    0
 network/Socket.h                              |    0
 network/SocketConnection.h                    |    0
 network/packet/OMSPacket.h                    |    0
 network/packet/PacketCache.h                  |    0
 network/packet/PacketHeader.h                 |    0
 network/packet/UDPPacket.h                    |    0
 network/packet/chat/ChatMessagePacket.h       |    0
 network/packet/general/AckPacket.h            |    0
 network/packet/general/PingPacket.h           |    0
 network/packet/mob/MobInfoPacket.h            |    0
 network/packet/mob/MobStatePacket.h           |    0
 network/packet/mob/player/PlayerInfoPacket.h  |    0
 network/packet/mob/player/PlayerState.h       |    0
 network/packet/packet_types.h                 |    0
 noise/FractalNoise.h                          |    0
 noise/PerlinNoise.h                           |    0
 noise/SimplexNoise.h                          |    0
 noise/ValueNoise.h                            |    0
 noise/WorleyNoise.h                           |    0
 object/Animation.h                            |    0
 object/Hitbox.h                               |    0
 object/Material.h                             |    0
 object/Materials.md                           |    0
 object/Mesh.h                                 |    0
 object/Model.h                                |    0
 object/Texture.h                              |    0
 object/Vertex.h                               |    0
 particle/Particle.h                           |    0
 pathfinding/Jpsp.h                            |    0
 pathfinding/Metric2d.h                        |    0
 pathfinding/Metric3d.h                        |    0
 pathfinding/Path.h                            |    0
 pathfinding/jps/Jps.h                         |    0
 pathfinding/jps/JpsGrid.h                     |    0
 pathfinding/jps/JpsNode.h                     |    0
 platform/linux/Allocator.h                    |    0
 platform/linux/ExceptionHandler.h             |    0
 platform/linux/FileUtils.cpp                  |    0
 platform/linux/Library.cpp                    |    0
 platform/linux/Library.h                      |    0
 platform/linux/SystemInfo.cpp                 |    0
 platform/linux/TimeUtils.h                    |    6 +-
 platform/linux/UtilsLinux.h                   |    0
 platform/linux/network/Server.h               |    0
 platform/linux/network/Socket.h               |    0
 platform/linux/threading/Atomic.h             |    0
 platform/linux/threading/Semaphore.h          |    0
 platform/linux/threading/Spinlock.cpp         |    0
 platform/linux/threading/Spinlock.h           |    0
 platform/linux/threading/Thread.h             |    0
 platform/linux/threading/ThreadDefines.h      |    0
 platform/win32/Allocator.h                    |    0
 platform/win32/Clipboard.h                    |    0
 platform/win32/ExceptionHandler.h             |    0
 platform/win32/FastPipes.h                    |    0
 platform/win32/FileUtils.cpp                  |    0
 platform/win32/LeanWin32.h                    |    0
 platform/win32/Library.cpp                    |    0
 platform/win32/Library.h                      |    0
 platform/win32/SystemInfo.cpp                 |    0
 platform/win32/TimeUtils.h                    |    0
 platform/win32/UtilsWin32.h                   |    0
 platform/win32/UtilsWindows.h                 |    0
 platform/win32/Window.h                       |    0
 platform/win32/audio/DirectSound.h            |    0
 platform/win32/audio/Wasapi.h                 |    0
 platform/win32/audio/XAudio2.h                |    0
 platform/win32/input/DirectInput.h            |    0
 platform/win32/input/HidInput.h               |    0
 platform/win32/input/RawInput.h               |    0
 platform/win32/input/XInput.h                 |    0
 .../input/controller/ControllerHandler.h      |    0
 platform/win32/input/controller/DualSense.h   |    0
 platform/win32/input/controller/DualShock4.h  |    0
 platform/win32/input/controller/XBoxS.h       |    0
 platform/win32/network/Client.h               |    0
 platform/win32/network/Server.h               |    0
 platform/win32/network/Socket.h               |    0
 platform/win32/threading/Atomic.h             |    0
 platform/win32/threading/Semaphore.h          |    0
 platform/win32/threading/Spinlock.cpp         |    0
 platform/win32/threading/Spinlock.h           |    0
 platform/win32/threading/Thread.h             |    0
 platform/win32/threading/ThreadDefines.h      |    0
 render/liquid.cpp                             |    0
 render/mob.cpp                                |    0
 render/object.cpp                             |    0
 render/sky.cpp                                |    0
 render/text.cpp                               |    0
 scene/SceneInfo.h                             |    0
 shaders/liquids/lava.hlsl                     |    0
 shaders/liquids/water/cube_fragment.hlsl      |    0
 shaders/liquids/water/cube_vertex.hlsl        |    0
 shaders/liquids/water/helper.hlsli            |    0
 shaders/liquids/water/sphere_fragment.hlsl    |    0
 shaders/liquids/water/sphere_vertex.hlsl      |    0
 .../liquids/water/water_above_fragment.hlsl   |    0
 .../liquids/water/water_below_fragment.hlsl   |    0
 .../water/water_caustics_fragment.hlsl        |    0
 .../liquids/water/water_caustics_vertex.hlsl  |    0
 shaders/liquids/water/water_vertex.hlsl       |    0
 shaders/nature/cloud.hlsl                     |    0
 shaders/nature/fire.hlsl                      |    0
 shaders/nature/fog.hlsl                       |    0
 shaders/nature/godray.hlsl                    |    0
 shaders/nature/lightning.hlsl                 |    0
 shaders/nature/rain.hlsl                      |    0
 shaders/nature/smoke.hlsl                     |    0
 shaders/nature/snow.hlsl                      |    0
 shaders/shaders.hlsl                          |    0
 sort/BinarySearch.h                           |    0
 sort/EytzingerSearch.h                        |    0
 sort/HeapSort.h                               |    0
 sort/InsertionSort.h                          |    0
 sort/IntroSort.h                              |    0
 sort/QuickSort.h                              |    0
 sort/Sort.h                                   |    0
 stdlib/HashMap.h                              |    0
 stdlib/PerfectHashMap.h                       |    0
 stdlib/Simd.h                                 |    8 +-
 stdlib/ThreadedHashMap.h                      |    0
 stdlib/Types.h                                |    0
 system/Allocator.h                            |    0
 system/FileUtils.cpp                          |    0
 system/Library.cpp                            |    0
 system/Library.h                              |    0
 system/SystemInfo.cpp                         |    0
 system/SystemInfo.h                           |    0
 system/Window.h                               |    0
 tests.bat                                     |    0
 tests/.vscode/c_cpp_properties.json           |    0
 tests/.vscode/launch.json                     |    0
 tests/.vscode/settings.json                   |    0
 tests/.vscode/tasks.json                      |    0
 tests/MainTest.cpp                            |    0
 tests/TestFramework.h                         |    0
 tests/math/EvaluatorTest.cpp                  |    0
 tests/memory/ChunkMemoryTest.cpp              |    0
 tests/memory/RingMemoryTest.cpp               |    0
 tests/stdlib/HashMapTest.cpp                  |    0
 tests/ui/UILayoutTest.cpp                     |    0
 tests/ui/UIThemeTest.cpp                      |    0
 tests/utils/BitUtilsTest.cpp                  |    0
 tests/utils/EndianUtilsTest.cpp               |    0
 tests/utils/MathUtilsTest.cpp                 |    0
 tests/utils/StringUtilsTest.cpp               |    0
 tests/utils/UtilsTest.cpp                     |    0
 tests_iter.bat                                |    0
 thread/Atomic.h                               |    0
 thread/Semaphore.h                            |    0
 thread/Spinlock.cpp                           |    0
 thread/Spinlock.h                             |    0
 thread/Thread.h                               |    0
 thread/ThreadDefines.h                        |    0
 thread/ThreadJob.h                            |    0
 thread/ThreadPool.h                           |    0
 ui/UIAlignment.h                              |    0
 ui/UIAnimation.h                              |    0
 ui/UIButton.h                                 |    0
 ui/UICursor.h                                 |    0
 ui/UICustom.h                                 |    0
 ui/UIElement.h                                |    0
 ui/UIElementType.h                            |    0
 ui/UIImage.h                                  |    0
 ui/UIInput.h                                  |    0
 ui/UILabel.h                                  |    0
 ui/UILayout.cpp                               |    0
 ui/UILayout.h                                 |    0
 ui/UILink.h                                   |    0
 ui/UIPanel.h                                  |    0
 ui/UISelect.h                                 |    0
 ui/UIStyleType.h                              |    0
 ui/UITab.h                                    |    0
 ui/UITable.h                                  |    0
 ui/UIText.h                                   |    0
 ui/UITextarea.h                               |    0
 ui/UITheme.h                                  |    0
 ui/UIWindow.h                                 |    0
 ui/attribute/UIAttribute.h                    |    0
 ui/attribute/UIAttributeBackground.h          |    0
 ui/attribute/UIAttributeBorder.h              |    0
 ui/attribute/UIAttributeDimension.h           |    0
 ui/attribute/UIAttributeFont.h                |    0
 ui/attribute/UIAttributeShadow.h              |    0
 ui/attribute/UIAttributeType.h                |    0
 utils/BitUtils.h                              |    0
 utils/EndianUtils.h                           |    0
 utils/MathUtils.h                             |    0
 utils/PerformanceProfiler.h                   |    0
 utils/RandomUtils.h                           |    0
 utils/StringUtils.h                           |   26 +-
 utils/TestUtils.h                             |    0
 utils/TimeUtils.h                             |    0
 utils/Utils.h                                 |    0
 485 files changed, 5108 insertions(+), 4628 deletions(-)
 mode change 100644 => 100755 .github/workflows/codeql.yml
 mode change 100644 => 100755 .github/workflows/msvc.yml
 mode change 100644 => 100755 .gitignore
 mode change 100644 => 100755 Guidelines.md
 mode change 100644 => 100755 README.md
 mode change 100644 => 100755 animation/Animation.h
 mode change 100644 => 100755 animation/AnimationEaseType.h
 mode change 100644 => 100755 architecture/CpuInfo.cpp
 mode change 100644 => 100755 architecture/CpuInfo.h
 mode change 100644 => 100755 architecture/Intrinsics.h
 mode change 100644 => 100755 architecture/arm/CpuInfo.cpp
 mode change 100644 => 100755 architecture/arm/Intrinsics.h
 mode change 100644 => 100755 architecture/arm/neon/utils/Utils.h
 mode change 100644 => 100755 architecture/arm/sve/utils/Utils.h
 mode change 100644 => 100755 architecture/x86/CpuInfo.cpp
 mode change 100644 => 100755 architecture/x86/Intrinsics.h
 mode change 100644 => 100755 architecture/x86/simd/SIMD_F32.h
 create mode 100644 architecture/x86/simd/SIMD_F32_AVX2.h
 create mode 100644 architecture/x86/simd/SIMD_F32_AVX512.h
 create mode 100644 architecture/x86/simd/SIMD_F32_SSE.h
 mode change 100644 => 100755 architecture/x86/simd/SIMD_F64.h
 create mode 100644 architecture/x86/simd/SIMD_F64_AVX2.h
 create mode 100644 architecture/x86/simd/SIMD_F64_AVX512.h
 create mode 100644 architecture/x86/simd/SIMD_F64_SSE.h
 mode change 100644 => 100755 architecture/x86/simd/SIMD_I16.h
 create mode 100644 architecture/x86/simd/SIMD_I16_AVX2.h
 create mode 100644 architecture/x86/simd/SIMD_I16_AVX512.h
 create mode 100644 architecture/x86/simd/SIMD_I16_SSE.h
 mode change 100644 => 100755 architecture/x86/simd/SIMD_I32.h
 create mode 100644 architecture/x86/simd/SIMD_I32_AVX2.h
 create mode 100644 architecture/x86/simd/SIMD_I32_AVX512.h
 create mode 100644 architecture/x86/simd/SIMD_I32_SSE.h
 mode change 100644 => 100755 architecture/x86/simd/SIMD_I64.h
 create mode 100644 architecture/x86/simd/SIMD_I64_AVX2.h
 create mode 100644 architecture/x86/simd/SIMD_I64_AVX512.h
 create mode 100644 architecture/x86/simd/SIMD_I64_SSE.h
 mode change 100644 => 100755 architecture/x86/simd/SIMD_I8.h
 create mode 100644 architecture/x86/simd/SIMD_I8_AVX2.h
 create mode 100644 architecture/x86/simd/SIMD_I8_AVX512.h
 create mode 100644 architecture/x86/simd/SIMD_I8_SSE.h
 create mode 100644 architecture/x86/simd/SIMD_SVML_AVX2.h
 create mode 100755 architecture/x86/simd/SIMD_SVML_AVX512.h
 create mode 100644 architecture/x86/simd/SIMD_SVML_SSE.h
 mode change 100644 => 100755 architecture/x86/simd/utils/Utils.h
 mode change 100644 => 100755 asset/Asset.h
 mode change 100644 => 100755 asset/AssetArchive.h
 mode change 100644 => 100755 asset/AssetManagementSystem.h
 mode change 100644 => 100755 asset/AssetType.h
 mode change 100644 => 100755 audio/Audio.cpp
 mode change 100644 => 100755 audio/Audio.h
 mode change 100644 => 100755 audio/AudioMixer.h
 mode change 100644 => 100755 audio/AudioSetting.h
 mode change 100644 => 100755 audio/Qoa.h
 mode change 100644 => 100755 audio/QoaSimd.h
 mode change 100644 => 100755 audio/Wav.h
 mode change 100644 => 100755 auth/Auth.h
 mode change 100644 => 100755 camera/Camera.h
 mode change 100644 => 100755 camera/CameraMovement.h
 mode change 100644 => 100755 color/ColorVisionDeficiency.h
 mode change 100644 => 100755 command/AppCmdBuffer.cpp
 mode change 100644 => 100755 command/AppCmdBuffer.h
 mode change 100644 => 100755 command/Command.h
 mode change 100644 => 100755 compiler/CompilerUtils.h
 mode change 100644 => 100755 compiler/gcc/Atomic.h
 mode change 100644 => 100755 compiler/gcc/CompilerUtils.h
 mode change 100644 => 100755 compiler/msvc/CompilerUtils.h
 mode change 100644 => 100755 compression/Huffman.h
 mode change 100644 => 100755 compression/LZP.h
 mode change 100644 => 100755 compression/RLE.h
 mode change 100644 => 100755 database/Database.h
 mode change 100644 => 100755 database/DatabaseConnection.h
 mode change 100644 => 100755 database/DatabaseType.h
 mode change 100644 => 100755 encryption/CeasarEncryption.h
 mode change 100644 => 100755 encryption/XorEncryption.h
 mode change 100644 => 100755 entity/AnimationEntityComponent.h
 mode change 100644 => 100755 entity/CursorEntity.h
 mode change 100644 => 100755 entity/Entity.h
 mode change 100644 => 100755 entity/EntityComponentSystem.h
 mode change 100644 => 100755 entity/EntitySize.h
 mode change 100644 => 100755 environment/Globe.h
 mode change 100644 => 100755 environment/Universe.h
 mode change 100644 => 100755 error/HammingCodes.h
 mode change 100644 => 100755 font/Font.h
 mode change 100644 => 100755 font/font_characters.txt
 mode change 100644 => 100755 gpuapi/AntiAliasing.h
 mode change 100644 => 100755 gpuapi/GpuApiType.h
 mode change 100644 => 100755 gpuapi/GpuAttributeType.h
 mode change 100644 => 100755 gpuapi/RenderUtils.h
 mode change 100644 => 100755 gpuapi/ShaderType.h
 mode change 100644 => 100755 gpuapi/direct3d/AppCmdBuffer.h
 mode change 100644 => 100755 gpuapi/direct3d/DirectXUtils.h
 mode change 100644 => 100755 gpuapi/direct3d/FramesInFlightContainer.h
 mode change 100644 => 100755 gpuapi/direct3d/Shader.h
 mode change 100644 => 100755 gpuapi/direct3d/ShaderUtils.h
 mode change 100644 => 100755 gpuapi/opengl/AppCmdBuffer.h
 mode change 100644 => 100755 gpuapi/opengl/FramesInFlightContainer.h
 mode change 100644 => 100755 gpuapi/opengl/Opengl.h
 mode change 100644 => 100755 gpuapi/opengl/OpenglDefines.h
 mode change 100644 => 100755 gpuapi/opengl/OpenglDescriptorSetLayoutBinding.h
 mode change 100644 => 100755 gpuapi/opengl/OpenglLinux.h
 mode change 100644 => 100755 gpuapi/opengl/OpenglUtils.h
 mode change 100644 => 100755 gpuapi/opengl/OpenglWin32.h
 mode change 100644 => 100755 gpuapi/opengl/Shader.h
 mode change 100644 => 100755 gpuapi/opengl/ShaderUtils.h
 mode change 100644 => 100755 gpuapi/vulkan/AppCmdBuffer.h
 mode change 100644 => 100755 gpuapi/vulkan/FramesInFlightContainer.h
 mode change 100644 => 100755 gpuapi/vulkan/Shader.h
 mode change 100644 => 100755 gpuapi/vulkan/ShaderUtils.h
 mode change 100644 => 100755 gpuapi/vulkan/VulkanUtils.h
 mode change 100644 => 100755 hash/Crc.h
 mode change 100644 => 100755 hash/GeneralHash.h
 mode change 100644 => 100755 html/template/HtmlTemplate.h
 mode change 100644 => 100755 html/template/HtmlTemplateCache.h
 mode change 100644 => 100755 html/template/HtmlTemplateContext.h
 mode change 100644 => 100755 html/template/HtmlTemplateInterpreter.h
 mode change 100644 => 100755 html/template/HtmlTemplateLexer.h
 mode change 100644 => 100755 html/template/HtmlTemplateParser.h
 mode change 100644 => 100755 image/Bitmap.h
 mode change 100644 => 100755 image/Image.cpp
 mode change 100644 => 100755 image/Image.h
 mode change 100644 => 100755 image/Png.h
 mode change 100644 => 100755 image/Qoi.h
 mode change 100644 => 100755 image/Tga.h
 mode change 100644 => 100755 image/default_colors.h
 mode change 100644 => 100755 image/default_colors.htm
 mode change 100644 => 100755 image/stb_image.h
 mode change 100644 => 100755 input/ControllerInput.h
 mode change 100644 => 100755 input/ControllerType.h
 mode change 100644 => 100755 input/Input.h
 mode change 100644 => 100755 input/InputConnectionType.h
 mode change 100644 => 100755 light/Material.h
 mode change 100644 => 100755 localization/Dialog.h
 mode change 100644 => 100755 localization/Language.h
 mode change 100644 => 100755 log/DebugContainer.h
 mode change 100644 => 100755 log/DebugMemory.h
 mode change 100644 => 100755 log/Log.h
 mode change 100644 => 100755 log/PerformanceProfiler.h
 mode change 100644 => 100755 log/Stats.h
 mode change 100644 => 100755 math/Evaluator.h
 mode change 100644 => 100755 math/PerlinNoise.h
 mode change 100644 => 100755 math/matrix/MatrixFloat32.h
 mode change 100644 => 100755 math/matrix/MatrixInt32.h
 mode change 100644 => 100755 math/matrix/MatrixInt64.h
 mode change 100644 => 100755 math/matrix/QuaternionFloat32.h
 mode change 100644 => 100755 math/matrix/VectorFloat32.h
 mode change 100644 => 100755 math/matrix/VectorFloat64.h
 mode change 100644 => 100755 math/matrix/VectorInt32.h
 mode change 100644 => 100755 math/matrix/VectorInt64.h
 mode change 100644 => 100755 math/random/BlueNoise.h
 mode change 100644 => 100755 memory/BufferMemory.h
 mode change 100644 => 100755 memory/ChunkMemory.h
 mode change 100644 => 100755 memory/Heap.h
 mode change 100644 => 100755 memory/Queue.h
 mode change 100644 => 100755 memory/RingMemory.h
 mode change 100644 => 100755 memory/ThreadedChunkMemory.h
 mode change 100644 => 100755 memory/ThreadedQueue.h
 mode change 100644 => 100755 memory/ThreadedRingMemory.h
 mode change 100644 => 100755 models/Colors.h
 mode change 100644 => 100755 models/Location.h
 mode change 100644 => 100755 models/Map/map_chunks.h
 mode change 100644 => 100755 models/Obj.h
 mode change 100644 => 100755 models/Sound.h
 mode change 100644 => 100755 models/account/Account.h
 mode change 100644 => 100755 models/bracket/Bracket.h
 mode change 100644 => 100755 models/bracket/BracketMatch.h
 mode change 100644 => 100755 models/bracket/BracketSeeding.h
 mode change 100644 => 100755 models/bracket/BracketTeam.h
 mode change 100644 => 100755 models/chat/Chat.h
 mode change 100644 => 100755 models/chat/ChatLevel.h
 mode change 100644 => 100755 models/chat/ChatStatus.h
 mode change 100644 => 100755 models/chat/ChatType.h
 mode change 100644 => 100755 models/event/Event.h
 mode change 100644 => 100755 models/event/EventTaskType.h
 mode change 100644 => 100755 models/event/tmp
 mode change 100644 => 100755 models/extension/ExtensionType.h
 mode change 100644 => 100755 models/guild/GuildBanner.h
 mode change 100644 => 100755 models/item/Consumable.h
 mode change 100644 => 100755 models/item/ConsumableType.h
 mode change 100644 => 100755 models/item/Equipment.cpp
 mode change 100644 => 100755 models/item/Equipment.h
 mode change 100644 => 100755 models/item/EquipmentType.h
 mode change 100644 => 100755 models/item/Item.h
 mode change 100644 => 100755 models/item/ItemAffixDistribution.h
 mode change 100644 => 100755 models/item/ItemLevelStats.h
 mode change 100644 => 100755 models/item/ItemRarityDefinition.h
 mode change 100644 => 100755 models/item/ItemRarityStats.h
 mode change 100644 => 100755 models/item/ItemStatsDistribution.h
 mode change 100644 => 100755 models/item/MobLevelStats.h
 mode change 100644 => 100755 models/item/_equipment_slots.h
 mode change 100644 => 100755 models/item/_equipment_types.h
 mode change 100644 => 100755 models/item/_item_rarity.h
 mode change 100644 => 100755 models/map.h
 mode change 100644 => 100755 models/mob/ActivityStats.h
 mode change 100644 => 100755 models/mob/FixedStats.h
 mode change 100644 => 100755 models/mob/Mob.cpp
 mode change 100644 => 100755 models/mob/Mob.h
 mode change 100644 => 100755 models/mob/MobAction.h
 mode change 100644 => 100755 models/mob/MobCategory.h
 mode change 100644 => 100755 models/mob/MobState.h
 mode change 100644 => 100755 models/mob/MobStats.cpp
 mode change 100644 => 100755 models/mob/MobStats.h
 mode change 100644 => 100755 models/mob/MobStatsType.h
 mode change 100644 => 100755 models/mob/PrimaryStatsPoints.cpp
 mode change 100644 => 100755 models/mob/PrimaryStatsPoints.h
 mode change 100644 => 100755 models/mob/SecondaryStatsPoints.cpp
 mode change 100644 => 100755 models/mob/SecondaryStatsPoints.h
 mode change 100644 => 100755 models/mob/_mob_category.h
 mode change 100644 => 100755 models/mob/_mob_list.h
 mode change 100644 => 100755 models/mob/monster/Drop.h
 mode change 100644 => 100755 models/mob/monster/LootTable.h
 mode change 100644 => 100755 models/mob/monster/Monster.h
 mode change 100644 => 100755 models/mob/monster/MonsterStats.h
 mode change 100644 => 100755 models/mob/player/Backpack.h
 mode change 100644 => 100755 models/mob/player/Guild.h
 mode change 100644 => 100755 models/mob/player/LootFilter.h
 mode change 100644 => 100755 models/mob/player/Player.cpp
 mode change 100644 => 100755 models/mob/player/Player.h
 mode change 100644 => 100755 models/mob/player/PlayerStats.h
 mode change 100644 => 100755 models/mob/player/PlayerXPRequirement.h
 mode change 100644 => 100755 models/mob/player/Reputation.h
 mode change 100644 => 100755 models/mob/player/_player_class.h
 mode change 100644 => 100755 models/mob/skill/AoeDistribution.h
 mode change 100644 => 100755 models/mob/skill/AoeShape.h
 mode change 100644 => 100755 models/mob/skill/ProjectileDistribution.h
 mode change 100644 => 100755 models/mob/skill/Skill.h
 mode change 100644 => 100755 models/mob/skill/SkillLocation.h
 mode change 100644 => 100755 models/mob/skill/StatsTarget.h
 mode change 100644 => 100755 models/mob/skill/definitions/arcane_bolt.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/arise.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/back_fist.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/beam.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/black_fist.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/chain.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/chain_lightning.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/corruption_bolt.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/cyclone.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/dodge.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/earth_bolt.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/elemental_pilar.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/fear.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/fire_bolt.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/frost_bolt.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/ghost_walk.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/health_inverse_dmg.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/health_to_dmg.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/holy_bolt.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/hook.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/kick.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/launch_strike.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/lightning_bolt.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/meteor_strike.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/mirage.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/net.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/palm_strike.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/poison_strike.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/pull.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/punch.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/push.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/reflect.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/revive.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/root.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/sacrafice.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/shield.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/side_kick.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/spikes.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/sprint.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/stomp.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/summon.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/sword_dance.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/teleport.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/totem.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/uppercut.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/whirlwind.cfg
 mode change 100644 => 100755 models/mob/skill/definitions/wind_slashes.cfg
 mode change 100644 => 100755 models/mob/skill/modifiers/split_shot.cfg
 mode change 100644 => 100755 models/object/Block.cpp
 mode change 100644 => 100755 models/object/Block.h
 mode change 100644 => 100755 models/object/Chunk.h
 mode change 100644 => 100755 models/object/Cube.h
 mode change 100644 => 100755 models/object/Object.h
 mode change 100644 => 100755 models/object/ObjectType.h
 mode change 100644 => 100755 models/object/_object_list.h
 mode change 100644 => 100755 models/object/_object_types.h
 mode change 100644 => 100755 models/settings/DungeonSettings.h
 mode change 100644 => 100755 models/settings/ItemDistributionType.h
 mode change 100644 => 100755 models/settings/Settings.h
 mode change 100644 => 100755 models/settings/setting_types.h
 mode change 100644 => 100755 module/Module.h
 mode change 100644 => 100755 module/ModuleManager.h
 mode change 100644 => 100755 network/Server.h
 mode change 100644 => 100755 network/Socket.h
 mode change 100644 => 100755 network/SocketConnection.h
 mode change 100644 => 100755 network/packet/OMSPacket.h
 mode change 100644 => 100755 network/packet/PacketCache.h
 mode change 100644 => 100755 network/packet/PacketHeader.h
 mode change 100644 => 100755 network/packet/UDPPacket.h
 mode change 100644 => 100755 network/packet/chat/ChatMessagePacket.h
 mode change 100644 => 100755 network/packet/general/AckPacket.h
 mode change 100644 => 100755 network/packet/general/PingPacket.h
 mode change 100644 => 100755 network/packet/mob/MobInfoPacket.h
 mode change 100644 => 100755 network/packet/mob/MobStatePacket.h
 mode change 100644 => 100755 network/packet/mob/player/PlayerInfoPacket.h
 mode change 100644 => 100755 network/packet/mob/player/PlayerState.h
 mode change 100644 => 100755 network/packet/packet_types.h
 mode change 100644 => 100755 noise/FractalNoise.h
 mode change 100644 => 100755 noise/PerlinNoise.h
 mode change 100644 => 100755 noise/SimplexNoise.h
 mode change 100644 => 100755 noise/ValueNoise.h
 mode change 100644 => 100755 noise/WorleyNoise.h
 mode change 100644 => 100755 object/Animation.h
 mode change 100644 => 100755 object/Hitbox.h
 mode change 100644 => 100755 object/Material.h
 mode change 100644 => 100755 object/Materials.md
 mode change 100644 => 100755 object/Mesh.h
 mode change 100644 => 100755 object/Model.h
 mode change 100644 => 100755 object/Texture.h
 mode change 100644 => 100755 object/Vertex.h
 mode change 100644 => 100755 particle/Particle.h
 mode change 100644 => 100755 pathfinding/Jpsp.h
 mode change 100644 => 100755 pathfinding/Metric2d.h
 mode change 100644 => 100755 pathfinding/Metric3d.h
 mode change 100644 => 100755 pathfinding/Path.h
 mode change 100644 => 100755 pathfinding/jps/Jps.h
 mode change 100644 => 100755 pathfinding/jps/JpsGrid.h
 mode change 100644 => 100755 pathfinding/jps/JpsNode.h
 mode change 100644 => 100755 platform/linux/Allocator.h
 mode change 100644 => 100755 platform/linux/ExceptionHandler.h
 mode change 100644 => 100755 platform/linux/FileUtils.cpp
 mode change 100644 => 100755 platform/linux/Library.cpp
 mode change 100644 => 100755 platform/linux/Library.h
 mode change 100644 => 100755 platform/linux/SystemInfo.cpp
 mode change 100644 => 100755 platform/linux/TimeUtils.h
 mode change 100644 => 100755 platform/linux/UtilsLinux.h
 mode change 100644 => 100755 platform/linux/network/Server.h
 mode change 100644 => 100755 platform/linux/network/Socket.h
 mode change 100644 => 100755 platform/linux/threading/Atomic.h
 mode change 100644 => 100755 platform/linux/threading/Semaphore.h
 mode change 100644 => 100755 platform/linux/threading/Spinlock.cpp
 mode change 100644 => 100755 platform/linux/threading/Spinlock.h
 mode change 100644 => 100755 platform/linux/threading/Thread.h
 mode change 100644 => 100755 platform/linux/threading/ThreadDefines.h
 mode change 100644 => 100755 platform/win32/Allocator.h
 mode change 100644 => 100755 platform/win32/Clipboard.h
 mode change 100644 => 100755 platform/win32/ExceptionHandler.h
 mode change 100644 => 100755 platform/win32/FastPipes.h
 mode change 100644 => 100755 platform/win32/FileUtils.cpp
 mode change 100644 => 100755 platform/win32/LeanWin32.h
 mode change 100644 => 100755 platform/win32/Library.cpp
 mode change 100644 => 100755 platform/win32/Library.h
 mode change 100644 => 100755 platform/win32/SystemInfo.cpp
 mode change 100644 => 100755 platform/win32/TimeUtils.h
 mode change 100644 => 100755 platform/win32/UtilsWin32.h
 mode change 100644 => 100755 platform/win32/UtilsWindows.h
 mode change 100644 => 100755 platform/win32/Window.h
 mode change 100644 => 100755 platform/win32/audio/DirectSound.h
 mode change 100644 => 100755 platform/win32/audio/Wasapi.h
 mode change 100644 => 100755 platform/win32/audio/XAudio2.h
 mode change 100644 => 100755 platform/win32/input/DirectInput.h
 mode change 100644 => 100755 platform/win32/input/HidInput.h
 mode change 100644 => 100755 platform/win32/input/RawInput.h
 mode change 100644 => 100755 platform/win32/input/XInput.h
 mode change 100644 => 100755 platform/win32/input/controller/ControllerHandler.h
 mode change 100644 => 100755 platform/win32/input/controller/DualSense.h
 mode change 100644 => 100755 platform/win32/input/controller/DualShock4.h
 mode change 100644 => 100755 platform/win32/input/controller/XBoxS.h
 mode change 100644 => 100755 platform/win32/network/Client.h
 mode change 100644 => 100755 platform/win32/network/Server.h
 mode change 100644 => 100755 platform/win32/network/Socket.h
 mode change 100644 => 100755 platform/win32/threading/Atomic.h
 mode change 100644 => 100755 platform/win32/threading/Semaphore.h
 mode change 100644 => 100755 platform/win32/threading/Spinlock.cpp
 mode change 100644 => 100755 platform/win32/threading/Spinlock.h
 mode change 100644 => 100755 platform/win32/threading/Thread.h
 mode change 100644 => 100755 platform/win32/threading/ThreadDefines.h
 mode change 100644 => 100755 render/liquid.cpp
 mode change 100644 => 100755 render/mob.cpp
 mode change 100644 => 100755 render/object.cpp
 mode change 100644 => 100755 render/sky.cpp
 mode change 100644 => 100755 render/text.cpp
 mode change 100644 => 100755 scene/SceneInfo.h
 mode change 100644 => 100755 shaders/liquids/lava.hlsl
 mode change 100644 => 100755 shaders/liquids/water/cube_fragment.hlsl
 mode change 100644 => 100755 shaders/liquids/water/cube_vertex.hlsl
 mode change 100644 => 100755 shaders/liquids/water/helper.hlsli
 mode change 100644 => 100755 shaders/liquids/water/sphere_fragment.hlsl
 mode change 100644 => 100755 shaders/liquids/water/sphere_vertex.hlsl
 mode change 100644 => 100755 shaders/liquids/water/water_above_fragment.hlsl
 mode change 100644 => 100755 shaders/liquids/water/water_below_fragment.hlsl
 mode change 100644 => 100755 shaders/liquids/water/water_caustics_fragment.hlsl
 mode change 100644 => 100755 shaders/liquids/water/water_caustics_vertex.hlsl
 mode change 100644 => 100755 shaders/liquids/water/water_vertex.hlsl
 mode change 100644 => 100755 shaders/nature/cloud.hlsl
 mode change 100644 => 100755 shaders/nature/fire.hlsl
 mode change 100644 => 100755 shaders/nature/fog.hlsl
 mode change 100644 => 100755 shaders/nature/godray.hlsl
 mode change 100644 => 100755 shaders/nature/lightning.hlsl
 mode change 100644 => 100755 shaders/nature/rain.hlsl
 mode change 100644 => 100755 shaders/nature/smoke.hlsl
 mode change 100644 => 100755 shaders/nature/snow.hlsl
 mode change 100644 => 100755 shaders/shaders.hlsl
 mode change 100644 => 100755 sort/BinarySearch.h
 mode change 100644 => 100755 sort/EytzingerSearch.h
 mode change 100644 => 100755 sort/HeapSort.h
 mode change 100644 => 100755 sort/InsertionSort.h
 mode change 100644 => 100755 sort/IntroSort.h
 mode change 100644 => 100755 sort/QuickSort.h
 mode change 100644 => 100755 sort/Sort.h
 mode change 100644 => 100755 stdlib/HashMap.h
 mode change 100644 => 100755 stdlib/PerfectHashMap.h
 mode change 100644 => 100755 stdlib/Simd.h
 mode change 100644 => 100755 stdlib/ThreadedHashMap.h
 mode change 100644 => 100755 stdlib/Types.h
 mode change 100644 => 100755 system/Allocator.h
 mode change 100644 => 100755 system/FileUtils.cpp
 mode change 100644 => 100755 system/Library.cpp
 mode change 100644 => 100755 system/Library.h
 mode change 100644 => 100755 system/SystemInfo.cpp
 mode change 100644 => 100755 system/SystemInfo.h
 mode change 100644 => 100755 system/Window.h
 mode change 100644 => 100755 tests.bat
 mode change 100644 => 100755 tests/.vscode/c_cpp_properties.json
 mode change 100644 => 100755 tests/.vscode/launch.json
 mode change 100644 => 100755 tests/.vscode/settings.json
 mode change 100644 => 100755 tests/.vscode/tasks.json
 mode change 100644 => 100755 tests/MainTest.cpp
 mode change 100644 => 100755 tests/TestFramework.h
 mode change 100644 => 100755 tests/math/EvaluatorTest.cpp
 mode change 100644 => 100755 tests/memory/ChunkMemoryTest.cpp
 mode change 100644 => 100755 tests/memory/RingMemoryTest.cpp
 mode change 100644 => 100755 tests/stdlib/HashMapTest.cpp
 mode change 100644 => 100755 tests/ui/UILayoutTest.cpp
 mode change 100644 => 100755 tests/ui/UIThemeTest.cpp
 mode change 100644 => 100755 tests/utils/BitUtilsTest.cpp
 mode change 100644 => 100755 tests/utils/EndianUtilsTest.cpp
 mode change 100644 => 100755 tests/utils/MathUtilsTest.cpp
 mode change 100644 => 100755 tests/utils/StringUtilsTest.cpp
 mode change 100644 => 100755 tests/utils/UtilsTest.cpp
 mode change 100644 => 100755 tests_iter.bat
 mode change 100644 => 100755 thread/Atomic.h
 mode change 100644 => 100755 thread/Semaphore.h
 mode change 100644 => 100755 thread/Spinlock.cpp
 mode change 100644 => 100755 thread/Spinlock.h
 mode change 100644 => 100755 thread/Thread.h
 mode change 100644 => 100755 thread/ThreadDefines.h
 mode change 100644 => 100755 thread/ThreadJob.h
 mode change 100644 => 100755 thread/ThreadPool.h
 mode change 100644 => 100755 ui/UIAlignment.h
 mode change 100644 => 100755 ui/UIAnimation.h
 mode change 100644 => 100755 ui/UIButton.h
 mode change 100644 => 100755 ui/UICursor.h
 mode change 100644 => 100755 ui/UICustom.h
 mode change 100644 => 100755 ui/UIElement.h
 mode change 100644 => 100755 ui/UIElementType.h
 mode change 100644 => 100755 ui/UIImage.h
 mode change 100644 => 100755 ui/UIInput.h
 mode change 100644 => 100755 ui/UILabel.h
 mode change 100644 => 100755 ui/UILayout.cpp
 mode change 100644 => 100755 ui/UILayout.h
 mode change 100644 => 100755 ui/UILink.h
 mode change 100644 => 100755 ui/UIPanel.h
 mode change 100644 => 100755 ui/UISelect.h
 mode change 100644 => 100755 ui/UIStyleType.h
 mode change 100644 => 100755 ui/UITab.h
 mode change 100644 => 100755 ui/UITable.h
 mode change 100644 => 100755 ui/UIText.h
 mode change 100644 => 100755 ui/UITextarea.h
 mode change 100644 => 100755 ui/UITheme.h
 mode change 100644 => 100755 ui/UIWindow.h
 mode change 100644 => 100755 ui/attribute/UIAttribute.h
 mode change 100644 => 100755 ui/attribute/UIAttributeBackground.h
 mode change 100644 => 100755 ui/attribute/UIAttributeBorder.h
 mode change 100644 => 100755 ui/attribute/UIAttributeDimension.h
 mode change 100644 => 100755 ui/attribute/UIAttributeFont.h
 mode change 100644 => 100755 ui/attribute/UIAttributeShadow.h
 mode change 100644 => 100755 ui/attribute/UIAttributeType.h
 mode change 100644 => 100755 utils/BitUtils.h
 mode change 100644 => 100755 utils/EndianUtils.h
 mode change 100644 => 100755 utils/MathUtils.h
 mode change 100644 => 100755 utils/PerformanceProfiler.h
 mode change 100644 => 100755 utils/RandomUtils.h
 mode change 100644 => 100755 utils/StringUtils.h
 mode change 100644 => 100755 utils/TestUtils.h
 mode change 100644 => 100755 utils/TimeUtils.h
 mode change 100644 => 100755 utils/Utils.h

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
old mode 100644
new mode 100755
diff --git a/.github/workflows/msvc.yml b/.github/workflows/msvc.yml
old mode 100644
new mode 100755
diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
diff --git a/Guidelines.md b/Guidelines.md
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
diff --git a/animation/Animation.h b/animation/Animation.h
old mode 100644
new mode 100755
diff --git a/animation/AnimationEaseType.h b/animation/AnimationEaseType.h
old mode 100644
new mode 100755
diff --git a/architecture/CpuInfo.cpp b/architecture/CpuInfo.cpp
old mode 100644
new mode 100755
diff --git a/architecture/CpuInfo.h b/architecture/CpuInfo.h
old mode 100644
new mode 100755
diff --git a/architecture/Intrinsics.h b/architecture/Intrinsics.h
old mode 100644
new mode 100755
diff --git a/architecture/arm/CpuInfo.cpp b/architecture/arm/CpuInfo.cpp
old mode 100644
new mode 100755
diff --git a/architecture/arm/Intrinsics.h b/architecture/arm/Intrinsics.h
old mode 100644
new mode 100755
index 08bfb69..b727536
--- a/architecture/arm/Intrinsics.h
+++ b/architecture/arm/Intrinsics.h
@@ -11,6 +11,7 @@
 
 #include <arm_sve.h>
 #include <arm_acle.h>
+#include <arm_neon.h>
 
 #include "../../stdlib/Types.h"
 #include "../../compiler/CompilerUtils.h"
@@ -50,4 +51,7 @@
     #define intrin_timestamp_counter() __builtin_readcyclecounter()
 #endif
 
+// a * b + c
+#define intrin_fmadd(a, b, c) vgetq_lane_f32(vmlaq_f32(vdupq_n_f32(c), vdupq_n_f32(a), vdupq_n_f32(b)), 0)
+
 #endif
\ No newline at end of file
diff --git a/architecture/arm/neon/utils/Utils.h b/architecture/arm/neon/utils/Utils.h
old mode 100644
new mode 100755
diff --git a/architecture/arm/sve/utils/Utils.h b/architecture/arm/sve/utils/Utils.h
old mode 100644
new mode 100755
diff --git a/architecture/x86/CpuInfo.cpp b/architecture/x86/CpuInfo.cpp
old mode 100644
new mode 100755
diff --git a/architecture/x86/Intrinsics.h b/architecture/x86/Intrinsics.h
old mode 100644
new mode 100755
index 20319e9..53f08bb
--- a/architecture/x86/Intrinsics.h
+++ b/architecture/x86/Intrinsics.h
@@ -58,6 +58,9 @@
 #define intrin_prefetch_l2(mem) _mm_prefetch((const char *) (mem), _MM_HINT_T1)
 #define intrin_prefetch_l3(mem) _mm_prefetch((const char *) (mem), _MM_HINT_T2)
 
+// a * b + c
+#define intrin_fmadd(a, b, c) _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a), _mm_set_ss(b), _mm_set_ss(c)))
+
 inline
 uint64 intrin_timestamp_counter() noexcept {
     _mm_mfence();
diff --git a/architecture/x86/simd/SIMD_F32.h b/architecture/x86/simd/SIMD_F32.h
old mode 100644
new mode 100755
index edf374d..2051b8c
--- a/architecture/x86/simd/SIMD_F32.h
+++ b/architecture/x86/simd/SIMD_F32.h
@@ -13,989 +13,18 @@
 #include <xmmintrin.h>
 
 #include "../../../stdlib/Types.h"
-#include "SIMD_SVML.h"
 
-struct f32_4 {
-    union {
-        #if ARM
-            svfloat32_t s;
-        #else
-            __m128 s;
-        #endif
+#ifdef MACRO_CPU_FEATURE_SSE42
+    #include "SIMD_F32_SSE.h"
+#endif
 
-        f32 v[4];
-    };
-};
+#ifdef MACRO_CPU_FEATURE_AVX2
+    #include "SIMD_F32_AVX2.h"
+#endif
 
-struct f32_8 {
-    union {
-        #if ARM
-            svfloat32_t s;
-        #else
-            __m256 s;
-        #endif
-
-        f32 v[8];
-    };
-};
-
-struct f32_16 {
-    union {
-        #if ARM
-            svfloat32_t s;
-        #else
-            __m512 s;
-        #endif
-
-        f32 v[16];
-    };
-};
-
-inline f32_4 load_f32_4(const f32* mem)
-{
-    f32_4 simd;
-    simd.s = _mm_load_ps(mem);
-
-    return simd;
-}
-
-inline f32_4 init_f32_4(const f32* mem)
-{
-    f32_4 simd;
-    simd.s = _mm_set_ps(mem[0], mem[1], mem[2], mem[3]);
-
-    return simd;
-}
-
-inline void unload_f32_4(f32_4 a, f32 *array) { _mm_store_ps(array, a.s); }
-
-inline f32_8 load_f32_8(const f32* mem)
-{
-    f32_8 simd;
-    simd.s = _mm256_load_ps(mem);
-
-    return simd;
-}
-
-inline f32_8 init_f32_8(const f32* mem)
-{
-    f32_8 simd;
-    simd.s = _mm256_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]);
-
-    return simd;
-}
-
-inline void unload_f32_8(f32_8 a, f32 *array) { _mm256_store_ps(array, a.s); }
-
-inline f32_16 load_f32_16(const f32* mem)
-{
-    f32_16 simd;
-    simd.s = _mm512_load_ps(mem);
-
-    return simd;
-}
-
-inline f32_16 init_f32_16(const f32* mem)
-{
-    f32_16 simd;
-    simd.s = _mm512_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], mem[8], mem[9], mem[10],
-                            mem[11], mem[12], mem[13], mem[14], mem[15]);
-
-    return simd;
-}
-
-inline void unload_f32_16(f32_16 a, f32 *array) { _mm512_store_ps(array, a.s); }
-
-inline f32_4 init_zero_f32_4()
-{
-    f32_4 simd;
-    simd.s = _mm_setzero_ps();
-
-    return simd;
-}
-
-inline f32_8 init_zero_f32_8()
-{
-    f32_8 simd;
-    simd.s = _mm256_setzero_ps();
-
-    return simd;
-}
-
-inline f32_16 init_zero_f32_16()
-{
-    f32_16 simd;
-    simd.s = _mm512_setzero_ps();
-
-    return simd;
-}
-
-inline f32_4 init_value_f32_4(f32 value)
-{
-    f32_4 simd;
-    simd.s = _mm_set1_ps(value);
-
-    return simd;
-}
-
-inline f32_8 init_value_f32_8(f32 value)
-{
-    f32_8 simd;
-    simd.s = _mm256_set1_ps(value);
-
-    return simd;
-}
-
-inline f32_16 init_value_f32_16(f32 value)
-{
-    f32_16 simd;
-    simd.s = _mm512_set1_ps(value);
-
-    return simd;
-}
-
-inline f32_4 init_values_f32_4(f32 a, f32 b, f32 c, f32 d)
-{
-    f32_4 simd;
-    simd.s = _mm_set_ps(a, b, c, d);
-
-    return simd;
-}
-
-inline f32_8 init_values_f32_8(
-    f32 a, f32 b, f32 c, f32 d,
-    f32 e, f32 f, f32 g, f32 h
-)
-{
-    f32_8 simd;
-    simd.s = _mm256_set_ps(a, b, c, d, e, f, g, h);
-
-    return simd;
-}
-
-inline f32_16 init_values_f32_16(
-    f32 a, f32 b, f32 c, f32 d,
-    f32 e, f32 f, f32 g, f32 h,
-    f32 i, f32 j, f32 k, f32 l,
-    f32 m, f32 n, f32 o, f32 p
-)
-{
-    f32_16 simd;
-    simd.s = _mm512_set_ps(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
-
-    return simd;
-}
-
-inline f32_4 operator+(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_add_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator+(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_add_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 operator+(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_add_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator-(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_sub_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator-(f32_4 a) { return init_zero_f32_4() - a; }
-
-inline f32_8 operator-(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_sub_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator-(f32_8 a) { return init_zero_f32_8() - a; }
-
-inline f32_16 operator-(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_sub_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 operator-(f32_16 a) { return init_zero_f32_16() - a; }
-
-inline f32_4 operator*(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_mul_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator*(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_mul_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 operator*(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_mul_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator/(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_div_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator/(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_div_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 operator/(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_div_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator^(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_xor_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator^(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_xor_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 operator^(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_xor_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 &operator-=(f32_4 &a, f32_4 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline f32_8 &operator-=(f32_8 &a, f32_8 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline f32_16 &operator-=(f32_16 &a, f32_16 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline f32_4 &operator+=(f32_4 &a, f32_4 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline f32_8 &operator+=(f32_8 &a, f32_8 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline f32_16 &operator+=(f32_16 &a, f32_16 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline f32_4 &operator*=(f32_4 &a, f32_4 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline f32_8 &operator*=(f32_8 &a, f32_8 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline f32_16 &operator*=(f32_16 &a, f32_16 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline f32_4 &operator/=(f32_4 &a, f32_4 b)
-{
-    a = a / b;
-
-    return a;
-}
-
-inline f32_8 &operator/=(f32_8 &a, f32_8 b)
-{
-    a = a / b;
-
-    return a;
-}
-
-inline f32_16 &operator/=(f32_16 &a, f32_16 b)
-{
-    a = a / b;
-
-    return a;
-}
-
-inline f32_4 &operator^=(f32_4 &a, f32_4 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline f32_8 &operator^=(f32_8 &a, f32_8 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline f32_16 &operator^=(f32_16 &a, f32_16 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline f32_4 operator<(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_cmplt_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator<(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LT_OQ);
-
-    return simd;
-}
-
-inline f32_16 operator<(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_mask_blend_ps(_mm512_cmplt_ps_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator<=(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_cmple_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator<=(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LE_OQ);
-
-    return simd;
-}
-
-inline f32_16 operator<=(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator>(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_cmpgt_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator>(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GT_OQ);
-
-    return simd;
-}
-
-inline f32_16 operator>(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator>=(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_cmpge_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator>=(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GE_OQ);
-
-    return simd;
-}
-
-inline f32_16 operator>=(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator==(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_cmpeq_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator==(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_EQ_OQ);
-
-    return simd;
-}
-
-inline f32_16 operator==(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator!=(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_cmpneq_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator!=(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_NEQ_OQ);
-
-    return simd;
-}
-
-inline f32_16 operator!=(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator&(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_and_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator&(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_and_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 operator&(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_and_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator|(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_or_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 operator|(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_or_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 operator|(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_or_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 &operator&=(f32_4 &a, f32_4 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline f32_8 &operator&=(f32_8 &a, f32_8 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline f32_16 &operator&=(f32_16 &a, f32_16 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline f32_4 &operator|=(f32_4 &a, f32_4 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline f32_8 &operator|=(f32_8 &a, f32_8 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline f32_16 &operator|=(f32_16 &a, f32_16 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline f32_4 abs(f32_4 a)
-{
-    uint32 unsigned_mask = (uint32) (1U << 31);
-    __m128 mask                = _mm_set1_ps(*(f32 *) &unsigned_mask);
-
-    f32_4 simd;
-    simd.s = _mm_and_ps(a.s, mask);
-
-    return simd;
-}
-
-inline f32_8 abs(f32_8 a)
-{
-    uint32 unsigned_mask = (uint32) (1U << 31);
-    __m256 mask                = _mm256_set1_ps(*(f32 *) &unsigned_mask);
-
-    f32_8 simd;
-    simd.s = _mm256_and_ps(a.s, mask);
-
-    return simd;
-}
-
-inline f32_16 abs(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_abs_ps(a.s);
-
-    return simd;
-}
-
-inline f32_4 simd_min(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_min_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 simd_min(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_min_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 simd_min(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_min_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 simd_max(f32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_max_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_8 simd_max(f32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_max_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_16 simd_max(f32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_max_ps(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 sign(f32_4 a)
-{
-    uint32 umask = (uint32) (1U << 31);
-    __m128 mask        = _mm_set1_ps(*(f32 *) &umask);
-
-    f32_4 signBit;
-    signBit.s = _mm_and_ps(a.s, mask);
-
-    f32_4 b;
-    b.s = _mm_set1_ps(1.0f);
-
-    f32_4 simd = b | signBit;
-
-    return simd;
-}
-
-inline f32_8 sign(f32_8 a)
-{
-    uint32 umask = (uint32) (1U << 31);
-    __m256 mask        = _mm256_set1_ps(*(f32 *) &umask);
-
-    f32_8 signBit;
-    signBit.s = _mm256_and_ps(a.s, mask);
-
-    f32_8 b;
-    b.s = _mm256_set1_ps(1.0f);
-
-    f32_8 simd = b | signBit;
-
-    return simd;
-}
-
-inline f32_16 sign(f32_16 a)
-{
-    uint32 umask = (uint32) (1U << 31);
-    __m512 mask        = _mm512_set1_ps(*(f32 *) &umask);
-
-    f32_16 signBit;
-    signBit.s = _mm512_and_ps(a.s, mask);
-
-    f32_16 b;
-    b.s = _mm512_set1_ps(1.0f);
-
-    f32_16 simd = b | signBit;
-
-    return simd;
-}
-
-inline f32_4 floor(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_floor_ps(a.s);
-
-    return simd;
-}
-
-inline f32_8 floor(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_floor_ps(a.s);
-
-    return simd;
-}
-
-inline f32_16 floor(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_floor_ps(a.s);
-
-    return simd;
-}
-
-inline f32_4 ceil(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_ceil_ps(a.s);
-
-    return simd;
-}
-
-inline f32_8 ceil(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_ceil_ps(a.s);
-
-    return simd;
-}
-
-inline f32_16 ceil(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_ceil_ps(a.s);
-
-    return simd;
-}
-
-inline f32_4 sqrt(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_sqrt_ps(a.s);
-
-    return simd;
-}
-
-inline f32_8 sqrt(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_sqrt_ps(a.s);
-
-    return simd;
-}
-
-inline f32_16 sqrt(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_sqrt_ps(a.s);
-
-    return simd;
-}
-
-inline f32_4 sqrt_inv_approx(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_rsqrt_ps(a.s);
-
-    return simd;
-}
-
-inline f32_8 sqrt_inv_approx(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_rsqrt_ps(a.s);
-
-    return simd;
-}
-
-inline f32_16 sqrt_inv_approx(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_rsqrt14_ps(a.s);
-
-    return simd;
-}
-
-inline f32_4 one_over_approx(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_rcp_ps(a.s);
-
-    return simd;
-}
-
-inline f32_8 one_over_approx(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_rcp_ps(a.s);
-
-    return simd;
-}
-
-inline f32_16 one_over_approx(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_rcp14_ps(a.s);
-
-    return simd;
-}
-
-inline f32_4 clamp(f32_4 min_value, f32_4 a, f32_4 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline f32_8 clamp(f32_8 min_value, f32_8 a, f32_8 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline f32_16 clamp(f32_16 min_value, f32_16 a, f32_16 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int32 which_true(f32_4 a)
-{
-    int32 which_true = _mm_movemask_ps(a.s);
-
-    return which_true;
-}
-
-inline int32 which_true(f32_8 a)
-{
-    int32 which_true = _mm256_movemask_ps(a.s);
-
-    return which_true;
-}
-
-inline int32 which_true(f32_16 a)
-{
-    int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s));
-
-    return which_true;
-}
-
-inline bool any_true(f32_4 a)
-{
-    bool is_any_true = _mm_movemask_ps(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool any_true(f32_8 a)
-{
-    bool is_any_true = _mm256_movemask_ps(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool any_true(f32_16 a)
-{
-    bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0;
-
-    return is_any_true;
-}
-
-inline bool all_true(f32_4 a)
-{
-    bool is_true = _mm_movemask_ps(a.s) == 15;
-
-    return is_true;
-}
-
-inline bool all_true(f32_8 a)
-{
-    bool is_true = _mm256_movemask_ps(a.s) == 255;
-
-    return is_true;
-}
-
-inline bool all_true(f32_16 a)
-{
-    bool is_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535;
-
-    return is_true;
-}
-
-inline bool all_false(f32_4 a)
-{
-    bool is_false = _mm_movemask_ps(a.s) == 0;
-
-    return is_false;
-}
-
-inline bool all_false(f32_8 a)
-{
-    bool is_false = _mm256_movemask_ps(a.s) == 0;
-
-    return is_false;
-}
-
-inline bool all_false(f32_16 a)
-{
-    // @todo This can be optimized (requires also changes in the comparison functions return)
-    bool is_false = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0;
-
-    return is_false;
-}
+#ifdef MACRO_CPU_FEATURE_AVX512
+    #include "SIMD_F32_AVX512.h"
+#endif
 
 // @todo from down here we can optimize some of the code by NOT using the wrappers
 //      the code is self contained and we could use te intrinsic functions directly
@@ -1004,53 +33,73 @@ inline
 void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) b, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512 a_16;
-        __m512 b_16;
-        __m512 result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512 a_16;
+            __m512 b_16;
+            __m512 result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_ps(a);
-            b_16 = _mm512_load_ps(b);
-            result_16 = _mm512_mul_ps(a_16, b_16);
-            _mm512_store_ps(result, result_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_ps(a);
+                b_16 = _mm512_load_ps(b);
+                result_16 = _mm512_mul_ps(a_16, b_16);
+                _mm512_store_ps(result, result_16);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256 a_8;
-        __m256 b_8;
-        __m256 result_8;
+                a += steps;
+                b += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_ps(a);
-            b_8 = _mm256_load_ps(b);
-            result_8 = _mm256_mul_ps(a_8, b_8);
-            _mm256_store_ps(result, result_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128 a_4;
-        __m128 b_4;
-        __m128 result_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256 a_8;
+            __m256 b_8;
+            __m256 result_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_ps(a);
-            b_4 = _mm_load_ps(b);
-            result_4 = _mm_mul_ps(a_4, b_4);
-            _mm_store_ps(result, result_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_ps(a);
+                b_8 = _mm256_load_ps(b);
+                result_8 = _mm256_mul_ps(a_8, b_8);
+                _mm256_store_ps(result, result_8);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128 a_4;
+            __m128 b_4;
+            __m128 result_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_ps(a);
+                b_4 = _mm_load_ps(b);
+                result_4 = _mm_mul_ps(a_4, b_4);
+                _mm_store_ps(result, result_4);
+
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         *result = *a * *b;
@@ -1065,21 +114,30 @@ inline
 void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512 a_16;
-        __m512 b_16 = _mm512_set1_ps(b);
-        __m512 result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512 a_16;
+            __m512 b_16 = _mm512_set1_ps(b);
+            __m512 result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_ps(a);
-            result_16 = _mm512_mul_ps(a_16, b_16);
-            _mm512_store_ps(result, result_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_ps(a);
+                result_16 = _mm512_mul_ps(a_16, b_16);
+                _mm512_store_ps(result, result_16);
 
-            a += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
+                a += steps;
+                result += steps;
+            }
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_AVX2
+    if (steps >= 8) {
+        steps = 8;
         __m256 a_8;
         __m256 b_8 = _mm256_set1_ps(b);
         __m256 result_8;
@@ -1091,8 +149,13 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
 
             a += steps;
             result += steps;
-       }
-    } else if (steps == 4) {
+        }
+    }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+    if (steps >= 4) {
+        steps = 4;
         __m128 a_4;
         __m128 b_4 = _mm_set1_ps(b);
         __m128 result_4;
@@ -1104,8 +167,9 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps)
 
             a += steps;
             result += steps;
-       }
+        }
     }
+    #endif
 
     for (; i < size; ++i) {
         *result = *a * b;
@@ -1119,48 +183,64 @@ inline
 void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512 a_16;
-        __m512 b_16 = _mm512_set1_ps(b);
-        __m512 result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512 a_16;
+            __m512 b_16 = _mm512_set1_ps(b);
+            __m512 result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_ps(a);
-            result_16 = _mm512_div_ps(a_16, b_16);
-            _mm512_store_ps(result, result_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_ps(a);
+                result_16 = _mm512_div_ps(a_16, b_16);
+                _mm512_store_ps(result, result_16);
 
-            a += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256 a_8;
-        __m256 b_8 = _mm256_set1_ps(b);
-        __m256 result_8;
+                a += steps;
+                result += steps;
+            }
+        }
+    #endif
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_ps(a);
-            result_8 = _mm256_div_ps(a_8, b_8);
-            _mm256_store_ps(result, result_8);
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256 a_8;
+            __m256 b_8 = _mm256_set1_ps(b);
+            __m256 result_8;
 
-            a += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128 a_4;
-        __m128 b_4 = _mm_set1_ps(b);
-        __m128 result_4;
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_ps(a);
+                result_8 = _mm256_div_ps(a_8, b_8);
+                _mm256_store_ps(result, result_8);
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_ps(a);
-            result_4 = _mm_div_ps(a_4, b_4);
-            _mm_store_ps(result, result_4);
+                a += steps;
+                result += steps;
+            }
+        }
+    #endif
 
-            a += steps;
-            result += steps;
-       }
-    }
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128 a_4;
+            __m128 b_4 = _mm_set1_ps(b);
+            __m128 result_4;
 
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_ps(a);
+                result_4 = _mm_div_ps(a_4, b_4);
+                _mm_store_ps(result, result_4);
+
+                a += steps;
+                result += steps;
+            }
+        }
+    #endif
+
+    // Scalar fallback
     for (; i < size; ++i) {
         *result = *a / b;
 
@@ -1169,159 +249,4 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps)
     }
 }
 
-inline
-void simd_div(const f32* a, f32 b, __m256* result, int32 size)
-{
-    int32 i = 0;
-    int32 j = 0;
-
-    // @todo this his how all the functions should be implemented that take in baseic types and output basic types
-    __m256 a_8;
-    __m256 b_8 = _mm256_set1_ps(b);
-    __m256 result_8;
-
-    for (; i <= size - 8; i += 8) {
-        a_8 = _mm256_load_ps(a);
-        result_8 = _mm256_div_ps(a_8, b_8);
-        result[j] = result_8;
-
-        a += 8;
-        ++j;
-    }
-
-    int32 diff = size - i;
-    alignas(32) f32 temp[8];
-
-    for (int32 k = 0; k < diff; k++) {
-        temp[k] = a[i + k] / b;
-    }
-
-    result[j] = _mm256_load_ps(temp);
-}
-
-inline
-void simd_cmp_le(const __m256* a, f32 b, bool* result, int32 size)
-{
-    __m256 b_8 = _mm256_set1_ps(b);
-
-    for (int32 i = 0; i < size; ++i) {
-        int32 mask = _mm256_movemask_ps(_mm256_cmp_ps(a[i], b_8, _CMP_LE_OQ));
-
-        for (int32 j = 0; j < 8; ++j) {
-            result[i * 8 + j] = (mask & (1 << j)) != 0;
-        }
-    }
-}
-
-// @todo But a guard or warning on the trigonometric functions since they are only implemented for msvc/intel compiler
-inline
-f32_4 simd_sin(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_sin_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_8 simd_sin(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_sin_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_16 simd_sin(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_sin_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_4 simd_cos(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_cos_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_8 simd_cos(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_cos_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_16 simd_cos(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_cos_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_4 simd_asin(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_asin_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_8 simd_asin(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_asin_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_16 simd_asin(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_asin_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_4 simd_acos(f32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_acos_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_8 simd_acos(f32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_acos_ps(a.s);
-
-    return simd;
-}
-
-inline
-f32_16 simd_acos(f32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_acos_ps(a.s);
-
-    return simd;
-}
-
-// @todo implement more trigonometry function
-
 #endif
diff --git a/architecture/x86/simd/SIMD_F32_AVX2.h b/architecture/x86/simd/SIMD_F32_AVX2.h
new file mode 100644
index 0000000..896dedf
--- /dev/null
+++ b/architecture/x86/simd/SIMD_F32_AVX2.h
@@ -0,0 +1,426 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_F32_H
+#define COMS_STDLIB_SIMD_F32_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+#include "SIMD_SVML_AVX2.h"
+
+struct f32_8 {
+    union {
+        #if ARM
+            svfloat32_t s;
+        #else
+            __m256 s;
+        #endif
+
+        f32 v[8];
+    };
+};
+
+inline f32_8 load_f32_8(const f32* mem)
+{
+    f32_8 simd;
+    simd.s = _mm256_load_ps(mem);
+
+    return simd;
+}
+
+inline f32_8 init_f32_8(const f32* mem)
+{
+    f32_8 simd;
+    simd.s = _mm256_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]);
+
+    return simd;
+}
+
+inline void unload_f32_8(f32_8 a, f32 *array) { _mm256_store_ps(array, a.s); }
+
+inline f32_8 init_zero_f32_8()
+{
+    f32_8 simd;
+    simd.s = _mm256_setzero_ps();
+
+    return simd;
+}
+
+inline f32_8 init_value_f32_8(f32 value)
+{
+    f32_8 simd;
+    simd.s = _mm256_set1_ps(value);
+
+    return simd;
+}
+
+inline f32_8 init_values_f32_8(
+    f32 a, f32 b, f32 c, f32 d,
+    f32 e, f32 f, f32 g, f32 h
+)
+{
+    f32_8 simd;
+    simd.s = _mm256_set_ps(a, b, c, d, e, f, g, h);
+
+    return simd;
+}
+
+inline f32_8 operator+(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_add_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 operator-(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_sub_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 operator-(f32_8 a) { return init_zero_f32_8() - a; }
+
+inline f32_8 operator*(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_mul_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 operator/(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_div_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 operator^(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_xor_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 &operator-=(f32_8 &a, f32_8 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline f32_8 &operator+=(f32_8 &a, f32_8 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline f32_8 &operator*=(f32_8 &a, f32_8 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline f32_8 &operator/=(f32_8 &a, f32_8 b)
+{
+    a = a / b;
+
+    return a;
+}
+
+inline f32_8 &operator^=(f32_8 &a, f32_8 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline f32_8 operator<(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LT_OQ);
+
+    return simd;
+}
+
+inline f32_8 operator<=(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LE_OQ);
+
+    return simd;
+}
+
+inline f32_8 operator>(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GT_OQ);
+
+    return simd;
+}
+
+inline f32_8 operator>=(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GE_OQ);
+
+    return simd;
+}
+
+inline f32_8 operator==(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_EQ_OQ);
+
+    return simd;
+}
+
+inline f32_8 operator!=(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_NEQ_OQ);
+
+    return simd;
+}
+
+inline f32_8 operator&(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_and_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 operator|(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_or_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 &operator&=(f32_8 &a, f32_8 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline f32_8 &operator|=(f32_8 &a, f32_8 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline f32_8 abs(f32_8 a)
+{
+    uint32 unsigned_mask = (uint32) (1U << 31);
+    __m256 mask                = _mm256_set1_ps(*(f32 *) &unsigned_mask);
+
+    f32_8 simd;
+    simd.s = _mm256_and_ps(a.s, mask);
+
+    return simd;
+}
+
+inline f32_8 simd_min(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_min_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 simd_max(f32_8 a, f32_8 b)
+{
+    f32_8 simd;
+    simd.s = _mm256_max_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_8 sign(f32_8 a)
+{
+    uint32 umask = (uint32) (1U << 31);
+    __m256 mask        = _mm256_set1_ps(*(f32 *) &umask);
+
+    f32_8 signBit;
+    signBit.s = _mm256_and_ps(a.s, mask);
+
+    f32_8 b;
+    b.s = _mm256_set1_ps(1.0f);
+
+    f32_8 simd = b | signBit;
+
+    return simd;
+}
+
+inline f32_8 floor(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_floor_ps(a.s);
+
+    return simd;
+}
+
+inline f32_8 ceil(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_ceil_ps(a.s);
+
+    return simd;
+}
+
+inline f32_8 sqrt(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_sqrt_ps(a.s);
+
+    return simd;
+}
+
+inline f32_8 sqrt_inv_approx(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_rsqrt_ps(a.s);
+
+    return simd;
+}
+
+inline f32_8 one_over_approx(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_rcp_ps(a.s);
+
+    return simd;
+}
+
+inline f32_8 clamp(f32_8 min_value, f32_8 a, f32_8 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(f32_8 a)
+{
+    int32 which_true = _mm256_movemask_ps(a.s);
+
+    return which_true;
+}
+
+inline bool any_true(f32_8 a)
+{
+    bool is_any_true = _mm256_movemask_ps(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(f32_8 a)
+{
+    bool is_true = _mm256_movemask_ps(a.s) == 255;
+
+    return is_true;
+}
+
+inline bool all_false(f32_8 a)
+{
+    bool is_false = _mm256_movemask_ps(a.s) == 0;
+
+    return is_false;
+}
+
+inline
+void simd_cmp_le(const __m256* a, f32 b, bool* result, int32 size)
+{
+    __m256 b_8 = _mm256_set1_ps(b);
+
+    for (int32 i = 0; i < size; ++i) {
+        int32 mask = _mm256_movemask_ps(_mm256_cmp_ps(a[i], b_8, _CMP_LE_OQ));
+
+        for (int32 j = 0; j < 8; ++j) {
+            result[i * 8 + j] = (mask & (1 << j)) != 0;
+        }
+    }
+}
+
+inline
+f32_8 simd_sin(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_sin_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_8 simd_cos(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_cos_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_8 simd_asin(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_asin_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_8 simd_acos(f32_8 a)
+{
+    f32_8 simd;
+    simd.s = _mm256_acos_ps(a.s);
+
+    return simd;
+}
+
+inline
+void simd_div(const f32* a, f32 b, __m256* result, int32 size)
+{
+    int32 i = 0;
+    int32 j = 0;
+
+    // @todo this his how all the functions should be implemented that take in baseic types and output basic types
+    __m256 a_8;
+    __m256 b_8 = _mm256_set1_ps(b);
+    __m256 result_8;
+
+    for (; i <= size - 8; i += 8) {
+        a_8 = _mm256_load_ps(a);
+        result_8 = _mm256_div_ps(a_8, b_8);
+        result[j] = result_8;
+
+        a += 8;
+        ++j;
+    }
+
+    int32 diff = size - i;
+    alignas(32) f32 temp[8];
+
+    for (int32 k = 0; k < diff; k++) {
+        temp[k] = a[i + k] / b;
+    }
+
+    result[j] = _mm256_load_ps(temp);
+}
+
+#endif
diff --git a/architecture/x86/simd/SIMD_F32_AVX512.h b/architecture/x86/simd/SIMD_F32_AVX512.h
new file mode 100644
index 0000000..ffa69fc
--- /dev/null
+++ b/architecture/x86/simd/SIMD_F32_AVX512.h
@@ -0,0 +1,385 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_F32_H
+#define COMS_STDLIB_SIMD_F32_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+#include "SIMD_SVML_AVX512.h"
+
+struct f32_16 {
+    union {
+        #if ARM
+            svfloat32_t s;
+        #else
+            __m512 s;
+        #endif
+
+        f32 v[16];
+    };
+};
+
+inline f32_16 load_f32_16(const f32* mem)
+{
+    f32_16 simd;
+    simd.s = _mm512_load_ps(mem);
+
+    return simd;
+}
+
+inline f32_16 init_f32_16(const f32* mem)
+{
+    f32_16 simd;
+    simd.s = _mm512_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], mem[8], mem[9], mem[10],
+                            mem[11], mem[12], mem[13], mem[14], mem[15]);
+
+    return simd;
+}
+
+inline void unload_f32_16(f32_16 a, f32 *array) { _mm512_store_ps(array, a.s); }
+
+inline f32_16 init_zero_f32_16()
+{
+    f32_16 simd;
+    simd.s = _mm512_setzero_ps();
+
+    return simd;
+}
+
+inline f32_16 init_value_f32_16(f32 value)
+{
+    f32_16 simd;
+    simd.s = _mm512_set1_ps(value);
+
+    return simd;
+}
+
+inline f32_16 init_values_f32_16(
+    f32 a, f32 b, f32 c, f32 d,
+    f32 e, f32 f, f32 g, f32 h,
+    f32 i, f32 j, f32 k, f32 l,
+    f32 m, f32 n, f32 o, f32 p
+)
+{
+    f32_16 simd;
+    simd.s = _mm512_set_ps(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
+
+    return simd;
+}
+
+inline f32_16 operator+(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_add_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator-(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_sub_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator-(f32_16 a) { return init_zero_f32_16() - a; }
+
+inline f32_16 operator*(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_mul_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator/(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_div_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator^(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_xor_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 &operator-=(f32_16 &a, f32_16 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline f32_16 &operator+=(f32_16 &a, f32_16 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline f32_16 &operator*=(f32_16 &a, f32_16 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline f32_16 &operator/=(f32_16 &a, f32_16 b)
+{
+    a = a / b;
+
+    return a;
+}
+
+inline f32_16 &operator^=(f32_16 &a, f32_16 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline f32_16 operator<(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_mask_blend_ps(_mm512_cmplt_ps_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator<=(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator>(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator>=(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator==(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator!=(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator&(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_and_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 operator|(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_or_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 &operator&=(f32_16 &a, f32_16 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline f32_16 &operator|=(f32_16 &a, f32_16 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline f32_16 abs(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_abs_ps(a.s);
+
+    return simd;
+}
+
+inline f32_16 simd_min(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_min_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 simd_max(f32_16 a, f32_16 b)
+{
+    f32_16 simd;
+    simd.s = _mm512_max_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_16 sign(f32_16 a)
+{
+    uint32 umask = (uint32) (1U << 31);
+    __m512 mask        = _mm512_set1_ps(*(f32 *) &umask);
+
+    f32_16 signBit;
+    signBit.s = _mm512_and_ps(a.s, mask);
+
+    f32_16 b;
+    b.s = _mm512_set1_ps(1.0f);
+
+    f32_16 simd = b | signBit;
+
+    return simd;
+}
+
+inline f32_16 floor(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_floor_ps(a.s);
+
+    return simd;
+}
+
+inline f32_16 ceil(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_ceil_ps(a.s);
+
+    return simd;
+}
+
+inline f32_16 sqrt(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_sqrt_ps(a.s);
+
+    return simd;
+}
+
+inline f32_16 sqrt_inv_approx(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_rsqrt14_ps(a.s);
+
+    return simd;
+}
+
+inline f32_16 one_over_approx(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_rcp14_ps(a.s);
+
+    return simd;
+}
+
+inline f32_16 clamp(f32_16 min_value, f32_16 a, f32_16 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(f32_16 a)
+{
+    int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s));
+
+    return which_true;
+}
+
+inline bool any_true(f32_16 a)
+{
+    bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(f32_16 a)
+{
+    bool is_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535;
+
+    return is_true;
+}
+
+inline bool all_false(f32_16 a)
+{
+    // @todo This can be optimized (requires also changes in the comparison functions return)
+    bool is_false = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0;
+
+    return is_false;
+}
+
+inline
+f32_16 simd_sin(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_sin_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_16 simd_cos(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_cos_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_16 simd_asin(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_asin_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_16 simd_acos(f32_16 a)
+{
+    f32_16 simd;
+    simd.s = _mm512_acos_ps(a.s);
+
+    return simd;
+}
+
+// @todo implement more trigonometry function
+
+#endif
diff --git a/architecture/x86/simd/SIMD_F32_SSE.h b/architecture/x86/simd/SIMD_F32_SSE.h
new file mode 100644
index 0000000..f1dfa79
--- /dev/null
+++ b/architecture/x86/simd/SIMD_F32_SSE.h
@@ -0,0 +1,381 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_F32_SSE_H
+#define COMS_STDLIB_SIMD_F32_SSE_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+#include "SIMD_SVML_SSE.h"
+
+struct f32_4 {
+    union {
+        #if ARM
+            svfloat32_t s;
+        #else
+            __m128 s;
+        #endif
+
+        f32 v[4];
+    };
+};
+
+inline f32_4 load_f32_4(const f32* mem)
+{
+    f32_4 simd;
+    simd.s = _mm_load_ps(mem);
+
+    return simd;
+}
+
+inline f32_4 init_f32_4(const f32* mem)
+{
+    f32_4 simd;
+    simd.s = _mm_set_ps(mem[0], mem[1], mem[2], mem[3]);
+
+    return simd;
+}
+
+inline void unload_f32_4(f32_4 a, f32 *array) { _mm_store_ps(array, a.s); }
+
+inline f32_4 init_zero_f32_4()
+{
+    f32_4 simd;
+    simd.s = _mm_setzero_ps();
+
+    return simd;
+}
+
+inline f32_4 init_value_f32_4(f32 value)
+{
+    f32_4 simd;
+    simd.s = _mm_set1_ps(value);
+
+    return simd;
+}
+
+inline f32_4 init_values_f32_4(f32 a, f32 b, f32 c, f32 d)
+{
+    f32_4 simd;
+    simd.s = _mm_set_ps(a, b, c, d);
+
+    return simd;
+}
+
+inline f32_4 operator+(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_add_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator-(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_sub_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator-(f32_4 a) { return init_zero_f32_4() - a; }
+
+inline f32_4 operator*(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_mul_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator/(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_div_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator^(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_xor_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 &operator-=(f32_4 &a, f32_4 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline f32_4 &operator+=(f32_4 &a, f32_4 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline f32_4 &operator*=(f32_4 &a, f32_4 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline f32_4 &operator/=(f32_4 &a, f32_4 b)
+{
+    a = a / b;
+
+    return a;
+}
+
+inline f32_4 &operator^=(f32_4 &a, f32_4 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline f32_4 operator<(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_cmplt_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator<=(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_cmple_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator>(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_cmpgt_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator>=(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_cmpge_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator==(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_cmpeq_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator!=(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_cmpneq_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator&(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_and_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 operator|(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_or_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 &operator&=(f32_4 &a, f32_4 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline f32_4 &operator|=(f32_4 &a, f32_4 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline f32_4 abs(f32_4 a)
+{
+    uint32 unsigned_mask = (uint32) (1U << 31);
+    __m128 mask                = _mm_set1_ps(*(f32 *) &unsigned_mask);
+
+    f32_4 simd;
+    simd.s = _mm_and_ps(a.s, mask);
+
+    return simd;
+}
+
+inline f32_4 simd_min(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_min_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 simd_max(f32_4 a, f32_4 b)
+{
+    f32_4 simd;
+    simd.s = _mm_max_ps(a.s, b.s);
+
+    return simd;
+}
+
+inline f32_4 sign(f32_4 a)
+{
+    uint32 umask = (uint32) (1U << 31);
+    __m128 mask        = _mm_set1_ps(*(f32 *) &umask);
+
+    f32_4 signBit;
+    signBit.s = _mm_and_ps(a.s, mask);
+
+    f32_4 b;
+    b.s = _mm_set1_ps(1.0f);
+
+    f32_4 simd = b | signBit;
+
+    return simd;
+}
+
+inline f32_4 floor(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_floor_ps(a.s);
+
+    return simd;
+}
+
+inline f32_4 ceil(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_ceil_ps(a.s);
+
+    return simd;
+}
+
+inline f32_4 sqrt(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_sqrt_ps(a.s);
+
+    return simd;
+}
+
+inline f32_4 sqrt_inv_approx(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_rsqrt_ps(a.s);
+
+    return simd;
+}
+
+inline f32_4 one_over_approx(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_rcp_ps(a.s);
+
+    return simd;
+}
+
+inline f32_4 clamp(f32_4 min_value, f32_4 a, f32_4 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(f32_4 a)
+{
+    int32 which_true = _mm_movemask_ps(a.s);
+
+    return which_true;
+}
+
+inline bool any_true(f32_4 a)
+{
+    bool is_any_true = _mm_movemask_ps(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(f32_4 a)
+{
+    bool is_true = _mm_movemask_ps(a.s) == 15;
+
+    return is_true;
+}
+
+inline bool all_false(f32_4 a)
+{
+    bool is_false = _mm_movemask_ps(a.s) == 0;
+
+    return is_false;
+}
+
+inline
+f32_4 simd_sin(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_sin_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_4 simd_cos(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_cos_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_4 simd_asin(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_asin_ps(a.s);
+
+    return simd;
+}
+
+inline
+f32_4 simd_acos(f32_4 a)
+{
+    f32_4 simd;
+    simd.s = _mm_acos_ps(a.s);
+
+    return simd;
+}
+
+// @todo implement more trigonometry function
+
+#endif
diff --git a/architecture/x86/simd/SIMD_F64.h b/architecture/x86/simd/SIMD_F64.h
old mode 100644
new mode 100755
index 9166cb2..57101fc
--- a/architecture/x86/simd/SIMD_F64.h
+++ b/architecture/x86/simd/SIMD_F64.h
@@ -14,40 +14,16 @@
 
 #include "../../../stdlib/Types.h"
 
-struct f64_2 {
-    union {
-        #if ARM
-            svfloat64_t s;
-        #else
-            __m128 s;
-        #endif
+#ifdef MACRO_CPU_FEATURE_SSE42
+    #include "SIMD_F64_SSE.h"
+#endif
 
-        f64 v[2];
-    };
-};
+#ifdef MACRO_CPU_FEATURE_AVX2
+    #include "SIMD_F64_AVX2.h"
+#endif
 
-struct f64_4 {
-    union {
-        #if ARM
-            svfloat64_t s;
-        #else
-            __m256 s;
-        #endif
-
-        f64 v[4];
-    };
-};
-
-struct f64_8 {
-    union {
-        #if ARM
-            svfloat64_t s;
-        #else
-            __m512 s;
-        #endif
-
-        f64 v[8];
-    };
-};
+#ifdef MACRO_CPU_FEATURE_AVX512
+    #include "SIMD_F64_AVX512.h"
+#endif
 
 #endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_F64_AVX2.h b/architecture/x86/simd/SIMD_F64_AVX2.h
new file mode 100644
index 0000000..278c508
--- /dev/null
+++ b/architecture/x86/simd/SIMD_F64_AVX2.h
@@ -0,0 +1,30 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_F64_AVX2_H
+#define COMS_STDLIB_SIMD_F64_AVX2_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct f64_4 {
+    union {
+        #if ARM
+            svfloat64_t s;
+        #else
+            __m256 s;
+        #endif
+
+        f64 v[4];
+    };
+};
+
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_F64_AVX512.h b/architecture/x86/simd/SIMD_F64_AVX512.h
new file mode 100644
index 0000000..d3aa225
--- /dev/null
+++ b/architecture/x86/simd/SIMD_F64_AVX512.h
@@ -0,0 +1,29 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_F64_AVX512_H
+#define COMS_STDLIB_SIMD_F64_AVX512_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct f64_8 {
+    union {
+        #if ARM
+            svfloat64_t s;
+        #else
+            __m512 s;
+        #endif
+
+        f64 v[8];
+    };
+};
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_F64_SSE.h b/architecture/x86/simd/SIMD_F64_SSE.h
new file mode 100644
index 0000000..368b93c
--- /dev/null
+++ b/architecture/x86/simd/SIMD_F64_SSE.h
@@ -0,0 +1,29 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_F64_SSE_H
+#define COMS_STDLIB_SIMD_F64_SSE_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct f64_2 {
+    union {
+        #if ARM
+            svfloat64_t s;
+        #else
+            __m128 s;
+        #endif
+
+        f64 v[2];
+    };
+};
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I16.h b/architecture/x86/simd/SIMD_I16.h
old mode 100644
new mode 100755
index 190b7f8..09ff02e
--- a/architecture/x86/simd/SIMD_I16.h
+++ b/architecture/x86/simd/SIMD_I16.h
@@ -14,742 +14,17 @@
 
 #include "../../../stdlib/Types.h"
 
-struct int16_8 {
-    union {
-        #if ARM
-            svint16_t s;
-        #else
-            __m128i s;
-        #endif
+#ifdef MACRO_CPU_FEATURE_SSE42
+    #include "SIMD_I16_SSE.h"
+#endif
 
-        int16 v[8];
-    };
-};
+#ifdef MACRO_CPU_FEATURE_AVX2
+    #include "SIMD_I16_AVX2.h"
+#endif
 
-struct int16_16 {
-    union {
-        #if ARM
-            svint16_t s;
-        #else
-            __m256i s;
-        #endif
-
-        int16 v[16];
-    };
-};
-
-struct int16_32 {
-    union {
-        #if ARM
-            svint16_t s;
-        #else
-            __m512i s;
-        #endif
-
-        int16 v[32];
-    };
-};
-
-
-inline int16_8 load_int16_8(const int16* mem)
-{
-    int16_8 simd;
-    simd.s = _mm_load_si128((__m128i *) mem);
-
-    return simd;
-}
-
-inline int16_8 init_int16_8(const int16* mem)
-{
-    int16_8 simd;
-    simd.s = _mm_set_epi16(
-        mem[0], mem[1], mem[2], mem[3],
-        mem[4], mem[5], mem[6], mem[7]
-    );
-
-    return simd;
-}
-
-inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i *) array, a.s); }
-
-inline int16_16 load_int16_16(const int16* mem)
-{
-    int16_16 simd;
-    simd.s = _mm256_load_si256((__m256i *) mem);
-
-    return simd;
-}
-
-inline int16_16 init_int16_16(const int16* mem)
-{
-    int16_16 simd;
-    simd.s = _mm256_set_epi16(
-        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
-        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]
-    );
-
-    return simd;
-}
-
-inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m256i *) array, a.s); }
-
-inline int16_32 load_int16_32(const int16* mem)
-{
-    int16_32 simd;
-    simd.s = _mm512_load_si512((__m512i *) mem);
-
-    return simd;
-}
-
-inline int16_32 init_int16_32(const int16* mem)
-{
-    int16_32 simd;
-    simd.s = _mm512_set_epi16(
-        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
-        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
-        mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
-        mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31]
-    );
-
-    return simd;
-}
-
-inline void unload_int16_32(int16_32 a, int16 *array) { _mm512_storeu_epi16(array, a.s); }
-
-inline int16_8 init_zero_int16_8()
-{
-    int16_8 simd;
-    simd.s = _mm_setzero_si128();
-
-    return simd;
-}
-
-inline int16_16 init_zero_int16_16()
-{
-    int16_16 simd;
-    simd.s = _mm256_setzero_si256();
-
-    return simd;
-}
-
-inline int16_32 init_zero_int16_32()
-{
-    int16_32 simd;
-    simd.s = _mm512_setzero_si512();
-
-    return simd;
-}
-
-inline int16_8 init_value_int16_8(int16 value)
-{
-    int16_8 simd;
-    simd.s = _mm_set1_epi16(value);
-
-    return simd;
-}
-
-inline int16_16 init_value_int16_16(int16 value)
-{
-    int16_16 simd;
-    simd.s = _mm256_set1_epi16(value);
-
-    return simd;
-}
-
-inline int16_32 init_value_int16_32(int16 value)
-{
-    int16_32 simd;
-    simd.s = _mm512_set1_epi16(value);
-
-    return simd;
-}
-
-inline int16_8 operator+(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_add_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator+(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_add_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator+(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_add_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator-(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_sub_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator-(int16_8 a) { return init_zero_int16_8() - a; }
-
-inline int16_16 operator-(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_sub_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator-(int16_16 a) { return init_zero_int16_16() - a; }
-
-inline int16_32 operator-(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_sub_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator-(int16_32 a) { return init_zero_int16_32() - a; }
-
-inline int16_8 operator*(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator*(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator*(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator^(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_xor_si128(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator^(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_xor_si256(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator^(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_xor_si512(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 &operator-=(int16_8 &a, int16_8 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int16_16 &operator-=(int16_16 &a, int16_16 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int16_32 &operator-=(int16_32 &a, int16_32 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int16_8 &operator+=(int16_8 &a, int16_8 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int16_16 &operator+=(int16_16 &a, int16_16 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int16_32 &operator+=(int16_32 &a, int16_32 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int16_8 &operator*=(int16_8 &a, int16_8 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int16_16 &operator*=(int16_16 &a, int16_16 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int16_32 &operator*=(int16_32 &a, int16_32 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int16_8 &operator^=(int16_8 &a, int16_8 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int16_16 &operator^=(int16_16 &a, int16_16 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int16_32 &operator^=(int16_32 &a, int16_32 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int16_8 operator<(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_cmplt_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator<(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_xor_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1));
-
-    return simd;
-}
-
-inline int16_32 operator<(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_mask_blend_epi16(_mm512_cmplt_epi16_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator<=(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_andnot_si128(_mm_cmplt_epi16(b.s, a.s), _mm_set1_epi16(-1));
-
-    return simd;
-}
-
-inline int16_16 operator<=(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1));
-
-    return simd;
-}
-
-inline int16_32 operator<=(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    __mmask32 mask = _mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_LE);
-    simd.s = _mm512_mask_blend_epi16(mask, b.s, a.s);
-
-    return simd;
-}
-
-inline int16_8 operator>(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_cmpgt_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator>(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_cmpgt_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator>(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_mask_blend_epi16(_mm512_cmpgt_epi16_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator>=(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_andnot_si128(_mm_cmplt_epi16(a.s, b.s), _mm_set1_epi16(-1));
-
-    return simd;
-}
-
-inline int16_16 operator>=(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(b.s, a.s), _mm256_set1_epi16(-1));
-
-    return simd;
-}
-
-inline int16_32 operator>=(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_mask_blend_epi16(_mm512_cmpge_epi16_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator==(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_cmpeq_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator==(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_cmpeq_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator==(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_mask_blend_epi16(_mm512_cmpeq_epi16_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator!=(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_andnot_si128(_mm_cmpeq_epi16(a.s, b.s), _mm_set1_epi16(-1));
-
-    return simd;
-}
-
-inline int16_16 operator!=(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_mask_blend_epi16(_mm256_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator!=(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_mask_blend_epi16(_mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator&(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_and_si128(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator&(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_and_si256(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator&(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_and_si512(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 operator|(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_or_si128(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 operator|(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_or_si256(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 operator|(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_or_si512(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 &operator&=(int16_8 &a, int16_8 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int16_16 &operator&=(int16_16 &a, int16_16 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int16_32 &operator&=(int16_32 &a, int16_32 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int16_8 &operator|=(int16_8 &a, int16_8 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int16_16 &operator|=(int16_16 &a, int16_16 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int16_32 &operator|=(int16_32 &a, int16_32 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int16_8 abs(int16_8 a)
-{
-    int16_8 simd;
-    simd.s = _mm_abs_epi16(a.s);
-
-    return simd;
-}
-
-inline int16_16 abs(int16_16 a)
-{
-    int16_16 simd;
-    simd.s = _mm256_abs_epi16(a.s);
-
-    return simd;
-}
-
-inline int16_32 abs(int16_32 a)
-{
-    int16_32 simd;
-    simd.s = _mm512_abs_epi16(a.s);
-
-    return simd;
-}
-
-inline int16_8 simd_min(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_min_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 simd_min(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_min_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 simd_min(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_min_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 simd_max(int16_8 a, int16_8 b)
-{
-    int16_8 simd;
-    simd.s = _mm_max_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_16 simd_max(int16_16 a, int16_16 b)
-{
-    int16_16 simd;
-    simd.s = _mm256_max_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_32 simd_max(int16_32 a, int16_32 b)
-{
-    int16_32 simd;
-    simd.s = _mm512_max_epi16(a.s, b.s);
-
-    return simd;
-}
-
-inline int16_8 clamp(int16_8 min_value, int16_8 a, int16_8 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int16_16 clamp(int16_16 min_value, int16_16 a, int16_16 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int16_32 clamp(int16_32 min_value, int16_32 a, int16_32 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int32 which_true(int16_8 a)
-{
-    return _mm_movemask_epi8(a.s);
-}
-
-inline int32 which_true(int16_16 a)
-{
-    return _mm256_movemask_epi8(a.s);
-}
-
-inline int32 which_true(int16_32 a)
-{
-    return _mm512_movepi16_mask(a.s);
-}
-
-inline bool any_true(int16_8 a)
-{
-    bool is_any_true = _mm_movemask_epi8(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool any_true(int16_16 a)
-{
-    bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool any_true(int16_32 a)
-{
-    bool is_any_true = _mm512_movepi16_mask(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool all_true(int16_8 a)
-{
-    bool is_true = _mm_movemask_epi8(a.s) == 15;
-
-    return is_true;
-}
-
-inline bool all_true(int16_16 a)
-{
-    bool is_true = _mm256_movemask_epi8(a.s) == 255;
-
-    return is_true;
-}
-
-inline bool all_true(int16_32 a)
-{
-    bool is_true = _mm512_movepi16_mask(a.s) == 65535;
-
-    return is_true;
-}
-
-inline bool all_false(int16_8 a)
-{
-    bool is_false = _mm_movemask_epi8(a.s) == 0;
-
-    return is_false;
-}
-
-inline bool all_false(int16_16 a)
-{
-    bool is_false = _mm256_movemask_epi8(a.s) == 0;
-
-    return is_false;
-}
-
-inline bool all_false(int16_32 a)
-{
-    // @todo This can be optimized (requires also changes in the comparison functions return)
-    bool is_false = _mm512_movepi16_mask(a.s) == 0;
-
-    return is_false;
-}
+#ifdef MACRO_CPU_FEATURE_AVX512
+    #include "SIMD_I16_AVX512.h"
+#endif
 
 // @todo from down here we can optimize some of the code by NOT using the wrappers
 //      the code is self contained and we could use te intrinsic functions directly
@@ -758,74 +33,93 @@ inline
 void simd_mult(const int16* a, f32 b, int16* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512 af_lo, af_hi;
-        __m512 b_16 = _mm512_set1_ps(b);
-        __m512 result_lo, result_hi;
-        __m512i result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512 af_lo, af_hi;
+            __m512 b_16 = _mm512_set1_ps(b);
+            __m512 result_lo, result_hi;
+            __m512i result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_loadu_si512((__m512i*) a);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_si512((__m512i*) a);
 
-            af_lo = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 0)));
-            af_hi = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 1)));
+                af_lo = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 0)));
+                af_hi = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 1)));
 
-            result_lo = _mm512_mul_ps(af_lo, b_16);
-            result_hi = _mm512_mul_ps(af_hi, b_16);
+                result_lo = _mm512_mul_ps(af_lo, b_16);
+                result_hi = _mm512_mul_ps(af_hi, b_16);
 
-            result_16 = _mm512_packs_epi32(_mm512_cvtps_epi32(result_lo), _mm512_cvtps_epi32(result_hi));
-            _mm512_storeu_si512((__m512i*) result, result_16);
+                result_16 = _mm512_packs_epi32(_mm512_cvtps_epi32(result_lo), _mm512_cvtps_epi32(result_hi));
+                _mm512_store_si512((__m512i*) result, result_16);
 
-            a += steps;
-            result += steps;
+                a += steps;
+                result += steps;
+            }
+
+            steps = 1;
         }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256 af_lo, af_hi;
-        __m256 b_8 = _mm256_set1_ps(b);
-        __m256 result_lo, result_hi;
-        __m256i result_8;
+    #endif
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_loadu_si256((__m256i*) a);
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256 af_lo, af_hi;
+            __m256 b_8 = _mm256_set1_ps(b);
+            __m256 result_lo, result_hi;
+            __m256i result_8;
 
-            af_lo = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 0)));
-            af_hi = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 1)));
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i*) a);
 
-            result_lo = _mm256_mul_ps(af_lo, b_8);
-            result_hi = _mm256_mul_ps(af_hi, b_8);
+                af_lo = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 0)));
+                af_hi = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 1)));
 
-            result_8 = _mm256_packs_epi32(_mm256_cvtps_epi32(result_lo), _mm256_cvtps_epi32(result_hi));
-            _mm256_storeu_si256((__m256i*) result, result_8);
+                result_lo = _mm256_mul_ps(af_lo, b_8);
+                result_hi = _mm256_mul_ps(af_hi, b_8);
 
-            a += steps;
-            result += steps;
+                result_8 = _mm256_packs_epi32(_mm256_cvtps_epi32(result_lo), _mm256_cvtps_epi32(result_hi));
+                _mm256_store_si256((__m256i*) result, result_8);
+
+                a += steps;
+                result += steps;
+            }
+
+            steps = 1;
         }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128 af_lo, af_hi;
-        __m128 b_4 = _mm_set1_ps(b);
-        __m128 result_lo, result_hi;
-        __m128i result_4;
+    #endif
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_loadu_si128((__m128i*) a);
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128 af_lo, af_hi;
+            __m128 b_4 = _mm_set1_ps(b);
+            __m128 result_lo, result_hi;
+            __m128i result_4;
 
-            af_lo = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_4));
-            af_hi = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(a_4, 8)));
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i*) a);
 
-            result_lo = _mm_mul_ps(af_lo, b_4);
-            result_hi = _mm_mul_ps(af_hi, b_4);
+                af_lo = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_4));
+                af_hi = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(a_4, 8)));
 
-            result_4 = _mm_packs_epi32(_mm_cvtps_epi32(result_lo), _mm_cvtps_epi32(result_hi));
-            _mm_storeu_si128((__m128i*) result, result_4);
+                result_lo = _mm_mul_ps(af_lo, b_4);
+                result_hi = _mm_mul_ps(af_hi, b_4);
 
-            a += steps;
-            result += steps;
+                result_4 = _mm_packs_epi32(_mm_cvtps_epi32(result_lo), _mm_cvtps_epi32(result_hi));
+                _mm_store_si128((__m128i*) result, result_4);
+
+                a += steps;
+                result += steps;
+            }
         }
-    }
+    #endif
 
     // Handle any remaining elements
     for (; i < size; ++i) {
diff --git a/architecture/x86/simd/SIMD_I16_AVX2.h b/architecture/x86/simd/SIMD_I16_AVX2.h
new file mode 100644
index 0000000..90b2cd7
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I16_AVX2.h
@@ -0,0 +1,262 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I16_AVX2_H
+#define COMS_TOS_STDLIB_SIMD_I16_AVX2_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int16_16 {
+    union {
+        #if ARM
+            svint16_t s;
+        #else
+            __m256i s;
+        #endif
+
+        int16 v[16];
+    };
+};
+
+inline int16_16 load_int16_16(const int16* mem)
+{
+    int16_16 simd;
+    simd.s = _mm256_load_si256((__m256i *) mem);
+
+    return simd;
+}
+
+inline int16_16 init_int16_16(const int16* mem)
+{
+    int16_16 simd;
+    simd.s = _mm256_set_epi16(
+        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
+        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]
+    );
+
+    return simd;
+}
+
+inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m256i *) array, a.s); }
+
+inline int16_16 init_zero_int16_16()
+{
+    int16_16 simd;
+    simd.s = _mm256_setzero_si256();
+
+    return simd;
+}
+
+inline int16_16 init_value_int16_16(int16 value)
+{
+    int16_16 simd;
+    simd.s = _mm256_set1_epi16(value);
+
+    return simd;
+}
+
+inline int16_16 operator+(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_add_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 operator-(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_sub_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 operator-(int16_16 a) { return init_zero_int16_16() - a; }
+
+inline int16_16 operator*(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 operator^(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_xor_si256(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 &operator-=(int16_16 &a, int16_16 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int16_16 &operator+=(int16_16 &a, int16_16 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int16_16 &operator*=(int16_16 &a, int16_16 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int16_16 &operator^=(int16_16 &a, int16_16 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int16_16 operator<(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_xor_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1));
+
+    return simd;
+}
+
+inline int16_16 operator<=(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1));
+
+    return simd;
+}
+
+inline int16_16 operator>(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_cmpgt_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 operator>=(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(b.s, a.s), _mm256_set1_epi16(-1));
+
+    return simd;
+}
+
+inline int16_16 operator==(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_cmpeq_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 operator!=(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_mask_blend_epi16(_mm256_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
+
+    return simd;
+}
+
+
+inline int16_16 operator&(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_and_si256(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 operator|(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_or_si256(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 &operator&=(int16_16 &a, int16_16 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int16_16 &operator|=(int16_16 &a, int16_16 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int16_16 abs(int16_16 a)
+{
+    int16_16 simd;
+    simd.s = _mm256_abs_epi16(a.s);
+
+    return simd;
+}
+
+inline int16_16 simd_min(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_min_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 simd_max(int16_16 a, int16_16 b)
+{
+    int16_16 simd;
+    simd.s = _mm256_max_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_16 clamp(int16_16 min_value, int16_16 a, int16_16 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(int16_16 a)
+{
+    return _mm256_movemask_epi8(a.s);
+}
+
+inline bool any_true(int16_16 a)
+{
+    bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int16_16 a)
+{
+    bool is_true = _mm256_movemask_epi8(a.s) == 255;
+
+    return is_true;
+}
+
+inline bool all_false(int16_16 a)
+{
+    bool is_false = _mm256_movemask_epi8(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I16_AVX512.h b/architecture/x86/simd/SIMD_I16_AVX512.h
new file mode 100644
index 0000000..26c7a9d
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I16_AVX512.h
@@ -0,0 +1,265 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I16_AVX512_H
+#define COMS_TOS_STDLIB_SIMD_I16_AVX512_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int16_32 {
+    union {
+        #if ARM
+            svint16_t s;
+        #else
+            __m512i s;
+        #endif
+
+        int16 v[32];
+    };
+};
+
+inline int16_32 load_int16_32(const int16* mem)
+{
+    int16_32 simd;
+    simd.s = _mm512_load_si512((__m512i *) mem);
+
+    return simd;
+}
+
+inline int16_32 init_int16_32(const int16* mem)
+{
+    int16_32 simd;
+    simd.s = _mm512_set_epi16(
+        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
+        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
+        mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
+        mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31]
+    );
+
+    return simd;
+}
+
+inline void unload_int16_32(int16_32 a, int16 *array) { _mm512_storeu_epi16(array, a.s); }
+
+inline int16_32 init_zero_int16_32()
+{
+    int16_32 simd;
+    simd.s = _mm512_setzero_si512();
+
+    return simd;
+}
+
+inline int16_32 init_value_int16_32(int16 value)
+{
+    int16_32 simd;
+    simd.s = _mm512_set1_epi16(value);
+
+    return simd;
+}
+
+inline int16_32 operator+(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_add_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator-(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_sub_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator-(int16_32 a) { return init_zero_int16_32() - a; }
+
+inline int16_32 operator*(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator^(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_xor_si512(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 &operator-=(int16_32 &a, int16_32 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int16_32 &operator+=(int16_32 &a, int16_32 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int16_32 &operator*=(int16_32 &a, int16_32 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int16_32 &operator^=(int16_32 &a, int16_32 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int16_32 operator<(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_mask_blend_epi16(_mm512_cmplt_epi16_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator<=(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    __mmask32 mask = _mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_LE);
+    simd.s = _mm512_mask_blend_epi16(mask, b.s, a.s);
+
+    return simd;
+}
+
+inline int16_32 operator>(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_mask_blend_epi16(_mm512_cmpgt_epi16_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator>=(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_mask_blend_epi16(_mm512_cmpge_epi16_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator==(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_mask_blend_epi16(_mm512_cmpeq_epi16_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator!=(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_mask_blend_epi16(_mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator&(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_and_si512(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 operator|(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_or_si512(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 &operator&=(int16_32 &a, int16_32 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int16_32 &operator|=(int16_32 &a, int16_32 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int16_32 abs(int16_32 a)
+{
+    int16_32 simd;
+    simd.s = _mm512_abs_epi16(a.s);
+
+    return simd;
+}
+
+inline int16_32 simd_min(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_min_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 simd_max(int16_32 a, int16_32 b)
+{
+    int16_32 simd;
+    simd.s = _mm512_max_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_32 clamp(int16_32 min_value, int16_32 a, int16_32 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(int16_32 a)
+{
+    return _mm512_movepi16_mask(a.s);
+}
+
+inline bool any_true(int16_32 a)
+{
+    bool is_any_true = _mm512_movepi16_mask(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int16_32 a)
+{
+    bool is_true = _mm512_movepi16_mask(a.s) == 65535;
+
+    return is_true;
+}
+
+inline bool all_false(int16_32 a)
+{
+    // @todo This can be optimized (requires also changes in the comparison functions return)
+    bool is_false = _mm512_movepi16_mask(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I16_SSE.h b/architecture/x86/simd/SIMD_I16_SSE.h
new file mode 100644
index 0000000..ea5d295
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I16_SSE.h
@@ -0,0 +1,261 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I16_SSE_H
+#define COMS_TOS_STDLIB_SIMD_I16_SSE_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int16_8 {
+    union {
+        #if ARM
+            svint16_t s;
+        #else
+            __m128i s;
+        #endif
+
+        int16 v[8];
+    };
+};
+
+inline int16_8 load_int16_8(const int16* mem)
+{
+    int16_8 simd;
+    simd.s = _mm_load_si128((__m128i *) mem);
+
+    return simd;
+}
+
+inline int16_8 init_int16_8(const int16* mem)
+{
+    int16_8 simd;
+    simd.s = _mm_set_epi16(
+        mem[0], mem[1], mem[2], mem[3],
+        mem[4], mem[5], mem[6], mem[7]
+    );
+
+    return simd;
+}
+
+inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i *) array, a.s); }
+
+inline int16_8 init_zero_int16_8()
+{
+    int16_8 simd;
+    simd.s = _mm_setzero_si128();
+
+    return simd;
+}
+
+inline int16_8 init_value_int16_8(int16 value)
+{
+    int16_8 simd;
+    simd.s = _mm_set1_epi16(value);
+
+    return simd;
+}
+
+inline int16_8 operator+(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_add_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 operator-(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_sub_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 operator-(int16_8 a) { return init_zero_int16_8() - a; }
+
+inline int16_8 operator*(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 operator^(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_xor_si128(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 &operator-=(int16_8 &a, int16_8 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int16_8 &operator+=(int16_8 &a, int16_8 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int16_8 &operator*=(int16_8 &a, int16_8 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int16_8 &operator^=(int16_8 &a, int16_8 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int16_8 operator<(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_cmplt_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 operator<=(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_andnot_si128(_mm_cmplt_epi16(b.s, a.s), _mm_set1_epi16(-1));
+
+    return simd;
+}
+
+inline int16_8 operator>(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_cmpgt_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 operator>=(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_andnot_si128(_mm_cmplt_epi16(a.s, b.s), _mm_set1_epi16(-1));
+
+    return simd;
+}
+
+inline int16_8 operator==(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_cmpeq_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 operator!=(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_andnot_si128(_mm_cmpeq_epi16(a.s, b.s), _mm_set1_epi16(-1));
+
+    return simd;
+}
+
+inline int16_8 operator&(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_and_si128(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 operator|(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_or_si128(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 &operator&=(int16_8 &a, int16_8 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int16_8 &operator|=(int16_8 &a, int16_8 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int16_8 abs(int16_8 a)
+{
+    int16_8 simd;
+    simd.s = _mm_abs_epi16(a.s);
+
+    return simd;
+}
+
+inline int16_8 simd_min(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_min_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 simd_max(int16_8 a, int16_8 b)
+{
+    int16_8 simd;
+    simd.s = _mm_max_epi16(a.s, b.s);
+
+    return simd;
+}
+
+inline int16_8 clamp(int16_8 min_value, int16_8 a, int16_8 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(int16_8 a)
+{
+    return _mm_movemask_epi8(a.s);
+}
+
+inline bool any_true(int16_8 a)
+{
+    bool is_any_true = _mm_movemask_epi8(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int16_8 a)
+{
+    bool is_true = _mm_movemask_epi8(a.s) == 15;
+
+    return is_true;
+}
+
+inline bool all_false(int16_8 a)
+{
+    bool is_false = _mm_movemask_epi8(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I32.h b/architecture/x86/simd/SIMD_I32.h
old mode 100644
new mode 100755
index 3633319..86f23f1
--- a/architecture/x86/simd/SIMD_I32.h
+++ b/architecture/x86/simd/SIMD_I32.h
@@ -15,1086 +15,90 @@
 
 #include "../../../stdlib/Types.h"
 #include "../../../utils/BitUtils.h"
-#include "SIMD_F32.h"
 
-// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general
-//      or better create alternative functions for the available sse version.
+#ifdef MACRO_CPU_FEATURE_SSE42
+    #include "SIMD_I32_SSE.h"
+#endif
 
-// @question why are we passing structs by value?
+#ifdef MACRO_CPU_FEATURE_AVX2
+    #include "SIMD_I32_AVX2.h"
+#endif
 
-struct int32_4 {
-    union {
-        #if ARM
-            svint32_t s;
-        #else
-            __m128i s;
-        #endif
-
-        int32 v[4];
-    };
-};
-
-struct int32_8 {
-    union {
-        #if ARM
-            svint32_t s;
-        #else
-            __m256i s;
-        #endif
-
-        int32 v[8];
-    };
-};
-
-struct int32_16 {
-    union {
-        #if ARM
-            svint32_t s;
-        #else
-            __m512i s;
-        #endif
-
-        int32 v[16];
-    };
-};
-
-inline int32_4 load_int32_4(const int32* mem)
-{
-    int32_4 simd;
-    simd.s = _mm_load_si128((__m128i *) mem);
-
-    return simd;
-}
-
-inline int32_4 init_int32_4(const int32* mem)
-{
-    int32_4 simd;
-    simd.s = _mm_set_epi32(mem[0], mem[1], mem[2], mem[3]);
-
-    return simd;
-}
-
-inline void unload_int32_4(int32_4 a, int32 *array) { _mm_store_si128((__m128i *) array, a.s); }
-
-inline int32_8 load_int32_8(const int32* mem)
-{
-    int32_8 simd;
-    simd.s = _mm256_load_si256((__m256i *) mem);
-
-    return simd;
-}
-
-inline int32_8 init_int32_8(const int32* mem)
-{
-    int32_8 simd;
-    simd.s = _mm256_set_epi32(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]);
-
-    return simd;
-}
-
-inline void unload_int32_8(int32_8 a, int32 *array) { _mm256_store_si256((__m256i *) array, a.s); }
-
-inline int32_16 load_int32_16(const int32* mem)
-{
-    int32_16 simd;
-    simd.s = _mm512_load_epi32(mem);
-
-    return simd;
-}
-
-inline int32_16 init_int32_16(const int32* mem)
-{
-    int32_16 simd;
-    simd.s = _mm512_set_epi32(
-        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
-        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]);
-
-    return simd;
-}
-
-inline void unload_int32_16(int32_16 a, int32 *array) { _mm512_store_epi32(array, a.s); }
-
-inline int32_4 init_zero_int32_4()
-{
-    int32_4 simd;
-    simd.s = _mm_setzero_si128();
-
-    return simd;
-}
-
-inline int32_8 init_zero_int32_8()
-{
-    int32_8 simd;
-    simd.s = _mm256_setzero_si256();
-
-    return simd;
-}
-
-inline int32_16 init_zero_int32_16()
-{
-    int32_16 simd;
-    simd.s = _mm512_setzero_epi32();
-
-    return simd;
-}
-
-inline int32_4 init_value_int32_4(int32 value)
-{
-    int32_4 simd;
-    simd.s = _mm_set1_epi32(value);
-
-    return simd;
-}
-
-inline int32_8 init_value_int32_8(int32 value)
-{
-    int32_8 simd;
-    simd.s = _mm256_set1_epi32(value);
-
-    return simd;
-}
-
-inline int32_16 init_value_int32_16(int32 value)
-{
-    int32_16 simd;
-    simd.s = _mm512_set1_epi32(value);
-
-    return simd;
-}
-
-inline int32_4 init_values_int32_4(int32 a, int32 b, int32 c, int32 d)
-{
-    int32_4 simd;
-    simd.s = _mm_set_epi32(a, b, c, d);
-
-    return simd;
-}
-
-inline int32_8 init_values_int32_8(
-    int32 a, int32 b, int32 c, int32 d,
-    int32 e, int32 f, int32 g, int32 h
-)
-{
-    int32_8 simd;
-    simd.s = _mm256_set_epi32(a, b, c, d, e, f, g, h);
-
-    return simd;
-}
-
-inline int32_16 init_values_int32_16(
-    int32 a, int32 b, int32 c, int32 d,
-    int32 e, int32 f, int32 g, int32 h,
-    int32 i, int32 j, int32 k, int32 l,
-    int32 m, int32 n, int32 o, int32 p
-)
-{
-    int32_16 simd;
-    simd.s = _mm512_set_epi32(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
-
-    return simd;
-}
-
-inline
-int32_4 f32_4_to_int32_4(f32_4 a)
-{
-    int32_4 result;
-    result.s = _mm_cvtps_epi32(a.s);
-
-    return result;
-}
-
-inline
-f32_4 int32_4_to_f32_4(int32_4 a)
-{
-    f32_4 result;
-    result.s = _mm_cvtepi32_ps(a.s);
-
-    return result;
-}
-
-inline
-int32_8 f32_8_to_int32_8(f32_8 a)
-{
-    int32_8 result;
-    result.s = _mm256_cvtps_epi32(a.s);
-
-    return result;
-}
-
-inline
-f32_8 int32_8_to_f32_8(int32_8 a)
-{
-    f32_8 result;
-    result.s = _mm256_cvtepi32_ps(a.s);
-
-    return result;
-}
-
-inline
-int32_16 f32_16_to_int32_16(f32_16 a)
-{
-    int32_16 result;
-    result.s = _mm512_cvtps_epi32(a.s);
-
-    return result;
-}
-
-inline
-f32_16 int32_16_to_f32_16(int32_16 a)
-{
-    f32_16 result;
-    result.s = _mm512_cvtepi32_ps(a.s);
-
-    return result;
-}
-
-inline int32_4 operator+(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_add_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator+(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_add_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator+(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_add_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator-(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_sub_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator-(int32_4 a) { return init_zero_int32_4() - a; }
-
-inline int32_8 operator-(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_sub_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator-(int32_8 a) { return init_zero_int32_8() - a; }
-
-inline int32_16 operator-(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_sub_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator-(int32_16 a) { return init_zero_int32_16() - a; }
-
-inline int32_4 operator*(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator*(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator*(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator/(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_div_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator/(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_div_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator/(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_div_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline f32_4 operator/(f32_4 a, int32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_div_ps(a.s, _mm_cvtepi32_ps(b.s));
-
-    return simd;
-}
-
-inline f32_8 operator/(f32_8 a, int32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_div_ps(a.s, _mm256_cvtepi32_ps(b.s));
-
-    return simd;
-}
-
-inline f32_16 operator/(f32_16 a, int32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_div_ps(a.s, _mm512_cvtepi32_ps(b.s));
-
-    return simd;
-}
-
-inline f32_4 operator/(int32_4 a, f32_4 b)
-{
-    f32_4 simd;
-    simd.s = _mm_div_ps(_mm_cvtepi32_ps(a.s), b.s);
-
-    return simd;
-}
-
-inline f32_8 operator/(int32_8 a, f32_8 b)
-{
-    f32_8 simd;
-    simd.s = _mm256_div_ps(_mm256_cvtepi32_ps(a.s), b.s);
-
-    return simd;
-}
-
-inline f32_16 operator/(int32_16 a, f32_16 b)
-{
-    f32_16 simd;
-    simd.s = _mm512_div_ps(_mm512_cvtepi32_ps(a.s), b.s);
-
-    return simd;
-}
-
-inline int32_4 operator^(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_xor_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator^(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_xor_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator^(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_xor_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 &operator-=(int32_4 &a, int32_4 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int32_8 &operator-=(int32_8 &a, int32_8 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int32_16 &operator-=(int32_16 &a, int32_16 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int32_4 &operator+=(int32_4 &a, int32_4 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int32_8 &operator+=(int32_8 &a, int32_8 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int32_16 &operator+=(int32_16 &a, int32_16 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int32_4 &operator*=(int32_4 &a, int32_4 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int32_8 &operator*=(int32_8 &a, int32_8 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int32_16 &operator*=(int32_16 &a, int32_16 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int32_4 &operator/=(int32_4 &a, int32_4 b)
-{
-    a.s = (a / b).s;
-
-    return a;
-}
-
-inline int32_8 &operator/=(int32_8 &a, int32_8 b)
-{
-    a.s = (a / b).s;
-
-    return a;
-}
-
-inline int32_16 &operator/=(int32_16 &a, int32_16 b)
-{
-    a.s = (a / b).s;
-
-    return a;
-}
-
-inline int32_4 &operator^=(int32_4 &a, int32_4 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int32_8 &operator^=(int32_8 &a, int32_8 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int32_16 &operator^=(int32_16 &a, int32_16 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int32_4 operator<(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_cmplt_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator<(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_xor_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1));
-
-    return simd;
-}
-
-inline int32_16 operator<(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_mask_blend_epi32(_mm512_cmplt_epi32_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator<=(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_andnot_si128(_mm_cmplt_epi32(b.s, a.s), _mm_set1_epi32(-1));
-
-    return simd;
-}
-
-inline int32_8 operator<=(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1));
-
-    return simd;
-}
-
-inline int32_16 operator<=(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_mask_blend_epi32(_mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), b.s, a.s);
-
-    return simd;
-}
-
-inline int32_4 operator>(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_cmpgt_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator>(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_cmpgt_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator>(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_mask_blend_epi32(_mm512_cmpgt_epi32_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator>=(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_andnot_si128(_mm_cmplt_epi32(a.s, b.s), _mm_set1_epi32(-1));
-
-    return simd;
-}
-
-inline int32_8 operator>=(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(b.s, a.s), _mm256_set1_epi32(-1));
-
-    return simd;
-}
-
-inline int32_16 operator>=(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_mask_blend_epi32(_mm512_cmpge_epi32_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator==(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_cmpeq_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator==(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_cmpeq_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator==(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_mask_blend_epi32(_mm512_cmpeq_epi32_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator!=(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_andnot_si128(_mm_cmpeq_epi32(a.s, b.s), _mm_set1_epi32(-1));
-
-    return simd;
-}
-
-inline int32_8 operator!=(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_mask_blend_epi32(_mm256_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator!=(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_mask_blend_epi32(_mm512_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator&(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_and_si128(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator&(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_and_si256(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator&(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_and_si512(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 operator|(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_or_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 operator|(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_or_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 operator|(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_or_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 &operator&=(int32_4 &a, int32_4 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int32_8 &operator&=(int32_8 &a, int32_8 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int32_16 &operator&=(int32_16 &a, int32_16 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int32_4 &operator|=(int32_4 &a, int32_4 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int32_8 &operator|=(int32_8 &a, int32_8 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int32_16 &operator|=(int32_16 &a, int32_16 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int32_4 abs(int32_4 a)
-{
-    int32_4 simd;
-    simd.s = _mm_abs_epi32(a.s);
-
-    return simd;
-}
-
-inline int32_8 abs(int32_8 a)
-{
-    int32_8 simd;
-    simd.s = _mm256_abs_epi32(a.s);
-
-    return simd;
-}
-
-inline int32_16 abs(int32_16 a)
-{
-    int32_16 simd;
-    simd.s = _mm512_abs_epi64(a.s);
-
-    return simd;
-}
-
-inline int32_4 simd_min(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_min_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 simd_min(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_min_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 simd_min(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_min_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 simd_max(int32_4 a, int32_4 b)
-{
-    int32_4 simd;
-    simd.s = _mm_max_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_8 simd_max(int32_8 a, int32_8 b)
-{
-    int32_8 simd;
-    simd.s = _mm256_max_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_16 simd_max(int32_16 a, int32_16 b)
-{
-    int32_16 simd;
-    simd.s = _mm512_max_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int32_4 sign(int32_4 a)
-{
-    __m128i mask = _mm_set1_epi32(0x80000000);
-    __m128i signBit = _mm_and_si128(a.s, mask);
-    __m128i b = _mm_set1_epi32(1);
-
-    int32_4 simd;
-    simd.s = _mm_or_si128(b, signBit);
-
-    return simd;
-}
-
-inline int32_8 sign(int32_8 a)
-{
-    __m256i mask = _mm256_set1_epi32(0x80000000);
-    __m256i signBit = _mm256_and_si256(a.s, mask);
-    __m256i b = _mm256_set1_epi32(1);
-
-    int32_8 simd;
-    simd.s = _mm256_or_si256(b, signBit);
-
-    return simd;
-}
-
-inline int32_16 sign(int32_16 a)
-{
-    __m512i mask = _mm512_set1_epi32(0x80000000);
-    __m512i signBit = _mm512_and_si512(a.s, mask);
-    __m512i b = _mm512_set1_epi32(1);
-    int32_16 simd;
-
-    simd.s = _mm512_or_si512(b, signBit);
-
-    return simd;
-}
-
-inline f32_4 sqrt(int32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_sqrt_ps(_mm_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline f32_8 sqrt(int32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_sqrt_ps(_mm256_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline f32_16 sqrt(int32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_sqrt_ps(_mm512_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline f32_4 sqrt_inv_approx(int32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_rsqrt_ps(_mm_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline f32_8 sqrt_inv_approx(int32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_rsqrt_ps(_mm256_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline f32_16 sqrt_inv_approx(int32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_rsqrt14_ps(_mm512_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline f32_4 one_over_approx(int32_4 a)
-{
-    f32_4 simd;
-    simd.s = _mm_rcp_ps(_mm_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline f32_8 one_over_approx(int32_8 a)
-{
-    f32_8 simd;
-    simd.s = _mm256_rcp_ps(_mm256_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline f32_16 one_over_approx(int32_16 a)
-{
-    f32_16 simd;
-    simd.s = _mm512_rcp14_ps(_mm512_cvtepi32_ps(a.s));
-
-    return simd;
-}
-
-inline int32_4 clamp(int32_4 min_value, int32_4 a, int32_4 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int32_8 clamp(int32_8 min_value, int32_8 a, int32_8 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int32_16 clamp(int32_16 min_value, int32_16 a, int32_16 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int32 which_true(int32_4 a)
-{
-    int32 which_true = _mm_movemask_epi8(a.s);
-
-    return which_true;
-}
-
-inline int32 which_true(int32_8 a)
-{
-    int32 which_true = _mm256_movemask_epi8(a.s);
-
-    return which_true;
-}
-
-inline int32 which_true(int32_16 a)
-{
-    int32 which_true = _mm512_movepi32_mask(a.s);
-
-    return which_true;
-}
-
-inline bool any_true(int32_4 a)
-{
-    bool is_any_true = _mm_movemask_epi8(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool any_true(int32_8 a)
-{
-    bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool any_true(int32_16 a)
-{
-    bool is_any_true = _mm512_movepi32_mask(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool all_true(int32_4 a)
-{
-    bool is_true = _mm_movemask_epi8(a.s) == 15;
-
-    return is_true;
-}
-
-inline bool all_true(int32_8 a)
-{
-    bool is_true = _mm256_movemask_epi8(a.s) == 255;
-
-    return is_true;
-}
-
-inline bool all_true(int32_16 a)
-{
-    bool is_true = _mm512_movepi32_mask(a.s) == 65535;
-
-    return is_true;
-}
-
-inline bool all_false(int32_4 a)
-{
-    bool is_false = _mm_movemask_epi8(a.s) == 0;
-
-    return is_false;
-}
-
-inline bool all_false(int32_8 a)
-{
-    bool is_false = _mm256_movemask_epi8(a.s) == 0;
-
-    return is_false;
-}
-
-inline bool all_false(int32_16 a)
-{
-    // @todo This can be optimized (requires also changes in the comparison functions return)
-    bool is_false = _mm512_movepi32_mask(a.s) == 0;
-
-    return is_false;
-}
-
-// @todo from down here we can optimize some of the code by NOT using the wrappers
-//      the code is self contained and we could use te intrinsic functions directly
+#ifdef MACRO_CPU_FEATURE_AVX512
+    #include "SIMD_I32_AVX512.h"
+#endif
 
 inline
 void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) b, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512i b_16;
-        __m512i result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512i b_16;
+            __m512i result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_epi32(a);
-            b_16 = _mm512_load_epi32(b);
-            result_16 = _mm512_mul_epi32(a_16, b_16);
-            _mm512_store_epi32(result, result_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_epi32(a);
+                b_16 = _mm512_load_epi32(b);
+                result_16 = _mm512_mul_epi32(a_16, b_16);
+                _mm512_store_epi32(result, result_16);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256i b_8;
-        __m256i result_8;
+                a += steps;
+                b += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_si256((__m256i *) a);
-            b_8 = _mm256_load_si256((__m256i *) b);
-            result_8 = _mm256_mul_epi32(a_8, b_8);
-            _mm256_store_si256((__m256i *) result, result_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128i b_4;
-        __m128i result_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256i b_8;
+            __m256i result_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_si128((__m128i *) a);
-            b_4 = _mm_load_si128((__m128i *) b);
-            result_4 = _mm_mul_epi32(a_4, b_4);
-            _mm_store_si128((__m128i *) result, result_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i *) a);
+                b_8 = _mm256_load_si256((__m256i *) b);
+                result_8 = _mm256_mul_epi32(a_8, b_8);
+                _mm256_store_si256((__m256i *) result, result_8);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128i b_4;
+            __m128i result_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i *) a);
+                b_4 = _mm_load_si128((__m128i *) b);
+                result_4 = _mm_mul_epi32(a_4, b_4);
+                _mm_store_si128((__m128i *) result, result_4);
+
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         ++a;
@@ -1109,59 +113,79 @@ inline
 void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) b, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512 af_16;
-        __m512 b_16;
-        __m512 result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512 af_16;
+            __m512 b_16;
+            __m512 result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_epi32(a);
-            af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_load_ps(b);
-            result_16 = _mm512_mul_ps(af_16, b_16);
-            _mm512_store_ps(result, result_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_epi32(a);
+                af_16 = _mm512_cvtepi32_ps(a_16);
+                b_16 = _mm512_load_ps(b);
+                result_16 = _mm512_mul_ps(af_16, b_16);
+                _mm512_store_ps(result, result_16);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256 af_8;
-        __m256 b_8;
-        __m256 result_8;
+                a += steps;
+                b += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_si256((__m256i *) a);
-            af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_load_ps(b);
-            result_8 = _mm256_mul_ps(af_8, b_8);
-            _mm256_store_ps(result, result_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128 af_4;
-        __m128 b_4;
-        __m128 result_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256 af_8;
+            __m256 b_8;
+            __m256 result_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_si128((__m128i *) a);
-            af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_load_ps(b);
-            result_4 = _mm_mul_ps(af_4, b_4);
-            _mm_store_ps(result, result_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i *) a);
+                af_8 = _mm256_cvtepi32_ps(a_8);
+                b_8 = _mm256_load_ps(b);
+                result_8 = _mm256_mul_ps(af_8, b_8);
+                _mm256_store_ps(result, result_8);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128 af_4;
+            __m128 b_4;
+            __m128 result_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i *) a);
+                af_4 = _mm_cvtepi32_ps(a_4);
+                b_4 = _mm_load_ps(b);
+                result_4 = _mm_mul_ps(af_4, b_4);
+                _mm_store_ps(result, result_4);
+
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         *result = *a * *b;
@@ -1176,65 +200,85 @@ inline
 void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) b, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512 af_16;
-        __m512 b_16;
-        __m512 result_16;
-        __m512i resulti_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512 af_16;
+            __m512 b_16;
+            __m512 result_16;
+            __m512i resulti_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_epi32(a);
-            af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_load_ps(b);
-            result_16 = _mm512_mul_ps(af_16, b_16);
-            resulti_16 = _mm512_cvtps_epi32(result_16);
-            _mm512_store_epi32(result, resulti_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_epi32(a);
+                af_16 = _mm512_cvtepi32_ps(a_16);
+                b_16 = _mm512_load_ps(b);
+                result_16 = _mm512_mul_ps(af_16, b_16);
+                resulti_16 = _mm512_cvtps_epi32(result_16);
+                _mm512_store_epi32(result, resulti_16);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256 af_8;
-        __m256 b_8;
-        __m256 result_8;
-        __m256i resulti_8;
+                a += steps;
+                b += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_si256((__m256i *) a);
-            af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_load_ps(b);
-            result_8 = _mm256_mul_ps(af_8, b_8);
-            resulti_8 = _mm256_cvtps_epi32(result_8);
-            _mm256_store_si256((__m256i *) result, resulti_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128 af_4;
-        __m128 b_4;
-        __m128 result_4;
-        __m128i resulti_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256 af_8;
+            __m256 b_8;
+            __m256 result_8;
+            __m256i resulti_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_si128((__m128i *) a);
-            af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_load_ps(b);
-            result_4 = _mm_mul_ps(af_4, b_4);
-            resulti_4 = _mm_cvtps_epi32(result_4);
-            _mm_store_si128((__m128i *) result, resulti_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i *) a);
+                af_8 = _mm256_cvtepi32_ps(a_8);
+                b_8 = _mm256_load_ps(b);
+                result_8 = _mm256_mul_ps(af_8, b_8);
+                resulti_8 = _mm256_cvtps_epi32(result_8);
+                _mm256_store_si256((__m256i *) result, resulti_8);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128 af_4;
+            __m128 b_4;
+            __m128 result_4;
+            __m128i resulti_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i *) a);
+                af_4 = _mm_cvtepi32_ps(a_4);
+                b_4 = _mm_load_ps(b);
+                result_4 = _mm_mul_ps(af_4, b_4);
+                resulti_4 = _mm_cvtps_epi32(result_4);
+                _mm_store_si128((__m128i *) result, resulti_4);
+
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         *result = (int32) (*a * *b);
@@ -1249,59 +293,78 @@ inline
 void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512 af_16;
-        __m512 b_16 = _mm512_set1_ps(b);
-        __m512 result_16;
-        __m512i resulti_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512 af_16;
+            __m512 b_16 = _mm512_set1_ps(b);
+            __m512 result_16;
+            __m512i resulti_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_epi32(a);
-            af_16 = _mm512_cvtepi32_ps(a_16);
-            result_16 = _mm512_mul_ps(af_16, b_16);
-            resulti_16 = _mm512_cvtps_epi32(result_16);
-            _mm512_store_epi32(result, resulti_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_epi32(a);
+                af_16 = _mm512_cvtepi32_ps(a_16);
+                result_16 = _mm512_mul_ps(af_16, b_16);
+                resulti_16 = _mm512_cvtps_epi32(result_16);
+                _mm512_store_epi32(result, resulti_16);
 
-            a += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256 af_8;
-        __m256 b_8 = _mm256_set1_ps(b);
-        __m256 result_8;
-        __m256i resulti_8;
+                a += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_si256((__m256i *) a);
-            af_8 = _mm256_cvtepi32_ps(a_8);
-            result_8 = _mm256_mul_ps(af_8, b_8);
-            resulti_8 = _mm256_cvtps_epi32(result_8);
-            _mm256_store_si256((__m256i *) result, resulti_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128 af_4;
-        __m128 b_4 = _mm_set1_ps(b);
-        __m128 result_4;
-        __m128i resulti_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256 af_8;
+            __m256 b_8 = _mm256_set1_ps(b);
+            __m256 result_8;
+            __m256i resulti_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_si128((__m128i *) a);
-            af_4 = _mm_cvtepi32_ps(a_4);
-            result_4 = _mm_mul_ps(af_4, b_4);
-            resulti_4 = _mm_cvtps_epi32(result_4);
-            _mm_store_si128((__m128i *) result, resulti_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i *) a);
+                af_8 = _mm256_cvtepi32_ps(a_8);
+                result_8 = _mm256_mul_ps(af_8, b_8);
+                resulti_8 = _mm256_cvtps_epi32(result_8);
+                _mm256_store_si256((__m256i *) result, resulti_8);
 
-            a += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128 af_4;
+            __m128 b_4 = _mm_set1_ps(b);
+            __m128 result_4;
+            __m128i resulti_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i *) a);
+                af_4 = _mm_cvtepi32_ps(a_4);
+                result_4 = _mm_mul_ps(af_4, b_4);
+                resulti_4 = _mm_cvtps_epi32(result_4);
+                _mm_store_si128((__m128i *) result, resulti_4);
+
+                a += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         *result = (int32) (*a * b);
@@ -1315,53 +378,72 @@ inline
 void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512 af_16;
-        __m512 b_16 = _mm512_set1_ps(b);
-        __m512 result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512 af_16;
+            __m512 b_16 = _mm512_set1_ps(b);
+            __m512 result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_epi32(a);
-            af_16 = _mm512_cvtepi32_ps(a_16);
-            result_16 = _mm512_div_ps(af_16, b_16);
-            _mm512_store_ps(result, result_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_epi32(a);
+                af_16 = _mm512_cvtepi32_ps(a_16);
+                result_16 = _mm512_div_ps(af_16, b_16);
+                _mm512_store_ps(result, result_16);
 
-            a += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256 af_8;
-        __m256 b_8 = _mm256_set1_ps(b);
-        __m256 result_8;
+                a += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_si256((__m256i *) a);
-            af_8 = _mm256_cvtepi32_ps(a_8);
-            result_8 = _mm256_div_ps(af_8, b_8);
-            _mm256_store_ps(result, result_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128 af_4;
-        __m128 b_4 = _mm_set1_ps(b);
-        __m128 result_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256 af_8;
+            __m256 b_8 = _mm256_set1_ps(b);
+            __m256 result_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_si128((__m128i *) a);
-            af_4 = _mm_cvtepi32_ps(a_4);
-            result_4 = _mm_div_ps(af_4, b_4);
-            _mm_store_ps(result, result_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i *) a);
+                af_8 = _mm256_cvtepi32_ps(a_8);
+                result_8 = _mm256_div_ps(af_8, b_8);
+                _mm256_store_ps(result, result_8);
 
-            a += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128 af_4;
+            __m128 b_4 = _mm_set1_ps(b);
+            __m128 result_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i *) a);
+                af_4 = _mm_cvtepi32_ps(a_4);
+                result_4 = _mm_div_ps(af_4, b_4);
+                _mm_store_ps(result, result_4);
+
+                a += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         *result = *a / b;
@@ -1375,53 +457,73 @@ inline
 void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) b, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512i b_16;
-        __m512i result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512i b_16;
+            __m512i result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_epi32(a);
-            b_16 = _mm512_load_epi32(b);
-            result_16 = _mm512_add_epi32(a_16, b_16);
-            _mm512_store_epi32(result, result_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_epi32(a);
+                b_16 = _mm512_load_epi32(b);
+                result_16 = _mm512_add_epi32(a_16, b_16);
+                _mm512_store_epi32(result, result_16);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256i b_8;
-        __m256i result_8;
+                a += steps;
+                b += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_si256((__m256i *) a);
-            b_8 = _mm256_load_si256((__m256i *) b);
-            result_8 = _mm256_add_epi32(a_8, b_8);
-            _mm256_store_si256((__m256i *) result, result_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128i b_4;
-        __m128i result_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256i b_8;
+            __m256i result_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_si128((__m128i *) a);
-            b_4 = _mm_load_si128((__m128i *) b);
-            result_4 = _mm_add_epi32(a_4, b_4);
-            _mm_store_si128((__m128i *) result, result_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i *) a);
+                b_8 = _mm256_load_si256((__m256i *) b);
+                result_8 = _mm256_add_epi32(a_8, b_8);
+                _mm256_store_si256((__m256i *) result, result_8);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128i b_4;
+            __m128i result_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i *) a);
+                b_4 = _mm_load_si128((__m128i *) b);
+                result_4 = _mm_add_epi32(a_4, b_4);
+                _mm_store_si128((__m128i *) result, result_4);
+
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         *result = *a + *b;
@@ -1436,59 +538,79 @@ inline
 void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) b, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512 af_16;
-        __m512 b_16;
-        __m512 result_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512 af_16;
+            __m512 b_16;
+            __m512 result_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_epi32(a);
-            af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_load_ps(b);
-            result_16 = _mm512_add_ps(af_16, b_16);
-            _mm512_store_ps(result, result_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_epi32(a);
+                af_16 = _mm512_cvtepi32_ps(a_16);
+                b_16 = _mm512_load_ps(b);
+                result_16 = _mm512_add_ps(af_16, b_16);
+                _mm512_store_ps(result, result_16);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256 af_8;
-        __m256 b_8;
-        __m256 result_8;
+                a += steps;
+                b += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_si256((__m256i *) a);
-            af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_load_ps(b);
-            result_8 = _mm256_add_ps(af_8, b_8);
-            _mm256_store_ps(result, result_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128 af_4;
-        __m128 b_4;
-        __m128 result_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256 af_8;
+            __m256 b_8;
+            __m256 result_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_si128((__m128i *) a);
-            af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_load_ps(b);
-            result_4 = _mm_add_ps(af_4, b_4);
-            _mm_store_ps(result, result_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i *) a);
+                af_8 = _mm256_cvtepi32_ps(a_8);
+                b_8 = _mm256_load_ps(b);
+                result_8 = _mm256_add_ps(af_8, b_8);
+                _mm256_store_ps(result, result_8);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128 af_4;
+            __m128 b_4;
+            __m128 result_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i *) a);
+                af_4 = _mm_cvtepi32_ps(a_4);
+                b_4 = _mm_load_ps(b);
+                result_4 = _mm_add_ps(af_4, b_4);
+                _mm_store_ps(result, result_4);
+
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         *result = *a + *b;
@@ -1503,65 +625,85 @@ inline
 void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) b, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps == 16) {
-        __m512i a_16;
-        __m512 af_16;
-        __m512 b_16;
-        __m512 result_16;
-        __m512i resulti_16;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            __m512i a_16;
+            __m512 af_16;
+            __m512 b_16;
+            __m512 result_16;
+            __m512i resulti_16;
 
-        for (; i <= size - steps; i += steps) {
-            a_16 = _mm512_load_epi32(a);
-            af_16 = _mm512_cvtepi32_ps(a_16);
-            b_16 = _mm512_load_ps(b);
-            result_16 = _mm512_add_ps(af_16, b_16);
-            resulti_16 = _mm512_cvtps_epi32(result_16);
-            _mm512_store_epi32(result, resulti_16);
+            for (; i <= size - steps; i += steps) {
+                a_16 = _mm512_load_epi32(a);
+                af_16 = _mm512_cvtepi32_ps(a_16);
+                b_16 = _mm512_load_ps(b);
+                result_16 = _mm512_add_ps(af_16, b_16);
+                resulti_16 = _mm512_cvtps_epi32(result_16);
+                _mm512_store_epi32(result, resulti_16);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 8) {
-        __m256i a_8;
-        __m256 af_8;
-        __m256 b_8;
-        __m256 result_8;
-        __m256i resulti_8;
+                a += steps;
+                b += steps;
+                result += steps;
+            }
 
-        for (; i <= size - steps; i += steps) {
-            a_8 = _mm256_load_si256((__m256i *) a);
-            af_8 = _mm256_cvtepi32_ps(a_8);
-            b_8 = _mm256_load_ps(b);
-            result_8 = _mm256_add_ps(af_8, b_8);
-            resulti_8 = _mm256_cvtps_epi32(result_8);
-            _mm256_store_si256((__m256i *) result, resulti_8);
+            steps = 1;
+        }
+    #endif
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    } else if (steps == 4) {
-        __m128i a_4;
-        __m128 af_4;
-        __m128 b_4;
-        __m128 result_4;
-        __m128i resulti_4;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            __m256i a_8;
+            __m256 af_8;
+            __m256 b_8;
+            __m256 result_8;
+            __m256i resulti_8;
 
-        for (; i <= size - steps; i += steps) {
-            a_4 = _mm_load_si128((__m128i *) a);
-            af_4 = _mm_cvtepi32_ps(a_4);
-            b_4 = _mm_load_ps(b);
-            result_4 = _mm_add_ps(af_4, b_4);
-            resulti_4 = _mm_cvtps_epi32(result_4);
-            _mm_store_si128((__m128i *) result, resulti_4);
+            for (; i <= size - steps; i += steps) {
+                a_8 = _mm256_load_si256((__m256i *) a);
+                af_8 = _mm256_cvtepi32_ps(a_8);
+                b_8 = _mm256_load_ps(b);
+                result_8 = _mm256_add_ps(af_8, b_8);
+                resulti_8 = _mm256_cvtps_epi32(result_8);
+                _mm256_store_si256((__m256i *) result, resulti_8);
 
-            a += steps;
-            b += steps;
-            result += steps;
-       }
-    }
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+
+            steps = 1;
+        }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            __m128i a_4;
+            __m128 af_4;
+            __m128 b_4;
+            __m128 result_4;
+            __m128i resulti_4;
+
+            for (; i <= size - steps; i += steps) {
+                a_4 = _mm_load_si128((__m128i *) a);
+                af_4 = _mm_cvtepi32_ps(a_4);
+                b_4 = _mm_load_ps(b);
+                result_4 = _mm_add_ps(af_4, b_4);
+                resulti_4 = _mm_cvtps_epi32(result_4);
+                _mm_store_si128((__m128i *) result, resulti_4);
+
+                a += steps;
+                b += steps;
+                result += steps;
+            }
+        }
+    #endif
 
     for (; i < size; ++i) {
         *result = (int32) (*a + *b);
@@ -1572,49 +714,52 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste
     }
 }
 
-// WARNING: only works with SSE4.2
-// WARNING: incl. \0 both strings must be <= 16 length
-bool str_compare_avx512(const char* str1, const char* str2) {
-    __m128i s1 = _mm_load_si128((__m128i *) (const __m128i *)  str1);
-    __m128i s2 = _mm_load_si128((__m128i *) (const __m128i *)  str2);
-
-    return _mm_cmpistrc(s1, s2, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH) == 0;
-}
-
 void
 endian_swap(const int32* val, int32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) val, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps >= 8) {
-        const __m256i mask_256 = _mm256_setr_epi8(
-            3, 2, 1, 0,  7, 6, 5, 4,
-            11, 10, 9, 8,  15, 14, 13, 12,
-            19, 18, 17, 16,  23, 22, 21, 20,
-            27, 26, 25, 24,  31, 30, 29, 28
-        );
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            const __m256i mask_256 = _mm256_setr_epi8(
+                3, 2, 1, 0,  7, 6, 5, 4,
+                11, 10, 9, 8,  15, 14, 13, 12,
+                19, 18, 17, 16,  23, 22, 21, 20,
+                27, 26, 25, 24,  31, 30, 29, 28
+            );
 
-        for (i = 0; i <= size - steps; i += steps) {
-            __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
-            vec = _mm256_shuffle_epi8(vec, mask_256);
+            for (i = 0; i <= size - steps; i += steps) {
+                __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
+                vec = _mm256_shuffle_epi8(vec, mask_256);
 
-            _mm256_storeu_si256((__m256i *) (result + i), vec);
+                _mm256_store_si256((__m256i *) (result + i), vec);
+            }
+
+            steps = 1;
         }
-    } else if (steps == 4) {
-        const __m128i mask_128 = _mm_setr_epi8(
-            3, 2, 1, 0,
-            7, 6, 5, 4,
-            11, 10, 9, 8,
-            15, 14, 13, 12
-        );
+    #endif
 
-        for (i = 0; i <= size - steps; i += steps) {
-             __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i));
-            vec = _mm_shuffle_epi8(vec, mask_128);
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            const __m128i mask_128 = _mm_setr_epi8(
+                3, 2, 1, 0,
+                7, 6, 5, 4,
+                11, 10, 9, 8,
+                15, 14, 13, 12
+            );
 
-            _mm_storeu_si128((__m128i *) (result + i), vec);
+            for (i = 0; i <= size - steps; i += steps) {
+                __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i));
+                vec = _mm_shuffle_epi8(vec, mask_128);
+
+                _mm_store_si128((__m128i *) (result + i), vec);
+            }
         }
-    }
+    #endif
 
     for (; i < size; ++i) {
         uint32 v = ((uint32 *) val)[i];
@@ -1629,36 +774,48 @@ void
 endian_swap(const uint32* val, uint32* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) val, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps >= 8) {
-        const __m256i mask_256 = _mm256_setr_epi8(
-            3, 2, 1, 0,  7, 6, 5, 4,
-            11, 10, 9, 8,  15, 14, 13, 12,
-            19, 18, 17, 16,  23, 22, 21, 20,
-            27, 26, 25, 24,  31, 30, 29, 28
-        );
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            const __m256i mask_256 = _mm256_setr_epi8(
+                3, 2, 1, 0,  7, 6, 5, 4,
+                11, 10, 9, 8,  15, 14, 13, 12,
+                19, 18, 17, 16,  23, 22, 21, 20,
+                27, 26, 25, 24,  31, 30, 29, 28
+            );
 
-        for (i = 0; i <= size - steps; i += steps) {
-            __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
-            vec = _mm256_shuffle_epi8(vec, mask_256);
+            for (i = 0; i <= size - steps; i += steps) {
+                __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
+                vec = _mm256_shuffle_epi8(vec, mask_256);
 
-            _mm256_storeu_si256((__m256i *) (result + i), vec);
+                _mm256_store_si256((__m256i *) (result + i), vec);
+            }
+
+            steps = 1;
         }
-    } else if (steps == 4) {
-        const __m128i mask_128 = _mm_setr_epi8(
-            3, 2, 1, 0,
-            7, 6, 5, 4,
-            11, 10, 9, 8,
-            15, 14, 13, 12
-        );
+    #endif
 
-        for (i = 0; i <= size - steps; i += steps) {
-             __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i));
-            vec = _mm_shuffle_epi8(vec, mask_128);
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            const __m128i mask_128 = _mm_setr_epi8(
+                3, 2, 1, 0,
+                7, 6, 5, 4,
+                11, 10, 9, 8,
+                15, 14, 13, 12
+            );
 
-            _mm_storeu_si128((__m128i *) (result + i), vec);
+            for (i = 0; i <= size - steps; i += steps) {
+                __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i));
+                vec = _mm_shuffle_epi8(vec, mask_128);
+
+                _mm_store_si128((__m128i *) (result + i), vec);
+            }
         }
-    }
+    #endif
 
     for (; i < size; ++i) {
         uint32 v = ((uint32 *) val)[i];
@@ -1672,8 +829,12 @@ endian_swap(const uint32* val, uint32* result, int32 size, int32 steps)
 void endian_swap(const int16* val, int16* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) val, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
+    #ifdef MACRO_CPU_FEATURE_AVX2
     if (steps >= 8) {
+        steps = 8;
         const __m256i mask_256 = _mm256_setr_epi8(
             1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
             17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30
@@ -1683,22 +844,29 @@ void endian_swap(const int16* val, int16* result, int32 size, int32 steps)
             __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
             vec = _mm256_shuffle_epi8(vec, mask_256);
 
-            _mm256_storeu_si256((__m256i *) (result + i), vec);
+            _mm256_store_si256((__m256i *) (result + i), vec);
         }
-    } else if (steps == 4) {
-        const __m128i mask_128 = _mm_setr_epi8(
-            1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
-        );
 
-        for (i = 0; i <= size - steps; i += steps) {
-            __m128i vec = _mm_load_si128((const __m128i *) (val + i));
-            vec = _mm_shuffle_epi8(vec, mask_128);
-
-            _mm_storeu_si128((__m128i *) (result + i), vec);
-        }
+        steps = 1;
     }
+    #endif
+
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            const __m128i mask_128 = _mm_setr_epi8(
+                1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
+            );
+
+            for (i = 0; i <= size - steps; i += steps) {
+                __m128i vec = _mm_load_si128((const __m128i *) (val + i));
+                vec = _mm_shuffle_epi8(vec, mask_128);
+
+                _mm_store_si128((__m128i *) (result + i), vec);
+            }
+        }
+    #endif
 
-    // Handle remaining elements
     for (; i < size; ++i) {
         uint16 v = ((uint16 *) val)[i];
         ((int16 *) result)[i] = ((v << 8) | (v >> 8));
@@ -1708,33 +876,44 @@ void endian_swap(const int16* val, int16* result, int32 size, int32 steps)
 void endian_swap(const uint16* val, uint16* result, int32 size, int32 steps)
 {
     int32 i = 0;
+    steps = intrin_validate_steps((const byte*) val, steps);
+    steps = intrin_validate_steps((const byte*) result, steps);
 
-    if (steps >= 8) {
-        const __m256i mask_256 = _mm256_setr_epi8(
-            1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-            17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30
-        );
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            const __m256i mask_256 = _mm256_setr_epi8(
+                1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30
+            );
 
-        for (i = 0; i <= size - steps; i += steps) {
-            __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
-            vec = _mm256_shuffle_epi8(vec, mask_256);
+            for (i = 0; i <= size - steps; i += steps) {
+                __m256i vec = _mm256_load_si256((const __m256i *) (val + i));
+                vec = _mm256_shuffle_epi8(vec, mask_256);
 
-            _mm256_storeu_si256((__m256i *) (result + i), vec);
+                _mm256_store_si256((__m256i *) (result + i), vec);
+            }
+
+            steps = 1;
         }
-    } else if (steps == 4) {
-        const __m128i mask_128 = _mm_setr_epi8(
-            1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
-        );
+    #endif
 
-        for (i = 0; i <= size - steps; i += steps) {
-            __m128i vec = _mm_load_si128((const __m128i *) (val + i));
-            vec = _mm_shuffle_epi8(vec, mask_128);
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            const __m128i mask_128 = _mm_setr_epi8(
+                1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
+            );
 
-            _mm_storeu_si128((__m128i *) (result + i), vec);
+            for (i = 0; i <= size - steps; i += steps) {
+                __m128i vec = _mm_load_si128((const __m128i *) (val + i));
+                vec = _mm_shuffle_epi8(vec, mask_128);
+
+                _mm_store_si128((__m128i *) (result + i), vec);
+            }
         }
-    }
+    #endif
 
-    // Handle remaining elements
     for (; i < size; ++i) {
         uint16 v = ((uint16 *) val)[i];
         ((uint16 *) result)[i] = ((v << 8) | (v >> 8));
diff --git a/architecture/x86/simd/SIMD_I32_AVX2.h b/architecture/x86/simd/SIMD_I32_AVX2.h
new file mode 100644
index 0000000..6cb7b0e
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I32_AVX2.h
@@ -0,0 +1,288 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I32_H
+#define COMS_TOS_STDLIB_SIMD_I32_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general
+//      or better create alternative functions for the available sse version.
+
+// @question why are we passing structs by value?
+struct int32_8 {
+    union {
+        #if ARM
+            svint32_t s;
+        #else
+            __m256i s;
+        #endif
+
+        int32 v[8];
+    };
+};
+
+inline int32_8 load_int32_8(const int32* mem)
+{
+    int32_8 simd;
+    simd.s = _mm256_load_si256((__m256i *) mem);
+
+    return simd;
+}
+
+inline int32_8 init_int32_8(const int32* mem)
+{
+    int32_8 simd;
+    simd.s = _mm256_set_epi32(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]);
+
+    return simd;
+}
+
+inline void unload_int32_8(int32_8 a, int32 *array) { _mm256_store_si256((__m256i *) array, a.s); }
+
+inline int32_8 init_zero_int32_8()
+{
+    int32_8 simd;
+    simd.s = _mm256_setzero_si256();
+
+    return simd;
+}
+
+inline int32_8 init_value_int32_8(int32 value)
+{
+    int32_8 simd;
+    simd.s = _mm256_set1_epi32(value);
+
+    return simd;
+}
+
+inline int32_8 init_values_int32_8(
+    int32 a, int32 b, int32 c, int32 d,
+    int32 e, int32 f, int32 g, int32 h
+)
+{
+    int32_8 simd;
+    simd.s = _mm256_set_epi32(a, b, c, d, e, f, g, h);
+
+    return simd;
+}
+
+inline int32_8 operator+(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_add_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 operator-(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_sub_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 operator-(int32_8 a) { return init_zero_int32_8() - a; }
+
+inline int32_8 operator*(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 operator^(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_xor_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 &operator-=(int32_8 &a, int32_8 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int32_8 &operator+=(int32_8 &a, int32_8 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int32_8 &operator*=(int32_8 &a, int32_8 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int32_8 &operator^=(int32_8 &a, int32_8 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int32_8 operator<(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_xor_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1));
+
+    return simd;
+}
+
+inline int32_8 operator<=(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1));
+
+    return simd;
+}
+
+inline int32_8 operator>(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_cmpgt_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 operator>=(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(b.s, a.s), _mm256_set1_epi32(-1));
+
+    return simd;
+}
+
+inline int32_8 operator==(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_cmpeq_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 operator!=(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_mask_blend_epi32(_mm256_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 operator&(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_and_si256(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 operator|(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_or_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 &operator&=(int32_8 &a, int32_8 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int32_8 &operator|=(int32_8 &a, int32_8 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int32_8 abs(int32_8 a)
+{
+    int32_8 simd;
+    simd.s = _mm256_abs_epi32(a.s);
+
+    return simd;
+}
+
+inline int32_8 simd_min(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_min_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 simd_max(int32_8 a, int32_8 b)
+{
+    int32_8 simd;
+    simd.s = _mm256_max_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_8 sign(int32_8 a)
+{
+    __m256i mask = _mm256_set1_epi32(0x80000000);
+    __m256i signBit = _mm256_and_si256(a.s, mask);
+    __m256i b = _mm256_set1_epi32(1);
+
+    int32_8 simd;
+    simd.s = _mm256_or_si256(b, signBit);
+
+    return simd;
+}
+
+inline int32_8 clamp(int32_8 min_value, int32_8 a, int32_8 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(int32_8 a)
+{
+    int32 which_true = _mm256_movemask_epi8(a.s);
+
+    return which_true;
+}
+
+inline bool any_true(int32_8 a)
+{
+    bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int32_8 a)
+{
+    bool is_true = _mm256_movemask_epi8(a.s) == 255;
+
+    return is_true;
+}
+
+inline bool all_false(int32_8 a)
+{
+    bool is_false = _mm256_movemask_epi8(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
diff --git a/architecture/x86/simd/SIMD_I32_AVX512.h b/architecture/x86/simd/SIMD_I32_AVX512.h
new file mode 100644
index 0000000..cd56539
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I32_AVX512.h
@@ -0,0 +1,309 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I32_H
+#define COMS_TOS_STDLIB_SIMD_I32_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "../../../stdlib/Types.h"
+#include "SIMD_SVML_AVX512.h"
+
+// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general
+//      or better create alternative functions for the available sse version.
+
+// @question why are we passing structs by value?
+struct int32_16 {
+    union {
+        #if ARM
+            svint32_t s;
+        #else
+            __m512i s;
+        #endif
+
+        int32 v[16];
+    };
+};
+
+inline int32_16 load_int32_16(const int32* mem)
+{
+    int32_16 simd;
+    simd.s = _mm512_load_epi32(mem);
+
+    return simd;
+}
+
+inline int32_16 init_int32_16(const int32* mem)
+{
+    int32_16 simd;
+    simd.s = _mm512_set_epi32(
+        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
+        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]);
+
+    return simd;
+}
+
+inline void unload_int32_16(int32_16 a, int32 *array) { _mm512_store_epi32(array, a.s); }
+
+inline int32_16 init_zero_int32_16()
+{
+    int32_16 simd;
+    simd.s = _mm512_setzero_epi32();
+
+    return simd;
+}
+
+inline int32_16 init_value_int32_16(int32 value)
+{
+    int32_16 simd;
+    simd.s = _mm512_set1_epi32(value);
+
+    return simd;
+}
+
+inline int32_16 init_values_int32_16(
+    int32 a, int32 b, int32 c, int32 d,
+    int32 e, int32 f, int32 g, int32 h,
+    int32 i, int32 j, int32 k, int32 l,
+    int32 m, int32 n, int32 o, int32 p
+)
+{
+    int32_16 simd;
+    simd.s = _mm512_set_epi32(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
+
+    return simd;
+}
+
+inline int32_16 operator+(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_add_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator-(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_sub_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator-(int32_16 a) { return init_zero_int32_16() - a; }
+
+inline int32_16 operator*(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator/(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_div_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator^(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_xor_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 &operator-=(int32_16 &a, int32_16 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int32_16 &operator+=(int32_16 &a, int32_16 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int32_16 &operator*=(int32_16 &a, int32_16 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int32_16 &operator/=(int32_16 &a, int32_16 b)
+{
+    a.s = (a / b).s;
+
+    return a;
+}
+
+inline int32_16 &operator^=(int32_16 &a, int32_16 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int32_16 operator<(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_mask_blend_epi32(_mm512_cmplt_epi32_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator<=(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_mask_blend_epi32(_mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), b.s, a.s);
+
+    return simd;
+}
+
+inline int32_16 operator>(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_mask_blend_epi32(_mm512_cmpgt_epi32_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator>=(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_mask_blend_epi32(_mm512_cmpge_epi32_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator==(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_mask_blend_epi32(_mm512_cmpeq_epi32_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator!=(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_mask_blend_epi32(_mm512_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator&(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_and_si512(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 operator|(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_or_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 &operator&=(int32_16 &a, int32_16 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int32_16 &operator|=(int32_16 &a, int32_16 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int32_16 abs(int32_16 a)
+{
+    int32_16 simd;
+    simd.s = _mm512_abs_epi64(a.s);
+
+    return simd;
+}
+
+inline int32_16 simd_min(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_min_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 simd_max(int32_16 a, int32_16 b)
+{
+    int32_16 simd;
+    simd.s = _mm512_max_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_16 sign(int32_16 a)
+{
+    __m512i mask = _mm512_set1_epi32(0x80000000);
+    __m512i signBit = _mm512_and_si512(a.s, mask);
+    __m512i b = _mm512_set1_epi32(1);
+    int32_16 simd;
+
+    simd.s = _mm512_or_si512(b, signBit);
+
+    return simd;
+}
+
+inline int32_16 clamp(int32_16 min_value, int32_16 a, int32_16 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(int32_16 a)
+{
+    int32 which_true = _mm512_movepi32_mask(a.s);
+
+    return which_true;
+}
+
+inline bool any_true(int32_16 a)
+{
+    bool is_any_true = _mm512_movepi32_mask(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int32_16 a)
+{
+    bool is_true = _mm512_movepi32_mask(a.s) == 65535;
+
+    return is_true;
+}
+
+inline bool all_false(int32_16 a)
+{
+    // @todo This can be optimized (requires also changes in the comparison functions return)
+    bool is_false = _mm512_movepi32_mask(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
diff --git a/architecture/x86/simd/SIMD_I32_SSE.h b/architecture/x86/simd/SIMD_I32_SSE.h
new file mode 100644
index 0000000..399c49f
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I32_SSE.h
@@ -0,0 +1,286 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I32_H
+#define COMS_TOS_STDLIB_SIMD_I32_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general
+//      or better create alternative functions for the available sse version.
+
+// @question why are we passing structs by value?
+
+struct int32_4 {
+    union {
+        #if ARM
+            svint32_t s;
+        #else
+            __m128i s;
+        #endif
+
+        int32 v[4];
+    };
+};
+
+inline int32_4 load_int32_4(const int32* mem)
+{
+    int32_4 simd;
+    simd.s = _mm_load_si128((__m128i *) mem);
+
+    return simd;
+}
+
+inline int32_4 init_int32_4(const int32* mem)
+{
+    int32_4 simd;
+    simd.s = _mm_set_epi32(mem[0], mem[1], mem[2], mem[3]);
+
+    return simd;
+}
+
+inline void unload_int32_4(int32_4 a, int32 *array) { _mm_store_si128((__m128i *) array, a.s); }
+
+inline int32_4 init_zero_int32_4()
+{
+    int32_4 simd;
+    simd.s = _mm_setzero_si128();
+
+    return simd;
+}
+
+inline int32_4 init_value_int32_4(int32 value)
+{
+    int32_4 simd;
+    simd.s = _mm_set1_epi32(value);
+
+    return simd;
+}
+
+inline int32_4 init_values_int32_4(int32 a, int32 b, int32 c, int32 d)
+{
+    int32_4 simd;
+    simd.s = _mm_set_epi32(a, b, c, d);
+
+    return simd;
+}
+
+inline int32_4 operator+(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_add_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 operator-(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_sub_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 operator-(int32_4 a) { return init_zero_int32_4() - a; }
+
+inline int32_4 operator*(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 operator^(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_xor_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 &operator-=(int32_4 &a, int32_4 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int32_4 &operator+=(int32_4 &a, int32_4 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int32_4 &operator*=(int32_4 &a, int32_4 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int32_4 &operator^=(int32_4 &a, int32_4 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int32_4 operator<(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_cmplt_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 operator<=(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_andnot_si128(_mm_cmplt_epi32(b.s, a.s), _mm_set1_epi32(-1));
+
+    return simd;
+}
+
+inline int32_4 operator>(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_cmpgt_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 operator>=(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_andnot_si128(_mm_cmplt_epi32(a.s, b.s), _mm_set1_epi32(-1));
+
+    return simd;
+}
+
+inline int32_4 operator==(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_cmpeq_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 operator!=(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_andnot_si128(_mm_cmpeq_epi32(a.s, b.s), _mm_set1_epi32(-1));
+
+    return simd;
+}
+
+inline int32_4 operator&(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_and_si128(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 operator|(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_or_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 &operator&=(int32_4 &a, int32_4 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int32_4 &operator|=(int32_4 &a, int32_4 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int32_4 abs(int32_4 a)
+{
+    int32_4 simd;
+    simd.s = _mm_abs_epi32(a.s);
+
+    return simd;
+}
+
+inline int32_4 simd_min(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_min_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 simd_max(int32_4 a, int32_4 b)
+{
+    int32_4 simd;
+    simd.s = _mm_max_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int32_4 sign(int32_4 a)
+{
+    __m128i mask = _mm_set1_epi32(0x80000000);
+    __m128i signBit = _mm_and_si128(a.s, mask);
+    __m128i b = _mm_set1_epi32(1);
+
+    int32_4 simd;
+    simd.s = _mm_or_si128(b, signBit);
+
+    return simd;
+}
+
+inline int32_4 clamp(int32_4 min_value, int32_4 a, int32_4 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(int32_4 a)
+{
+    int32 which_true = _mm_movemask_epi8(a.s);
+
+    return which_true;
+}
+
+inline bool any_true(int32_4 a)
+{
+    bool is_any_true = _mm_movemask_epi8(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int32_4 a)
+{
+    bool is_true = _mm_movemask_epi8(a.s) == 15;
+
+    return is_true;
+}
+
+inline bool all_false(int32_4 a)
+{
+    bool is_false = _mm_movemask_epi8(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
diff --git a/architecture/x86/simd/SIMD_I64.h b/architecture/x86/simd/SIMD_I64.h
old mode 100644
new mode 100755
index 33bbc41..151c222
--- a/architecture/x86/simd/SIMD_I64.h
+++ b/architecture/x86/simd/SIMD_I64.h
@@ -13,42 +13,17 @@
 #include <xmmintrin.h>
 
 #include "../../../stdlib/Types.h"
-#include "SIMD_F64.h"
 
-struct int64_2 {
-    union {
-        #if ARM
-            svint64_t s;
-        #else
-            __m128i s;
-        #endif
+#ifdef MACRO_CPU_FEATURE_SSE42
+    #include "SIMD_I64_SSE.h"
+#endif
 
-        int64 v[2];
-    };
-};
+#ifdef MACRO_CPU_FEATURE_AVX2
+    #include "SIMD_I64_AVX2.h"
+#endif
 
-struct int64_4 {
-    union {
-        #if ARM
-            svint64_t s;
-        #else
-            __m256i s;
-        #endif
-
-        int64 v[4];
-    };
-};
-
-struct int64_8 {
-    union {
-        #if ARM
-            svint64_t s;
-        #else
-            __m512i s;
-        #endif
-
-        int64 v[8];
-    };
-};
+#ifdef MACRO_CPU_FEATURE_AVX512
+    #include "SIMD_I64_AVX512.h"
+#endif
 
 #endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I64_AVX2.h b/architecture/x86/simd/SIMD_I64_AVX2.h
new file mode 100644
index 0000000..aa61750
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I64_AVX2.h
@@ -0,0 +1,29 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I64_AVX2_H
+#define COMS_TOS_STDLIB_SIMD_I64_AVX2_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int64_4 {
+    union {
+        #if ARM
+            svint64_t s;
+        #else
+            __m256i s;
+        #endif
+
+        int64 v[4];
+    };
+};
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I64_AVX512.h b/architecture/x86/simd/SIMD_I64_AVX512.h
new file mode 100644
index 0000000..8c49d05
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I64_AVX512.h
@@ -0,0 +1,29 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I64_AVX512_H
+#define COMS_TOS_STDLIB_SIMD_I64_AVX512_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int64_8 {
+    union {
+        #if ARM
+            svint64_t s;
+        #else
+            __m512i s;
+        #endif
+
+        int64 v[8];
+    };
+};
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I64_SSE.h b/architecture/x86/simd/SIMD_I64_SSE.h
new file mode 100644
index 0000000..82ea226
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I64_SSE.h
@@ -0,0 +1,29 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I64_SSE_H
+#define COMS_TOS_STDLIB_SIMD_I64_SSE_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int64_2 {
+    union {
+        #if ARM
+            svint64_t s;
+        #else
+            __m128i s;
+        #endif
+
+        int64 v[2];
+    };
+};
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I8.h b/architecture/x86/simd/SIMD_I8.h
old mode 100644
new mode 100755
index 6885808..4cb5205
--- a/architecture/x86/simd/SIMD_I8.h
+++ b/architecture/x86/simd/SIMD_I8.h
@@ -13,906 +13,108 @@
 #include <xmmintrin.h>
 
 #include "../../../stdlib/Types.h"
-#include "SIMD_F32.h"
-#include "SIMD_I32.h"
 
-struct int8_16 {
-    union {
-        #if ARM
-            svint8_t s;
-        #else
-            __m128i s;
-        #endif
+#ifdef MACRO_CPU_FEATURE_SSE42
+    #include "SIMD_I8_SSE.h"
+#endif
 
-        int8 v[16];
-    };
-};
+#ifdef MACRO_CPU_FEATURE_AVX2
+    #include "SIMD_I8_AVX2.h"
+#endif
 
-struct int8_32 {
-    union {
-        #if ARM
-            svint8_t s;
-        #else
-            __m256i s;
-        #endif
+#ifdef MACRO_CPU_FEATURE_AVX512
+    #include "SIMD_I8_AVX512.h"
+#endif
 
-        int8 v[32];
-    };
-};
-
-struct int8_64 {
-    union {
-        #if ARM
-            svint8_t s;
-        #else
-            __m512i s;
-        #endif
-
-        int8 v[64];
-    };
-};
-
-inline int8_16 load_int8_16(const int8* mem)
-{
-    int8_16 simd;
-    simd.s = _mm_load_si128((__m128i *) mem);
-
-    return simd;
-}
-
-inline int8_16 init_int8_16(const int8* mem)
-{
-    int8_16 simd;
-    simd.s = _mm_set_epi8(
-        mem[0], mem[1], mem[2], mem[3],
-        mem[4], mem[5], mem[6], mem[7],
-        mem[8], mem[9], mem[10], mem[11],
-        mem[12], mem[13], mem[14], mem[15]
-    );
-
-    return simd;
-}
-
-inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *) array, a.s); }
-
-inline int8_32 load_int8_32(const int8* mem)
-{
-    int8_32 simd;
-    simd.s = _mm256_load_si256((__m256i *) mem);
-
-    return simd;
-}
-
-inline int8_32 init_int8_32(const int8* mem)
-{
-    int8_32 simd;
-    simd.s = _mm256_set_epi8(
-        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
-        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
-        mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
-        mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31]
-    );
-
-    return simd;
-}
-
-inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i *) array, a.s); }
-
-inline int8_64 load_int8_64(const int8* mem)
-{
-    int8_64 simd;
-    simd.s = _mm512_load_si512((__m512i *) mem);
-
-    return simd;
-}
-
-inline int8_64 init_int8_64(const int8* mem)
-{
-    int8_64 simd;
-    simd.s = _mm512_set_epi8(
-        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
-        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
-        mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
-        mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31],
-        mem[32], mem[33], mem[34], mem[35], mem[36], mem[37], mem[38], mem[39],
-        mem[40], mem[41], mem[42], mem[43], mem[44], mem[45], mem[46], mem[47],
-        mem[48], mem[49], mem[50], mem[51], mem[52], mem[53], mem[54], mem[55],
-        mem[56], mem[57], mem[58], mem[59], mem[60], mem[61], mem[62], mem[63]
-    );
-
-    return simd;
-}
-
-inline void unload_int8_64(int8_64 a, int8 *array) { _mm512_storeu_epi8(array, a.s); }
-
-inline int8_16 init_zero_int8_16()
-{
-    int8_16 simd;
-    simd.s = _mm_setzero_si128();
-
-    return simd;
-}
-
-inline int8_32 init_zero_int8_32()
-{
-    int8_32 simd;
-    simd.s = _mm256_setzero_si256();
-
-    return simd;
-}
-
-inline int8_64 init_zero_int8_64()
-{
-    int8_64 simd;
-    simd.s = _mm512_setzero_si512();
-
-    return simd;
-}
-
-inline int8_16 init_value_int8_16(int8 value)
-{
-    int8_16 simd;
-    simd.s = _mm_set1_epi8(value);
-
-    return simd;
-}
-
-inline int8_32 init_value_int8_32(int8 value)
-{
-    int8_32 simd;
-    simd.s = _mm256_set1_epi8(value);
-
-    return simd;
-}
-
-inline int8_64 init_value_int8_64(int8 value)
-{
-    int8_64 simd;
-    simd.s = _mm512_set1_epi8(value);
-
-    return simd;
-}
-
-inline
-f32_4 int8_16_to_f32_4(int8_16 a)
-{
-    f32_4 result;
-    result.s = _mm_cvtepi32_ps(a.s);
-
-    return result;
-}
-
-inline
-f32_8 int8_16_to_f32_8(int8_16 a)
-{
-    f32_8 result;
-    result.s = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(a.s));
-
-    return result;
-}
-
-inline
-f32_16 int8_16_to_f32_16(int8_16 a)
-{
-    f32_16 result;
-    result.s = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(a.s));
-
-    return result;
-}
-
-inline int8_16 operator+(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_add_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator+(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_add_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator+(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_add_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator-(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_sub_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator-(int8_16 a) { return init_zero_int8_16() - a; }
-
-inline int8_32 operator-(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_sub_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator-(int8_32 a) { return init_zero_int8_32() - a; }
-
-inline int8_64 operator-(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_sub_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator-(int8_64 a) { return init_zero_int8_64() - a; }
-
-inline int8_16 operator*(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator*(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator*(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_mul_epi32(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator^(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_xor_si128(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator^(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_xor_si256(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator^(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_xor_si512(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 &operator-=(int8_16 &a, int8_16 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int8_32 &operator-=(int8_32 &a, int8_32 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int8_64 &operator-=(int8_64 &a, int8_64 b)
-{
-    a = a - b;
-
-    return a;
-}
-
-inline int8_16 &operator+=(int8_16 &a, int8_16 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int8_32 &operator+=(int8_32 &a, int8_32 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int8_64 &operator+=(int8_64 &a, int8_64 b)
-{
-    a = a + b;
-
-    return a;
-}
-
-inline int8_16 &operator*=(int8_16 &a, int8_16 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int8_32 &operator*=(int8_32 &a, int8_32 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int8_64 &operator*=(int8_64 &a, int8_64 b)
-{
-    a = a * b;
-
-    return a;
-}
-
-inline int8_16 &operator^=(int8_16 &a, int8_16 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int8_32 &operator^=(int8_32 &a, int8_32 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int8_64 &operator^=(int8_64 &a, int8_64 b)
-{
-    a = a ^ b;
-
-    return a;
-}
-
-inline int8_16 operator<(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_cmplt_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator<(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_xor_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1));
-
-    return simd;
-}
-
-inline int8_64 operator<(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_mask_blend_epi8(_mm512_cmplt_epi8_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator<=(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_andnot_si128(_mm_cmplt_epi8(b.s, a.s), _mm_set1_epi8(-1));
-
-    return simd;
-}
-
-inline int8_32 operator<=(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1));
-
-    return simd;
-}
-
-inline int8_64 operator<=(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_mask_blend_epi8(_mm512_cmple_epi8_mask(a.s, b.s), b.s, a.s);
-
-    return simd;
-}
-
-inline int8_16 operator>(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_cmpgt_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator>(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_cmpgt_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator>(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_mask_blend_epi8(_mm512_cmpgt_epi8_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator>=(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_andnot_si128(_mm_cmplt_epi8(a.s, b.s), _mm_set1_epi8(-1));
-
-    return simd;
-}
-
-inline int8_32 operator>=(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(b.s, a.s), _mm256_set1_epi8(-1));
-
-    return simd;
-}
-
-inline int8_64 operator>=(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_mask_blend_epi8(_mm512_cmpge_epi8_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator==(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_cmpeq_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator==(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_cmpeq_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator==(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_mask_blend_epi8(_mm512_cmpeq_epi8_mask(a.s, b.s), a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator!=(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_andnot_si128(_mm_cmpeq_epi8(a.s, b.s), _mm_set1_epi8(-1));
-
-    return simd;
-}
-
-inline int8_32 operator!=(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_mask_blend_epi8(_mm256_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator!=(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_mask_blend_epi8(_mm512_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator&(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_and_si128(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator&(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_and_si256(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator&(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_and_si512(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 operator|(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_or_si128(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 operator|(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_or_si256(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 operator|(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_or_si512(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 &operator&=(int8_16 &a, int8_16 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int8_32 &operator&=(int8_32 &a, int8_32 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int8_64 &operator&=(int8_64 &a, int8_64 b)
-{
-    a = a & b;
-
-    return a;
-}
-
-inline int8_16 &operator|=(int8_16 &a, int8_16 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int8_32 &operator|=(int8_32 &a, int8_32 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int8_64 &operator|=(int8_64 &a, int8_64 b)
-{
-    a = a | b;
-
-    return a;
-}
-
-inline int8_16 abs(int8_16 a)
-{
-    int8_16 simd;
-    simd.s = _mm_abs_epi8(a.s);
-
-    return simd;
-}
-
-inline int8_32 abs(int8_32 a)
-{
-    int8_32 simd;
-    simd.s = _mm256_abs_epi16(a.s);
-
-    return simd;
-}
-
-inline int8_64 abs(int8_64 a)
-{
-    int8_64 simd;
-    simd.s = _mm512_abs_epi16(a.s);
-
-    return simd;
-}
-
-inline int8_16 simd_min(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_min_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 simd_min(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_min_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 simd_min(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_min_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 simd_max(int8_16 a, int8_16 b)
-{
-    int8_16 simd;
-    simd.s = _mm_max_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_32 simd_max(int8_32 a, int8_32 b)
-{
-    int8_32 simd;
-    simd.s = _mm256_max_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_64 simd_max(int8_64 a, int8_64 b)
-{
-    int8_64 simd;
-    simd.s = _mm512_max_epi8(a.s, b.s);
-
-    return simd;
-}
-
-inline int8_16 clamp(int8_16 min_value, int8_16 a, int8_16 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int8_32 clamp(int8_32 min_value, int8_32 a, int8_32 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int8_64 clamp(int8_64 min_value, int8_64 a, int8_64 max_value)
-{
-    return simd_min(simd_max(a, min_value), max_value);
-}
-
-inline int32 which_true(int8_16 a)
-{
-    int32 which_true = _mm_movemask_epi8(a.s);
-
-    return which_true;
-}
-
-inline int32 which_true(int8_32 a)
-{
-    int32 which_true = _mm256_movemask_epi8(a.s);
-
-    return which_true;
-}
-
-inline int64 which_true(int8_64 a)
-{
-    int64 which_true = _mm512_movepi8_mask(a.s);
-
-    return which_true;
-}
-
-inline bool any_true(int8_16 a)
-{
-    bool is_any_true = _mm_movemask_epi8(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool any_true(int8_32 a)
-{
-    bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool any_true(int8_64 a)
-{
-    bool is_any_true = _mm512_movepi8_mask(a.s) > 0;
-
-    return is_any_true;
-}
-
-inline bool all_true(int8_16 a)
-{
-    bool is_true = _mm_movemask_epi8(a.s) == 15;
-
-    return is_true;
-}
-
-inline bool all_true(int8_32 a)
-{
-    bool is_true = _mm256_movemask_epi8(a.s) == 255;
-
-    return is_true;
-}
-
-inline bool all_true(int8_64 a)
-{
-    bool is_true = _mm512_movepi8_mask(a.s) == 65535;
-
-    return is_true;
-}
-
-inline bool all_false(int8_16 a)
-{
-    bool is_false = _mm_movemask_epi8(a.s) == 0;
-
-    return is_false;
-}
-
-inline bool all_false(int8_32 a)
-{
-    bool is_false = _mm256_movemask_epi8(a.s) == 0;
-
-    return is_false;
-}
-
-inline bool all_false(int8_64 a)
-{
-    // @todo This can be optimized (requires also changes in the comparison functions return)
-    bool is_false = _mm512_movepi8_mask(a.s) == 0;
-
-    return is_false;
-}
-
-// @todo from down here we can optimize some of the code by NOT using the wrappers
-//      the code is self contained and we could use te intrinsic functions directly
-
-/*
-inline
-f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps)
-{
-    if (steps == 16) {
-        __m512i a_16 = _mm512_load_si512((__m512i *) a);
-        __m512 af_16 = _mm512_cvtepi32_ps(a_16);
-        __m512 b_16 = _mm512_set1_ps(b);
-
-        __m512 result = _mm512_mul_ps(af_16, b_16);
-    } else if (steps == 8) {
-        __m256i a_8 = _mm256_load_si256((__m256i *) a);
-        __m256 af_8 = _mm256_cvtepi32_ps(a_8);
-        __m256 b_8 = _mm256_set1_ps(b);
-
-        __m256 result = _mm256_mul_ps(af_8, b_8);
-    } else if (steps == 4) {
-        __m128i a_4 = _mm_load_si128((__m128i *) a);
-        __m128 af_4 = _mm_cvtepi32_ps(a_4);
-        __m128 b_4 = _mm_set1_ps(b);
-
-        __m128 result = _mm_mul_ps(af_4, b_4);
-    } else {
-
-    }
-}
-*/
-
-bool simd_compare_64(const byte* a, const byte* b)
-{
-    __m256i chunk1_a = _mm256_load_si256((__m256i*) a);
-    __m256i chunk1_b = _mm256_load_si256((__m256i*) b);
-
-    __m256i chunk2_a = _mm256_load_si256((__m256i*) (a + 32));
-    __m256i chunk2_b = _mm256_load_si256((__m256i*) (b + 32));
-
-    __m256i result1 = _mm256_cmpeq_epi8(chunk1_a, chunk1_b);
-    __m256i result2 = _mm256_cmpeq_epi8(chunk2_a, chunk2_b);
-
-    __m256i combined = _mm256_and_si256(result1, result2);
-
-    return _mm256_testc_si256(combined, _mm256_set1_epi8(-1)) != 1;
-}
-
-int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
+int simd_equal(const byte* a, const byte* b, uint32 size, uint32 steps = 8) {
     uint32 i = 0;
+    steps = intrin_validate_steps((const byte*) a, steps);
+    steps = intrin_validate_steps((const byte*) b, steps);
 
-    if (steps == 16) {
-        if (size >= 128) {
-            __m512i a_16;
-            __m512i b_16;
-            __mmask64 result_mask;
+    #ifdef MACRO_CPU_FEATURE_AVX512
+        if (steps >= 16) {
+            steps = 16;
+            if (size >= 128) {
+                __m512i a_16;
+                __m512i b_16;
+                __mmask64 result_mask;
 
-            for (; i <= size - 64; i += 64) {  // 64 bytes per iteration
-                a_16 = _mm512_load_si512((__m512i*) a);
-                b_16 = _mm512_load_si512((__m512i*) b);
+                for (; i <= size - 64; i += 64) {  // 64 bytes per iteration
+                    a_16 = _mm512_load_si512((__m512i*) a);
+                    b_16 = _mm512_load_si512((__m512i*) b);
 
-                result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16);
+                    result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16);
 
-                if (result_mask != 0xFFFFFFFFFFFFFFFF) {
-                    return false;
+                    if (result_mask != 0xFFFFFFFFFFFFFFFF) {
+                        return false;
+                    }
+
+                    a += 64;
+                    b += 64;
                 }
+            }
 
-                a += 64;
-                b += 64;
+            if (size - i >= 64) {
+                return simd_equal(a, b, size - i, 8);
+            } else if (size - i >= 32) {
+                return simd_equal(a, b, size - i, 4);
             }
         }
+    #endif
 
-        if (size - i >= 64) {
-            return simd_compare(a, b, size - i, 8);
-        } else if (size - i >= 32) {
-            return simd_compare(a, b, size - i, 4);
-        }
-    } else if (steps == 8) {
-        if (size >= 64) {
-            __m256i a_8;
-            __m256i b_8;
-            __m256i result_8;
+    #ifdef MACRO_CPU_FEATURE_AVX2
+        if (steps >= 8) {
+            steps = 8;
+            if (size >= 64) {
+                __m256i a_8;
+                __m256i b_8;
+                __m256i result_8;
 
-            for (; i <= size - steps; i += steps) {
-                a_8 = _mm256_load_si256((__m256i*) a);
-                b_8 = _mm256_load_si256((__m256i*) b);
+                for (; i <= size - steps; i += steps) {
+                    a_8 = _mm256_load_si256((__m256i*) a);
+                    b_8 = _mm256_load_si256((__m256i*) b);
 
-                result_8 = _mm256_cmpeq_epi8(a_8, b_8);
+                    result_8 = _mm256_cmpeq_epi8(a_8, b_8);
 
-                if (_mm256_testc_si256(result_8, _mm256_set1_epi8(-1)) != 1) {
-                    return false;
+                    if (_mm256_testc_si256(result_8, _mm256_set1_epi8(-1)) != 1) {
+                        return false;
+                    }
+
+                    a += steps;
+                    b += steps;
                 }
+            }
 
-                a += steps;
-                b += steps;
+            if (size - i >= 32) {
+                return simd_equal(a, b, size - i, 4);
             }
         }
+    #endif
 
-        if (size - i >= 32) {
-            return simd_compare(a, b, size - i, 4);
-        }
-    } else if (steps == 4) {
-        if (size >= 16) {
-            __m128i a_4;
-            __m128i b_4;
-            __m128i result_4;
+    #ifdef MACRO_CPU_FEATURE_SSE42
+        if (steps >= 4) {
+            steps = 4;
+            if (size >= 16) {
+                __m128i a_4;
+                __m128i b_4;
+                __m128i result_4;
 
-            for (; i <= size - steps; i += steps) {
-                a_4 = _mm_load_si128((__m128i*) a);
-                b_4 = _mm_load_si128((__m128i*) b);
+                for (; i <= size - steps; i += steps) {
+                    a_4 = _mm_load_si128((__m128i*) a);
+                    b_4 = _mm_load_si128((__m128i*) b);
 
-                result_4 = _mm_cmpeq_epi8(a_4, b_4);
+                    result_4 = _mm_cmpeq_epi8(a_4, b_4);
 
-                if (_mm_movemask_epi8(result_4) != 0xFFFF) {
-                    return false;
+                    if (_mm_movemask_epi8(result_4) != 0xFFFF) {
+                        return false;
+                    }
+
+                    a += steps;
+                    b += steps;
                 }
-
-                a += steps;
-                b += steps;
             }
         }
-    }
+    #endif
 
     for (; i < size; ++i) {
         if (*a++ != *b++) {
diff --git a/architecture/x86/simd/SIMD_I8_AVX2.h b/architecture/x86/simd/SIMD_I8_AVX2.h
new file mode 100644
index 0000000..462beaa
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I8_AVX2.h
@@ -0,0 +1,265 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I8_H
+#define COMS_TOS_STDLIB_SIMD_I8_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int8_32 {
+    union {
+        #if ARM
+            svint8_t s;
+        #else
+            __m256i s;
+        #endif
+
+        int8 v[32];
+    };
+};
+
+inline int8_32 load_int8_32(const int8* mem)
+{
+    int8_32 simd;
+    simd.s = _mm256_load_si256((__m256i *) mem);
+
+    return simd;
+}
+
+inline int8_32 init_int8_32(const int8* mem)
+{
+    int8_32 simd;
+    simd.s = _mm256_set_epi8(
+        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
+        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
+        mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
+        mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31]
+    );
+
+    return simd;
+}
+
+inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i *) array, a.s); }
+
+inline int8_32 init_zero_int8_32()
+{
+    int8_32 simd;
+    simd.s = _mm256_setzero_si256();
+
+    return simd;
+}
+
+inline int8_32 init_value_int8_32(int8 value)
+{
+    int8_32 simd;
+    simd.s = _mm256_set1_epi8(value);
+
+    return simd;
+}
+
+inline int8_32 operator+(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_add_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 operator-(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_sub_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 operator-(int8_32 a) { return init_zero_int8_32() - a; }
+
+inline int8_32 operator*(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 operator^(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_xor_si256(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 &operator-=(int8_32 &a, int8_32 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int8_32 &operator+=(int8_32 &a, int8_32 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int8_32 &operator*=(int8_32 &a, int8_32 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int8_32 &operator^=(int8_32 &a, int8_32 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int8_32 operator<(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_xor_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1));
+
+    return simd;
+}
+
+inline int8_32 operator<=(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1));
+
+    return simd;
+}
+
+inline int8_32 operator>(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_cmpgt_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 operator>=(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(b.s, a.s), _mm256_set1_epi8(-1));
+
+    return simd;
+}
+
+inline int8_32 operator==(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_cmpeq_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 operator!=(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_mask_blend_epi8(_mm256_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 operator&(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_and_si256(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 operator|(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_or_si256(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 &operator&=(int8_32 &a, int8_32 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int8_32 &operator|=(int8_32 &a, int8_32 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int8_32 abs(int8_32 a)
+{
+    int8_32 simd;
+    simd.s = _mm256_abs_epi16(a.s);
+
+    return simd;
+}
+
+inline int8_32 simd_min(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_min_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 simd_max(int8_32 a, int8_32 b)
+{
+    int8_32 simd;
+    simd.s = _mm256_max_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_32 clamp(int8_32 min_value, int8_32 a, int8_32 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(int8_32 a)
+{
+    int32 which_true = _mm256_movemask_epi8(a.s);
+
+    return which_true;
+}
+
+inline bool any_true(int8_32 a)
+{
+    bool is_any_true = _mm256_movemask_epi8(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int8_32 a)
+{
+    bool is_true = _mm256_movemask_epi8(a.s) == 255;
+
+    return is_true;
+}
+
+inline bool all_false(int8_32 a)
+{
+    bool is_false = _mm256_movemask_epi8(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I8_AVX512.h b/architecture/x86/simd/SIMD_I8_AVX512.h
new file mode 100644
index 0000000..a14047d
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I8_AVX512.h
@@ -0,0 +1,270 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I8_H
+#define COMS_TOS_STDLIB_SIMD_I8_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int8_64 {
+    union {
+        #if ARM
+            svint8_t s;
+        #else
+            __m512i s;
+        #endif
+
+        int8 v[64];
+    };
+};
+
+inline int8_64 load_int8_64(const int8* mem)
+{
+    int8_64 simd;
+    simd.s = _mm512_load_si512((__m512i *) mem);
+
+    return simd;
+}
+
+inline int8_64 init_int8_64(const int8* mem)
+{
+    int8_64 simd;
+    simd.s = _mm512_set_epi8(
+        mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7],
+        mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15],
+        mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23],
+        mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31],
+        mem[32], mem[33], mem[34], mem[35], mem[36], mem[37], mem[38], mem[39],
+        mem[40], mem[41], mem[42], mem[43], mem[44], mem[45], mem[46], mem[47],
+        mem[48], mem[49], mem[50], mem[51], mem[52], mem[53], mem[54], mem[55],
+        mem[56], mem[57], mem[58], mem[59], mem[60], mem[61], mem[62], mem[63]
+    );
+
+    return simd;
+}
+
+inline void unload_int8_64(int8_64 a, int8 *array) { _mm512_storeu_epi8(array, a.s); }
+
+inline int8_64 init_zero_int8_64()
+{
+    int8_64 simd;
+    simd.s = _mm512_setzero_si512();
+
+    return simd;
+}
+
+inline int8_64 init_value_int8_64(int8 value)
+{
+    int8_64 simd;
+    simd.s = _mm512_set1_epi8(value);
+
+    return simd;
+}
+
+inline int8_64 operator+(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_add_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator-(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_sub_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator-(int8_64 a) { return init_zero_int8_64() - a; }
+
+inline int8_64 operator*(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator^(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_xor_si512(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 &operator-=(int8_64 &a, int8_64 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int8_64 &operator+=(int8_64 &a, int8_64 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int8_64 &operator*=(int8_64 &a, int8_64 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int8_64 &operator^=(int8_64 &a, int8_64 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int8_64 operator<(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_mask_blend_epi8(_mm512_cmplt_epi8_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator<=(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_mask_blend_epi8(_mm512_cmple_epi8_mask(a.s, b.s), b.s, a.s);
+
+    return simd;
+}
+
+inline int8_64 operator>(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_mask_blend_epi8(_mm512_cmpgt_epi8_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator>=(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_mask_blend_epi8(_mm512_cmpge_epi8_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator==(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_mask_blend_epi8(_mm512_cmpeq_epi8_mask(a.s, b.s), a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator!=(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_mask_blend_epi8(_mm512_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator&(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_and_si512(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 operator|(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_or_si512(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 &operator&=(int8_64 &a, int8_64 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int8_64 &operator|=(int8_64 &a, int8_64 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int8_64 abs(int8_64 a)
+{
+    int8_64 simd;
+    simd.s = _mm512_abs_epi16(a.s);
+
+    return simd;
+}
+
+inline int8_64 simd_min(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_min_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 simd_max(int8_64 a, int8_64 b)
+{
+    int8_64 simd;
+    simd.s = _mm512_max_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_64 clamp(int8_64 min_value, int8_64 a, int8_64 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int64 which_true(int8_64 a)
+{
+    int64 which_true = _mm512_movepi8_mask(a.s);
+
+    return which_true;
+}
+
+inline bool any_true(int8_64 a)
+{
+    bool is_any_true = _mm512_movepi8_mask(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int8_64 a)
+{
+    bool is_true = _mm512_movepi8_mask(a.s) == 65535;
+
+    return is_true;
+}
+
+inline bool all_false(int8_64 a)
+{
+    // @todo This can be optimized (requires also changes in the comparison functions return)
+    bool is_false = _mm512_movepi8_mask(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_I8_SSE.h b/architecture/x86/simd/SIMD_I8_SSE.h
new file mode 100644
index 0000000..e676bc6
--- /dev/null
+++ b/architecture/x86/simd/SIMD_I8_SSE.h
@@ -0,0 +1,265 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_TOS_STDLIB_SIMD_I8_H
+#define COMS_TOS_STDLIB_SIMD_I8_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+struct int8_16 {
+    union {
+        #if ARM
+            svint8_t s;
+        #else
+            __m128i s;
+        #endif
+
+        int8 v[16];
+    };
+};
+
+inline int8_16 load_int8_16(const int8* mem)
+{
+    int8_16 simd;
+    simd.s = _mm_load_si128((__m128i *) mem);
+
+    return simd;
+}
+
+inline int8_16 init_int8_16(const int8* mem)
+{
+    int8_16 simd;
+    simd.s = _mm_set_epi8(
+        mem[0], mem[1], mem[2], mem[3],
+        mem[4], mem[5], mem[6], mem[7],
+        mem[8], mem[9], mem[10], mem[11],
+        mem[12], mem[13], mem[14], mem[15]
+    );
+
+    return simd;
+}
+
+inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *) array, a.s); }
+
+inline int8_16 init_zero_int8_16()
+{
+    int8_16 simd;
+    simd.s = _mm_setzero_si128();
+
+    return simd;
+}
+
+inline int8_16 init_value_int8_16(int8 value)
+{
+    int8_16 simd;
+    simd.s = _mm_set1_epi8(value);
+
+    return simd;
+}
+
+inline int8_16 operator+(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_add_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 operator-(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_sub_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 operator-(int8_16 a) { return init_zero_int8_16() - a; }
+
+inline int8_16 operator*(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_mul_epi32(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 operator^(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_xor_si128(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 &operator-=(int8_16 &a, int8_16 b)
+{
+    a = a - b;
+
+    return a;
+}
+
+inline int8_16 &operator+=(int8_16 &a, int8_16 b)
+{
+    a = a + b;
+
+    return a;
+}
+
+inline int8_16 &operator*=(int8_16 &a, int8_16 b)
+{
+    a = a * b;
+
+    return a;
+}
+
+inline int8_16 &operator^=(int8_16 &a, int8_16 b)
+{
+    a = a ^ b;
+
+    return a;
+}
+
+inline int8_16 operator<(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_cmplt_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 operator<=(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_andnot_si128(_mm_cmplt_epi8(b.s, a.s), _mm_set1_epi8(-1));
+
+    return simd;
+}
+
+inline int8_16 operator>(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_cmpgt_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 operator>=(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_andnot_si128(_mm_cmplt_epi8(a.s, b.s), _mm_set1_epi8(-1));
+
+    return simd;
+}
+
+inline int8_16 operator==(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_cmpeq_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 operator!=(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_andnot_si128(_mm_cmpeq_epi8(a.s, b.s), _mm_set1_epi8(-1));
+
+    return simd;
+}
+
+inline int8_16 operator&(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_and_si128(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 operator|(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_or_si128(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 &operator&=(int8_16 &a, int8_16 b)
+{
+    a = a & b;
+
+    return a;
+}
+
+inline int8_16 &operator|=(int8_16 &a, int8_16 b)
+{
+    a = a | b;
+
+    return a;
+}
+
+inline int8_16 abs(int8_16 a)
+{
+    int8_16 simd;
+    simd.s = _mm_abs_epi8(a.s);
+
+    return simd;
+}
+
+inline int8_16 simd_min(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_min_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 simd_max(int8_16 a, int8_16 b)
+{
+    int8_16 simd;
+    simd.s = _mm_max_epi8(a.s, b.s);
+
+    return simd;
+}
+
+inline int8_16 clamp(int8_16 min_value, int8_16 a, int8_16 max_value)
+{
+    return simd_min(simd_max(a, min_value), max_value);
+}
+
+inline int32 which_true(int8_16 a)
+{
+    int32 which_true = _mm_movemask_epi8(a.s);
+
+    return which_true;
+}
+
+inline bool any_true(int8_16 a)
+{
+    bool is_any_true = _mm_movemask_epi8(a.s) > 0;
+
+    return is_any_true;
+}
+
+inline bool all_true(int8_16 a)
+{
+    bool is_true = _mm_movemask_epi8(a.s) == 15;
+
+    return is_true;
+}
+
+inline bool all_false(int8_16 a)
+{
+    bool is_false = _mm_movemask_epi8(a.s) == 0;
+
+    return is_false;
+}
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_SVML.h b/architecture/x86/simd/SIMD_SVML.h
index 022e658..83a5c10 100644
--- a/architecture/x86/simd/SIMD_SVML.h
+++ b/architecture/x86/simd/SIMD_SVML.h
@@ -9,160 +9,16 @@
 #ifndef COMS_STDLIB_SIMD_SVML_H
 #define COMS_STDLIB_SIMD_SVML_H
 
-#include <immintrin.h>
-#include <xmmintrin.h>
+#ifdef MACRO_CPU_FEATURE_SSE42
+    #include "SIMD_SVML_SSE.h"
+#endif
 
-#include "../../../stdlib/Types.h"
+#ifdef MACRO_CPU_FEATURE_AVX2
+    #include "SIMD_SVML_AVX2.h"
+#endif
 
-#if __linux__
-    #include "math.h"
-
-    inline __m128i _mm_div_epi32(__m128i a, __m128i b) {
-        alignas(16) int32 a_array[4], b_array[4], result[4];
-
-        _mm_storeu_si128((__m128i*) a_array, a);
-        _mm_storeu_si128((__m128i*) b_array, b);
-
-        for (int32 i = 0; i < 4; ++i) {
-            result[i] = a_array[i] / b_array[i];
-        }
-
-        return _mm_load_si128((__m128i*) result);
-    }
-
-    inline __m256i _mm256_div_epi32(__m256i a, __m256i b) {
-        alignas(32) int32 a_array[8], b_array[8], result[8];
-
-        _mm256_storeu_si256((__m256i*) a_array, a);
-        _mm256_storeu_si256((__m256i*) b_array, b);
-
-        for (int32 i = 0; i < 8; ++i) {
-            result[i] = a_array[i] / b_array[i];
-        }
-
-        return _mm256_load_si256((__m256i*) result);
-    }
-
-    inline __m512i _mm512_div_epi32(__m512i a, __m512i b) {
-        alignas(64) int32 a_array[16], b_array[16], result[16];
-
-        _mm512_storeu_si512((__m512i*) a_array, a);
-        _mm512_storeu_si512((__m512i*) b_array, b);
-
-        for (int32 i = 0; i < 16; ++i) {
-            result[i] = a_array[i] / b_array[i];
-        }
-
-        return _mm512_load_si512((__m512i*) result);
-    }
-
-    inline __m128 _mm_sin_ps(__m128 a) {
-        alignas(16) f32 a_array[4], result[4];
-        _mm_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 4; ++i) {
-            result[i] = sinf(a_array[i]);
-        }
-        return _mm_load_ps(result);
-    }
-
-    inline __m128 _mm_cos_ps(__m128 a) {
-        alignas(16) f32 a_array[4], result[4];
-        _mm_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 4; ++i) {
-            result[i] = cosf(a_array[i]);
-        }
-        return _mm_load_ps(result);
-    }
-
-    inline __m128 _mm_asin_ps(__m128 a) {
-        alignas(16) f32 a_array[4], result[4];
-        _mm_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 4; ++i) {
-            result[i] = asinf(a_array[i]);
-        }
-        return _mm_load_ps(result);
-    }
-
-    inline __m128 _mm_acos_ps(__m128 a) {
-        alignas(16) f32 a_array[4], result[4];
-        _mm_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 4; ++i) {
-            result[i] = acosf(a_array[i]);
-        }
-        return _mm_load_ps(result);
-    }
-
-    inline __m256 _mm256_sin_ps(__m256 a) {
-        alignas(32) f32 a_array[8], result[8];
-        _mm256_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 8; ++i) {
-            result[i] = sinf(a_array[i]);
-        }
-        return _mm256_load_ps(result);
-    }
-
-    inline __m256 _mm256_cos_ps(__m256 a) {
-        alignas(32) f32 a_array[8], result[8];
-        _mm256_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 8; ++i) {
-            result[i] = cosf(a_array[i]);
-        }
-        return _mm256_load_ps(result);
-    }
-
-    inline __m256 _mm256_asin_ps(__m256 a) {
-        alignas(32) f32 a_array[8], result[8];
-        _mm256_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 8; ++i) {
-            result[i] = asinf(a_array[i]);
-        }
-        return _mm256_load_ps(result);
-    }
-
-    inline __m256 _mm256_acos_ps(__m256 a) {
-        alignas(32) f32 a_array[8], result[8];
-        _mm256_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 16; ++i) {
-            result[i] = acosf(a_array[i]);
-        }
-        return _mm256_load_ps(result);
-    }
-
-    inline __m512 _mm512_sin_ps(__m512 a) {
-        alignas(64) f32 a_array[8], result[8];
-        _mm512_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 16; ++i) {
-            result[i] = sinf(a_array[i]);
-        }
-        return _mm512_load_ps(result);
-    }
-
-    inline __m512 _mm512_cos_ps(__m512 a) {
-        alignas(64) f32 a_array[8], result[8];
-        _mm512_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 16; ++i) {
-            result[i] = cosf(a_array[i]);
-        }
-        return _mm512_load_ps(result);
-    }
-
-    inline __m512 _mm512_asin_ps(__m512 a) {
-        alignas(64) f32 a_array[8], result[8];
-        _mm512_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 16; ++i) {
-            result[i] = asinf(a_array[i]);
-        }
-        return _mm512_load_ps(result);
-    }
-
-    inline __m512 _mm512_acos_ps(__m512 a) {
-        alignas(64) f32 a_array[16], result[16];
-        _mm512_storeu_ps(a_array, a);
-        for (int32 i = 0; i < 16; ++i) {
-            result[i] = acosf(a_array[i]);
-        }
-        return _mm512_load_ps(result);
-    }
+#ifdef MACRO_CPU_FEATURE_AVX512
+    #include "SIMD_SVML_AVX512.h"
 #endif
 
 #endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_SVML_AVX2.h b/architecture/x86/simd/SIMD_SVML_AVX2.h
new file mode 100644
index 0000000..2365c12
--- /dev/null
+++ b/architecture/x86/simd/SIMD_SVML_AVX2.h
@@ -0,0 +1,69 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_SVML_AVX2_H
+#define COMS_STDLIB_SIMD_SVML_AVX2_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+#if __linux__
+    #include <math.h>
+    inline __m256i _mm256_div_epi32(__m256i a, __m256i b) {
+        alignas(32) int32 a_array[8], b_array[8], result[8];
+
+        _mm256_store_si256((__m256i*) a_array, a);
+        _mm256_store_si256((__m256i*) b_array, b);
+
+        for (int32 i = 0; i < 8; ++i) {
+            result[i] = a_array[i] / b_array[i];
+        }
+
+        return _mm256_load_si256((__m256i*) result);
+    }
+
+    inline __m256 _mm256_sin_ps(__m256 a) {
+        alignas(32) f32 a_array[8], result[8];
+        _mm256_store_ps(a_array, a);
+        for (int32 i = 0; i < 8; ++i) {
+            result[i] = sinf(a_array[i]);
+        }
+        return _mm256_load_ps(result);
+    }
+
+    inline __m256 _mm256_cos_ps(__m256 a) {
+        alignas(32) f32 a_array[8], result[8];
+        _mm256_store_ps(a_array, a);
+        for (int32 i = 0; i < 8; ++i) {
+            result[i] = cosf(a_array[i]);
+        }
+        return _mm256_load_ps(result);
+    }
+
+    inline __m256 _mm256_asin_ps(__m256 a) {
+        alignas(32) f32 a_array[8], result[8];
+        _mm256_store_ps(a_array, a);
+        for (int32 i = 0; i < 8; ++i) {
+            result[i] = asinf(a_array[i]);
+        }
+        return _mm256_load_ps(result);
+    }
+
+    inline __m256 _mm256_acos_ps(__m256 a) {
+        alignas(32) f32 a_array[8], result[8];
+        _mm256_store_ps(a_array, a);
+        for (int32 i = 0; i < 16; ++i) {
+            result[i] = acosf(a_array[i]);
+        }
+        return _mm256_load_ps(result);
+    }
+#endif
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_SVML_AVX512.h b/architecture/x86/simd/SIMD_SVML_AVX512.h
new file mode 100755
index 0000000..d8b15e5
--- /dev/null
+++ b/architecture/x86/simd/SIMD_SVML_AVX512.h
@@ -0,0 +1,70 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_SVML_AVX512_H
+#define COMS_STDLIB_SIMD_SVML_AVX512_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+#if __linux__
+    #include <math.h>
+
+    inline __m512i _mm512_div_epi32(__m512i a, __m512i b) {
+        alignas(64) int32 a_array[16], b_array[16], result[16];
+
+        _mm512_store_si512((__m512i*) a_array, a);
+        _mm512_store_si512((__m512i*) b_array, b);
+
+        for (int32 i = 0; i < 16; ++i) {
+            result[i] = a_array[i] / b_array[i];
+        }
+
+        return _mm512_load_si512((__m512i*) result);
+    }
+
+    inline __m512 _mm512_sin_ps(__m512 a) {
+        alignas(64) f32 a_array[8], result[8];
+        _mm512_store_ps(a_array, a);
+        for (int32 i = 0; i < 16; ++i) {
+            result[i] = sinf(a_array[i]);
+        }
+        return _mm512_load_ps(result);
+    }
+
+    inline __m512 _mm512_cos_ps(__m512 a) {
+        alignas(64) f32 a_array[8], result[8];
+        _mm512_store_ps(a_array, a);
+        for (int32 i = 0; i < 16; ++i) {
+            result[i] = cosf(a_array[i]);
+        }
+        return _mm512_load_ps(result);
+    }
+
+    inline __m512 _mm512_asin_ps(__m512 a) {
+        alignas(64) f32 a_array[8], result[8];
+        _mm512_store_ps(a_array, a);
+        for (int32 i = 0; i < 16; ++i) {
+            result[i] = asinf(a_array[i]);
+        }
+        return _mm512_load_ps(result);
+    }
+
+    inline __m512 _mm512_acos_ps(__m512 a) {
+        alignas(64) f32 a_array[16], result[16];
+        _mm512_store_ps(a_array, a);
+        for (int32 i = 0; i < 16; ++i) {
+            result[i] = acosf(a_array[i]);
+        }
+        return _mm512_load_ps(result);
+    }
+#endif
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/SIMD_SVML_SSE.h b/architecture/x86/simd/SIMD_SVML_SSE.h
new file mode 100644
index 0000000..683f554
--- /dev/null
+++ b/architecture/x86/simd/SIMD_SVML_SSE.h
@@ -0,0 +1,70 @@
+/**
+ * Jingga
+ *
+ * @copyright Jingga
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+#ifndef COMS_STDLIB_SIMD_SVML_SSE_H
+#define COMS_STDLIB_SIMD_SVML_SSE_H
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "../../../stdlib/Types.h"
+
+#if __linux__
+    #include <math.h>
+
+    inline __m128i _mm_div_epi32(__m128i a, __m128i b) {
+        alignas(16) int32 a_array[4], b_array[4], result[4];
+
+        _mm_store_si128((__m128i*) a_array, a);
+        _mm_store_si128((__m128i*) b_array, b);
+
+        for (int32 i = 0; i < 4; ++i) {
+            result[i] = a_array[i] / b_array[i];
+        }
+
+        return _mm_load_si128((__m128i*) result);
+    }
+
+    inline __m128 _mm_sin_ps(__m128 a) {
+        alignas(16) f32 a_array[4], result[4];
+        _mm_store_ps(a_array, a);
+        for (int32 i = 0; i < 4; ++i) {
+            result[i] = sinf(a_array[i]);
+        }
+        return _mm_load_ps(result);
+    }
+
+    inline __m128 _mm_cos_ps(__m128 a) {
+        alignas(16) f32 a_array[4], result[4];
+        _mm_store_ps(a_array, a);
+        for (int32 i = 0; i < 4; ++i) {
+            result[i] = cosf(a_array[i]);
+        }
+        return _mm_load_ps(result);
+    }
+
+    inline __m128 _mm_asin_ps(__m128 a) {
+        alignas(16) f32 a_array[4], result[4];
+        _mm_store_ps(a_array, a);
+        for (int32 i = 0; i < 4; ++i) {
+            result[i] = asinf(a_array[i]);
+        }
+        return _mm_load_ps(result);
+    }
+
+    inline __m128 _mm_acos_ps(__m128 a) {
+        alignas(16) f32 a_array[4], result[4];
+        _mm_store_ps(a_array, a);
+        for (int32 i = 0; i < 4; ++i) {
+            result[i] = acosf(a_array[i]);
+        }
+        return _mm_load_ps(result);
+    }
+#endif
+
+#endif
\ No newline at end of file
diff --git a/architecture/x86/simd/utils/Utils.h b/architecture/x86/simd/utils/Utils.h
old mode 100644
new mode 100755
index 9554392..b5633c6
--- a/architecture/x86/simd/utils/Utils.h
+++ b/architecture/x86/simd/utils/Utils.h
@@ -17,6 +17,7 @@
 // Only allowed for data >= 64 bits
 bool is_empty(const byte* region, uint64 size, int32 steps = 8)
 {
+    // Quick check of first 8 bytes
     if (*((uint64 *) region) != 0) {
         return false;
     }
@@ -25,40 +26,52 @@ bool is_empty(const byte* region, uint64 size, int32 steps = 8)
     steps = intrin_validate_steps(region, steps);
 
     switch (steps) {
-        case 16: {
-                while (region + 64 <= end) {
-                    __m512i chunk = _mm512_loadu_si512((const __m512i *) region);
-                    __mmask64 mask = _mm512_cmpeq_epi8_mask(chunk, _mm512_setzero_si512());
-                    if (mask != 0xFFFFFFFFFFFFFFFF) {
+        #ifdef MACRO_CPU_FEATURE_AVX512
+            case 16: {
+                    while (region + 64 <= end) {
+                        __m512i chunk = _mm512_load_si512((const __m512i *) region);
+                        __mmask64 mask = _mm512_cmpeq_epi8_mask(chunk, _mm512_setzero_si512());
+                        if (mask != 0xFFFFFFFFFFFFFFFF) {
+                            return false;
+                        }
+
+                        region += 64;
+                    }
+                };
+                [[fallthrough]];
+        #else
+            case 16: [[fallthrough]];
+        #endif
+        #ifdef MACRO_CPU_FEATURE_AVX2
+            case 8: {
+                while (region + 32 <= end) {
+                    __m256i chunk = _mm256_load_si256((const __m256i *) region);
+                    if (!_mm256_testz_si256(chunk, chunk)) {
                         return false;
                     }
 
-                    region += 64;
+                    region += 32;
                 }
             };
             [[fallthrough]];
-        case 8: {
-            while (region + 32 <= end) {
-                __m256i chunk = _mm256_loadu_si256((const __m256i *) region);
-                if (!_mm256_testz_si256(chunk, chunk)) {
-                    return false;
-                }
+        #else
+            case 8: [[fallthrough]];
+        #endif
+        #ifdef MACRO_CPU_FEATURE_SSE42
+            case 4: {
+                while (region + 16 <= end) {
+                    __m128i chunk = _mm_load_si128((const __m128i *) region);
+                    if (!_mm_testz_si128(chunk, chunk)) {
+                        return false;
+                    }
 
-                region += 32;
-            }
-        };
-        [[fallthrough]];
-        case 4: {
-            while (region + 16 <= end) {
-                __m128i chunk = _mm_loadu_si128((const __m128i *) region);
-                if (!_mm_testz_si128(chunk, chunk)) {
-                    return false;
+                    region += 16;
                 }
-
-                region += 16;
             }
-        }
-        [[fallthrough]];
+            [[fallthrough]];
+        #else
+            case 4: [[fallthrough]];
+        #endif
         case 1: {
             while (region + 4 <= end) {
                 if (*((const uint32_t *) region) != 0) {
diff --git a/asset/Asset.h b/asset/Asset.h
old mode 100644
new mode 100755
diff --git a/asset/AssetArchive.h b/asset/AssetArchive.h
old mode 100644
new mode 100755
diff --git a/asset/AssetManagementSystem.h b/asset/AssetManagementSystem.h
old mode 100644
new mode 100755
diff --git a/asset/AssetType.h b/asset/AssetType.h
old mode 100644
new mode 100755
diff --git a/audio/Audio.cpp b/audio/Audio.cpp
old mode 100644
new mode 100755
diff --git a/audio/Audio.h b/audio/Audio.h
old mode 100644
new mode 100755
diff --git a/audio/AudioMixer.h b/audio/AudioMixer.h
old mode 100644
new mode 100755
diff --git a/audio/AudioSetting.h b/audio/AudioSetting.h
old mode 100644
new mode 100755
diff --git a/audio/Qoa.h b/audio/Qoa.h
old mode 100644
new mode 100755
diff --git a/audio/QoaSimd.h b/audio/QoaSimd.h
old mode 100644
new mode 100755
diff --git a/audio/Wav.h b/audio/Wav.h
old mode 100644
new mode 100755
diff --git a/auth/Auth.h b/auth/Auth.h
old mode 100644
new mode 100755
diff --git a/camera/Camera.h b/camera/Camera.h
old mode 100644
new mode 100755
diff --git a/camera/CameraMovement.h b/camera/CameraMovement.h
old mode 100644
new mode 100755
diff --git a/color/ColorVisionDeficiency.h b/color/ColorVisionDeficiency.h
old mode 100644
new mode 100755
diff --git a/command/AppCmdBuffer.cpp b/command/AppCmdBuffer.cpp
old mode 100644
new mode 100755
diff --git a/command/AppCmdBuffer.h b/command/AppCmdBuffer.h
old mode 100644
new mode 100755
diff --git a/command/Command.h b/command/Command.h
old mode 100644
new mode 100755
diff --git a/compiler/CompilerUtils.h b/compiler/CompilerUtils.h
old mode 100644
new mode 100755
diff --git a/compiler/gcc/Atomic.h b/compiler/gcc/Atomic.h
old mode 100644
new mode 100755
diff --git a/compiler/gcc/CompilerUtils.h b/compiler/gcc/CompilerUtils.h
old mode 100644
new mode 100755
diff --git a/compiler/msvc/CompilerUtils.h b/compiler/msvc/CompilerUtils.h
old mode 100644
new mode 100755
diff --git a/compression/Huffman.h b/compression/Huffman.h
old mode 100644
new mode 100755
diff --git a/compression/LZP.h b/compression/LZP.h
old mode 100644
new mode 100755
diff --git a/compression/RLE.h b/compression/RLE.h
old mode 100644
new mode 100755
diff --git a/database/Database.h b/database/Database.h
old mode 100644
new mode 100755
diff --git a/database/DatabaseConnection.h b/database/DatabaseConnection.h
old mode 100644
new mode 100755
diff --git a/database/DatabaseType.h b/database/DatabaseType.h
old mode 100644
new mode 100755
diff --git a/encryption/CeasarEncryption.h b/encryption/CeasarEncryption.h
old mode 100644
new mode 100755
diff --git a/encryption/XorEncryption.h b/encryption/XorEncryption.h
old mode 100644
new mode 100755
diff --git a/entity/AnimationEntityComponent.h b/entity/AnimationEntityComponent.h
old mode 100644
new mode 100755
diff --git a/entity/CursorEntity.h b/entity/CursorEntity.h
old mode 100644
new mode 100755
diff --git a/entity/Entity.h b/entity/Entity.h
old mode 100644
new mode 100755
diff --git a/entity/EntityComponentSystem.h b/entity/EntityComponentSystem.h
old mode 100644
new mode 100755
diff --git a/entity/EntitySize.h b/entity/EntitySize.h
old mode 100644
new mode 100755
diff --git a/environment/Globe.h b/environment/Globe.h
old mode 100644
new mode 100755
diff --git a/environment/Universe.h b/environment/Universe.h
old mode 100644
new mode 100755
diff --git a/error/HammingCodes.h b/error/HammingCodes.h
old mode 100644
new mode 100755
diff --git a/font/Font.h b/font/Font.h
old mode 100644
new mode 100755
diff --git a/font/font_characters.txt b/font/font_characters.txt
old mode 100644
new mode 100755
diff --git a/gpuapi/AntiAliasing.h b/gpuapi/AntiAliasing.h
old mode 100644
new mode 100755
diff --git a/gpuapi/GpuApiType.h b/gpuapi/GpuApiType.h
old mode 100644
new mode 100755
diff --git a/gpuapi/GpuAttributeType.h b/gpuapi/GpuAttributeType.h
old mode 100644
new mode 100755
diff --git a/gpuapi/RenderUtils.h b/gpuapi/RenderUtils.h
old mode 100644
new mode 100755
diff --git a/gpuapi/ShaderType.h b/gpuapi/ShaderType.h
old mode 100644
new mode 100755
diff --git a/gpuapi/direct3d/AppCmdBuffer.h b/gpuapi/direct3d/AppCmdBuffer.h
old mode 100644
new mode 100755
diff --git a/gpuapi/direct3d/DirectXUtils.h b/gpuapi/direct3d/DirectXUtils.h
old mode 100644
new mode 100755
diff --git a/gpuapi/direct3d/FramesInFlightContainer.h b/gpuapi/direct3d/FramesInFlightContainer.h
old mode 100644
new mode 100755
diff --git a/gpuapi/direct3d/Shader.h b/gpuapi/direct3d/Shader.h
old mode 100644
new mode 100755
diff --git a/gpuapi/direct3d/ShaderUtils.h b/gpuapi/direct3d/ShaderUtils.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/AppCmdBuffer.h b/gpuapi/opengl/AppCmdBuffer.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/FramesInFlightContainer.h b/gpuapi/opengl/FramesInFlightContainer.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/Opengl.h b/gpuapi/opengl/Opengl.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/OpenglDefines.h b/gpuapi/opengl/OpenglDefines.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/OpenglDescriptorSetLayoutBinding.h b/gpuapi/opengl/OpenglDescriptorSetLayoutBinding.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/OpenglLinux.h b/gpuapi/opengl/OpenglLinux.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/OpenglUtils.h b/gpuapi/opengl/OpenglUtils.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/OpenglWin32.h b/gpuapi/opengl/OpenglWin32.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/Shader.h b/gpuapi/opengl/Shader.h
old mode 100644
new mode 100755
diff --git a/gpuapi/opengl/ShaderUtils.h b/gpuapi/opengl/ShaderUtils.h
old mode 100644
new mode 100755
diff --git a/gpuapi/vulkan/AppCmdBuffer.h b/gpuapi/vulkan/AppCmdBuffer.h
old mode 100644
new mode 100755
diff --git a/gpuapi/vulkan/FramesInFlightContainer.h b/gpuapi/vulkan/FramesInFlightContainer.h
old mode 100644
new mode 100755
diff --git a/gpuapi/vulkan/Shader.h b/gpuapi/vulkan/Shader.h
old mode 100644
new mode 100755
diff --git a/gpuapi/vulkan/ShaderUtils.h b/gpuapi/vulkan/ShaderUtils.h
old mode 100644
new mode 100755
diff --git a/gpuapi/vulkan/VulkanUtils.h b/gpuapi/vulkan/VulkanUtils.h
old mode 100644
new mode 100755
diff --git a/hash/Crc.h b/hash/Crc.h
old mode 100644
new mode 100755
diff --git a/hash/GeneralHash.h b/hash/GeneralHash.h
old mode 100644
new mode 100755
index 9a5901f..bc4cd02
--- a/hash/GeneralHash.h
+++ b/hash/GeneralHash.h
@@ -402,8 +402,8 @@ uint32 intrin_hash(uint64 a, uint64 b = 0) noexcept
     };
 
     __m128i hash = _mm_set_epi64x(a, b);
-    hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed));
-    hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed));
+    hash = _mm_aesdec_si128(hash, _mm_load_si128((__m128i *) seed));
+    hash = _mm_aesdec_si128(hash, _mm_load_si128((__m128i *) seed));
 
     return _mm_extract_epi32(hash, 0);
 }
diff --git a/html/template/HtmlTemplate.h b/html/template/HtmlTemplate.h
old mode 100644
new mode 100755
diff --git a/html/template/HtmlTemplateCache.h b/html/template/HtmlTemplateCache.h
old mode 100644
new mode 100755
diff --git a/html/template/HtmlTemplateContext.h b/html/template/HtmlTemplateContext.h
old mode 100644
new mode 100755
diff --git a/html/template/HtmlTemplateInterpreter.h b/html/template/HtmlTemplateInterpreter.h
old mode 100644
new mode 100755
diff --git a/html/template/HtmlTemplateLexer.h b/html/template/HtmlTemplateLexer.h
old mode 100644
new mode 100755
diff --git a/html/template/HtmlTemplateParser.h b/html/template/HtmlTemplateParser.h
old mode 100644
new mode 100755
diff --git a/image/Bitmap.h b/image/Bitmap.h
old mode 100644
new mode 100755
diff --git a/image/Image.cpp b/image/Image.cpp
old mode 100644
new mode 100755
diff --git a/image/Image.h b/image/Image.h
old mode 100644
new mode 100755
diff --git a/image/Png.h b/image/Png.h
old mode 100644
new mode 100755
diff --git a/image/Qoi.h b/image/Qoi.h
old mode 100644
new mode 100755
diff --git a/image/Tga.h b/image/Tga.h
old mode 100644
new mode 100755
diff --git a/image/default_colors.h b/image/default_colors.h
old mode 100644
new mode 100755
diff --git a/image/default_colors.htm b/image/default_colors.htm
old mode 100644
new mode 100755
diff --git a/image/stb_image.h b/image/stb_image.h
old mode 100644
new mode 100755
diff --git a/input/ControllerInput.h b/input/ControllerInput.h
old mode 100644
new mode 100755
diff --git a/input/ControllerType.h b/input/ControllerType.h
old mode 100644
new mode 100755
diff --git a/input/Input.h b/input/Input.h
old mode 100644
new mode 100755
diff --git a/input/InputConnectionType.h b/input/InputConnectionType.h
old mode 100644
new mode 100755
diff --git a/light/Material.h b/light/Material.h
old mode 100644
new mode 100755
diff --git a/localization/Dialog.h b/localization/Dialog.h
old mode 100644
new mode 100755
diff --git a/localization/Language.h b/localization/Language.h
old mode 100644
new mode 100755
diff --git a/log/DebugContainer.h b/log/DebugContainer.h
old mode 100644
new mode 100755
diff --git a/log/DebugMemory.h b/log/DebugMemory.h
old mode 100644
new mode 100755
diff --git a/log/Log.h b/log/Log.h
old mode 100644
new mode 100755
index 9fc45ee..0d6a085
--- a/log/Log.h
+++ b/log/Log.h
@@ -20,8 +20,6 @@
  * Debug builds also log to the debug console, or alternative standard output if no dedicated debug console is available
  */
 
-#define LOG_DATA_ARRAY 5
-
 #ifndef LOG_LEVEL
     // 0 = no logging at all
     // 1 = release logging
@@ -93,6 +91,7 @@ struct LogData {
     void* value;
 };
 
+#define LOG_DATA_ARRAY 5
 struct LogDataArray{
     LogData data[LOG_DATA_ARRAY];
 };
@@ -143,6 +142,7 @@ void log_to_file()
 }
 
 // Same as log_to_file with the exception that reset the log pos to avoid repeated output
+inline
 void log_flush()
 {
     if (!_log_memory || _log_memory->pos == 0 || !_log_fp) {
@@ -178,8 +178,12 @@ void log(const char* str, const char* file, const char* function, int32 line)
         str += message_length;
         len -= MAX_LOG_LENGTH - sizeof(LogMessage);
 
-        #if DEBUG
+        #if DEBUG || VERBOSE
             // In debug mode we always output the log message to the debug console
+            char time_str[9];
+            format_time_hh_mm_ss(time_str, msg->time / 1000000ULL);
+            compiler_debug_print(time_str);
+            compiler_debug_print(" ");
             compiler_debug_print(msg->message);
             compiler_debug_print("\n");
         #endif
@@ -260,6 +264,10 @@ void log(const char* format, LogDataArray data, const char* file, const char* fu
 
     #if DEBUG || VERBOSE
         // In debug mode we always output the log message to the debug console
+        char time_str[9];
+        format_time_hh_mm_ss(time_str, msg->time / 1000000ULL);
+        compiler_debug_print(time_str);
+        compiler_debug_print(" ");
         compiler_debug_print(msg->message);
         compiler_debug_print("\n");
     #endif
diff --git a/log/PerformanceProfiler.h b/log/PerformanceProfiler.h
old mode 100644
new mode 100755
diff --git a/log/Stats.h b/log/Stats.h
old mode 100644
new mode 100755
diff --git a/math/Evaluator.h b/math/Evaluator.h
old mode 100644
new mode 100755
diff --git a/math/PerlinNoise.h b/math/PerlinNoise.h
old mode 100644
new mode 100755
diff --git a/math/matrix/MatrixFloat32.h b/math/matrix/MatrixFloat32.h
old mode 100644
new mode 100755
index faca090..ce2ab1a
--- a/math/matrix/MatrixFloat32.h
+++ b/math/matrix/MatrixFloat32.h
@@ -518,13 +518,13 @@ void mat4mat4_mult(const f32* __restrict a, const f32* __restrict b, f32* __rest
 inline
 void mat4mat4_mult_simd(const f32* __restrict a, const f32* __restrict b, f32* __restrict result) noexcept
 {
-    __m128 row1 = _mm_loadu_ps(&b[0]);
-    __m128 row2 = _mm_loadu_ps(&b[4]);
-    __m128 row3 = _mm_loadu_ps(&b[8]);
-    __m128 row4 = _mm_loadu_ps(&b[12]);
+    __m128 row1 = _mm_load_ps(&b[0]);
+    __m128 row2 = _mm_load_ps(&b[4]);
+    __m128 row3 = _mm_load_ps(&b[8]);
+    __m128 row4 = _mm_load_ps(&b[12]);
 
     for (int32 i = 3; i >= 0; --i) {
-        __m128 vW = _mm_loadu_ps(&a[i * 4]);
+        __m128 vW = _mm_load_ps(&a[i * 4]);
 
         __m128 vX = _mm_shuffle_ps(vW, vW, _MM_SHUFFLE(0, 0, 0, 0));
         __m128 vY = _mm_shuffle_ps(vW, vW, _MM_SHUFFLE(1, 1, 1, 1));
@@ -540,7 +540,7 @@ void mat4mat4_mult_simd(const f32* __restrict a, const f32* __restrict b, f32* _
         vY = _mm_add_ps(vY, vW);
         vX = _mm_add_ps(vX, vY);
 
-        _mm_storeu_ps(&result[i * 4], vX);
+        _mm_store_ps(&result[i * 4], vX);
     }
 }
 
diff --git a/math/matrix/MatrixInt32.h b/math/matrix/MatrixInt32.h
old mode 100644
new mode 100755
diff --git a/math/matrix/MatrixInt64.h b/math/matrix/MatrixInt64.h
old mode 100644
new mode 100755
diff --git a/math/matrix/QuaternionFloat32.h b/math/matrix/QuaternionFloat32.h
old mode 100644
new mode 100755
diff --git a/math/matrix/VectorFloat32.h b/math/matrix/VectorFloat32.h
old mode 100644
new mode 100755
diff --git a/math/matrix/VectorFloat64.h b/math/matrix/VectorFloat64.h
old mode 100644
new mode 100755
diff --git a/math/matrix/VectorInt32.h b/math/matrix/VectorInt32.h
old mode 100644
new mode 100755
diff --git a/math/matrix/VectorInt64.h b/math/matrix/VectorInt64.h
old mode 100644
new mode 100755
diff --git a/math/random/BlueNoise.h b/math/random/BlueNoise.h
old mode 100644
new mode 100755
diff --git a/memory/BufferMemory.h b/memory/BufferMemory.h
old mode 100644
new mode 100755
diff --git a/memory/ChunkMemory.h b/memory/ChunkMemory.h
old mode 100644
new mode 100755
diff --git a/memory/Heap.h b/memory/Heap.h
old mode 100644
new mode 100755
diff --git a/memory/Queue.h b/memory/Queue.h
old mode 100644
new mode 100755
diff --git a/memory/RingMemory.h b/memory/RingMemory.h
old mode 100644
new mode 100755
diff --git a/memory/ThreadedChunkMemory.h b/memory/ThreadedChunkMemory.h
old mode 100644
new mode 100755
diff --git a/memory/ThreadedQueue.h b/memory/ThreadedQueue.h
old mode 100644
new mode 100755
diff --git a/memory/ThreadedRingMemory.h b/memory/ThreadedRingMemory.h
old mode 100644
new mode 100755
diff --git a/models/Colors.h b/models/Colors.h
old mode 100644
new mode 100755
diff --git a/models/Location.h b/models/Location.h
old mode 100644
new mode 100755
diff --git a/models/Map/map_chunks.h b/models/Map/map_chunks.h
old mode 100644
new mode 100755
diff --git a/models/Obj.h b/models/Obj.h
old mode 100644
new mode 100755
diff --git a/models/Sound.h b/models/Sound.h
old mode 100644
new mode 100755
diff --git a/models/account/Account.h b/models/account/Account.h
old mode 100644
new mode 100755
diff --git a/models/bracket/Bracket.h b/models/bracket/Bracket.h
old mode 100644
new mode 100755
diff --git a/models/bracket/BracketMatch.h b/models/bracket/BracketMatch.h
old mode 100644
new mode 100755
diff --git a/models/bracket/BracketSeeding.h b/models/bracket/BracketSeeding.h
old mode 100644
new mode 100755
diff --git a/models/bracket/BracketTeam.h b/models/bracket/BracketTeam.h
old mode 100644
new mode 100755
diff --git a/models/chat/Chat.h b/models/chat/Chat.h
old mode 100644
new mode 100755
diff --git a/models/chat/ChatLevel.h b/models/chat/ChatLevel.h
old mode 100644
new mode 100755
diff --git a/models/chat/ChatStatus.h b/models/chat/ChatStatus.h
old mode 100644
new mode 100755
diff --git a/models/chat/ChatType.h b/models/chat/ChatType.h
old mode 100644
new mode 100755
diff --git a/models/event/Event.h b/models/event/Event.h
old mode 100644
new mode 100755
diff --git a/models/event/EventTaskType.h b/models/event/EventTaskType.h
old mode 100644
new mode 100755
diff --git a/models/event/tmp b/models/event/tmp
old mode 100644
new mode 100755
diff --git a/models/extension/ExtensionType.h b/models/extension/ExtensionType.h
old mode 100644
new mode 100755
diff --git a/models/guild/GuildBanner.h b/models/guild/GuildBanner.h
old mode 100644
new mode 100755
diff --git a/models/item/Consumable.h b/models/item/Consumable.h
old mode 100644
new mode 100755
diff --git a/models/item/ConsumableType.h b/models/item/ConsumableType.h
old mode 100644
new mode 100755
diff --git a/models/item/Equipment.cpp b/models/item/Equipment.cpp
old mode 100644
new mode 100755
diff --git a/models/item/Equipment.h b/models/item/Equipment.h
old mode 100644
new mode 100755
diff --git a/models/item/EquipmentType.h b/models/item/EquipmentType.h
old mode 100644
new mode 100755
diff --git a/models/item/Item.h b/models/item/Item.h
old mode 100644
new mode 100755
diff --git a/models/item/ItemAffixDistribution.h b/models/item/ItemAffixDistribution.h
old mode 100644
new mode 100755
diff --git a/models/item/ItemLevelStats.h b/models/item/ItemLevelStats.h
old mode 100644
new mode 100755
diff --git a/models/item/ItemRarityDefinition.h b/models/item/ItemRarityDefinition.h
old mode 100644
new mode 100755
diff --git a/models/item/ItemRarityStats.h b/models/item/ItemRarityStats.h
old mode 100644
new mode 100755
diff --git a/models/item/ItemStatsDistribution.h b/models/item/ItemStatsDistribution.h
old mode 100644
new mode 100755
diff --git a/models/item/MobLevelStats.h b/models/item/MobLevelStats.h
old mode 100644
new mode 100755
diff --git a/models/item/_equipment_slots.h b/models/item/_equipment_slots.h
old mode 100644
new mode 100755
diff --git a/models/item/_equipment_types.h b/models/item/_equipment_types.h
old mode 100644
new mode 100755
diff --git a/models/item/_item_rarity.h b/models/item/_item_rarity.h
old mode 100644
new mode 100755
diff --git a/models/map.h b/models/map.h
old mode 100644
new mode 100755
diff --git a/models/mob/ActivityStats.h b/models/mob/ActivityStats.h
old mode 100644
new mode 100755
diff --git a/models/mob/FixedStats.h b/models/mob/FixedStats.h
old mode 100644
new mode 100755
diff --git a/models/mob/Mob.cpp b/models/mob/Mob.cpp
old mode 100644
new mode 100755
diff --git a/models/mob/Mob.h b/models/mob/Mob.h
old mode 100644
new mode 100755
diff --git a/models/mob/MobAction.h b/models/mob/MobAction.h
old mode 100644
new mode 100755
diff --git a/models/mob/MobCategory.h b/models/mob/MobCategory.h
old mode 100644
new mode 100755
diff --git a/models/mob/MobState.h b/models/mob/MobState.h
old mode 100644
new mode 100755
diff --git a/models/mob/MobStats.cpp b/models/mob/MobStats.cpp
old mode 100644
new mode 100755
diff --git a/models/mob/MobStats.h b/models/mob/MobStats.h
old mode 100644
new mode 100755
diff --git a/models/mob/MobStatsType.h b/models/mob/MobStatsType.h
old mode 100644
new mode 100755
diff --git a/models/mob/PrimaryStatsPoints.cpp b/models/mob/PrimaryStatsPoints.cpp
old mode 100644
new mode 100755
diff --git a/models/mob/PrimaryStatsPoints.h b/models/mob/PrimaryStatsPoints.h
old mode 100644
new mode 100755
diff --git a/models/mob/SecondaryStatsPoints.cpp b/models/mob/SecondaryStatsPoints.cpp
old mode 100644
new mode 100755
diff --git a/models/mob/SecondaryStatsPoints.h b/models/mob/SecondaryStatsPoints.h
old mode 100644
new mode 100755
diff --git a/models/mob/_mob_category.h b/models/mob/_mob_category.h
old mode 100644
new mode 100755
diff --git a/models/mob/_mob_list.h b/models/mob/_mob_list.h
old mode 100644
new mode 100755
diff --git a/models/mob/monster/Drop.h b/models/mob/monster/Drop.h
old mode 100644
new mode 100755
diff --git a/models/mob/monster/LootTable.h b/models/mob/monster/LootTable.h
old mode 100644
new mode 100755
diff --git a/models/mob/monster/Monster.h b/models/mob/monster/Monster.h
old mode 100644
new mode 100755
diff --git a/models/mob/monster/MonsterStats.h b/models/mob/monster/MonsterStats.h
old mode 100644
new mode 100755
diff --git a/models/mob/player/Backpack.h b/models/mob/player/Backpack.h
old mode 100644
new mode 100755
diff --git a/models/mob/player/Guild.h b/models/mob/player/Guild.h
old mode 100644
new mode 100755
diff --git a/models/mob/player/LootFilter.h b/models/mob/player/LootFilter.h
old mode 100644
new mode 100755
diff --git a/models/mob/player/Player.cpp b/models/mob/player/Player.cpp
old mode 100644
new mode 100755
diff --git a/models/mob/player/Player.h b/models/mob/player/Player.h
old mode 100644
new mode 100755
diff --git a/models/mob/player/PlayerStats.h b/models/mob/player/PlayerStats.h
old mode 100644
new mode 100755
diff --git a/models/mob/player/PlayerXPRequirement.h b/models/mob/player/PlayerXPRequirement.h
old mode 100644
new mode 100755
diff --git a/models/mob/player/Reputation.h b/models/mob/player/Reputation.h
old mode 100644
new mode 100755
diff --git a/models/mob/player/_player_class.h b/models/mob/player/_player_class.h
old mode 100644
new mode 100755
diff --git a/models/mob/skill/AoeDistribution.h b/models/mob/skill/AoeDistribution.h
old mode 100644
new mode 100755
diff --git a/models/mob/skill/AoeShape.h b/models/mob/skill/AoeShape.h
old mode 100644
new mode 100755
diff --git a/models/mob/skill/ProjectileDistribution.h b/models/mob/skill/ProjectileDistribution.h
old mode 100644
new mode 100755
diff --git a/models/mob/skill/Skill.h b/models/mob/skill/Skill.h
old mode 100644
new mode 100755
diff --git a/models/mob/skill/SkillLocation.h b/models/mob/skill/SkillLocation.h
old mode 100644
new mode 100755
diff --git a/models/mob/skill/StatsTarget.h b/models/mob/skill/StatsTarget.h
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/arcane_bolt.cfg b/models/mob/skill/definitions/arcane_bolt.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/arise.cfg b/models/mob/skill/definitions/arise.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/back_fist.cfg b/models/mob/skill/definitions/back_fist.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/beam.cfg b/models/mob/skill/definitions/beam.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/black_fist.cfg b/models/mob/skill/definitions/black_fist.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/chain.cfg b/models/mob/skill/definitions/chain.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/chain_lightning.cfg b/models/mob/skill/definitions/chain_lightning.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/corruption_bolt.cfg b/models/mob/skill/definitions/corruption_bolt.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/cyclone.cfg b/models/mob/skill/definitions/cyclone.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/dodge.cfg b/models/mob/skill/definitions/dodge.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/earth_bolt.cfg b/models/mob/skill/definitions/earth_bolt.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/elemental_pilar.cfg b/models/mob/skill/definitions/elemental_pilar.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/fear.cfg b/models/mob/skill/definitions/fear.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/fire_bolt.cfg b/models/mob/skill/definitions/fire_bolt.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/frost_bolt.cfg b/models/mob/skill/definitions/frost_bolt.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/ghost_walk.cfg b/models/mob/skill/definitions/ghost_walk.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/health_inverse_dmg.cfg b/models/mob/skill/definitions/health_inverse_dmg.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/health_to_dmg.cfg b/models/mob/skill/definitions/health_to_dmg.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/holy_bolt.cfg b/models/mob/skill/definitions/holy_bolt.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/hook.cfg b/models/mob/skill/definitions/hook.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/kick.cfg b/models/mob/skill/definitions/kick.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/launch_strike.cfg b/models/mob/skill/definitions/launch_strike.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/lightning_bolt.cfg b/models/mob/skill/definitions/lightning_bolt.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/meteor_strike.cfg b/models/mob/skill/definitions/meteor_strike.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/mirage.cfg b/models/mob/skill/definitions/mirage.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/net.cfg b/models/mob/skill/definitions/net.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/palm_strike.cfg b/models/mob/skill/definitions/palm_strike.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/poison_strike.cfg b/models/mob/skill/definitions/poison_strike.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/pull.cfg b/models/mob/skill/definitions/pull.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/punch.cfg b/models/mob/skill/definitions/punch.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/push.cfg b/models/mob/skill/definitions/push.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/reflect.cfg b/models/mob/skill/definitions/reflect.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/revive.cfg b/models/mob/skill/definitions/revive.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/root.cfg b/models/mob/skill/definitions/root.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/sacrafice.cfg b/models/mob/skill/definitions/sacrafice.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/shield.cfg b/models/mob/skill/definitions/shield.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/side_kick.cfg b/models/mob/skill/definitions/side_kick.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/spikes.cfg b/models/mob/skill/definitions/spikes.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/sprint.cfg b/models/mob/skill/definitions/sprint.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/stomp.cfg b/models/mob/skill/definitions/stomp.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/summon.cfg b/models/mob/skill/definitions/summon.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/sword_dance.cfg b/models/mob/skill/definitions/sword_dance.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/teleport.cfg b/models/mob/skill/definitions/teleport.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/totem.cfg b/models/mob/skill/definitions/totem.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/uppercut.cfg b/models/mob/skill/definitions/uppercut.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/whirlwind.cfg b/models/mob/skill/definitions/whirlwind.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/definitions/wind_slashes.cfg b/models/mob/skill/definitions/wind_slashes.cfg
old mode 100644
new mode 100755
diff --git a/models/mob/skill/modifiers/split_shot.cfg b/models/mob/skill/modifiers/split_shot.cfg
old mode 100644
new mode 100755
diff --git a/models/object/Block.cpp b/models/object/Block.cpp
old mode 100644
new mode 100755
diff --git a/models/object/Block.h b/models/object/Block.h
old mode 100644
new mode 100755
diff --git a/models/object/Chunk.h b/models/object/Chunk.h
old mode 100644
new mode 100755
diff --git a/models/object/Cube.h b/models/object/Cube.h
old mode 100644
new mode 100755
diff --git a/models/object/Object.h b/models/object/Object.h
old mode 100644
new mode 100755
diff --git a/models/object/ObjectType.h b/models/object/ObjectType.h
old mode 100644
new mode 100755
diff --git a/models/object/_object_list.h b/models/object/_object_list.h
old mode 100644
new mode 100755
diff --git a/models/object/_object_types.h b/models/object/_object_types.h
old mode 100644
new mode 100755
diff --git a/models/settings/DungeonSettings.h b/models/settings/DungeonSettings.h
old mode 100644
new mode 100755
diff --git a/models/settings/ItemDistributionType.h b/models/settings/ItemDistributionType.h
old mode 100644
new mode 100755
diff --git a/models/settings/Settings.h b/models/settings/Settings.h
old mode 100644
new mode 100755
diff --git a/models/settings/setting_types.h b/models/settings/setting_types.h
old mode 100644
new mode 100755
diff --git a/module/Module.h b/module/Module.h
old mode 100644
new mode 100755
diff --git a/module/ModuleManager.h b/module/ModuleManager.h
old mode 100644
new mode 100755
diff --git a/network/Server.h b/network/Server.h
old mode 100644
new mode 100755
diff --git a/network/Socket.h b/network/Socket.h
old mode 100644
new mode 100755
diff --git a/network/SocketConnection.h b/network/SocketConnection.h
old mode 100644
new mode 100755
diff --git a/network/packet/OMSPacket.h b/network/packet/OMSPacket.h
old mode 100644
new mode 100755
diff --git a/network/packet/PacketCache.h b/network/packet/PacketCache.h
old mode 100644
new mode 100755
diff --git a/network/packet/PacketHeader.h b/network/packet/PacketHeader.h
old mode 100644
new mode 100755
diff --git a/network/packet/UDPPacket.h b/network/packet/UDPPacket.h
old mode 100644
new mode 100755
diff --git a/network/packet/chat/ChatMessagePacket.h b/network/packet/chat/ChatMessagePacket.h
old mode 100644
new mode 100755
diff --git a/network/packet/general/AckPacket.h b/network/packet/general/AckPacket.h
old mode 100644
new mode 100755
diff --git a/network/packet/general/PingPacket.h b/network/packet/general/PingPacket.h
old mode 100644
new mode 100755
diff --git a/network/packet/mob/MobInfoPacket.h b/network/packet/mob/MobInfoPacket.h
old mode 100644
new mode 100755
diff --git a/network/packet/mob/MobStatePacket.h b/network/packet/mob/MobStatePacket.h
old mode 100644
new mode 100755
diff --git a/network/packet/mob/player/PlayerInfoPacket.h b/network/packet/mob/player/PlayerInfoPacket.h
old mode 100644
new mode 100755
diff --git a/network/packet/mob/player/PlayerState.h b/network/packet/mob/player/PlayerState.h
old mode 100644
new mode 100755
diff --git a/network/packet/packet_types.h b/network/packet/packet_types.h
old mode 100644
new mode 100755
diff --git a/noise/FractalNoise.h b/noise/FractalNoise.h
old mode 100644
new mode 100755
diff --git a/noise/PerlinNoise.h b/noise/PerlinNoise.h
old mode 100644
new mode 100755
diff --git a/noise/SimplexNoise.h b/noise/SimplexNoise.h
old mode 100644
new mode 100755
diff --git a/noise/ValueNoise.h b/noise/ValueNoise.h
old mode 100644
new mode 100755
diff --git a/noise/WorleyNoise.h b/noise/WorleyNoise.h
old mode 100644
new mode 100755
diff --git a/object/Animation.h b/object/Animation.h
old mode 100644
new mode 100755
diff --git a/object/Hitbox.h b/object/Hitbox.h
old mode 100644
new mode 100755
diff --git a/object/Material.h b/object/Material.h
old mode 100644
new mode 100755
diff --git a/object/Materials.md b/object/Materials.md
old mode 100644
new mode 100755
diff --git a/object/Mesh.h b/object/Mesh.h
old mode 100644
new mode 100755
diff --git a/object/Model.h b/object/Model.h
old mode 100644
new mode 100755
diff --git a/object/Texture.h b/object/Texture.h
old mode 100644
new mode 100755
diff --git a/object/Vertex.h b/object/Vertex.h
old mode 100644
new mode 100755
diff --git a/particle/Particle.h b/particle/Particle.h
old mode 100644
new mode 100755
diff --git a/pathfinding/Jpsp.h b/pathfinding/Jpsp.h
old mode 100644
new mode 100755
diff --git a/pathfinding/Metric2d.h b/pathfinding/Metric2d.h
old mode 100644
new mode 100755
diff --git a/pathfinding/Metric3d.h b/pathfinding/Metric3d.h
old mode 100644
new mode 100755
diff --git a/pathfinding/Path.h b/pathfinding/Path.h
old mode 100644
new mode 100755
diff --git a/pathfinding/jps/Jps.h b/pathfinding/jps/Jps.h
old mode 100644
new mode 100755
diff --git a/pathfinding/jps/JpsGrid.h b/pathfinding/jps/JpsGrid.h
old mode 100644
new mode 100755
diff --git a/pathfinding/jps/JpsNode.h b/pathfinding/jps/JpsNode.h
old mode 100644
new mode 100755
diff --git a/platform/linux/Allocator.h b/platform/linux/Allocator.h
old mode 100644
new mode 100755
diff --git a/platform/linux/ExceptionHandler.h b/platform/linux/ExceptionHandler.h
old mode 100644
new mode 100755
diff --git a/platform/linux/FileUtils.cpp b/platform/linux/FileUtils.cpp
old mode 100644
new mode 100755
diff --git a/platform/linux/Library.cpp b/platform/linux/Library.cpp
old mode 100644
new mode 100755
diff --git a/platform/linux/Library.h b/platform/linux/Library.h
old mode 100644
new mode 100755
diff --git a/platform/linux/SystemInfo.cpp b/platform/linux/SystemInfo.cpp
old mode 100644
new mode 100755
diff --git a/platform/linux/TimeUtils.h b/platform/linux/TimeUtils.h
old mode 100644
new mode 100755
index c05a0d0..0ea6c0a
--- a/platform/linux/TimeUtils.h
+++ b/platform/linux/TimeUtils.h
@@ -14,10 +14,10 @@
 #include "../../stdlib/Types.h"
 
 uint64 system_time() {
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
 
-    return (uint64) tv.tv_sec * 1000000ULL + (uint64) tv.tv_usec;
+    return (uint64_t) ts.tv_sec * 1000000ULL + (uint64_t) ts.tv_nsec / 1000ULL;
 }
 
 uint64 time_mu() {
diff --git a/platform/linux/UtilsLinux.h b/platform/linux/UtilsLinux.h
old mode 100644
new mode 100755
diff --git a/platform/linux/network/Server.h b/platform/linux/network/Server.h
old mode 100644
new mode 100755
diff --git a/platform/linux/network/Socket.h b/platform/linux/network/Socket.h
old mode 100644
new mode 100755
diff --git a/platform/linux/threading/Atomic.h b/platform/linux/threading/Atomic.h
old mode 100644
new mode 100755
diff --git a/platform/linux/threading/Semaphore.h b/platform/linux/threading/Semaphore.h
old mode 100644
new mode 100755
diff --git a/platform/linux/threading/Spinlock.cpp b/platform/linux/threading/Spinlock.cpp
old mode 100644
new mode 100755
diff --git a/platform/linux/threading/Spinlock.h b/platform/linux/threading/Spinlock.h
old mode 100644
new mode 100755
diff --git a/platform/linux/threading/Thread.h b/platform/linux/threading/Thread.h
old mode 100644
new mode 100755
diff --git a/platform/linux/threading/ThreadDefines.h b/platform/linux/threading/ThreadDefines.h
old mode 100644
new mode 100755
diff --git a/platform/win32/Allocator.h b/platform/win32/Allocator.h
old mode 100644
new mode 100755
diff --git a/platform/win32/Clipboard.h b/platform/win32/Clipboard.h
old mode 100644
new mode 100755
diff --git a/platform/win32/ExceptionHandler.h b/platform/win32/ExceptionHandler.h
old mode 100644
new mode 100755
diff --git a/platform/win32/FastPipes.h b/platform/win32/FastPipes.h
old mode 100644
new mode 100755
diff --git a/platform/win32/FileUtils.cpp b/platform/win32/FileUtils.cpp
old mode 100644
new mode 100755
diff --git a/platform/win32/LeanWin32.h b/platform/win32/LeanWin32.h
old mode 100644
new mode 100755
diff --git a/platform/win32/Library.cpp b/platform/win32/Library.cpp
old mode 100644
new mode 100755
diff --git a/platform/win32/Library.h b/platform/win32/Library.h
old mode 100644
new mode 100755
diff --git a/platform/win32/SystemInfo.cpp b/platform/win32/SystemInfo.cpp
old mode 100644
new mode 100755
diff --git a/platform/win32/TimeUtils.h b/platform/win32/TimeUtils.h
old mode 100644
new mode 100755
diff --git a/platform/win32/UtilsWin32.h b/platform/win32/UtilsWin32.h
old mode 100644
new mode 100755
diff --git a/platform/win32/UtilsWindows.h b/platform/win32/UtilsWindows.h
old mode 100644
new mode 100755
diff --git a/platform/win32/Window.h b/platform/win32/Window.h
old mode 100644
new mode 100755
diff --git a/platform/win32/audio/DirectSound.h b/platform/win32/audio/DirectSound.h
old mode 100644
new mode 100755
diff --git a/platform/win32/audio/Wasapi.h b/platform/win32/audio/Wasapi.h
old mode 100644
new mode 100755
diff --git a/platform/win32/audio/XAudio2.h b/platform/win32/audio/XAudio2.h
old mode 100644
new mode 100755
diff --git a/platform/win32/input/DirectInput.h b/platform/win32/input/DirectInput.h
old mode 100644
new mode 100755
diff --git a/platform/win32/input/HidInput.h b/platform/win32/input/HidInput.h
old mode 100644
new mode 100755
diff --git a/platform/win32/input/RawInput.h b/platform/win32/input/RawInput.h
old mode 100644
new mode 100755
diff --git a/platform/win32/input/XInput.h b/platform/win32/input/XInput.h
old mode 100644
new mode 100755
diff --git a/platform/win32/input/controller/ControllerHandler.h b/platform/win32/input/controller/ControllerHandler.h
old mode 100644
new mode 100755
diff --git a/platform/win32/input/controller/DualSense.h b/platform/win32/input/controller/DualSense.h
old mode 100644
new mode 100755
diff --git a/platform/win32/input/controller/DualShock4.h b/platform/win32/input/controller/DualShock4.h
old mode 100644
new mode 100755
diff --git a/platform/win32/input/controller/XBoxS.h b/platform/win32/input/controller/XBoxS.h
old mode 100644
new mode 100755
diff --git a/platform/win32/network/Client.h b/platform/win32/network/Client.h
old mode 100644
new mode 100755
diff --git a/platform/win32/network/Server.h b/platform/win32/network/Server.h
old mode 100644
new mode 100755
diff --git a/platform/win32/network/Socket.h b/platform/win32/network/Socket.h
old mode 100644
new mode 100755
diff --git a/platform/win32/threading/Atomic.h b/platform/win32/threading/Atomic.h
old mode 100644
new mode 100755
diff --git a/platform/win32/threading/Semaphore.h b/platform/win32/threading/Semaphore.h
old mode 100644
new mode 100755
diff --git a/platform/win32/threading/Spinlock.cpp b/platform/win32/threading/Spinlock.cpp
old mode 100644
new mode 100755
diff --git a/platform/win32/threading/Spinlock.h b/platform/win32/threading/Spinlock.h
old mode 100644
new mode 100755
diff --git a/platform/win32/threading/Thread.h b/platform/win32/threading/Thread.h
old mode 100644
new mode 100755
diff --git a/platform/win32/threading/ThreadDefines.h b/platform/win32/threading/ThreadDefines.h
old mode 100644
new mode 100755
diff --git a/render/liquid.cpp b/render/liquid.cpp
old mode 100644
new mode 100755
diff --git a/render/mob.cpp b/render/mob.cpp
old mode 100644
new mode 100755
diff --git a/render/object.cpp b/render/object.cpp
old mode 100644
new mode 100755
diff --git a/render/sky.cpp b/render/sky.cpp
old mode 100644
new mode 100755
diff --git a/render/text.cpp b/render/text.cpp
old mode 100644
new mode 100755
diff --git a/scene/SceneInfo.h b/scene/SceneInfo.h
old mode 100644
new mode 100755
diff --git a/shaders/liquids/lava.hlsl b/shaders/liquids/lava.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/cube_fragment.hlsl b/shaders/liquids/water/cube_fragment.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/cube_vertex.hlsl b/shaders/liquids/water/cube_vertex.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/helper.hlsli b/shaders/liquids/water/helper.hlsli
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/sphere_fragment.hlsl b/shaders/liquids/water/sphere_fragment.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/sphere_vertex.hlsl b/shaders/liquids/water/sphere_vertex.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/water_above_fragment.hlsl b/shaders/liquids/water/water_above_fragment.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/water_below_fragment.hlsl b/shaders/liquids/water/water_below_fragment.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/water_caustics_fragment.hlsl b/shaders/liquids/water/water_caustics_fragment.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/water_caustics_vertex.hlsl b/shaders/liquids/water/water_caustics_vertex.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/liquids/water/water_vertex.hlsl b/shaders/liquids/water/water_vertex.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/nature/cloud.hlsl b/shaders/nature/cloud.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/nature/fire.hlsl b/shaders/nature/fire.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/nature/fog.hlsl b/shaders/nature/fog.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/nature/godray.hlsl b/shaders/nature/godray.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/nature/lightning.hlsl b/shaders/nature/lightning.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/nature/rain.hlsl b/shaders/nature/rain.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/nature/smoke.hlsl b/shaders/nature/smoke.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/nature/snow.hlsl b/shaders/nature/snow.hlsl
old mode 100644
new mode 100755
diff --git a/shaders/shaders.hlsl b/shaders/shaders.hlsl
old mode 100644
new mode 100755
diff --git a/sort/BinarySearch.h b/sort/BinarySearch.h
old mode 100644
new mode 100755
diff --git a/sort/EytzingerSearch.h b/sort/EytzingerSearch.h
old mode 100644
new mode 100755
diff --git a/sort/HeapSort.h b/sort/HeapSort.h
old mode 100644
new mode 100755
diff --git a/sort/InsertionSort.h b/sort/InsertionSort.h
old mode 100644
new mode 100755
diff --git a/sort/IntroSort.h b/sort/IntroSort.h
old mode 100644
new mode 100755
diff --git a/sort/QuickSort.h b/sort/QuickSort.h
old mode 100644
new mode 100755
diff --git a/sort/Sort.h b/sort/Sort.h
old mode 100644
new mode 100755
diff --git a/stdlib/HashMap.h b/stdlib/HashMap.h
old mode 100644
new mode 100755
diff --git a/stdlib/PerfectHashMap.h b/stdlib/PerfectHashMap.h
old mode 100644
new mode 100755
diff --git a/stdlib/Simd.h b/stdlib/Simd.h
old mode 100644
new mode 100755
index 44d87e4..1e8e85e
--- a/stdlib/Simd.h
+++ b/stdlib/Simd.h
@@ -9,9 +9,16 @@
 #ifndef COMS_STDLIB_SIMD_H
 #define COMS_STDLIB_SIMD_H
 
+#include "../utils/TestUtils.h"
+
 // Adjusts the step size based on the memory alignment
 inline
 int32 intrin_validate_steps(const byte* mem, int32 steps) {
+    // During development we want to spot invalid alignment
+    ASSERT_SIMPLE(steps < 16 || (steps >= 16 && ((uintptr_t) mem & 63) == 0));
+    ASSERT_SIMPLE(steps < 8 || (steps >= 8 && ((uintptr_t) mem & 31) == 0));
+    ASSERT_SIMPLE(steps < 4 || (steps >= 4 && ((uintptr_t) mem & 15) == 0));
+
     if (steps >= 16 && ((uintptr_t) mem & 63) == 0) {
         return 16;
     } else if (steps >= 8 && ((uintptr_t) mem & 31) == 0) {
@@ -35,5 +42,4 @@ int32 intrin_validate_steps(const byte* mem, int32 steps) {
     #include "../architecture/x86/simd/SIMD_SVML.h"
 #endif
 
-
 #endif
\ No newline at end of file
diff --git a/stdlib/ThreadedHashMap.h b/stdlib/ThreadedHashMap.h
old mode 100644
new mode 100755
diff --git a/stdlib/Types.h b/stdlib/Types.h
old mode 100644
new mode 100755
diff --git a/system/Allocator.h b/system/Allocator.h
old mode 100644
new mode 100755
diff --git a/system/FileUtils.cpp b/system/FileUtils.cpp
old mode 100644
new mode 100755
diff --git a/system/Library.cpp b/system/Library.cpp
old mode 100644
new mode 100755
diff --git a/system/Library.h b/system/Library.h
old mode 100644
new mode 100755
diff --git a/system/SystemInfo.cpp b/system/SystemInfo.cpp
old mode 100644
new mode 100755
diff --git a/system/SystemInfo.h b/system/SystemInfo.h
old mode 100644
new mode 100755
diff --git a/system/Window.h b/system/Window.h
old mode 100644
new mode 100755
diff --git a/tests.bat b/tests.bat
old mode 100644
new mode 100755
diff --git a/tests/.vscode/c_cpp_properties.json b/tests/.vscode/c_cpp_properties.json
old mode 100644
new mode 100755
diff --git a/tests/.vscode/launch.json b/tests/.vscode/launch.json
old mode 100644
new mode 100755
diff --git a/tests/.vscode/settings.json b/tests/.vscode/settings.json
old mode 100644
new mode 100755
diff --git a/tests/.vscode/tasks.json b/tests/.vscode/tasks.json
old mode 100644
new mode 100755
diff --git a/tests/MainTest.cpp b/tests/MainTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/TestFramework.h b/tests/TestFramework.h
old mode 100644
new mode 100755
diff --git a/tests/math/EvaluatorTest.cpp b/tests/math/EvaluatorTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/memory/ChunkMemoryTest.cpp b/tests/memory/ChunkMemoryTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/memory/RingMemoryTest.cpp b/tests/memory/RingMemoryTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/stdlib/HashMapTest.cpp b/tests/stdlib/HashMapTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/ui/UILayoutTest.cpp b/tests/ui/UILayoutTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/ui/UIThemeTest.cpp b/tests/ui/UIThemeTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/utils/BitUtilsTest.cpp b/tests/utils/BitUtilsTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/utils/EndianUtilsTest.cpp b/tests/utils/EndianUtilsTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/utils/MathUtilsTest.cpp b/tests/utils/MathUtilsTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/utils/StringUtilsTest.cpp b/tests/utils/StringUtilsTest.cpp
old mode 100644
new mode 100755
diff --git a/tests/utils/UtilsTest.cpp b/tests/utils/UtilsTest.cpp
old mode 100644
new mode 100755
diff --git a/tests_iter.bat b/tests_iter.bat
old mode 100644
new mode 100755
diff --git a/thread/Atomic.h b/thread/Atomic.h
old mode 100644
new mode 100755
diff --git a/thread/Semaphore.h b/thread/Semaphore.h
old mode 100644
new mode 100755
diff --git a/thread/Spinlock.cpp b/thread/Spinlock.cpp
old mode 100644
new mode 100755
diff --git a/thread/Spinlock.h b/thread/Spinlock.h
old mode 100644
new mode 100755
diff --git a/thread/Thread.h b/thread/Thread.h
old mode 100644
new mode 100755
diff --git a/thread/ThreadDefines.h b/thread/ThreadDefines.h
old mode 100644
new mode 100755
diff --git a/thread/ThreadJob.h b/thread/ThreadJob.h
old mode 100644
new mode 100755
diff --git a/thread/ThreadPool.h b/thread/ThreadPool.h
old mode 100644
new mode 100755
diff --git a/ui/UIAlignment.h b/ui/UIAlignment.h
old mode 100644
new mode 100755
diff --git a/ui/UIAnimation.h b/ui/UIAnimation.h
old mode 100644
new mode 100755
diff --git a/ui/UIButton.h b/ui/UIButton.h
old mode 100644
new mode 100755
diff --git a/ui/UICursor.h b/ui/UICursor.h
old mode 100644
new mode 100755
diff --git a/ui/UICustom.h b/ui/UICustom.h
old mode 100644
new mode 100755
diff --git a/ui/UIElement.h b/ui/UIElement.h
old mode 100644
new mode 100755
diff --git a/ui/UIElementType.h b/ui/UIElementType.h
old mode 100644
new mode 100755
diff --git a/ui/UIImage.h b/ui/UIImage.h
old mode 100644
new mode 100755
diff --git a/ui/UIInput.h b/ui/UIInput.h
old mode 100644
new mode 100755
diff --git a/ui/UILabel.h b/ui/UILabel.h
old mode 100644
new mode 100755
diff --git a/ui/UILayout.cpp b/ui/UILayout.cpp
old mode 100644
new mode 100755
diff --git a/ui/UILayout.h b/ui/UILayout.h
old mode 100644
new mode 100755
diff --git a/ui/UILink.h b/ui/UILink.h
old mode 100644
new mode 100755
diff --git a/ui/UIPanel.h b/ui/UIPanel.h
old mode 100644
new mode 100755
diff --git a/ui/UISelect.h b/ui/UISelect.h
old mode 100644
new mode 100755
diff --git a/ui/UIStyleType.h b/ui/UIStyleType.h
old mode 100644
new mode 100755
diff --git a/ui/UITab.h b/ui/UITab.h
old mode 100644
new mode 100755
diff --git a/ui/UITable.h b/ui/UITable.h
old mode 100644
new mode 100755
diff --git a/ui/UIText.h b/ui/UIText.h
old mode 100644
new mode 100755
diff --git a/ui/UITextarea.h b/ui/UITextarea.h
old mode 100644
new mode 100755
diff --git a/ui/UITheme.h b/ui/UITheme.h
old mode 100644
new mode 100755
diff --git a/ui/UIWindow.h b/ui/UIWindow.h
old mode 100644
new mode 100755
diff --git a/ui/attribute/UIAttribute.h b/ui/attribute/UIAttribute.h
old mode 100644
new mode 100755
diff --git a/ui/attribute/UIAttributeBackground.h b/ui/attribute/UIAttributeBackground.h
old mode 100644
new mode 100755
diff --git a/ui/attribute/UIAttributeBorder.h b/ui/attribute/UIAttributeBorder.h
old mode 100644
new mode 100755
diff --git a/ui/attribute/UIAttributeDimension.h b/ui/attribute/UIAttributeDimension.h
old mode 100644
new mode 100755
diff --git a/ui/attribute/UIAttributeFont.h b/ui/attribute/UIAttributeFont.h
old mode 100644
new mode 100755
diff --git a/ui/attribute/UIAttributeShadow.h b/ui/attribute/UIAttributeShadow.h
old mode 100644
new mode 100755
diff --git a/ui/attribute/UIAttributeType.h b/ui/attribute/UIAttributeType.h
old mode 100644
new mode 100755
diff --git a/utils/BitUtils.h b/utils/BitUtils.h
old mode 100644
new mode 100755
diff --git a/utils/EndianUtils.h b/utils/EndianUtils.h
old mode 100644
new mode 100755
diff --git a/utils/MathUtils.h b/utils/MathUtils.h
old mode 100644
new mode 100755
diff --git a/utils/PerformanceProfiler.h b/utils/PerformanceProfiler.h
old mode 100644
new mode 100755
diff --git a/utils/RandomUtils.h b/utils/RandomUtils.h
old mode 100644
new mode 100755
diff --git a/utils/StringUtils.h b/utils/StringUtils.h
old mode 100644
new mode 100755
index 4029ac1..59acd69
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@@ -18,8 +18,7 @@
 #define HAS_CHAR(x, c) (HAS_ZERO((x) ^ (((size_t)-1 / 0xFF) * (c))))
 
 inline constexpr
-size_t str_length(const char* str) noexcept
-{
+size_t str_length(const char* str) noexcept {
     const char* ptr = str;
 
     // Align the pointer to the size of size_t
@@ -32,22 +31,15 @@ size_t str_length(const char* str) noexcept
     // Check one longword (size_t) at a time
     const size_t* longword_ptr = (const size_t *) ptr;
     while (true) {
-        size_t longword = *longword_ptr++;
-        if (HAS_ZERO(longword)) {
-            const char* cp = (const char *) (longword_ptr - 1);
-            if (cp[0] == '\0') return cp - str;
-            if (cp[1] == '\0') return cp + 1 - str;
-            if (cp[2] == '\0') return cp + 2 - str;
-            if (cp[3] == '\0') return cp + 3 - str;
-
-            // Are we using 8bytes for size_t?
-            #if SIZE_MAX > 0xFFFFFFFF
-                if (cp[4] == '\0') return cp + 4 - str;
-                if (cp[5] == '\0') return cp + 5 - str;
-                if (cp[6] == '\0') return cp + 6 - str;
-                if (cp[7] == '\0') return cp + 7 - str;
-            #endif
+        // Ensure we don't read past the end of the string
+        const char* end_ptr = (const char *) longword_ptr + sizeof(size_t);
+        for (const char* cp = (const char *) longword_ptr; cp < end_ptr; ++cp) {
+            if (*cp == '\0') {
+                return cp - str;
+            }
         }
+
+        ++longword_ptr;
     }
 }
 
diff --git a/utils/TestUtils.h b/utils/TestUtils.h
old mode 100644
new mode 100755
diff --git a/utils/TimeUtils.h b/utils/TimeUtils.h
old mode 100644
new mode 100755
diff --git a/utils/Utils.h b/utils/Utils.h
old mode 100644
new mode 100755