From 39fbcf43002f753cdccffb8966e8b35b7bd1e01a Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Sat, 22 Mar 2025 01:10:19 +0000 Subject: [PATCH] linux bug fixes --- .github/workflows/codeql.yml | 0 .github/workflows/msvc.yml | 0 .gitignore | 0 Guidelines.md | 0 README.md | 0 animation/Animation.h | 0 animation/AnimationEaseType.h | 0 architecture/CpuInfo.cpp | 0 architecture/CpuInfo.h | 0 architecture/Intrinsics.h | 0 architecture/arm/CpuInfo.cpp | 0 architecture/arm/Intrinsics.h | 4 + architecture/arm/neon/utils/Utils.h | 0 architecture/arm/sve/utils/Utils.h | 0 architecture/x86/CpuInfo.cpp | 0 architecture/x86/Intrinsics.h | 3 + architecture/x86/simd/SIMD_F32.h | 1373 ++--------- architecture/x86/simd/SIMD_F32_AVX2.h | 426 ++++ architecture/x86/simd/SIMD_F32_AVX512.h | 385 +++ architecture/x86/simd/SIMD_F32_SSE.h | 381 +++ architecture/x86/simd/SIMD_F64.h | 42 +- architecture/x86/simd/SIMD_F64_AVX2.h | 30 + architecture/x86/simd/SIMD_F64_AVX512.h | 29 + architecture/x86/simd/SIMD_F64_SSE.h | 29 + architecture/x86/simd/SIMD_I16.h | 860 +------ architecture/x86/simd/SIMD_I16_AVX2.h | 262 ++ architecture/x86/simd/SIMD_I16_AVX512.h | 265 ++ architecture/x86/simd/SIMD_I16_SSE.h | 261 ++ architecture/x86/simd/SIMD_I32.h | 2125 +++++------------ architecture/x86/simd/SIMD_I32_AVX2.h | 288 +++ architecture/x86/simd/SIMD_I32_AVX512.h | 309 +++ architecture/x86/simd/SIMD_I32_SSE.h | 286 +++ architecture/x86/simd/SIMD_I64.h | 43 +- architecture/x86/simd/SIMD_I64_AVX2.h | 29 + architecture/x86/simd/SIMD_I64_AVX512.h | 29 + architecture/x86/simd/SIMD_I64_SSE.h | 29 + architecture/x86/simd/SIMD_I8.h | 946 +------- architecture/x86/simd/SIMD_I8_AVX2.h | 265 ++ architecture/x86/simd/SIMD_I8_AVX512.h | 270 +++ architecture/x86/simd/SIMD_I8_SSE.h | 265 ++ architecture/x86/simd/SIMD_SVML.h | 160 +- architecture/x86/simd/SIMD_SVML_AVX2.h | 69 + architecture/x86/simd/SIMD_SVML_AVX512.h | 70 + architecture/x86/simd/SIMD_SVML_SSE.h | 70 + architecture/x86/simd/utils/Utils.h | 63 +- asset/Asset.h | 0 asset/AssetArchive.h | 0 asset/AssetManagementSystem.h | 0 asset/AssetType.h | 0 audio/Audio.cpp | 0 audio/Audio.h | 0 audio/AudioMixer.h | 0 audio/AudioSetting.h | 0 audio/Qoa.h | 0 audio/QoaSimd.h | 0 audio/Wav.h | 0 auth/Auth.h | 0 camera/Camera.h | 0 camera/CameraMovement.h | 0 color/ColorVisionDeficiency.h | 0 command/AppCmdBuffer.cpp | 0 command/AppCmdBuffer.h | 0 command/Command.h | 0 compiler/CompilerUtils.h | 0 compiler/gcc/Atomic.h | 0 compiler/gcc/CompilerUtils.h | 0 compiler/msvc/CompilerUtils.h | 0 compression/Huffman.h | 0 compression/LZP.h | 0 compression/RLE.h | 0 database/Database.h | 0 database/DatabaseConnection.h | 0 database/DatabaseType.h | 0 encryption/CeasarEncryption.h | 0 encryption/XorEncryption.h | 0 entity/AnimationEntityComponent.h | 0 entity/CursorEntity.h | 0 entity/Entity.h | 0 entity/EntityComponentSystem.h | 0 entity/EntitySize.h | 0 environment/Globe.h | 0 environment/Universe.h | 0 error/HammingCodes.h | 0 font/Font.h | 0 font/font_characters.txt | 0 gpuapi/AntiAliasing.h | 0 gpuapi/GpuApiType.h | 0 gpuapi/GpuAttributeType.h | 0 gpuapi/RenderUtils.h | 0 gpuapi/ShaderType.h | 0 gpuapi/direct3d/AppCmdBuffer.h | 0 gpuapi/direct3d/DirectXUtils.h | 0 gpuapi/direct3d/FramesInFlightContainer.h | 0 gpuapi/direct3d/Shader.h | 0 gpuapi/direct3d/ShaderUtils.h | 0 gpuapi/opengl/AppCmdBuffer.h | 0 gpuapi/opengl/FramesInFlightContainer.h | 0 gpuapi/opengl/Opengl.h | 0 gpuapi/opengl/OpenglDefines.h | 0 .../opengl/OpenglDescriptorSetLayoutBinding.h | 0 gpuapi/opengl/OpenglLinux.h | 0 gpuapi/opengl/OpenglUtils.h | 0 gpuapi/opengl/OpenglWin32.h | 0 gpuapi/opengl/Shader.h | 0 gpuapi/opengl/ShaderUtils.h | 0 gpuapi/vulkan/AppCmdBuffer.h | 0 gpuapi/vulkan/FramesInFlightContainer.h | 0 gpuapi/vulkan/Shader.h | 0 gpuapi/vulkan/ShaderUtils.h | 0 gpuapi/vulkan/VulkanUtils.h | 0 hash/Crc.h | 0 hash/GeneralHash.h | 4 +- html/template/HtmlTemplate.h | 0 html/template/HtmlTemplateCache.h | 0 html/template/HtmlTemplateContext.h | 0 html/template/HtmlTemplateInterpreter.h | 0 html/template/HtmlTemplateLexer.h | 0 html/template/HtmlTemplateParser.h | 0 image/Bitmap.h | 0 image/Image.cpp | 0 image/Image.h | 0 image/Png.h | 0 image/Qoi.h | 0 image/Tga.h | 0 image/default_colors.h | 0 image/default_colors.htm | 0 image/stb_image.h | 0 input/ControllerInput.h | 0 input/ControllerType.h | 0 input/Input.h | 0 input/InputConnectionType.h | 0 light/Material.h | 0 localization/Dialog.h | 0 localization/Language.h | 0 log/DebugContainer.h | 0 log/DebugMemory.h | 0 log/Log.h | 14 +- log/PerformanceProfiler.h | 0 log/Stats.h | 0 math/Evaluator.h | 0 math/PerlinNoise.h | 0 math/matrix/MatrixFloat32.h | 12 +- math/matrix/MatrixInt32.h | 0 math/matrix/MatrixInt64.h | 0 math/matrix/QuaternionFloat32.h | 0 math/matrix/VectorFloat32.h | 0 math/matrix/VectorFloat64.h | 0 math/matrix/VectorInt32.h | 0 math/matrix/VectorInt64.h | 0 math/random/BlueNoise.h | 0 memory/BufferMemory.h | 0 memory/ChunkMemory.h | 0 memory/Heap.h | 0 memory/Queue.h | 0 memory/RingMemory.h | 0 memory/ThreadedChunkMemory.h | 0 memory/ThreadedQueue.h | 0 memory/ThreadedRingMemory.h | 0 models/Colors.h | 0 models/Location.h | 0 models/Map/map_chunks.h | 0 models/Obj.h | 0 models/Sound.h | 0 models/account/Account.h | 0 models/bracket/Bracket.h | 0 models/bracket/BracketMatch.h | 0 models/bracket/BracketSeeding.h | 0 models/bracket/BracketTeam.h | 0 models/chat/Chat.h | 0 models/chat/ChatLevel.h | 0 models/chat/ChatStatus.h | 0 models/chat/ChatType.h | 0 models/event/Event.h | 0 models/event/EventTaskType.h | 0 models/event/tmp | 0 models/extension/ExtensionType.h | 0 models/guild/GuildBanner.h | 0 models/item/Consumable.h | 0 models/item/ConsumableType.h | 0 models/item/Equipment.cpp | 0 models/item/Equipment.h | 0 models/item/EquipmentType.h | 0 models/item/Item.h | 0 models/item/ItemAffixDistribution.h | 0 models/item/ItemLevelStats.h | 0 models/item/ItemRarityDefinition.h | 0 models/item/ItemRarityStats.h | 0 models/item/ItemStatsDistribution.h | 0 models/item/MobLevelStats.h | 0 models/item/_equipment_slots.h | 0 models/item/_equipment_types.h | 0 models/item/_item_rarity.h | 0 models/map.h | 0 models/mob/ActivityStats.h | 0 models/mob/FixedStats.h | 0 models/mob/Mob.cpp | 0 models/mob/Mob.h | 0 models/mob/MobAction.h | 0 models/mob/MobCategory.h | 0 models/mob/MobState.h | 0 models/mob/MobStats.cpp | 0 models/mob/MobStats.h | 0 models/mob/MobStatsType.h | 0 models/mob/PrimaryStatsPoints.cpp | 0 models/mob/PrimaryStatsPoints.h | 0 models/mob/SecondaryStatsPoints.cpp | 0 models/mob/SecondaryStatsPoints.h | 0 models/mob/_mob_category.h | 0 models/mob/_mob_list.h | 0 models/mob/monster/Drop.h | 0 models/mob/monster/LootTable.h | 0 models/mob/monster/Monster.h | 0 models/mob/monster/MonsterStats.h | 0 models/mob/player/Backpack.h | 0 models/mob/player/Guild.h | 0 models/mob/player/LootFilter.h | 0 models/mob/player/Player.cpp | 0 models/mob/player/Player.h | 0 models/mob/player/PlayerStats.h | 0 models/mob/player/PlayerXPRequirement.h | 0 models/mob/player/Reputation.h | 0 models/mob/player/_player_class.h | 0 models/mob/skill/AoeDistribution.h | 0 models/mob/skill/AoeShape.h | 0 models/mob/skill/ProjectileDistribution.h | 0 models/mob/skill/Skill.h | 0 models/mob/skill/SkillLocation.h | 0 models/mob/skill/StatsTarget.h | 0 models/mob/skill/definitions/arcane_bolt.cfg | 0 models/mob/skill/definitions/arise.cfg | 0 models/mob/skill/definitions/back_fist.cfg | 0 models/mob/skill/definitions/beam.cfg | 0 models/mob/skill/definitions/black_fist.cfg | 0 models/mob/skill/definitions/chain.cfg | 0 .../mob/skill/definitions/chain_lightning.cfg | 0 .../mob/skill/definitions/corruption_bolt.cfg | 0 models/mob/skill/definitions/cyclone.cfg | 0 models/mob/skill/definitions/dodge.cfg | 0 models/mob/skill/definitions/earth_bolt.cfg | 0 .../mob/skill/definitions/elemental_pilar.cfg | 0 models/mob/skill/definitions/fear.cfg | 0 models/mob/skill/definitions/fire_bolt.cfg | 0 models/mob/skill/definitions/frost_bolt.cfg | 0 models/mob/skill/definitions/ghost_walk.cfg | 0 .../skill/definitions/health_inverse_dmg.cfg | 0 .../mob/skill/definitions/health_to_dmg.cfg | 0 models/mob/skill/definitions/holy_bolt.cfg | 0 models/mob/skill/definitions/hook.cfg | 0 models/mob/skill/definitions/kick.cfg | 0 .../mob/skill/definitions/launch_strike.cfg | 0 .../mob/skill/definitions/lightning_bolt.cfg | 0 .../mob/skill/definitions/meteor_strike.cfg | 0 models/mob/skill/definitions/mirage.cfg | 0 models/mob/skill/definitions/net.cfg | 0 models/mob/skill/definitions/palm_strike.cfg | 0 .../mob/skill/definitions/poison_strike.cfg | 0 models/mob/skill/definitions/pull.cfg | 0 models/mob/skill/definitions/punch.cfg | 0 models/mob/skill/definitions/push.cfg | 0 models/mob/skill/definitions/reflect.cfg | 0 models/mob/skill/definitions/revive.cfg | 0 models/mob/skill/definitions/root.cfg | 0 models/mob/skill/definitions/sacrafice.cfg | 0 models/mob/skill/definitions/shield.cfg | 0 models/mob/skill/definitions/side_kick.cfg | 0 models/mob/skill/definitions/spikes.cfg | 0 models/mob/skill/definitions/sprint.cfg | 0 models/mob/skill/definitions/stomp.cfg | 0 models/mob/skill/definitions/summon.cfg | 0 models/mob/skill/definitions/sword_dance.cfg | 0 models/mob/skill/definitions/teleport.cfg | 0 models/mob/skill/definitions/totem.cfg | 0 models/mob/skill/definitions/uppercut.cfg | 0 models/mob/skill/definitions/whirlwind.cfg | 0 models/mob/skill/definitions/wind_slashes.cfg | 0 models/mob/skill/modifiers/split_shot.cfg | 0 models/object/Block.cpp | 0 models/object/Block.h | 0 models/object/Chunk.h | 0 models/object/Cube.h | 0 models/object/Object.h | 0 models/object/ObjectType.h | 0 models/object/_object_list.h | 0 models/object/_object_types.h | 0 models/settings/DungeonSettings.h | 0 models/settings/ItemDistributionType.h | 0 models/settings/Settings.h | 0 models/settings/setting_types.h | 0 module/Module.h | 0 module/ModuleManager.h | 0 network/Server.h | 0 network/Socket.h | 0 network/SocketConnection.h | 0 network/packet/OMSPacket.h | 0 network/packet/PacketCache.h | 0 network/packet/PacketHeader.h | 0 network/packet/UDPPacket.h | 0 network/packet/chat/ChatMessagePacket.h | 0 network/packet/general/AckPacket.h | 0 network/packet/general/PingPacket.h | 0 network/packet/mob/MobInfoPacket.h | 0 network/packet/mob/MobStatePacket.h | 0 network/packet/mob/player/PlayerInfoPacket.h | 0 network/packet/mob/player/PlayerState.h | 0 network/packet/packet_types.h | 0 noise/FractalNoise.h | 0 noise/PerlinNoise.h | 0 noise/SimplexNoise.h | 0 noise/ValueNoise.h | 0 noise/WorleyNoise.h | 0 object/Animation.h | 0 object/Hitbox.h | 0 object/Material.h | 0 object/Materials.md | 0 object/Mesh.h | 0 object/Model.h | 0 object/Texture.h | 0 object/Vertex.h | 0 particle/Particle.h | 0 pathfinding/Jpsp.h | 0 pathfinding/Metric2d.h | 0 pathfinding/Metric3d.h | 0 pathfinding/Path.h | 0 pathfinding/jps/Jps.h | 0 pathfinding/jps/JpsGrid.h | 0 pathfinding/jps/JpsNode.h | 0 platform/linux/Allocator.h | 0 platform/linux/ExceptionHandler.h | 0 platform/linux/FileUtils.cpp | 0 platform/linux/Library.cpp | 0 platform/linux/Library.h | 0 platform/linux/SystemInfo.cpp | 0 platform/linux/TimeUtils.h | 6 +- platform/linux/UtilsLinux.h | 0 platform/linux/network/Server.h | 0 platform/linux/network/Socket.h | 0 platform/linux/threading/Atomic.h | 0 platform/linux/threading/Semaphore.h | 0 platform/linux/threading/Spinlock.cpp | 0 platform/linux/threading/Spinlock.h | 0 platform/linux/threading/Thread.h | 0 platform/linux/threading/ThreadDefines.h | 0 platform/win32/Allocator.h | 0 platform/win32/Clipboard.h | 0 platform/win32/ExceptionHandler.h | 0 platform/win32/FastPipes.h | 0 platform/win32/FileUtils.cpp | 0 platform/win32/LeanWin32.h | 0 platform/win32/Library.cpp | 0 platform/win32/Library.h | 0 platform/win32/SystemInfo.cpp | 0 platform/win32/TimeUtils.h | 0 platform/win32/UtilsWin32.h | 0 platform/win32/UtilsWindows.h | 0 platform/win32/Window.h | 0 platform/win32/audio/DirectSound.h | 0 platform/win32/audio/Wasapi.h | 0 platform/win32/audio/XAudio2.h | 0 platform/win32/input/DirectInput.h | 0 platform/win32/input/HidInput.h | 0 platform/win32/input/RawInput.h | 0 platform/win32/input/XInput.h | 0 .../input/controller/ControllerHandler.h | 0 platform/win32/input/controller/DualSense.h | 0 platform/win32/input/controller/DualShock4.h | 0 platform/win32/input/controller/XBoxS.h | 0 platform/win32/network/Client.h | 0 platform/win32/network/Server.h | 0 platform/win32/network/Socket.h | 0 platform/win32/threading/Atomic.h | 0 platform/win32/threading/Semaphore.h | 0 platform/win32/threading/Spinlock.cpp | 0 platform/win32/threading/Spinlock.h | 0 platform/win32/threading/Thread.h | 0 platform/win32/threading/ThreadDefines.h | 0 render/liquid.cpp | 0 render/mob.cpp | 0 render/object.cpp | 0 render/sky.cpp | 0 render/text.cpp | 0 scene/SceneInfo.h | 0 shaders/liquids/lava.hlsl | 0 shaders/liquids/water/cube_fragment.hlsl | 0 shaders/liquids/water/cube_vertex.hlsl | 0 shaders/liquids/water/helper.hlsli | 0 shaders/liquids/water/sphere_fragment.hlsl | 0 shaders/liquids/water/sphere_vertex.hlsl | 0 .../liquids/water/water_above_fragment.hlsl | 0 .../liquids/water/water_below_fragment.hlsl | 0 .../water/water_caustics_fragment.hlsl | 0 .../liquids/water/water_caustics_vertex.hlsl | 0 shaders/liquids/water/water_vertex.hlsl | 0 shaders/nature/cloud.hlsl | 0 shaders/nature/fire.hlsl | 0 shaders/nature/fog.hlsl | 0 shaders/nature/godray.hlsl | 0 shaders/nature/lightning.hlsl | 0 shaders/nature/rain.hlsl | 0 shaders/nature/smoke.hlsl | 0 shaders/nature/snow.hlsl | 0 shaders/shaders.hlsl | 0 sort/BinarySearch.h | 0 sort/EytzingerSearch.h | 0 sort/HeapSort.h | 0 sort/InsertionSort.h | 0 sort/IntroSort.h | 0 sort/QuickSort.h | 0 sort/Sort.h | 0 stdlib/HashMap.h | 0 stdlib/PerfectHashMap.h | 0 stdlib/Simd.h | 8 +- stdlib/ThreadedHashMap.h | 0 stdlib/Types.h | 0 system/Allocator.h | 0 system/FileUtils.cpp | 0 system/Library.cpp | 0 system/Library.h | 0 system/SystemInfo.cpp | 0 system/SystemInfo.h | 0 system/Window.h | 0 tests.bat | 0 tests/.vscode/c_cpp_properties.json | 0 tests/.vscode/launch.json | 0 tests/.vscode/settings.json | 0 tests/.vscode/tasks.json | 0 tests/MainTest.cpp | 0 tests/TestFramework.h | 0 tests/math/EvaluatorTest.cpp | 0 tests/memory/ChunkMemoryTest.cpp | 0 tests/memory/RingMemoryTest.cpp | 0 tests/stdlib/HashMapTest.cpp | 0 tests/ui/UILayoutTest.cpp | 0 tests/ui/UIThemeTest.cpp | 0 tests/utils/BitUtilsTest.cpp | 0 tests/utils/EndianUtilsTest.cpp | 0 tests/utils/MathUtilsTest.cpp | 0 tests/utils/StringUtilsTest.cpp | 0 tests/utils/UtilsTest.cpp | 0 tests_iter.bat | 0 thread/Atomic.h | 0 thread/Semaphore.h | 0 thread/Spinlock.cpp | 0 thread/Spinlock.h | 0 thread/Thread.h | 0 thread/ThreadDefines.h | 0 thread/ThreadJob.h | 0 thread/ThreadPool.h | 0 ui/UIAlignment.h | 0 ui/UIAnimation.h | 0 ui/UIButton.h | 0 ui/UICursor.h | 0 ui/UICustom.h | 0 ui/UIElement.h | 0 ui/UIElementType.h | 0 ui/UIImage.h | 0 ui/UIInput.h | 0 ui/UILabel.h | 0 ui/UILayout.cpp | 0 ui/UILayout.h | 0 ui/UILink.h | 0 ui/UIPanel.h | 0 ui/UISelect.h | 0 ui/UIStyleType.h | 0 ui/UITab.h | 0 ui/UITable.h | 0 ui/UIText.h | 0 ui/UITextarea.h | 0 ui/UITheme.h | 0 ui/UIWindow.h | 0 ui/attribute/UIAttribute.h | 0 ui/attribute/UIAttributeBackground.h | 0 ui/attribute/UIAttributeBorder.h | 0 ui/attribute/UIAttributeDimension.h | 0 ui/attribute/UIAttributeFont.h | 0 ui/attribute/UIAttributeShadow.h | 0 ui/attribute/UIAttributeType.h | 0 utils/BitUtils.h | 0 utils/EndianUtils.h | 0 utils/MathUtils.h | 0 utils/PerformanceProfiler.h | 0 utils/RandomUtils.h | 0 utils/StringUtils.h | 26 +- utils/TestUtils.h | 0 utils/TimeUtils.h | 0 utils/Utils.h | 0 485 files changed, 5108 insertions(+), 4628 deletions(-) mode change 100644 => 100755 .github/workflows/codeql.yml mode change 100644 => 100755 .github/workflows/msvc.yml mode change 100644 => 100755 .gitignore mode change 100644 => 100755 Guidelines.md mode change 100644 => 100755 README.md mode change 100644 => 100755 animation/Animation.h mode change 100644 => 100755 animation/AnimationEaseType.h mode change 100644 => 100755 architecture/CpuInfo.cpp mode change 100644 => 100755 architecture/CpuInfo.h mode change 100644 => 100755 architecture/Intrinsics.h mode change 100644 => 100755 architecture/arm/CpuInfo.cpp mode change 100644 => 100755 architecture/arm/Intrinsics.h mode change 100644 => 100755 architecture/arm/neon/utils/Utils.h mode change 100644 => 100755 architecture/arm/sve/utils/Utils.h mode change 100644 => 100755 architecture/x86/CpuInfo.cpp mode change 100644 => 100755 architecture/x86/Intrinsics.h mode change 100644 => 100755 architecture/x86/simd/SIMD_F32.h create mode 100644 architecture/x86/simd/SIMD_F32_AVX2.h create mode 100644 architecture/x86/simd/SIMD_F32_AVX512.h create mode 100644 architecture/x86/simd/SIMD_F32_SSE.h mode change 100644 => 100755 architecture/x86/simd/SIMD_F64.h create mode 100644 architecture/x86/simd/SIMD_F64_AVX2.h create mode 100644 architecture/x86/simd/SIMD_F64_AVX512.h create mode 100644 architecture/x86/simd/SIMD_F64_SSE.h mode change 100644 => 100755 architecture/x86/simd/SIMD_I16.h create mode 100644 architecture/x86/simd/SIMD_I16_AVX2.h create mode 100644 architecture/x86/simd/SIMD_I16_AVX512.h create mode 100644 architecture/x86/simd/SIMD_I16_SSE.h mode change 100644 => 100755 architecture/x86/simd/SIMD_I32.h create mode 100644 architecture/x86/simd/SIMD_I32_AVX2.h create mode 100644 architecture/x86/simd/SIMD_I32_AVX512.h create mode 100644 architecture/x86/simd/SIMD_I32_SSE.h mode change 100644 => 100755 architecture/x86/simd/SIMD_I64.h create mode 100644 architecture/x86/simd/SIMD_I64_AVX2.h create mode 100644 architecture/x86/simd/SIMD_I64_AVX512.h create mode 100644 architecture/x86/simd/SIMD_I64_SSE.h mode change 100644 => 100755 architecture/x86/simd/SIMD_I8.h create mode 100644 architecture/x86/simd/SIMD_I8_AVX2.h create mode 100644 architecture/x86/simd/SIMD_I8_AVX512.h create mode 100644 architecture/x86/simd/SIMD_I8_SSE.h create mode 100644 architecture/x86/simd/SIMD_SVML_AVX2.h create mode 100755 architecture/x86/simd/SIMD_SVML_AVX512.h create mode 100644 architecture/x86/simd/SIMD_SVML_SSE.h mode change 100644 => 100755 architecture/x86/simd/utils/Utils.h mode change 100644 => 100755 asset/Asset.h mode change 100644 => 100755 asset/AssetArchive.h mode change 100644 => 100755 asset/AssetManagementSystem.h mode change 100644 => 100755 asset/AssetType.h mode change 100644 => 100755 audio/Audio.cpp mode change 100644 => 100755 audio/Audio.h mode change 100644 => 100755 audio/AudioMixer.h mode change 100644 => 100755 audio/AudioSetting.h mode change 100644 => 100755 audio/Qoa.h mode change 100644 => 100755 audio/QoaSimd.h mode change 100644 => 100755 audio/Wav.h mode change 100644 => 100755 auth/Auth.h mode change 100644 => 100755 camera/Camera.h mode change 100644 => 100755 camera/CameraMovement.h mode change 100644 => 100755 color/ColorVisionDeficiency.h mode change 100644 => 100755 command/AppCmdBuffer.cpp mode change 100644 => 100755 command/AppCmdBuffer.h mode change 100644 => 100755 command/Command.h mode change 100644 => 100755 compiler/CompilerUtils.h mode change 100644 => 100755 compiler/gcc/Atomic.h mode change 100644 => 100755 compiler/gcc/CompilerUtils.h mode change 100644 => 100755 compiler/msvc/CompilerUtils.h mode change 100644 => 100755 compression/Huffman.h mode change 100644 => 100755 compression/LZP.h mode change 100644 => 100755 compression/RLE.h mode change 100644 => 100755 database/Database.h mode change 100644 => 100755 database/DatabaseConnection.h mode change 100644 => 100755 database/DatabaseType.h mode change 100644 => 100755 encryption/CeasarEncryption.h mode change 100644 => 100755 encryption/XorEncryption.h mode change 100644 => 100755 entity/AnimationEntityComponent.h mode change 100644 => 100755 entity/CursorEntity.h mode change 100644 => 100755 entity/Entity.h mode change 100644 => 100755 entity/EntityComponentSystem.h mode change 100644 => 100755 entity/EntitySize.h mode change 100644 => 100755 environment/Globe.h mode change 100644 => 100755 environment/Universe.h mode change 100644 => 100755 error/HammingCodes.h mode change 100644 => 100755 font/Font.h mode change 100644 => 100755 font/font_characters.txt mode change 100644 => 100755 gpuapi/AntiAliasing.h mode change 100644 => 100755 gpuapi/GpuApiType.h mode change 100644 => 100755 gpuapi/GpuAttributeType.h mode change 100644 => 100755 gpuapi/RenderUtils.h mode change 100644 => 100755 gpuapi/ShaderType.h mode change 100644 => 100755 gpuapi/direct3d/AppCmdBuffer.h mode change 100644 => 100755 gpuapi/direct3d/DirectXUtils.h mode change 100644 => 100755 gpuapi/direct3d/FramesInFlightContainer.h mode change 100644 => 100755 gpuapi/direct3d/Shader.h mode change 100644 => 100755 gpuapi/direct3d/ShaderUtils.h mode change 100644 => 100755 gpuapi/opengl/AppCmdBuffer.h mode change 100644 => 100755 gpuapi/opengl/FramesInFlightContainer.h mode change 100644 => 100755 gpuapi/opengl/Opengl.h mode change 100644 => 100755 gpuapi/opengl/OpenglDefines.h mode change 100644 => 100755 gpuapi/opengl/OpenglDescriptorSetLayoutBinding.h mode change 100644 => 100755 gpuapi/opengl/OpenglLinux.h mode change 100644 => 100755 gpuapi/opengl/OpenglUtils.h mode change 100644 => 100755 gpuapi/opengl/OpenglWin32.h mode change 100644 => 100755 gpuapi/opengl/Shader.h mode change 100644 => 100755 gpuapi/opengl/ShaderUtils.h mode change 100644 => 100755 gpuapi/vulkan/AppCmdBuffer.h mode change 100644 => 100755 gpuapi/vulkan/FramesInFlightContainer.h mode change 100644 => 100755 gpuapi/vulkan/Shader.h mode change 100644 => 100755 gpuapi/vulkan/ShaderUtils.h mode change 100644 => 100755 gpuapi/vulkan/VulkanUtils.h mode change 100644 => 100755 hash/Crc.h mode change 100644 => 100755 hash/GeneralHash.h mode change 100644 => 100755 html/template/HtmlTemplate.h mode change 100644 => 100755 html/template/HtmlTemplateCache.h mode change 100644 => 100755 html/template/HtmlTemplateContext.h mode change 100644 => 100755 html/template/HtmlTemplateInterpreter.h mode change 100644 => 100755 html/template/HtmlTemplateLexer.h mode change 100644 => 100755 html/template/HtmlTemplateParser.h mode change 100644 => 100755 image/Bitmap.h mode change 100644 => 100755 image/Image.cpp mode change 100644 => 100755 image/Image.h mode change 100644 => 100755 image/Png.h mode change 100644 => 100755 image/Qoi.h mode change 100644 => 100755 image/Tga.h mode change 100644 => 100755 image/default_colors.h mode change 100644 => 100755 image/default_colors.htm mode change 100644 => 100755 image/stb_image.h mode change 100644 => 100755 input/ControllerInput.h mode change 100644 => 100755 input/ControllerType.h mode change 100644 => 100755 input/Input.h mode change 100644 => 100755 input/InputConnectionType.h mode change 100644 => 100755 light/Material.h mode change 100644 => 100755 localization/Dialog.h mode change 100644 => 100755 localization/Language.h mode change 100644 => 100755 log/DebugContainer.h mode change 100644 => 100755 log/DebugMemory.h mode change 100644 => 100755 log/Log.h mode change 100644 => 100755 log/PerformanceProfiler.h mode change 100644 => 100755 log/Stats.h mode change 100644 => 100755 math/Evaluator.h mode change 100644 => 100755 math/PerlinNoise.h mode change 100644 => 100755 math/matrix/MatrixFloat32.h mode change 100644 => 100755 math/matrix/MatrixInt32.h mode change 100644 => 100755 math/matrix/MatrixInt64.h mode change 100644 => 100755 math/matrix/QuaternionFloat32.h mode change 100644 => 100755 math/matrix/VectorFloat32.h mode change 100644 => 100755 math/matrix/VectorFloat64.h mode change 100644 => 100755 math/matrix/VectorInt32.h mode change 100644 => 100755 math/matrix/VectorInt64.h mode change 100644 => 100755 math/random/BlueNoise.h mode change 100644 => 100755 memory/BufferMemory.h mode change 100644 => 100755 memory/ChunkMemory.h mode change 100644 => 100755 memory/Heap.h mode change 100644 => 100755 memory/Queue.h mode change 100644 => 100755 memory/RingMemory.h mode change 100644 => 100755 memory/ThreadedChunkMemory.h mode change 100644 => 100755 memory/ThreadedQueue.h mode change 100644 => 100755 memory/ThreadedRingMemory.h mode change 100644 => 100755 models/Colors.h mode change 100644 => 100755 models/Location.h mode change 100644 => 100755 models/Map/map_chunks.h mode change 100644 => 100755 models/Obj.h mode change 100644 => 100755 models/Sound.h mode change 100644 => 100755 models/account/Account.h mode change 100644 => 100755 models/bracket/Bracket.h mode change 100644 => 100755 models/bracket/BracketMatch.h mode change 100644 => 100755 models/bracket/BracketSeeding.h mode change 100644 => 100755 models/bracket/BracketTeam.h mode change 100644 => 100755 models/chat/Chat.h mode change 100644 => 100755 models/chat/ChatLevel.h mode change 100644 => 100755 models/chat/ChatStatus.h mode change 100644 => 100755 models/chat/ChatType.h mode change 100644 => 100755 models/event/Event.h mode change 100644 => 100755 models/event/EventTaskType.h mode change 100644 => 100755 models/event/tmp mode change 100644 => 100755 models/extension/ExtensionType.h mode change 100644 => 100755 models/guild/GuildBanner.h mode change 100644 => 100755 models/item/Consumable.h mode change 100644 => 100755 models/item/ConsumableType.h mode change 100644 => 100755 models/item/Equipment.cpp mode change 100644 => 100755 models/item/Equipment.h mode change 100644 => 100755 models/item/EquipmentType.h mode change 100644 => 100755 models/item/Item.h mode change 100644 => 100755 models/item/ItemAffixDistribution.h mode change 100644 => 100755 models/item/ItemLevelStats.h mode change 100644 => 100755 models/item/ItemRarityDefinition.h mode change 100644 => 100755 models/item/ItemRarityStats.h mode change 100644 => 100755 models/item/ItemStatsDistribution.h mode change 100644 => 100755 models/item/MobLevelStats.h mode change 100644 => 100755 models/item/_equipment_slots.h mode change 100644 => 100755 models/item/_equipment_types.h mode change 100644 => 100755 models/item/_item_rarity.h mode change 100644 => 100755 models/map.h mode change 100644 => 100755 models/mob/ActivityStats.h mode change 100644 => 100755 models/mob/FixedStats.h mode change 100644 => 100755 models/mob/Mob.cpp mode change 100644 => 100755 models/mob/Mob.h mode change 100644 => 100755 models/mob/MobAction.h mode change 100644 => 100755 models/mob/MobCategory.h mode change 100644 => 100755 models/mob/MobState.h mode change 100644 => 100755 models/mob/MobStats.cpp mode change 100644 => 100755 models/mob/MobStats.h mode change 100644 => 100755 models/mob/MobStatsType.h mode change 100644 => 100755 models/mob/PrimaryStatsPoints.cpp mode change 100644 => 100755 models/mob/PrimaryStatsPoints.h mode change 100644 => 100755 models/mob/SecondaryStatsPoints.cpp mode change 100644 => 100755 models/mob/SecondaryStatsPoints.h mode change 100644 => 100755 models/mob/_mob_category.h mode change 100644 => 100755 models/mob/_mob_list.h mode change 100644 => 100755 models/mob/monster/Drop.h mode change 100644 => 100755 models/mob/monster/LootTable.h mode change 100644 => 100755 models/mob/monster/Monster.h mode change 100644 => 100755 models/mob/monster/MonsterStats.h mode change 100644 => 100755 models/mob/player/Backpack.h mode change 100644 => 100755 models/mob/player/Guild.h mode change 100644 => 100755 models/mob/player/LootFilter.h mode change 100644 => 100755 models/mob/player/Player.cpp mode change 100644 => 100755 models/mob/player/Player.h mode change 100644 => 100755 models/mob/player/PlayerStats.h mode change 100644 => 100755 models/mob/player/PlayerXPRequirement.h mode change 100644 => 100755 models/mob/player/Reputation.h mode change 100644 => 100755 models/mob/player/_player_class.h mode change 100644 => 100755 models/mob/skill/AoeDistribution.h mode change 100644 => 100755 models/mob/skill/AoeShape.h mode change 100644 => 100755 models/mob/skill/ProjectileDistribution.h mode change 100644 => 100755 models/mob/skill/Skill.h mode change 100644 => 100755 models/mob/skill/SkillLocation.h mode change 100644 => 100755 models/mob/skill/StatsTarget.h mode change 100644 => 100755 models/mob/skill/definitions/arcane_bolt.cfg mode change 100644 => 100755 models/mob/skill/definitions/arise.cfg mode change 100644 => 100755 models/mob/skill/definitions/back_fist.cfg mode change 100644 => 100755 models/mob/skill/definitions/beam.cfg mode change 100644 => 100755 models/mob/skill/definitions/black_fist.cfg mode change 100644 => 100755 models/mob/skill/definitions/chain.cfg mode change 100644 => 100755 models/mob/skill/definitions/chain_lightning.cfg mode change 100644 => 100755 models/mob/skill/definitions/corruption_bolt.cfg mode change 100644 => 100755 models/mob/skill/definitions/cyclone.cfg mode change 100644 => 100755 models/mob/skill/definitions/dodge.cfg mode change 100644 => 100755 models/mob/skill/definitions/earth_bolt.cfg mode change 100644 => 100755 models/mob/skill/definitions/elemental_pilar.cfg mode change 100644 => 100755 models/mob/skill/definitions/fear.cfg mode change 100644 => 100755 models/mob/skill/definitions/fire_bolt.cfg mode change 100644 => 100755 models/mob/skill/definitions/frost_bolt.cfg mode change 100644 => 100755 models/mob/skill/definitions/ghost_walk.cfg mode change 100644 => 100755 models/mob/skill/definitions/health_inverse_dmg.cfg mode change 100644 => 100755 models/mob/skill/definitions/health_to_dmg.cfg mode change 100644 => 100755 models/mob/skill/definitions/holy_bolt.cfg mode change 100644 => 100755 models/mob/skill/definitions/hook.cfg mode change 100644 => 100755 models/mob/skill/definitions/kick.cfg mode change 100644 => 100755 models/mob/skill/definitions/launch_strike.cfg mode change 100644 => 100755 models/mob/skill/definitions/lightning_bolt.cfg mode change 100644 => 100755 models/mob/skill/definitions/meteor_strike.cfg mode change 100644 => 100755 models/mob/skill/definitions/mirage.cfg mode change 100644 => 100755 models/mob/skill/definitions/net.cfg mode change 100644 => 100755 models/mob/skill/definitions/palm_strike.cfg mode change 100644 => 100755 models/mob/skill/definitions/poison_strike.cfg mode change 100644 => 100755 models/mob/skill/definitions/pull.cfg mode change 100644 => 100755 models/mob/skill/definitions/punch.cfg mode change 100644 => 100755 models/mob/skill/definitions/push.cfg mode change 100644 => 100755 models/mob/skill/definitions/reflect.cfg mode change 100644 => 100755 models/mob/skill/definitions/revive.cfg mode change 100644 => 100755 models/mob/skill/definitions/root.cfg mode change 100644 => 100755 models/mob/skill/definitions/sacrafice.cfg mode change 100644 => 100755 models/mob/skill/definitions/shield.cfg mode change 100644 => 100755 models/mob/skill/definitions/side_kick.cfg mode change 100644 => 100755 models/mob/skill/definitions/spikes.cfg mode change 100644 => 100755 models/mob/skill/definitions/sprint.cfg mode change 100644 => 100755 models/mob/skill/definitions/stomp.cfg mode change 100644 => 100755 models/mob/skill/definitions/summon.cfg mode change 100644 => 100755 models/mob/skill/definitions/sword_dance.cfg mode change 100644 => 100755 models/mob/skill/definitions/teleport.cfg mode change 100644 => 100755 models/mob/skill/definitions/totem.cfg mode change 100644 => 100755 models/mob/skill/definitions/uppercut.cfg mode change 100644 => 100755 models/mob/skill/definitions/whirlwind.cfg mode change 100644 => 100755 models/mob/skill/definitions/wind_slashes.cfg mode change 100644 => 100755 models/mob/skill/modifiers/split_shot.cfg mode change 100644 => 100755 models/object/Block.cpp mode change 100644 => 100755 models/object/Block.h mode change 100644 => 100755 models/object/Chunk.h mode change 100644 => 100755 models/object/Cube.h mode change 100644 => 100755 models/object/Object.h mode change 100644 => 100755 models/object/ObjectType.h mode change 100644 => 100755 models/object/_object_list.h mode change 100644 => 100755 models/object/_object_types.h mode change 100644 => 100755 models/settings/DungeonSettings.h mode change 100644 => 100755 models/settings/ItemDistributionType.h mode change 100644 => 100755 models/settings/Settings.h mode change 100644 => 100755 models/settings/setting_types.h mode change 100644 => 100755 module/Module.h mode change 100644 => 100755 module/ModuleManager.h mode change 100644 => 100755 network/Server.h mode change 100644 => 100755 network/Socket.h mode change 100644 => 100755 network/SocketConnection.h mode change 100644 => 100755 network/packet/OMSPacket.h mode change 100644 => 100755 network/packet/PacketCache.h mode change 100644 => 100755 network/packet/PacketHeader.h mode change 100644 => 100755 network/packet/UDPPacket.h mode change 100644 => 100755 network/packet/chat/ChatMessagePacket.h mode change 100644 => 100755 network/packet/general/AckPacket.h mode change 100644 => 100755 network/packet/general/PingPacket.h mode change 100644 => 100755 network/packet/mob/MobInfoPacket.h mode change 100644 => 100755 network/packet/mob/MobStatePacket.h mode change 100644 => 100755 network/packet/mob/player/PlayerInfoPacket.h mode change 100644 => 100755 network/packet/mob/player/PlayerState.h mode change 100644 => 100755 network/packet/packet_types.h mode change 100644 => 100755 noise/FractalNoise.h mode change 100644 => 100755 noise/PerlinNoise.h mode change 100644 => 100755 noise/SimplexNoise.h mode change 100644 => 100755 noise/ValueNoise.h mode change 100644 => 100755 noise/WorleyNoise.h mode change 100644 => 100755 object/Animation.h mode change 100644 => 100755 object/Hitbox.h mode change 100644 => 100755 object/Material.h mode change 100644 => 100755 object/Materials.md mode change 100644 => 100755 object/Mesh.h mode change 100644 => 100755 object/Model.h mode change 100644 => 100755 object/Texture.h mode change 100644 => 100755 object/Vertex.h mode change 100644 => 100755 particle/Particle.h mode change 100644 => 100755 pathfinding/Jpsp.h mode change 100644 => 100755 pathfinding/Metric2d.h mode change 100644 => 100755 pathfinding/Metric3d.h mode change 100644 => 100755 pathfinding/Path.h mode change 100644 => 100755 pathfinding/jps/Jps.h mode change 100644 => 100755 pathfinding/jps/JpsGrid.h mode change 100644 => 100755 pathfinding/jps/JpsNode.h mode change 100644 => 100755 platform/linux/Allocator.h mode change 100644 => 100755 platform/linux/ExceptionHandler.h mode change 100644 => 100755 platform/linux/FileUtils.cpp mode change 100644 => 100755 platform/linux/Library.cpp mode change 100644 => 100755 platform/linux/Library.h mode change 100644 => 100755 platform/linux/SystemInfo.cpp mode change 100644 => 100755 platform/linux/TimeUtils.h mode change 100644 => 100755 platform/linux/UtilsLinux.h mode change 100644 => 100755 platform/linux/network/Server.h mode change 100644 => 100755 platform/linux/network/Socket.h mode change 100644 => 100755 platform/linux/threading/Atomic.h mode change 100644 => 100755 platform/linux/threading/Semaphore.h mode change 100644 => 100755 platform/linux/threading/Spinlock.cpp mode change 100644 => 100755 platform/linux/threading/Spinlock.h mode change 100644 => 100755 platform/linux/threading/Thread.h mode change 100644 => 100755 platform/linux/threading/ThreadDefines.h mode change 100644 => 100755 platform/win32/Allocator.h mode change 100644 => 100755 platform/win32/Clipboard.h mode change 100644 => 100755 platform/win32/ExceptionHandler.h mode change 100644 => 100755 platform/win32/FastPipes.h mode change 100644 => 100755 platform/win32/FileUtils.cpp mode change 100644 => 100755 platform/win32/LeanWin32.h mode change 100644 => 100755 platform/win32/Library.cpp mode change 100644 => 100755 platform/win32/Library.h mode change 100644 => 100755 platform/win32/SystemInfo.cpp mode change 100644 => 100755 platform/win32/TimeUtils.h mode change 100644 => 100755 platform/win32/UtilsWin32.h mode change 100644 => 100755 platform/win32/UtilsWindows.h mode change 100644 => 100755 platform/win32/Window.h mode change 100644 => 100755 platform/win32/audio/DirectSound.h mode change 100644 => 100755 platform/win32/audio/Wasapi.h mode change 100644 => 100755 platform/win32/audio/XAudio2.h mode change 100644 => 100755 platform/win32/input/DirectInput.h mode change 100644 => 100755 platform/win32/input/HidInput.h mode change 100644 => 100755 platform/win32/input/RawInput.h mode change 100644 => 100755 platform/win32/input/XInput.h mode change 100644 => 100755 platform/win32/input/controller/ControllerHandler.h mode change 100644 => 100755 platform/win32/input/controller/DualSense.h mode change 100644 => 100755 platform/win32/input/controller/DualShock4.h mode change 100644 => 100755 platform/win32/input/controller/XBoxS.h mode change 100644 => 100755 platform/win32/network/Client.h mode change 100644 => 100755 platform/win32/network/Server.h mode change 100644 => 100755 platform/win32/network/Socket.h mode change 100644 => 100755 platform/win32/threading/Atomic.h mode change 100644 => 100755 platform/win32/threading/Semaphore.h mode change 100644 => 100755 platform/win32/threading/Spinlock.cpp mode change 100644 => 100755 platform/win32/threading/Spinlock.h mode change 100644 => 100755 platform/win32/threading/Thread.h mode change 100644 => 100755 platform/win32/threading/ThreadDefines.h mode change 100644 => 100755 render/liquid.cpp mode change 100644 => 100755 render/mob.cpp mode change 100644 => 100755 render/object.cpp mode change 100644 => 100755 render/sky.cpp mode change 100644 => 100755 render/text.cpp mode change 100644 => 100755 scene/SceneInfo.h mode change 100644 => 100755 shaders/liquids/lava.hlsl mode change 100644 => 100755 shaders/liquids/water/cube_fragment.hlsl mode change 100644 => 100755 shaders/liquids/water/cube_vertex.hlsl mode change 100644 => 100755 shaders/liquids/water/helper.hlsli mode change 100644 => 100755 shaders/liquids/water/sphere_fragment.hlsl mode change 100644 => 100755 shaders/liquids/water/sphere_vertex.hlsl mode change 100644 => 100755 shaders/liquids/water/water_above_fragment.hlsl mode change 100644 => 100755 shaders/liquids/water/water_below_fragment.hlsl mode change 100644 => 100755 shaders/liquids/water/water_caustics_fragment.hlsl mode change 100644 => 100755 shaders/liquids/water/water_caustics_vertex.hlsl mode change 100644 => 100755 shaders/liquids/water/water_vertex.hlsl mode change 100644 => 100755 shaders/nature/cloud.hlsl mode change 100644 => 100755 shaders/nature/fire.hlsl mode change 100644 => 100755 shaders/nature/fog.hlsl mode change 100644 => 100755 shaders/nature/godray.hlsl mode change 100644 => 100755 shaders/nature/lightning.hlsl mode change 100644 => 100755 shaders/nature/rain.hlsl mode change 100644 => 100755 shaders/nature/smoke.hlsl mode change 100644 => 100755 shaders/nature/snow.hlsl mode change 100644 => 100755 shaders/shaders.hlsl mode change 100644 => 100755 sort/BinarySearch.h mode change 100644 => 100755 sort/EytzingerSearch.h mode change 100644 => 100755 sort/HeapSort.h mode change 100644 => 100755 sort/InsertionSort.h mode change 100644 => 100755 sort/IntroSort.h mode change 100644 => 100755 sort/QuickSort.h mode change 100644 => 100755 sort/Sort.h mode change 100644 => 100755 stdlib/HashMap.h mode change 100644 => 100755 stdlib/PerfectHashMap.h mode change 100644 => 100755 stdlib/Simd.h mode change 100644 => 100755 stdlib/ThreadedHashMap.h mode change 100644 => 100755 stdlib/Types.h mode change 100644 => 100755 system/Allocator.h mode change 100644 => 100755 system/FileUtils.cpp mode change 100644 => 100755 system/Library.cpp mode change 100644 => 100755 system/Library.h mode change 100644 => 100755 system/SystemInfo.cpp mode change 100644 => 100755 system/SystemInfo.h mode change 100644 => 100755 system/Window.h mode change 100644 => 100755 tests.bat mode change 100644 => 100755 tests/.vscode/c_cpp_properties.json mode change 100644 => 100755 tests/.vscode/launch.json mode change 100644 => 100755 tests/.vscode/settings.json mode change 100644 => 100755 tests/.vscode/tasks.json mode change 100644 => 100755 tests/MainTest.cpp mode change 100644 => 100755 tests/TestFramework.h mode change 100644 => 100755 tests/math/EvaluatorTest.cpp mode change 100644 => 100755 tests/memory/ChunkMemoryTest.cpp mode change 100644 => 100755 tests/memory/RingMemoryTest.cpp mode change 100644 => 100755 tests/stdlib/HashMapTest.cpp mode change 100644 => 100755 tests/ui/UILayoutTest.cpp mode change 100644 => 100755 tests/ui/UIThemeTest.cpp mode change 100644 => 100755 tests/utils/BitUtilsTest.cpp mode change 100644 => 100755 tests/utils/EndianUtilsTest.cpp mode change 100644 => 100755 tests/utils/MathUtilsTest.cpp mode change 100644 => 100755 tests/utils/StringUtilsTest.cpp mode change 100644 => 100755 tests/utils/UtilsTest.cpp mode change 100644 => 100755 tests_iter.bat mode change 100644 => 100755 thread/Atomic.h mode change 100644 => 100755 thread/Semaphore.h mode change 100644 => 100755 thread/Spinlock.cpp mode change 100644 => 100755 thread/Spinlock.h mode change 100644 => 100755 thread/Thread.h mode change 100644 => 100755 thread/ThreadDefines.h mode change 100644 => 100755 thread/ThreadJob.h mode change 100644 => 100755 thread/ThreadPool.h mode change 100644 => 100755 ui/UIAlignment.h mode change 100644 => 100755 ui/UIAnimation.h mode change 100644 => 100755 ui/UIButton.h mode change 100644 => 100755 ui/UICursor.h mode change 100644 => 100755 ui/UICustom.h mode change 100644 => 100755 ui/UIElement.h mode change 100644 => 100755 ui/UIElementType.h mode change 100644 => 100755 ui/UIImage.h mode change 100644 => 100755 ui/UIInput.h mode change 100644 => 100755 ui/UILabel.h mode change 100644 => 100755 ui/UILayout.cpp mode change 100644 => 100755 ui/UILayout.h mode change 100644 => 100755 ui/UILink.h mode change 100644 => 100755 ui/UIPanel.h mode change 100644 => 100755 ui/UISelect.h mode change 100644 => 100755 ui/UIStyleType.h mode change 100644 => 100755 ui/UITab.h mode change 100644 => 100755 ui/UITable.h mode change 100644 => 100755 ui/UIText.h mode change 100644 => 100755 ui/UITextarea.h mode change 100644 => 100755 ui/UITheme.h mode change 100644 => 100755 ui/UIWindow.h mode change 100644 => 100755 ui/attribute/UIAttribute.h mode change 100644 => 100755 ui/attribute/UIAttributeBackground.h mode change 100644 => 100755 ui/attribute/UIAttributeBorder.h mode change 100644 => 100755 ui/attribute/UIAttributeDimension.h mode change 100644 => 100755 ui/attribute/UIAttributeFont.h mode change 100644 => 100755 ui/attribute/UIAttributeShadow.h mode change 100644 => 100755 ui/attribute/UIAttributeType.h mode change 100644 => 100755 utils/BitUtils.h mode change 100644 => 100755 utils/EndianUtils.h mode change 100644 => 100755 utils/MathUtils.h mode change 100644 => 100755 utils/PerformanceProfiler.h mode change 100644 => 100755 utils/RandomUtils.h mode change 100644 => 100755 utils/StringUtils.h mode change 100644 => 100755 utils/TestUtils.h mode change 100644 => 100755 utils/TimeUtils.h mode change 100644 => 100755 utils/Utils.h diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml old mode 100644 new mode 100755 diff --git a/.github/workflows/msvc.yml b/.github/workflows/msvc.yml old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/Guidelines.md b/Guidelines.md old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/animation/Animation.h b/animation/Animation.h old mode 100644 new mode 100755 diff --git a/animation/AnimationEaseType.h b/animation/AnimationEaseType.h old mode 100644 new mode 100755 diff --git a/architecture/CpuInfo.cpp b/architecture/CpuInfo.cpp old mode 100644 new mode 100755 diff --git a/architecture/CpuInfo.h b/architecture/CpuInfo.h old mode 100644 new mode 100755 diff --git a/architecture/Intrinsics.h b/architecture/Intrinsics.h old mode 100644 new mode 100755 diff --git a/architecture/arm/CpuInfo.cpp b/architecture/arm/CpuInfo.cpp old mode 100644 new mode 100755 diff --git a/architecture/arm/Intrinsics.h b/architecture/arm/Intrinsics.h old mode 100644 new mode 100755 index 08bfb69..b727536 --- a/architecture/arm/Intrinsics.h +++ b/architecture/arm/Intrinsics.h @@ -11,6 +11,7 @@ #include #include +#include #include "../../stdlib/Types.h" #include "../../compiler/CompilerUtils.h" @@ -50,4 +51,7 @@ #define intrin_timestamp_counter() __builtin_readcyclecounter() #endif +// a * b + c +#define intrin_fmadd(a, b, c) vgetq_lane_f32(vmlaq_f32(vdupq_n_f32(c), vdupq_n_f32(a), vdupq_n_f32(b)), 0) + #endif \ No newline at end of file diff --git a/architecture/arm/neon/utils/Utils.h b/architecture/arm/neon/utils/Utils.h old mode 100644 new mode 100755 diff --git a/architecture/arm/sve/utils/Utils.h b/architecture/arm/sve/utils/Utils.h old mode 100644 new mode 100755 diff --git a/architecture/x86/CpuInfo.cpp b/architecture/x86/CpuInfo.cpp old mode 100644 new mode 100755 diff --git a/architecture/x86/Intrinsics.h b/architecture/x86/Intrinsics.h old mode 100644 new mode 100755 index 20319e9..53f08bb --- a/architecture/x86/Intrinsics.h +++ b/architecture/x86/Intrinsics.h @@ -58,6 +58,9 @@ #define intrin_prefetch_l2(mem) _mm_prefetch((const char *) (mem), _MM_HINT_T1) #define intrin_prefetch_l3(mem) _mm_prefetch((const char *) (mem), _MM_HINT_T2) +// a * b + c +#define intrin_fmadd(a, b, c) _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a), _mm_set_ss(b), _mm_set_ss(c))) + inline uint64 intrin_timestamp_counter() noexcept { _mm_mfence(); diff --git a/architecture/x86/simd/SIMD_F32.h b/architecture/x86/simd/SIMD_F32.h old mode 100644 new mode 100755 index edf374d..2051b8c --- a/architecture/x86/simd/SIMD_F32.h +++ b/architecture/x86/simd/SIMD_F32.h @@ -13,989 +13,18 @@ #include #include "../../../stdlib/Types.h" -#include "SIMD_SVML.h" -struct f32_4 { - union { - #if ARM - svfloat32_t s; - #else - __m128 s; - #endif +#ifdef MACRO_CPU_FEATURE_SSE42 + #include "SIMD_F32_SSE.h" +#endif - f32 v[4]; - }; -}; +#ifdef MACRO_CPU_FEATURE_AVX2 + #include "SIMD_F32_AVX2.h" +#endif -struct f32_8 { - union { - #if ARM - svfloat32_t s; - #else - __m256 s; - #endif - - f32 v[8]; - }; -}; - -struct f32_16 { - union { - #if ARM - svfloat32_t s; - #else - __m512 s; - #endif - - f32 v[16]; - }; -}; - -inline f32_4 load_f32_4(const f32* mem) -{ - f32_4 simd; - simd.s = _mm_load_ps(mem); - - return simd; -} - -inline f32_4 init_f32_4(const f32* mem) -{ - f32_4 simd; - simd.s = _mm_set_ps(mem[0], mem[1], mem[2], mem[3]); - - return simd; -} - -inline void unload_f32_4(f32_4 a, f32 *array) { _mm_store_ps(array, a.s); } - -inline f32_8 load_f32_8(const f32* mem) -{ - f32_8 simd; - simd.s = _mm256_load_ps(mem); - - return simd; -} - -inline f32_8 init_f32_8(const f32* mem) -{ - f32_8 simd; - simd.s = _mm256_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]); - - return simd; -} - -inline void unload_f32_8(f32_8 a, f32 *array) { _mm256_store_ps(array, a.s); } - -inline f32_16 load_f32_16(const f32* mem) -{ - f32_16 simd; - simd.s = _mm512_load_ps(mem); - - return simd; -} - -inline f32_16 init_f32_16(const f32* mem) -{ - f32_16 simd; - simd.s = _mm512_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], mem[8], mem[9], mem[10], - mem[11], mem[12], mem[13], mem[14], mem[15]); - - return simd; -} - -inline void unload_f32_16(f32_16 a, f32 *array) { _mm512_store_ps(array, a.s); } - -inline f32_4 init_zero_f32_4() -{ - f32_4 simd; - simd.s = _mm_setzero_ps(); - - return simd; -} - -inline f32_8 init_zero_f32_8() -{ - f32_8 simd; - simd.s = _mm256_setzero_ps(); - - return simd; -} - -inline f32_16 init_zero_f32_16() -{ - f32_16 simd; - simd.s = _mm512_setzero_ps(); - - return simd; -} - -inline f32_4 init_value_f32_4(f32 value) -{ - f32_4 simd; - simd.s = _mm_set1_ps(value); - - return simd; -} - -inline f32_8 init_value_f32_8(f32 value) -{ - f32_8 simd; - simd.s = _mm256_set1_ps(value); - - return simd; -} - -inline f32_16 init_value_f32_16(f32 value) -{ - f32_16 simd; - simd.s = _mm512_set1_ps(value); - - return simd; -} - -inline f32_4 init_values_f32_4(f32 a, f32 b, f32 c, f32 d) -{ - f32_4 simd; - simd.s = _mm_set_ps(a, b, c, d); - - return simd; -} - -inline f32_8 init_values_f32_8( - f32 a, f32 b, f32 c, f32 d, - f32 e, f32 f, f32 g, f32 h -) -{ - f32_8 simd; - simd.s = _mm256_set_ps(a, b, c, d, e, f, g, h); - - return simd; -} - -inline f32_16 init_values_f32_16( - f32 a, f32 b, f32 c, f32 d, - f32 e, f32 f, f32 g, f32 h, - f32 i, f32 j, f32 k, f32 l, - f32 m, f32 n, f32 o, f32 p -) -{ - f32_16 simd; - simd.s = _mm512_set_ps(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p); - - return simd; -} - -inline f32_4 operator+(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_add_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator+(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_add_ps(a.s, b.s); - - return simd; -} - -inline f32_16 operator+(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_add_ps(a.s, b.s); - - return simd; -} - -inline f32_4 operator-(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_sub_ps(a.s, b.s); - - return simd; -} - -inline f32_4 operator-(f32_4 a) { return init_zero_f32_4() - a; } - -inline f32_8 operator-(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_sub_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator-(f32_8 a) { return init_zero_f32_8() - a; } - -inline f32_16 operator-(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_sub_ps(a.s, b.s); - - return simd; -} - -inline f32_16 operator-(f32_16 a) { return init_zero_f32_16() - a; } - -inline f32_4 operator*(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_mul_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator*(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_mul_ps(a.s, b.s); - - return simd; -} - -inline f32_16 operator*(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_mul_ps(a.s, b.s); - - return simd; -} - -inline f32_4 operator/(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_div_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator/(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_div_ps(a.s, b.s); - - return simd; -} - -inline f32_16 operator/(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_div_ps(a.s, b.s); - - return simd; -} - -inline f32_4 operator^(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_xor_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator^(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_xor_ps(a.s, b.s); - - return simd; -} - -inline f32_16 operator^(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_xor_ps(a.s, b.s); - - return simd; -} - -inline f32_4 &operator-=(f32_4 &a, f32_4 b) -{ - a = a - b; - - return a; -} - -inline f32_8 &operator-=(f32_8 &a, f32_8 b) -{ - a = a - b; - - return a; -} - -inline f32_16 &operator-=(f32_16 &a, f32_16 b) -{ - a = a - b; - - return a; -} - -inline f32_4 &operator+=(f32_4 &a, f32_4 b) -{ - a = a + b; - - return a; -} - -inline f32_8 &operator+=(f32_8 &a, f32_8 b) -{ - a = a + b; - - return a; -} - -inline f32_16 &operator+=(f32_16 &a, f32_16 b) -{ - a = a + b; - - return a; -} - -inline f32_4 &operator*=(f32_4 &a, f32_4 b) -{ - a = a * b; - - return a; -} - -inline f32_8 &operator*=(f32_8 &a, f32_8 b) -{ - a = a * b; - - return a; -} - -inline f32_16 &operator*=(f32_16 &a, f32_16 b) -{ - a = a * b; - - return a; -} - -inline f32_4 &operator/=(f32_4 &a, f32_4 b) -{ - a = a / b; - - return a; -} - -inline f32_8 &operator/=(f32_8 &a, f32_8 b) -{ - a = a / b; - - return a; -} - -inline f32_16 &operator/=(f32_16 &a, f32_16 b) -{ - a = a / b; - - return a; -} - -inline f32_4 &operator^=(f32_4 &a, f32_4 b) -{ - a = a ^ b; - - return a; -} - -inline f32_8 &operator^=(f32_8 &a, f32_8 b) -{ - a = a ^ b; - - return a; -} - -inline f32_16 &operator^=(f32_16 &a, f32_16 b) -{ - a = a ^ b; - - return a; -} - -inline f32_4 operator<(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_cmplt_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator<(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LT_OQ); - - return simd; -} - -inline f32_16 operator<(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_mask_blend_ps(_mm512_cmplt_ps_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline f32_4 operator<=(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_cmple_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator<=(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LE_OQ); - - return simd; -} - -inline f32_16 operator<=(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), a.s, b.s); - - return simd; -} - -inline f32_4 operator>(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_cmpgt_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator>(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GT_OQ); - - return simd; -} - -inline f32_16 operator>(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), a.s, b.s); - - return simd; -} - -inline f32_4 operator>=(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_cmpge_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator>=(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GE_OQ); - - return simd; -} - -inline f32_16 operator>=(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), a.s, b.s); - - return simd; -} - -inline f32_4 operator==(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_cmpeq_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator==(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_EQ_OQ); - - return simd; -} - -inline f32_16 operator==(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), a.s, b.s); - - return simd; -} - -inline f32_4 operator!=(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_cmpneq_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator!=(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_NEQ_OQ); - - return simd; -} - -inline f32_16 operator!=(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), a.s, b.s); - - return simd; -} - -inline f32_4 operator&(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_and_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator&(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_and_ps(a.s, b.s); - - return simd; -} - -inline f32_16 operator&(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_and_ps(a.s, b.s); - - return simd; -} - -inline f32_4 operator|(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_or_ps(a.s, b.s); - - return simd; -} - -inline f32_8 operator|(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_or_ps(a.s, b.s); - - return simd; -} - -inline f32_16 operator|(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_or_ps(a.s, b.s); - - return simd; -} - -inline f32_4 &operator&=(f32_4 &a, f32_4 b) -{ - a = a & b; - - return a; -} - -inline f32_8 &operator&=(f32_8 &a, f32_8 b) -{ - a = a & b; - - return a; -} - -inline f32_16 &operator&=(f32_16 &a, f32_16 b) -{ - a = a & b; - - return a; -} - -inline f32_4 &operator|=(f32_4 &a, f32_4 b) -{ - a = a | b; - - return a; -} - -inline f32_8 &operator|=(f32_8 &a, f32_8 b) -{ - a = a | b; - - return a; -} - -inline f32_16 &operator|=(f32_16 &a, f32_16 b) -{ - a = a | b; - - return a; -} - -inline f32_4 abs(f32_4 a) -{ - uint32 unsigned_mask = (uint32) (1U << 31); - __m128 mask = _mm_set1_ps(*(f32 *) &unsigned_mask); - - f32_4 simd; - simd.s = _mm_and_ps(a.s, mask); - - return simd; -} - -inline f32_8 abs(f32_8 a) -{ - uint32 unsigned_mask = (uint32) (1U << 31); - __m256 mask = _mm256_set1_ps(*(f32 *) &unsigned_mask); - - f32_8 simd; - simd.s = _mm256_and_ps(a.s, mask); - - return simd; -} - -inline f32_16 abs(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_abs_ps(a.s); - - return simd; -} - -inline f32_4 simd_min(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_min_ps(a.s, b.s); - - return simd; -} - -inline f32_8 simd_min(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_min_ps(a.s, b.s); - - return simd; -} - -inline f32_16 simd_min(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_min_ps(a.s, b.s); - - return simd; -} - -inline f32_4 simd_max(f32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_max_ps(a.s, b.s); - - return simd; -} - -inline f32_8 simd_max(f32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_max_ps(a.s, b.s); - - return simd; -} - -inline f32_16 simd_max(f32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_max_ps(a.s, b.s); - - return simd; -} - -inline f32_4 sign(f32_4 a) -{ - uint32 umask = (uint32) (1U << 31); - __m128 mask = _mm_set1_ps(*(f32 *) &umask); - - f32_4 signBit; - signBit.s = _mm_and_ps(a.s, mask); - - f32_4 b; - b.s = _mm_set1_ps(1.0f); - - f32_4 simd = b | signBit; - - return simd; -} - -inline f32_8 sign(f32_8 a) -{ - uint32 umask = (uint32) (1U << 31); - __m256 mask = _mm256_set1_ps(*(f32 *) &umask); - - f32_8 signBit; - signBit.s = _mm256_and_ps(a.s, mask); - - f32_8 b; - b.s = _mm256_set1_ps(1.0f); - - f32_8 simd = b | signBit; - - return simd; -} - -inline f32_16 sign(f32_16 a) -{ - uint32 umask = (uint32) (1U << 31); - __m512 mask = _mm512_set1_ps(*(f32 *) &umask); - - f32_16 signBit; - signBit.s = _mm512_and_ps(a.s, mask); - - f32_16 b; - b.s = _mm512_set1_ps(1.0f); - - f32_16 simd = b | signBit; - - return simd; -} - -inline f32_4 floor(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_floor_ps(a.s); - - return simd; -} - -inline f32_8 floor(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_floor_ps(a.s); - - return simd; -} - -inline f32_16 floor(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_floor_ps(a.s); - - return simd; -} - -inline f32_4 ceil(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_ceil_ps(a.s); - - return simd; -} - -inline f32_8 ceil(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_ceil_ps(a.s); - - return simd; -} - -inline f32_16 ceil(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_ceil_ps(a.s); - - return simd; -} - -inline f32_4 sqrt(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_sqrt_ps(a.s); - - return simd; -} - -inline f32_8 sqrt(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_sqrt_ps(a.s); - - return simd; -} - -inline f32_16 sqrt(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_sqrt_ps(a.s); - - return simd; -} - -inline f32_4 sqrt_inv_approx(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_rsqrt_ps(a.s); - - return simd; -} - -inline f32_8 sqrt_inv_approx(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_rsqrt_ps(a.s); - - return simd; -} - -inline f32_16 sqrt_inv_approx(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_rsqrt14_ps(a.s); - - return simd; -} - -inline f32_4 one_over_approx(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_rcp_ps(a.s); - - return simd; -} - -inline f32_8 one_over_approx(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_rcp_ps(a.s); - - return simd; -} - -inline f32_16 one_over_approx(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_rcp14_ps(a.s); - - return simd; -} - -inline f32_4 clamp(f32_4 min_value, f32_4 a, f32_4 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline f32_8 clamp(f32_8 min_value, f32_8 a, f32_8 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline f32_16 clamp(f32_16 min_value, f32_16 a, f32_16 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int32 which_true(f32_4 a) -{ - int32 which_true = _mm_movemask_ps(a.s); - - return which_true; -} - -inline int32 which_true(f32_8 a) -{ - int32 which_true = _mm256_movemask_ps(a.s); - - return which_true; -} - -inline int32 which_true(f32_16 a) -{ - int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)); - - return which_true; -} - -inline bool any_true(f32_4 a) -{ - bool is_any_true = _mm_movemask_ps(a.s) > 0; - - return is_any_true; -} - -inline bool any_true(f32_8 a) -{ - bool is_any_true = _mm256_movemask_ps(a.s) > 0; - - return is_any_true; -} - -inline bool any_true(f32_16 a) -{ - bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0; - - return is_any_true; -} - -inline bool all_true(f32_4 a) -{ - bool is_true = _mm_movemask_ps(a.s) == 15; - - return is_true; -} - -inline bool all_true(f32_8 a) -{ - bool is_true = _mm256_movemask_ps(a.s) == 255; - - return is_true; -} - -inline bool all_true(f32_16 a) -{ - bool is_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535; - - return is_true; -} - -inline bool all_false(f32_4 a) -{ - bool is_false = _mm_movemask_ps(a.s) == 0; - - return is_false; -} - -inline bool all_false(f32_8 a) -{ - bool is_false = _mm256_movemask_ps(a.s) == 0; - - return is_false; -} - -inline bool all_false(f32_16 a) -{ - // @todo This can be optimized (requires also changes in the comparison functions return) - bool is_false = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0; - - return is_false; -} +#ifdef MACRO_CPU_FEATURE_AVX512 + #include "SIMD_F32_AVX512.h" +#endif // @todo from down here we can optimize some of the code by NOT using the wrappers // the code is self contained and we could use te intrinsic functions directly @@ -1004,53 +33,73 @@ inline void simd_mult(const f32* a, const f32* b, f32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) b, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512 a_16; - __m512 b_16; - __m512 result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512 a_16; + __m512 b_16; + __m512 result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_ps(a); - b_16 = _mm512_load_ps(b); - result_16 = _mm512_mul_ps(a_16, b_16); - _mm512_store_ps(result, result_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_ps(a); + b_16 = _mm512_load_ps(b); + result_16 = _mm512_mul_ps(a_16, b_16); + _mm512_store_ps(result, result_16); - a += steps; - b += steps; - result += steps; - } - } else if (steps == 8) { - __m256 a_8; - __m256 b_8; - __m256 result_8; + a += steps; + b += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_ps(a); - b_8 = _mm256_load_ps(b); - result_8 = _mm256_mul_ps(a_8, b_8); - _mm256_store_ps(result, result_8); + steps = 1; + } + #endif - a += steps; - b += steps; - result += steps; - } - } else if (steps == 4) { - __m128 a_4; - __m128 b_4; - __m128 result_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256 a_8; + __m256 b_8; + __m256 result_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_ps(a); - b_4 = _mm_load_ps(b); - result_4 = _mm_mul_ps(a_4, b_4); - _mm_store_ps(result, result_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_ps(a); + b_8 = _mm256_load_ps(b); + result_8 = _mm256_mul_ps(a_8, b_8); + _mm256_store_ps(result, result_8); - a += steps; - b += steps; - result += steps; - } - } + a += steps; + b += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128 a_4; + __m128 b_4; + __m128 result_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_ps(a); + b_4 = _mm_load_ps(b); + result_4 = _mm_mul_ps(a_4, b_4); + _mm_store_ps(result, result_4); + + a += steps; + b += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { *result = *a * *b; @@ -1065,21 +114,30 @@ inline void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512 a_16; - __m512 b_16 = _mm512_set1_ps(b); - __m512 result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512 a_16; + __m512 b_16 = _mm512_set1_ps(b); + __m512 result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_ps(a); - result_16 = _mm512_mul_ps(a_16, b_16); - _mm512_store_ps(result, result_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_ps(a); + result_16 = _mm512_mul_ps(a_16, b_16); + _mm512_store_ps(result, result_16); - a += steps; - result += steps; - } - } else if (steps == 8) { + a += steps; + result += steps; + } + } + #endif + + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; __m256 a_8; __m256 b_8 = _mm256_set1_ps(b); __m256 result_8; @@ -1091,8 +149,13 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps) a += steps; result += steps; - } - } else if (steps == 4) { + } + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; __m128 a_4; __m128 b_4 = _mm_set1_ps(b); __m128 result_4; @@ -1104,8 +167,9 @@ void simd_mult(const f32* a, f32 b, f32* result, int32 size, int32 steps) a += steps; result += steps; - } + } } + #endif for (; i < size; ++i) { *result = *a * b; @@ -1119,48 +183,64 @@ inline void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512 a_16; - __m512 b_16 = _mm512_set1_ps(b); - __m512 result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512 a_16; + __m512 b_16 = _mm512_set1_ps(b); + __m512 result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_ps(a); - result_16 = _mm512_div_ps(a_16, b_16); - _mm512_store_ps(result, result_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_ps(a); + result_16 = _mm512_div_ps(a_16, b_16); + _mm512_store_ps(result, result_16); - a += steps; - result += steps; - } - } else if (steps == 8) { - __m256 a_8; - __m256 b_8 = _mm256_set1_ps(b); - __m256 result_8; + a += steps; + result += steps; + } + } + #endif - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_ps(a); - result_8 = _mm256_div_ps(a_8, b_8); - _mm256_store_ps(result, result_8); + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256 a_8; + __m256 b_8 = _mm256_set1_ps(b); + __m256 result_8; - a += steps; - result += steps; - } - } else if (steps == 4) { - __m128 a_4; - __m128 b_4 = _mm_set1_ps(b); - __m128 result_4; + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_ps(a); + result_8 = _mm256_div_ps(a_8, b_8); + _mm256_store_ps(result, result_8); - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_ps(a); - result_4 = _mm_div_ps(a_4, b_4); - _mm_store_ps(result, result_4); + a += steps; + result += steps; + } + } + #endif - a += steps; - result += steps; - } - } + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128 a_4; + __m128 b_4 = _mm_set1_ps(b); + __m128 result_4; + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_ps(a); + result_4 = _mm_div_ps(a_4, b_4); + _mm_store_ps(result, result_4); + + a += steps; + result += steps; + } + } + #endif + + // Scalar fallback for (; i < size; ++i) { *result = *a / b; @@ -1169,159 +249,4 @@ void simd_div(const f32* a, f32 b, f32* result, int32 size, int32 steps) } } -inline -void simd_div(const f32* a, f32 b, __m256* result, int32 size) -{ - int32 i = 0; - int32 j = 0; - - // @todo this his how all the functions should be implemented that take in baseic types and output basic types - __m256 a_8; - __m256 b_8 = _mm256_set1_ps(b); - __m256 result_8; - - for (; i <= size - 8; i += 8) { - a_8 = _mm256_load_ps(a); - result_8 = _mm256_div_ps(a_8, b_8); - result[j] = result_8; - - a += 8; - ++j; - } - - int32 diff = size - i; - alignas(32) f32 temp[8]; - - for (int32 k = 0; k < diff; k++) { - temp[k] = a[i + k] / b; - } - - result[j] = _mm256_load_ps(temp); -} - -inline -void simd_cmp_le(const __m256* a, f32 b, bool* result, int32 size) -{ - __m256 b_8 = _mm256_set1_ps(b); - - for (int32 i = 0; i < size; ++i) { - int32 mask = _mm256_movemask_ps(_mm256_cmp_ps(a[i], b_8, _CMP_LE_OQ)); - - for (int32 j = 0; j < 8; ++j) { - result[i * 8 + j] = (mask & (1 << j)) != 0; - } - } -} - -// @todo But a guard or warning on the trigonometric functions since they are only implemented for msvc/intel compiler -inline -f32_4 simd_sin(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_sin_ps(a.s); - - return simd; -} - -inline -f32_8 simd_sin(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_sin_ps(a.s); - - return simd; -} - -inline -f32_16 simd_sin(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_sin_ps(a.s); - - return simd; -} - -inline -f32_4 simd_cos(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_cos_ps(a.s); - - return simd; -} - -inline -f32_8 simd_cos(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_cos_ps(a.s); - - return simd; -} - -inline -f32_16 simd_cos(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_cos_ps(a.s); - - return simd; -} - -inline -f32_4 simd_asin(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_asin_ps(a.s); - - return simd; -} - -inline -f32_8 simd_asin(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_asin_ps(a.s); - - return simd; -} - -inline -f32_16 simd_asin(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_asin_ps(a.s); - - return simd; -} - -inline -f32_4 simd_acos(f32_4 a) -{ - f32_4 simd; - simd.s = _mm_acos_ps(a.s); - - return simd; -} - -inline -f32_8 simd_acos(f32_8 a) -{ - f32_8 simd; - simd.s = _mm256_acos_ps(a.s); - - return simd; -} - -inline -f32_16 simd_acos(f32_16 a) -{ - f32_16 simd; - simd.s = _mm512_acos_ps(a.s); - - return simd; -} - -// @todo implement more trigonometry function - #endif diff --git a/architecture/x86/simd/SIMD_F32_AVX2.h b/architecture/x86/simd/SIMD_F32_AVX2.h new file mode 100644 index 0000000..896dedf --- /dev/null +++ b/architecture/x86/simd/SIMD_F32_AVX2.h @@ -0,0 +1,426 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_F32_H +#define COMS_STDLIB_SIMD_F32_H + +#include +#include + +#include "../../../stdlib/Types.h" +#include "SIMD_SVML_AVX2.h" + +struct f32_8 { + union { + #if ARM + svfloat32_t s; + #else + __m256 s; + #endif + + f32 v[8]; + }; +}; + +inline f32_8 load_f32_8(const f32* mem) +{ + f32_8 simd; + simd.s = _mm256_load_ps(mem); + + return simd; +} + +inline f32_8 init_f32_8(const f32* mem) +{ + f32_8 simd; + simd.s = _mm256_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]); + + return simd; +} + +inline void unload_f32_8(f32_8 a, f32 *array) { _mm256_store_ps(array, a.s); } + +inline f32_8 init_zero_f32_8() +{ + f32_8 simd; + simd.s = _mm256_setzero_ps(); + + return simd; +} + +inline f32_8 init_value_f32_8(f32 value) +{ + f32_8 simd; + simd.s = _mm256_set1_ps(value); + + return simd; +} + +inline f32_8 init_values_f32_8( + f32 a, f32 b, f32 c, f32 d, + f32 e, f32 f, f32 g, f32 h +) +{ + f32_8 simd; + simd.s = _mm256_set_ps(a, b, c, d, e, f, g, h); + + return simd; +} + +inline f32_8 operator+(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_add_ps(a.s, b.s); + + return simd; +} + +inline f32_8 operator-(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_sub_ps(a.s, b.s); + + return simd; +} + +inline f32_8 operator-(f32_8 a) { return init_zero_f32_8() - a; } + +inline f32_8 operator*(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_mul_ps(a.s, b.s); + + return simd; +} + +inline f32_8 operator/(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_div_ps(a.s, b.s); + + return simd; +} + +inline f32_8 operator^(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_xor_ps(a.s, b.s); + + return simd; +} + +inline f32_8 &operator-=(f32_8 &a, f32_8 b) +{ + a = a - b; + + return a; +} + +inline f32_8 &operator+=(f32_8 &a, f32_8 b) +{ + a = a + b; + + return a; +} + +inline f32_8 &operator*=(f32_8 &a, f32_8 b) +{ + a = a * b; + + return a; +} + +inline f32_8 &operator/=(f32_8 &a, f32_8 b) +{ + a = a / b; + + return a; +} + +inline f32_8 &operator^=(f32_8 &a, f32_8 b) +{ + a = a ^ b; + + return a; +} + +inline f32_8 operator<(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LT_OQ); + + return simd; +} + +inline f32_8 operator<=(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_LE_OQ); + + return simd; +} + +inline f32_8 operator>(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GT_OQ); + + return simd; +} + +inline f32_8 operator>=(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_GE_OQ); + + return simd; +} + +inline f32_8 operator==(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_EQ_OQ); + + return simd; +} + +inline f32_8 operator!=(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_cmp_ps(a.s, b.s, _CMP_NEQ_OQ); + + return simd; +} + +inline f32_8 operator&(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_and_ps(a.s, b.s); + + return simd; +} + +inline f32_8 operator|(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_or_ps(a.s, b.s); + + return simd; +} + +inline f32_8 &operator&=(f32_8 &a, f32_8 b) +{ + a = a & b; + + return a; +} + +inline f32_8 &operator|=(f32_8 &a, f32_8 b) +{ + a = a | b; + + return a; +} + +inline f32_8 abs(f32_8 a) +{ + uint32 unsigned_mask = (uint32) (1U << 31); + __m256 mask = _mm256_set1_ps(*(f32 *) &unsigned_mask); + + f32_8 simd; + simd.s = _mm256_and_ps(a.s, mask); + + return simd; +} + +inline f32_8 simd_min(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_min_ps(a.s, b.s); + + return simd; +} + +inline f32_8 simd_max(f32_8 a, f32_8 b) +{ + f32_8 simd; + simd.s = _mm256_max_ps(a.s, b.s); + + return simd; +} + +inline f32_8 sign(f32_8 a) +{ + uint32 umask = (uint32) (1U << 31); + __m256 mask = _mm256_set1_ps(*(f32 *) &umask); + + f32_8 signBit; + signBit.s = _mm256_and_ps(a.s, mask); + + f32_8 b; + b.s = _mm256_set1_ps(1.0f); + + f32_8 simd = b | signBit; + + return simd; +} + +inline f32_8 floor(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_floor_ps(a.s); + + return simd; +} + +inline f32_8 ceil(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_ceil_ps(a.s); + + return simd; +} + +inline f32_8 sqrt(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_sqrt_ps(a.s); + + return simd; +} + +inline f32_8 sqrt_inv_approx(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_rsqrt_ps(a.s); + + return simd; +} + +inline f32_8 one_over_approx(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_rcp_ps(a.s); + + return simd; +} + +inline f32_8 clamp(f32_8 min_value, f32_8 a, f32_8 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(f32_8 a) +{ + int32 which_true = _mm256_movemask_ps(a.s); + + return which_true; +} + +inline bool any_true(f32_8 a) +{ + bool is_any_true = _mm256_movemask_ps(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(f32_8 a) +{ + bool is_true = _mm256_movemask_ps(a.s) == 255; + + return is_true; +} + +inline bool all_false(f32_8 a) +{ + bool is_false = _mm256_movemask_ps(a.s) == 0; + + return is_false; +} + +inline +void simd_cmp_le(const __m256* a, f32 b, bool* result, int32 size) +{ + __m256 b_8 = _mm256_set1_ps(b); + + for (int32 i = 0; i < size; ++i) { + int32 mask = _mm256_movemask_ps(_mm256_cmp_ps(a[i], b_8, _CMP_LE_OQ)); + + for (int32 j = 0; j < 8; ++j) { + result[i * 8 + j] = (mask & (1 << j)) != 0; + } + } +} + +inline +f32_8 simd_sin(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_sin_ps(a.s); + + return simd; +} + +inline +f32_8 simd_cos(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_cos_ps(a.s); + + return simd; +} + +inline +f32_8 simd_asin(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_asin_ps(a.s); + + return simd; +} + +inline +f32_8 simd_acos(f32_8 a) +{ + f32_8 simd; + simd.s = _mm256_acos_ps(a.s); + + return simd; +} + +inline +void simd_div(const f32* a, f32 b, __m256* result, int32 size) +{ + int32 i = 0; + int32 j = 0; + + // @todo this his how all the functions should be implemented that take in baseic types and output basic types + __m256 a_8; + __m256 b_8 = _mm256_set1_ps(b); + __m256 result_8; + + for (; i <= size - 8; i += 8) { + a_8 = _mm256_load_ps(a); + result_8 = _mm256_div_ps(a_8, b_8); + result[j] = result_8; + + a += 8; + ++j; + } + + int32 diff = size - i; + alignas(32) f32 temp[8]; + + for (int32 k = 0; k < diff; k++) { + temp[k] = a[i + k] / b; + } + + result[j] = _mm256_load_ps(temp); +} + +#endif diff --git a/architecture/x86/simd/SIMD_F32_AVX512.h b/architecture/x86/simd/SIMD_F32_AVX512.h new file mode 100644 index 0000000..ffa69fc --- /dev/null +++ b/architecture/x86/simd/SIMD_F32_AVX512.h @@ -0,0 +1,385 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_F32_H +#define COMS_STDLIB_SIMD_F32_H + +#include +#include + +#include "../../../stdlib/Types.h" +#include "SIMD_SVML_AVX512.h" + +struct f32_16 { + union { + #if ARM + svfloat32_t s; + #else + __m512 s; + #endif + + f32 v[16]; + }; +}; + +inline f32_16 load_f32_16(const f32* mem) +{ + f32_16 simd; + simd.s = _mm512_load_ps(mem); + + return simd; +} + +inline f32_16 init_f32_16(const f32* mem) +{ + f32_16 simd; + simd.s = _mm512_set_ps(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], mem[8], mem[9], mem[10], + mem[11], mem[12], mem[13], mem[14], mem[15]); + + return simd; +} + +inline void unload_f32_16(f32_16 a, f32 *array) { _mm512_store_ps(array, a.s); } + +inline f32_16 init_zero_f32_16() +{ + f32_16 simd; + simd.s = _mm512_setzero_ps(); + + return simd; +} + +inline f32_16 init_value_f32_16(f32 value) +{ + f32_16 simd; + simd.s = _mm512_set1_ps(value); + + return simd; +} + +inline f32_16 init_values_f32_16( + f32 a, f32 b, f32 c, f32 d, + f32 e, f32 f, f32 g, f32 h, + f32 i, f32 j, f32 k, f32 l, + f32 m, f32 n, f32 o, f32 p +) +{ + f32_16 simd; + simd.s = _mm512_set_ps(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p); + + return simd; +} + +inline f32_16 operator+(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_add_ps(a.s, b.s); + + return simd; +} + +inline f32_16 operator-(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_sub_ps(a.s, b.s); + + return simd; +} + +inline f32_16 operator-(f32_16 a) { return init_zero_f32_16() - a; } + +inline f32_16 operator*(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_mul_ps(a.s, b.s); + + return simd; +} + +inline f32_16 operator/(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_div_ps(a.s, b.s); + + return simd; +} + +inline f32_16 operator^(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_xor_ps(a.s, b.s); + + return simd; +} + +inline f32_16 &operator-=(f32_16 &a, f32_16 b) +{ + a = a - b; + + return a; +} + +inline f32_16 &operator+=(f32_16 &a, f32_16 b) +{ + a = a + b; + + return a; +} + +inline f32_16 &operator*=(f32_16 &a, f32_16 b) +{ + a = a * b; + + return a; +} + +inline f32_16 &operator/=(f32_16 &a, f32_16 b) +{ + a = a / b; + + return a; +} + +inline f32_16 &operator^=(f32_16 &a, f32_16 b) +{ + a = a ^ b; + + return a; +} + +inline f32_16 operator<(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_mask_blend_ps(_mm512_cmplt_ps_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline f32_16 operator<=(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_LE_OQ), a.s, b.s); + + return simd; +} + +inline f32_16 operator>(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GT_OQ), a.s, b.s); + + return simd; +} + +inline f32_16 operator>=(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_GE_OQ), a.s, b.s); + + return simd; +} + +inline f32_16 operator==(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_EQ_OQ), a.s, b.s); + + return simd; +} + +inline f32_16 operator!=(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(a.s, b.s, _CMP_NEQ_OQ), a.s, b.s); + + return simd; +} + +inline f32_16 operator&(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_and_ps(a.s, b.s); + + return simd; +} + +inline f32_16 operator|(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_or_ps(a.s, b.s); + + return simd; +} + +inline f32_16 &operator&=(f32_16 &a, f32_16 b) +{ + a = a & b; + + return a; +} + +inline f32_16 &operator|=(f32_16 &a, f32_16 b) +{ + a = a | b; + + return a; +} + +inline f32_16 abs(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_abs_ps(a.s); + + return simd; +} + +inline f32_16 simd_min(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_min_ps(a.s, b.s); + + return simd; +} + +inline f32_16 simd_max(f32_16 a, f32_16 b) +{ + f32_16 simd; + simd.s = _mm512_max_ps(a.s, b.s); + + return simd; +} + +inline f32_16 sign(f32_16 a) +{ + uint32 umask = (uint32) (1U << 31); + __m512 mask = _mm512_set1_ps(*(f32 *) &umask); + + f32_16 signBit; + signBit.s = _mm512_and_ps(a.s, mask); + + f32_16 b; + b.s = _mm512_set1_ps(1.0f); + + f32_16 simd = b | signBit; + + return simd; +} + +inline f32_16 floor(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_floor_ps(a.s); + + return simd; +} + +inline f32_16 ceil(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_ceil_ps(a.s); + + return simd; +} + +inline f32_16 sqrt(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_sqrt_ps(a.s); + + return simd; +} + +inline f32_16 sqrt_inv_approx(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_rsqrt14_ps(a.s); + + return simd; +} + +inline f32_16 one_over_approx(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_rcp14_ps(a.s); + + return simd; +} + +inline f32_16 clamp(f32_16 min_value, f32_16 a, f32_16 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(f32_16 a) +{ + int32 which_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)); + + return which_true; +} + +inline bool any_true(f32_16 a) +{ + bool is_any_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) > 0; + + return is_any_true; +} + +inline bool all_true(f32_16 a) +{ + bool is_true = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 65535; + + return is_true; +} + +inline bool all_false(f32_16 a) +{ + // @todo This can be optimized (requires also changes in the comparison functions return) + bool is_false = _mm512_movepi32_mask(_mm512_castps_si512(a.s)) == 0; + + return is_false; +} + +inline +f32_16 simd_sin(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_sin_ps(a.s); + + return simd; +} + +inline +f32_16 simd_cos(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_cos_ps(a.s); + + return simd; +} + +inline +f32_16 simd_asin(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_asin_ps(a.s); + + return simd; +} + +inline +f32_16 simd_acos(f32_16 a) +{ + f32_16 simd; + simd.s = _mm512_acos_ps(a.s); + + return simd; +} + +// @todo implement more trigonometry function + +#endif diff --git a/architecture/x86/simd/SIMD_F32_SSE.h b/architecture/x86/simd/SIMD_F32_SSE.h new file mode 100644 index 0000000..f1dfa79 --- /dev/null +++ b/architecture/x86/simd/SIMD_F32_SSE.h @@ -0,0 +1,381 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_F32_SSE_H +#define COMS_STDLIB_SIMD_F32_SSE_H + +#include +#include + +#include "../../../stdlib/Types.h" +#include "SIMD_SVML_SSE.h" + +struct f32_4 { + union { + #if ARM + svfloat32_t s; + #else + __m128 s; + #endif + + f32 v[4]; + }; +}; + +inline f32_4 load_f32_4(const f32* mem) +{ + f32_4 simd; + simd.s = _mm_load_ps(mem); + + return simd; +} + +inline f32_4 init_f32_4(const f32* mem) +{ + f32_4 simd; + simd.s = _mm_set_ps(mem[0], mem[1], mem[2], mem[3]); + + return simd; +} + +inline void unload_f32_4(f32_4 a, f32 *array) { _mm_store_ps(array, a.s); } + +inline f32_4 init_zero_f32_4() +{ + f32_4 simd; + simd.s = _mm_setzero_ps(); + + return simd; +} + +inline f32_4 init_value_f32_4(f32 value) +{ + f32_4 simd; + simd.s = _mm_set1_ps(value); + + return simd; +} + +inline f32_4 init_values_f32_4(f32 a, f32 b, f32 c, f32 d) +{ + f32_4 simd; + simd.s = _mm_set_ps(a, b, c, d); + + return simd; +} + +inline f32_4 operator+(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_add_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator-(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_sub_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator-(f32_4 a) { return init_zero_f32_4() - a; } + +inline f32_4 operator*(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_mul_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator/(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_div_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator^(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_xor_ps(a.s, b.s); + + return simd; +} + +inline f32_4 &operator-=(f32_4 &a, f32_4 b) +{ + a = a - b; + + return a; +} + +inline f32_4 &operator+=(f32_4 &a, f32_4 b) +{ + a = a + b; + + return a; +} + +inline f32_4 &operator*=(f32_4 &a, f32_4 b) +{ + a = a * b; + + return a; +} + +inline f32_4 &operator/=(f32_4 &a, f32_4 b) +{ + a = a / b; + + return a; +} + +inline f32_4 &operator^=(f32_4 &a, f32_4 b) +{ + a = a ^ b; + + return a; +} + +inline f32_4 operator<(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_cmplt_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator<=(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_cmple_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator>(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_cmpgt_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator>=(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_cmpge_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator==(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_cmpeq_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator!=(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_cmpneq_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator&(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_and_ps(a.s, b.s); + + return simd; +} + +inline f32_4 operator|(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_or_ps(a.s, b.s); + + return simd; +} + +inline f32_4 &operator&=(f32_4 &a, f32_4 b) +{ + a = a & b; + + return a; +} + +inline f32_4 &operator|=(f32_4 &a, f32_4 b) +{ + a = a | b; + + return a; +} + +inline f32_4 abs(f32_4 a) +{ + uint32 unsigned_mask = (uint32) (1U << 31); + __m128 mask = _mm_set1_ps(*(f32 *) &unsigned_mask); + + f32_4 simd; + simd.s = _mm_and_ps(a.s, mask); + + return simd; +} + +inline f32_4 simd_min(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_min_ps(a.s, b.s); + + return simd; +} + +inline f32_4 simd_max(f32_4 a, f32_4 b) +{ + f32_4 simd; + simd.s = _mm_max_ps(a.s, b.s); + + return simd; +} + +inline f32_4 sign(f32_4 a) +{ + uint32 umask = (uint32) (1U << 31); + __m128 mask = _mm_set1_ps(*(f32 *) &umask); + + f32_4 signBit; + signBit.s = _mm_and_ps(a.s, mask); + + f32_4 b; + b.s = _mm_set1_ps(1.0f); + + f32_4 simd = b | signBit; + + return simd; +} + +inline f32_4 floor(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_floor_ps(a.s); + + return simd; +} + +inline f32_4 ceil(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_ceil_ps(a.s); + + return simd; +} + +inline f32_4 sqrt(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_sqrt_ps(a.s); + + return simd; +} + +inline f32_4 sqrt_inv_approx(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_rsqrt_ps(a.s); + + return simd; +} + +inline f32_4 one_over_approx(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_rcp_ps(a.s); + + return simd; +} + +inline f32_4 clamp(f32_4 min_value, f32_4 a, f32_4 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(f32_4 a) +{ + int32 which_true = _mm_movemask_ps(a.s); + + return which_true; +} + +inline bool any_true(f32_4 a) +{ + bool is_any_true = _mm_movemask_ps(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(f32_4 a) +{ + bool is_true = _mm_movemask_ps(a.s) == 15; + + return is_true; +} + +inline bool all_false(f32_4 a) +{ + bool is_false = _mm_movemask_ps(a.s) == 0; + + return is_false; +} + +inline +f32_4 simd_sin(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_sin_ps(a.s); + + return simd; +} + +inline +f32_4 simd_cos(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_cos_ps(a.s); + + return simd; +} + +inline +f32_4 simd_asin(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_asin_ps(a.s); + + return simd; +} + +inline +f32_4 simd_acos(f32_4 a) +{ + f32_4 simd; + simd.s = _mm_acos_ps(a.s); + + return simd; +} + +// @todo implement more trigonometry function + +#endif diff --git a/architecture/x86/simd/SIMD_F64.h b/architecture/x86/simd/SIMD_F64.h old mode 100644 new mode 100755 index 9166cb2..57101fc --- a/architecture/x86/simd/SIMD_F64.h +++ b/architecture/x86/simd/SIMD_F64.h @@ -14,40 +14,16 @@ #include "../../../stdlib/Types.h" -struct f64_2 { - union { - #if ARM - svfloat64_t s; - #else - __m128 s; - #endif +#ifdef MACRO_CPU_FEATURE_SSE42 + #include "SIMD_F64_SSE.h" +#endif - f64 v[2]; - }; -}; +#ifdef MACRO_CPU_FEATURE_AVX2 + #include "SIMD_F64_AVX2.h" +#endif -struct f64_4 { - union { - #if ARM - svfloat64_t s; - #else - __m256 s; - #endif - - f64 v[4]; - }; -}; - -struct f64_8 { - union { - #if ARM - svfloat64_t s; - #else - __m512 s; - #endif - - f64 v[8]; - }; -}; +#ifdef MACRO_CPU_FEATURE_AVX512 + #include "SIMD_F64_AVX512.h" +#endif #endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_F64_AVX2.h b/architecture/x86/simd/SIMD_F64_AVX2.h new file mode 100644 index 0000000..278c508 --- /dev/null +++ b/architecture/x86/simd/SIMD_F64_AVX2.h @@ -0,0 +1,30 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_F64_AVX2_H +#define COMS_STDLIB_SIMD_F64_AVX2_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct f64_4 { + union { + #if ARM + svfloat64_t s; + #else + __m256 s; + #endif + + f64 v[4]; + }; +}; + + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_F64_AVX512.h b/architecture/x86/simd/SIMD_F64_AVX512.h new file mode 100644 index 0000000..d3aa225 --- /dev/null +++ b/architecture/x86/simd/SIMD_F64_AVX512.h @@ -0,0 +1,29 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_F64_AVX512_H +#define COMS_STDLIB_SIMD_F64_AVX512_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct f64_8 { + union { + #if ARM + svfloat64_t s; + #else + __m512 s; + #endif + + f64 v[8]; + }; +}; + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_F64_SSE.h b/architecture/x86/simd/SIMD_F64_SSE.h new file mode 100644 index 0000000..368b93c --- /dev/null +++ b/architecture/x86/simd/SIMD_F64_SSE.h @@ -0,0 +1,29 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_F64_SSE_H +#define COMS_STDLIB_SIMD_F64_SSE_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct f64_2 { + union { + #if ARM + svfloat64_t s; + #else + __m128 s; + #endif + + f64 v[2]; + }; +}; + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I16.h b/architecture/x86/simd/SIMD_I16.h old mode 100644 new mode 100755 index 190b7f8..09ff02e --- a/architecture/x86/simd/SIMD_I16.h +++ b/architecture/x86/simd/SIMD_I16.h @@ -14,742 +14,17 @@ #include "../../../stdlib/Types.h" -struct int16_8 { - union { - #if ARM - svint16_t s; - #else - __m128i s; - #endif +#ifdef MACRO_CPU_FEATURE_SSE42 + #include "SIMD_I16_SSE.h" +#endif - int16 v[8]; - }; -}; +#ifdef MACRO_CPU_FEATURE_AVX2 + #include "SIMD_I16_AVX2.h" +#endif -struct int16_16 { - union { - #if ARM - svint16_t s; - #else - __m256i s; - #endif - - int16 v[16]; - }; -}; - -struct int16_32 { - union { - #if ARM - svint16_t s; - #else - __m512i s; - #endif - - int16 v[32]; - }; -}; - - -inline int16_8 load_int16_8(const int16* mem) -{ - int16_8 simd; - simd.s = _mm_load_si128((__m128i *) mem); - - return simd; -} - -inline int16_8 init_int16_8(const int16* mem) -{ - int16_8 simd; - simd.s = _mm_set_epi16( - mem[0], mem[1], mem[2], mem[3], - mem[4], mem[5], mem[6], mem[7] - ); - - return simd; -} - -inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i *) array, a.s); } - -inline int16_16 load_int16_16(const int16* mem) -{ - int16_16 simd; - simd.s = _mm256_load_si256((__m256i *) mem); - - return simd; -} - -inline int16_16 init_int16_16(const int16* mem) -{ - int16_16 simd; - simd.s = _mm256_set_epi16( - mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], - mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15] - ); - - return simd; -} - -inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m256i *) array, a.s); } - -inline int16_32 load_int16_32(const int16* mem) -{ - int16_32 simd; - simd.s = _mm512_load_si512((__m512i *) mem); - - return simd; -} - -inline int16_32 init_int16_32(const int16* mem) -{ - int16_32 simd; - simd.s = _mm512_set_epi16( - mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], - mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15], - mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23], - mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31] - ); - - return simd; -} - -inline void unload_int16_32(int16_32 a, int16 *array) { _mm512_storeu_epi16(array, a.s); } - -inline int16_8 init_zero_int16_8() -{ - int16_8 simd; - simd.s = _mm_setzero_si128(); - - return simd; -} - -inline int16_16 init_zero_int16_16() -{ - int16_16 simd; - simd.s = _mm256_setzero_si256(); - - return simd; -} - -inline int16_32 init_zero_int16_32() -{ - int16_32 simd; - simd.s = _mm512_setzero_si512(); - - return simd; -} - -inline int16_8 init_value_int16_8(int16 value) -{ - int16_8 simd; - simd.s = _mm_set1_epi16(value); - - return simd; -} - -inline int16_16 init_value_int16_16(int16 value) -{ - int16_16 simd; - simd.s = _mm256_set1_epi16(value); - - return simd; -} - -inline int16_32 init_value_int16_32(int16 value) -{ - int16_32 simd; - simd.s = _mm512_set1_epi16(value); - - return simd; -} - -inline int16_8 operator+(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_add_epi16(a.s, b.s); - - return simd; -} - -inline int16_16 operator+(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_add_epi16(a.s, b.s); - - return simd; -} - -inline int16_32 operator+(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_add_epi16(a.s, b.s); - - return simd; -} - -inline int16_8 operator-(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_sub_epi16(a.s, b.s); - - return simd; -} - -inline int16_8 operator-(int16_8 a) { return init_zero_int16_8() - a; } - -inline int16_16 operator-(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_sub_epi16(a.s, b.s); - - return simd; -} - -inline int16_16 operator-(int16_16 a) { return init_zero_int16_16() - a; } - -inline int16_32 operator-(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_sub_epi16(a.s, b.s); - - return simd; -} - -inline int16_32 operator-(int16_32 a) { return init_zero_int16_32() - a; } - -inline int16_8 operator*(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_mul_epi32(a.s, b.s); - - return simd; -} - -inline int16_16 operator*(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_mul_epi32(a.s, b.s); - - return simd; -} - -inline int16_32 operator*(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_mul_epi32(a.s, b.s); - - return simd; -} - -inline int16_8 operator^(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_xor_si128(a.s, b.s); - - return simd; -} - -inline int16_16 operator^(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_xor_si256(a.s, b.s); - - return simd; -} - -inline int16_32 operator^(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_xor_si512(a.s, b.s); - - return simd; -} - -inline int16_8 &operator-=(int16_8 &a, int16_8 b) -{ - a = a - b; - - return a; -} - -inline int16_16 &operator-=(int16_16 &a, int16_16 b) -{ - a = a - b; - - return a; -} - -inline int16_32 &operator-=(int16_32 &a, int16_32 b) -{ - a = a - b; - - return a; -} - -inline int16_8 &operator+=(int16_8 &a, int16_8 b) -{ - a = a + b; - - return a; -} - -inline int16_16 &operator+=(int16_16 &a, int16_16 b) -{ - a = a + b; - - return a; -} - -inline int16_32 &operator+=(int16_32 &a, int16_32 b) -{ - a = a + b; - - return a; -} - -inline int16_8 &operator*=(int16_8 &a, int16_8 b) -{ - a = a * b; - - return a; -} - -inline int16_16 &operator*=(int16_16 &a, int16_16 b) -{ - a = a * b; - - return a; -} - -inline int16_32 &operator*=(int16_32 &a, int16_32 b) -{ - a = a * b; - - return a; -} - -inline int16_8 &operator^=(int16_8 &a, int16_8 b) -{ - a = a ^ b; - - return a; -} - -inline int16_16 &operator^=(int16_16 &a, int16_16 b) -{ - a = a ^ b; - - return a; -} - -inline int16_32 &operator^=(int16_32 &a, int16_32 b) -{ - a = a ^ b; - - return a; -} - -inline int16_8 operator<(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_cmplt_epi16(a.s, b.s); - - return simd; -} - -inline int16_16 operator<(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_xor_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1)); - - return simd; -} - -inline int16_32 operator<(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_mask_blend_epi16(_mm512_cmplt_epi16_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int16_8 operator<=(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_andnot_si128(_mm_cmplt_epi16(b.s, a.s), _mm_set1_epi16(-1)); - - return simd; -} - -inline int16_16 operator<=(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1)); - - return simd; -} - -inline int16_32 operator<=(int16_32 a, int16_32 b) -{ - int16_32 simd; - __mmask32 mask = _mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_LE); - simd.s = _mm512_mask_blend_epi16(mask, b.s, a.s); - - return simd; -} - -inline int16_8 operator>(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_cmpgt_epi16(a.s, b.s); - - return simd; -} - -inline int16_16 operator>(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_cmpgt_epi16(a.s, b.s); - - return simd; -} - -inline int16_32 operator>(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_mask_blend_epi16(_mm512_cmpgt_epi16_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int16_8 operator>=(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_andnot_si128(_mm_cmplt_epi16(a.s, b.s), _mm_set1_epi16(-1)); - - return simd; -} - -inline int16_16 operator>=(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(b.s, a.s), _mm256_set1_epi16(-1)); - - return simd; -} - -inline int16_32 operator>=(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_mask_blend_epi16(_mm512_cmpge_epi16_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int16_8 operator==(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_cmpeq_epi16(a.s, b.s); - - return simd; -} - -inline int16_16 operator==(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_cmpeq_epi16(a.s, b.s); - - return simd; -} - -inline int16_32 operator==(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_mask_blend_epi16(_mm512_cmpeq_epi16_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int16_8 operator!=(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_andnot_si128(_mm_cmpeq_epi16(a.s, b.s), _mm_set1_epi16(-1)); - - return simd; -} - -inline int16_16 operator!=(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_mask_blend_epi16(_mm256_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); - - return simd; -} - -inline int16_32 operator!=(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_mask_blend_epi16(_mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); - - return simd; -} - -inline int16_8 operator&(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_and_si128(a.s, b.s); - - return simd; -} - -inline int16_16 operator&(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_and_si256(a.s, b.s); - - return simd; -} - -inline int16_32 operator&(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_and_si512(a.s, b.s); - - return simd; -} - -inline int16_8 operator|(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_or_si128(a.s, b.s); - - return simd; -} - -inline int16_16 operator|(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_or_si256(a.s, b.s); - - return simd; -} - -inline int16_32 operator|(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_or_si512(a.s, b.s); - - return simd; -} - -inline int16_8 &operator&=(int16_8 &a, int16_8 b) -{ - a = a & b; - - return a; -} - -inline int16_16 &operator&=(int16_16 &a, int16_16 b) -{ - a = a & b; - - return a; -} - -inline int16_32 &operator&=(int16_32 &a, int16_32 b) -{ - a = a & b; - - return a; -} - -inline int16_8 &operator|=(int16_8 &a, int16_8 b) -{ - a = a | b; - - return a; -} - -inline int16_16 &operator|=(int16_16 &a, int16_16 b) -{ - a = a | b; - - return a; -} - -inline int16_32 &operator|=(int16_32 &a, int16_32 b) -{ - a = a | b; - - return a; -} - -inline int16_8 abs(int16_8 a) -{ - int16_8 simd; - simd.s = _mm_abs_epi16(a.s); - - return simd; -} - -inline int16_16 abs(int16_16 a) -{ - int16_16 simd; - simd.s = _mm256_abs_epi16(a.s); - - return simd; -} - -inline int16_32 abs(int16_32 a) -{ - int16_32 simd; - simd.s = _mm512_abs_epi16(a.s); - - return simd; -} - -inline int16_8 simd_min(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_min_epi16(a.s, b.s); - - return simd; -} - -inline int16_16 simd_min(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_min_epi16(a.s, b.s); - - return simd; -} - -inline int16_32 simd_min(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_min_epi16(a.s, b.s); - - return simd; -} - -inline int16_8 simd_max(int16_8 a, int16_8 b) -{ - int16_8 simd; - simd.s = _mm_max_epi16(a.s, b.s); - - return simd; -} - -inline int16_16 simd_max(int16_16 a, int16_16 b) -{ - int16_16 simd; - simd.s = _mm256_max_epi16(a.s, b.s); - - return simd; -} - -inline int16_32 simd_max(int16_32 a, int16_32 b) -{ - int16_32 simd; - simd.s = _mm512_max_epi16(a.s, b.s); - - return simd; -} - -inline int16_8 clamp(int16_8 min_value, int16_8 a, int16_8 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int16_16 clamp(int16_16 min_value, int16_16 a, int16_16 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int16_32 clamp(int16_32 min_value, int16_32 a, int16_32 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int32 which_true(int16_8 a) -{ - return _mm_movemask_epi8(a.s); -} - -inline int32 which_true(int16_16 a) -{ - return _mm256_movemask_epi8(a.s); -} - -inline int32 which_true(int16_32 a) -{ - return _mm512_movepi16_mask(a.s); -} - -inline bool any_true(int16_8 a) -{ - bool is_any_true = _mm_movemask_epi8(a.s) > 0; - - return is_any_true; -} - -inline bool any_true(int16_16 a) -{ - bool is_any_true = _mm256_movemask_epi8(a.s) > 0; - - return is_any_true; -} - -inline bool any_true(int16_32 a) -{ - bool is_any_true = _mm512_movepi16_mask(a.s) > 0; - - return is_any_true; -} - -inline bool all_true(int16_8 a) -{ - bool is_true = _mm_movemask_epi8(a.s) == 15; - - return is_true; -} - -inline bool all_true(int16_16 a) -{ - bool is_true = _mm256_movemask_epi8(a.s) == 255; - - return is_true; -} - -inline bool all_true(int16_32 a) -{ - bool is_true = _mm512_movepi16_mask(a.s) == 65535; - - return is_true; -} - -inline bool all_false(int16_8 a) -{ - bool is_false = _mm_movemask_epi8(a.s) == 0; - - return is_false; -} - -inline bool all_false(int16_16 a) -{ - bool is_false = _mm256_movemask_epi8(a.s) == 0; - - return is_false; -} - -inline bool all_false(int16_32 a) -{ - // @todo This can be optimized (requires also changes in the comparison functions return) - bool is_false = _mm512_movepi16_mask(a.s) == 0; - - return is_false; -} +#ifdef MACRO_CPU_FEATURE_AVX512 + #include "SIMD_I16_AVX512.h" +#endif // @todo from down here we can optimize some of the code by NOT using the wrappers // the code is self contained and we could use te intrinsic functions directly @@ -758,74 +33,93 @@ inline void simd_mult(const int16* a, f32 b, int16* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512 af_lo, af_hi; - __m512 b_16 = _mm512_set1_ps(b); - __m512 result_lo, result_hi; - __m512i result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512 af_lo, af_hi; + __m512 b_16 = _mm512_set1_ps(b); + __m512 result_lo, result_hi; + __m512i result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_loadu_si512((__m512i*) a); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_si512((__m512i*) a); - af_lo = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 0))); - af_hi = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 1))); + af_lo = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 0))); + af_hi = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16, 1))); - result_lo = _mm512_mul_ps(af_lo, b_16); - result_hi = _mm512_mul_ps(af_hi, b_16); + result_lo = _mm512_mul_ps(af_lo, b_16); + result_hi = _mm512_mul_ps(af_hi, b_16); - result_16 = _mm512_packs_epi32(_mm512_cvtps_epi32(result_lo), _mm512_cvtps_epi32(result_hi)); - _mm512_storeu_si512((__m512i*) result, result_16); + result_16 = _mm512_packs_epi32(_mm512_cvtps_epi32(result_lo), _mm512_cvtps_epi32(result_hi)); + _mm512_store_si512((__m512i*) result, result_16); - a += steps; - result += steps; + a += steps; + result += steps; + } + + steps = 1; } - } else if (steps == 8) { - __m256i a_8; - __m256 af_lo, af_hi; - __m256 b_8 = _mm256_set1_ps(b); - __m256 result_lo, result_hi; - __m256i result_8; + #endif - for (; i <= size - steps; i += steps) { - a_8 = _mm256_loadu_si256((__m256i*) a); + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256 af_lo, af_hi; + __m256 b_8 = _mm256_set1_ps(b); + __m256 result_lo, result_hi; + __m256i result_8; - af_lo = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 0))); - af_hi = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 1))); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i*) a); - result_lo = _mm256_mul_ps(af_lo, b_8); - result_hi = _mm256_mul_ps(af_hi, b_8); + af_lo = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 0))); + af_hi = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_8, 1))); - result_8 = _mm256_packs_epi32(_mm256_cvtps_epi32(result_lo), _mm256_cvtps_epi32(result_hi)); - _mm256_storeu_si256((__m256i*) result, result_8); + result_lo = _mm256_mul_ps(af_lo, b_8); + result_hi = _mm256_mul_ps(af_hi, b_8); - a += steps; - result += steps; + result_8 = _mm256_packs_epi32(_mm256_cvtps_epi32(result_lo), _mm256_cvtps_epi32(result_hi)); + _mm256_store_si256((__m256i*) result, result_8); + + a += steps; + result += steps; + } + + steps = 1; } - } else if (steps == 4) { - __m128i a_4; - __m128 af_lo, af_hi; - __m128 b_4 = _mm_set1_ps(b); - __m128 result_lo, result_hi; - __m128i result_4; + #endif - for (; i <= size - steps; i += steps) { - a_4 = _mm_loadu_si128((__m128i*) a); + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128 af_lo, af_hi; + __m128 b_4 = _mm_set1_ps(b); + __m128 result_lo, result_hi; + __m128i result_4; - af_lo = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_4)); - af_hi = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(a_4, 8))); + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i*) a); - result_lo = _mm_mul_ps(af_lo, b_4); - result_hi = _mm_mul_ps(af_hi, b_4); + af_lo = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_4)); + af_hi = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(a_4, 8))); - result_4 = _mm_packs_epi32(_mm_cvtps_epi32(result_lo), _mm_cvtps_epi32(result_hi)); - _mm_storeu_si128((__m128i*) result, result_4); + result_lo = _mm_mul_ps(af_lo, b_4); + result_hi = _mm_mul_ps(af_hi, b_4); - a += steps; - result += steps; + result_4 = _mm_packs_epi32(_mm_cvtps_epi32(result_lo), _mm_cvtps_epi32(result_hi)); + _mm_store_si128((__m128i*) result, result_4); + + a += steps; + result += steps; + } } - } + #endif // Handle any remaining elements for (; i < size; ++i) { diff --git a/architecture/x86/simd/SIMD_I16_AVX2.h b/architecture/x86/simd/SIMD_I16_AVX2.h new file mode 100644 index 0000000..90b2cd7 --- /dev/null +++ b/architecture/x86/simd/SIMD_I16_AVX2.h @@ -0,0 +1,262 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I16_AVX2_H +#define COMS_TOS_STDLIB_SIMD_I16_AVX2_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int16_16 { + union { + #if ARM + svint16_t s; + #else + __m256i s; + #endif + + int16 v[16]; + }; +}; + +inline int16_16 load_int16_16(const int16* mem) +{ + int16_16 simd; + simd.s = _mm256_load_si256((__m256i *) mem); + + return simd; +} + +inline int16_16 init_int16_16(const int16* mem) +{ + int16_16 simd; + simd.s = _mm256_set_epi16( + mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15] + ); + + return simd; +} + +inline void unload_int16_16(int16_16 a, int16 *array) { _mm256_store_si256((__m256i *) array, a.s); } + +inline int16_16 init_zero_int16_16() +{ + int16_16 simd; + simd.s = _mm256_setzero_si256(); + + return simd; +} + +inline int16_16 init_value_int16_16(int16 value) +{ + int16_16 simd; + simd.s = _mm256_set1_epi16(value); + + return simd; +} + +inline int16_16 operator+(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_add_epi16(a.s, b.s); + + return simd; +} + +inline int16_16 operator-(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_sub_epi16(a.s, b.s); + + return simd; +} + +inline int16_16 operator-(int16_16 a) { return init_zero_int16_16() - a; } + +inline int16_16 operator*(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_mul_epi32(a.s, b.s); + + return simd; +} + +inline int16_16 operator^(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_xor_si256(a.s, b.s); + + return simd; +} + +inline int16_16 &operator-=(int16_16 &a, int16_16 b) +{ + a = a - b; + + return a; +} + +inline int16_16 &operator+=(int16_16 &a, int16_16 b) +{ + a = a + b; + + return a; +} + +inline int16_16 &operator*=(int16_16 &a, int16_16 b) +{ + a = a * b; + + return a; +} + +inline int16_16 &operator^=(int16_16 &a, int16_16 b) +{ + a = a ^ b; + + return a; +} + +inline int16_16 operator<(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_xor_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1)); + + return simd; +} + +inline int16_16 operator<=(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(a.s, b.s), _mm256_set1_epi16(-1)); + + return simd; +} + +inline int16_16 operator>(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_cmpgt_epi16(a.s, b.s); + + return simd; +} + +inline int16_16 operator>=(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi16(b.s, a.s), _mm256_set1_epi16(-1)); + + return simd; +} + +inline int16_16 operator==(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_cmpeq_epi16(a.s, b.s); + + return simd; +} + +inline int16_16 operator!=(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_mask_blend_epi16(_mm256_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); + + return simd; +} + + +inline int16_16 operator&(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_and_si256(a.s, b.s); + + return simd; +} + +inline int16_16 operator|(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_or_si256(a.s, b.s); + + return simd; +} + +inline int16_16 &operator&=(int16_16 &a, int16_16 b) +{ + a = a & b; + + return a; +} + +inline int16_16 &operator|=(int16_16 &a, int16_16 b) +{ + a = a | b; + + return a; +} + +inline int16_16 abs(int16_16 a) +{ + int16_16 simd; + simd.s = _mm256_abs_epi16(a.s); + + return simd; +} + +inline int16_16 simd_min(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_min_epi16(a.s, b.s); + + return simd; +} + +inline int16_16 simd_max(int16_16 a, int16_16 b) +{ + int16_16 simd; + simd.s = _mm256_max_epi16(a.s, b.s); + + return simd; +} + +inline int16_16 clamp(int16_16 min_value, int16_16 a, int16_16 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(int16_16 a) +{ + return _mm256_movemask_epi8(a.s); +} + +inline bool any_true(int16_16 a) +{ + bool is_any_true = _mm256_movemask_epi8(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int16_16 a) +{ + bool is_true = _mm256_movemask_epi8(a.s) == 255; + + return is_true; +} + +inline bool all_false(int16_16 a) +{ + bool is_false = _mm256_movemask_epi8(a.s) == 0; + + return is_false; +} + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I16_AVX512.h b/architecture/x86/simd/SIMD_I16_AVX512.h new file mode 100644 index 0000000..26c7a9d --- /dev/null +++ b/architecture/x86/simd/SIMD_I16_AVX512.h @@ -0,0 +1,265 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I16_AVX512_H +#define COMS_TOS_STDLIB_SIMD_I16_AVX512_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int16_32 { + union { + #if ARM + svint16_t s; + #else + __m512i s; + #endif + + int16 v[32]; + }; +}; + +inline int16_32 load_int16_32(const int16* mem) +{ + int16_32 simd; + simd.s = _mm512_load_si512((__m512i *) mem); + + return simd; +} + +inline int16_32 init_int16_32(const int16* mem) +{ + int16_32 simd; + simd.s = _mm512_set_epi16( + mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15], + mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23], + mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31] + ); + + return simd; +} + +inline void unload_int16_32(int16_32 a, int16 *array) { _mm512_storeu_epi16(array, a.s); } + +inline int16_32 init_zero_int16_32() +{ + int16_32 simd; + simd.s = _mm512_setzero_si512(); + + return simd; +} + +inline int16_32 init_value_int16_32(int16 value) +{ + int16_32 simd; + simd.s = _mm512_set1_epi16(value); + + return simd; +} + +inline int16_32 operator+(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_add_epi16(a.s, b.s); + + return simd; +} + +inline int16_32 operator-(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_sub_epi16(a.s, b.s); + + return simd; +} + +inline int16_32 operator-(int16_32 a) { return init_zero_int16_32() - a; } + +inline int16_32 operator*(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_mul_epi32(a.s, b.s); + + return simd; +} + +inline int16_32 operator^(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_xor_si512(a.s, b.s); + + return simd; +} + +inline int16_32 &operator-=(int16_32 &a, int16_32 b) +{ + a = a - b; + + return a; +} + +inline int16_32 &operator+=(int16_32 &a, int16_32 b) +{ + a = a + b; + + return a; +} + +inline int16_32 &operator*=(int16_32 &a, int16_32 b) +{ + a = a * b; + + return a; +} + +inline int16_32 &operator^=(int16_32 &a, int16_32 b) +{ + a = a ^ b; + + return a; +} + +inline int16_32 operator<(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_mask_blend_epi16(_mm512_cmplt_epi16_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int16_32 operator<=(int16_32 a, int16_32 b) +{ + int16_32 simd; + __mmask32 mask = _mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_LE); + simd.s = _mm512_mask_blend_epi16(mask, b.s, a.s); + + return simd; +} + +inline int16_32 operator>(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_mask_blend_epi16(_mm512_cmpgt_epi16_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int16_32 operator>=(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_mask_blend_epi16(_mm512_cmpge_epi16_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int16_32 operator==(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_mask_blend_epi16(_mm512_cmpeq_epi16_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int16_32 operator!=(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_mask_blend_epi16(_mm512_cmp_epi16_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); + + return simd; +} + +inline int16_32 operator&(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_and_si512(a.s, b.s); + + return simd; +} + +inline int16_32 operator|(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_or_si512(a.s, b.s); + + return simd; +} + +inline int16_32 &operator&=(int16_32 &a, int16_32 b) +{ + a = a & b; + + return a; +} + +inline int16_32 &operator|=(int16_32 &a, int16_32 b) +{ + a = a | b; + + return a; +} + +inline int16_32 abs(int16_32 a) +{ + int16_32 simd; + simd.s = _mm512_abs_epi16(a.s); + + return simd; +} + +inline int16_32 simd_min(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_min_epi16(a.s, b.s); + + return simd; +} + +inline int16_32 simd_max(int16_32 a, int16_32 b) +{ + int16_32 simd; + simd.s = _mm512_max_epi16(a.s, b.s); + + return simd; +} + +inline int16_32 clamp(int16_32 min_value, int16_32 a, int16_32 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(int16_32 a) +{ + return _mm512_movepi16_mask(a.s); +} + +inline bool any_true(int16_32 a) +{ + bool is_any_true = _mm512_movepi16_mask(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int16_32 a) +{ + bool is_true = _mm512_movepi16_mask(a.s) == 65535; + + return is_true; +} + +inline bool all_false(int16_32 a) +{ + // @todo This can be optimized (requires also changes in the comparison functions return) + bool is_false = _mm512_movepi16_mask(a.s) == 0; + + return is_false; +} + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I16_SSE.h b/architecture/x86/simd/SIMD_I16_SSE.h new file mode 100644 index 0000000..ea5d295 --- /dev/null +++ b/architecture/x86/simd/SIMD_I16_SSE.h @@ -0,0 +1,261 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I16_SSE_H +#define COMS_TOS_STDLIB_SIMD_I16_SSE_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int16_8 { + union { + #if ARM + svint16_t s; + #else + __m128i s; + #endif + + int16 v[8]; + }; +}; + +inline int16_8 load_int16_8(const int16* mem) +{ + int16_8 simd; + simd.s = _mm_load_si128((__m128i *) mem); + + return simd; +} + +inline int16_8 init_int16_8(const int16* mem) +{ + int16_8 simd; + simd.s = _mm_set_epi16( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7] + ); + + return simd; +} + +inline void unload_int16_8(int16_8 a, int16 *array) { _mm_store_si128((__m128i *) array, a.s); } + +inline int16_8 init_zero_int16_8() +{ + int16_8 simd; + simd.s = _mm_setzero_si128(); + + return simd; +} + +inline int16_8 init_value_int16_8(int16 value) +{ + int16_8 simd; + simd.s = _mm_set1_epi16(value); + + return simd; +} + +inline int16_8 operator+(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_add_epi16(a.s, b.s); + + return simd; +} + +inline int16_8 operator-(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_sub_epi16(a.s, b.s); + + return simd; +} + +inline int16_8 operator-(int16_8 a) { return init_zero_int16_8() - a; } + +inline int16_8 operator*(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_mul_epi32(a.s, b.s); + + return simd; +} + +inline int16_8 operator^(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_xor_si128(a.s, b.s); + + return simd; +} + +inline int16_8 &operator-=(int16_8 &a, int16_8 b) +{ + a = a - b; + + return a; +} + +inline int16_8 &operator+=(int16_8 &a, int16_8 b) +{ + a = a + b; + + return a; +} + +inline int16_8 &operator*=(int16_8 &a, int16_8 b) +{ + a = a * b; + + return a; +} + +inline int16_8 &operator^=(int16_8 &a, int16_8 b) +{ + a = a ^ b; + + return a; +} + +inline int16_8 operator<(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_cmplt_epi16(a.s, b.s); + + return simd; +} + +inline int16_8 operator<=(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_andnot_si128(_mm_cmplt_epi16(b.s, a.s), _mm_set1_epi16(-1)); + + return simd; +} + +inline int16_8 operator>(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_cmpgt_epi16(a.s, b.s); + + return simd; +} + +inline int16_8 operator>=(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_andnot_si128(_mm_cmplt_epi16(a.s, b.s), _mm_set1_epi16(-1)); + + return simd; +} + +inline int16_8 operator==(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_cmpeq_epi16(a.s, b.s); + + return simd; +} + +inline int16_8 operator!=(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_andnot_si128(_mm_cmpeq_epi16(a.s, b.s), _mm_set1_epi16(-1)); + + return simd; +} + +inline int16_8 operator&(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_and_si128(a.s, b.s); + + return simd; +} + +inline int16_8 operator|(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_or_si128(a.s, b.s); + + return simd; +} + +inline int16_8 &operator&=(int16_8 &a, int16_8 b) +{ + a = a & b; + + return a; +} + +inline int16_8 &operator|=(int16_8 &a, int16_8 b) +{ + a = a | b; + + return a; +} + +inline int16_8 abs(int16_8 a) +{ + int16_8 simd; + simd.s = _mm_abs_epi16(a.s); + + return simd; +} + +inline int16_8 simd_min(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_min_epi16(a.s, b.s); + + return simd; +} + +inline int16_8 simd_max(int16_8 a, int16_8 b) +{ + int16_8 simd; + simd.s = _mm_max_epi16(a.s, b.s); + + return simd; +} + +inline int16_8 clamp(int16_8 min_value, int16_8 a, int16_8 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(int16_8 a) +{ + return _mm_movemask_epi8(a.s); +} + +inline bool any_true(int16_8 a) +{ + bool is_any_true = _mm_movemask_epi8(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int16_8 a) +{ + bool is_true = _mm_movemask_epi8(a.s) == 15; + + return is_true; +} + +inline bool all_false(int16_8 a) +{ + bool is_false = _mm_movemask_epi8(a.s) == 0; + + return is_false; +} + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I32.h b/architecture/x86/simd/SIMD_I32.h old mode 100644 new mode 100755 index 3633319..86f23f1 --- a/architecture/x86/simd/SIMD_I32.h +++ b/architecture/x86/simd/SIMD_I32.h @@ -15,1086 +15,90 @@ #include "../../../stdlib/Types.h" #include "../../../utils/BitUtils.h" -#include "SIMD_F32.h" -// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general -// or better create alternative functions for the available sse version. +#ifdef MACRO_CPU_FEATURE_SSE42 + #include "SIMD_I32_SSE.h" +#endif -// @question why are we passing structs by value? +#ifdef MACRO_CPU_FEATURE_AVX2 + #include "SIMD_I32_AVX2.h" +#endif -struct int32_4 { - union { - #if ARM - svint32_t s; - #else - __m128i s; - #endif - - int32 v[4]; - }; -}; - -struct int32_8 { - union { - #if ARM - svint32_t s; - #else - __m256i s; - #endif - - int32 v[8]; - }; -}; - -struct int32_16 { - union { - #if ARM - svint32_t s; - #else - __m512i s; - #endif - - int32 v[16]; - }; -}; - -inline int32_4 load_int32_4(const int32* mem) -{ - int32_4 simd; - simd.s = _mm_load_si128((__m128i *) mem); - - return simd; -} - -inline int32_4 init_int32_4(const int32* mem) -{ - int32_4 simd; - simd.s = _mm_set_epi32(mem[0], mem[1], mem[2], mem[3]); - - return simd; -} - -inline void unload_int32_4(int32_4 a, int32 *array) { _mm_store_si128((__m128i *) array, a.s); } - -inline int32_8 load_int32_8(const int32* mem) -{ - int32_8 simd; - simd.s = _mm256_load_si256((__m256i *) mem); - - return simd; -} - -inline int32_8 init_int32_8(const int32* mem) -{ - int32_8 simd; - simd.s = _mm256_set_epi32(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]); - - return simd; -} - -inline void unload_int32_8(int32_8 a, int32 *array) { _mm256_store_si256((__m256i *) array, a.s); } - -inline int32_16 load_int32_16(const int32* mem) -{ - int32_16 simd; - simd.s = _mm512_load_epi32(mem); - - return simd; -} - -inline int32_16 init_int32_16(const int32* mem) -{ - int32_16 simd; - simd.s = _mm512_set_epi32( - mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], - mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]); - - return simd; -} - -inline void unload_int32_16(int32_16 a, int32 *array) { _mm512_store_epi32(array, a.s); } - -inline int32_4 init_zero_int32_4() -{ - int32_4 simd; - simd.s = _mm_setzero_si128(); - - return simd; -} - -inline int32_8 init_zero_int32_8() -{ - int32_8 simd; - simd.s = _mm256_setzero_si256(); - - return simd; -} - -inline int32_16 init_zero_int32_16() -{ - int32_16 simd; - simd.s = _mm512_setzero_epi32(); - - return simd; -} - -inline int32_4 init_value_int32_4(int32 value) -{ - int32_4 simd; - simd.s = _mm_set1_epi32(value); - - return simd; -} - -inline int32_8 init_value_int32_8(int32 value) -{ - int32_8 simd; - simd.s = _mm256_set1_epi32(value); - - return simd; -} - -inline int32_16 init_value_int32_16(int32 value) -{ - int32_16 simd; - simd.s = _mm512_set1_epi32(value); - - return simd; -} - -inline int32_4 init_values_int32_4(int32 a, int32 b, int32 c, int32 d) -{ - int32_4 simd; - simd.s = _mm_set_epi32(a, b, c, d); - - return simd; -} - -inline int32_8 init_values_int32_8( - int32 a, int32 b, int32 c, int32 d, - int32 e, int32 f, int32 g, int32 h -) -{ - int32_8 simd; - simd.s = _mm256_set_epi32(a, b, c, d, e, f, g, h); - - return simd; -} - -inline int32_16 init_values_int32_16( - int32 a, int32 b, int32 c, int32 d, - int32 e, int32 f, int32 g, int32 h, - int32 i, int32 j, int32 k, int32 l, - int32 m, int32 n, int32 o, int32 p -) -{ - int32_16 simd; - simd.s = _mm512_set_epi32(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p); - - return simd; -} - -inline -int32_4 f32_4_to_int32_4(f32_4 a) -{ - int32_4 result; - result.s = _mm_cvtps_epi32(a.s); - - return result; -} - -inline -f32_4 int32_4_to_f32_4(int32_4 a) -{ - f32_4 result; - result.s = _mm_cvtepi32_ps(a.s); - - return result; -} - -inline -int32_8 f32_8_to_int32_8(f32_8 a) -{ - int32_8 result; - result.s = _mm256_cvtps_epi32(a.s); - - return result; -} - -inline -f32_8 int32_8_to_f32_8(int32_8 a) -{ - f32_8 result; - result.s = _mm256_cvtepi32_ps(a.s); - - return result; -} - -inline -int32_16 f32_16_to_int32_16(f32_16 a) -{ - int32_16 result; - result.s = _mm512_cvtps_epi32(a.s); - - return result; -} - -inline -f32_16 int32_16_to_f32_16(int32_16 a) -{ - f32_16 result; - result.s = _mm512_cvtepi32_ps(a.s); - - return result; -} - -inline int32_4 operator+(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_add_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator+(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_add_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 operator+(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_add_epi32(a.s, b.s); - - return simd; -} - -inline int32_4 operator-(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_sub_epi32(a.s, b.s); - - return simd; -} - -inline int32_4 operator-(int32_4 a) { return init_zero_int32_4() - a; } - -inline int32_8 operator-(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_sub_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator-(int32_8 a) { return init_zero_int32_8() - a; } - -inline int32_16 operator-(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_sub_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 operator-(int32_16 a) { return init_zero_int32_16() - a; } - -inline int32_4 operator*(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_mul_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator*(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_mul_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 operator*(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_mul_epi32(a.s, b.s); - - return simd; -} - -inline int32_4 operator/(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_div_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator/(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_div_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 operator/(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_div_epi32(a.s, b.s); - - return simd; -} - -inline f32_4 operator/(f32_4 a, int32_4 b) -{ - f32_4 simd; - simd.s = _mm_div_ps(a.s, _mm_cvtepi32_ps(b.s)); - - return simd; -} - -inline f32_8 operator/(f32_8 a, int32_8 b) -{ - f32_8 simd; - simd.s = _mm256_div_ps(a.s, _mm256_cvtepi32_ps(b.s)); - - return simd; -} - -inline f32_16 operator/(f32_16 a, int32_16 b) -{ - f32_16 simd; - simd.s = _mm512_div_ps(a.s, _mm512_cvtepi32_ps(b.s)); - - return simd; -} - -inline f32_4 operator/(int32_4 a, f32_4 b) -{ - f32_4 simd; - simd.s = _mm_div_ps(_mm_cvtepi32_ps(a.s), b.s); - - return simd; -} - -inline f32_8 operator/(int32_8 a, f32_8 b) -{ - f32_8 simd; - simd.s = _mm256_div_ps(_mm256_cvtepi32_ps(a.s), b.s); - - return simd; -} - -inline f32_16 operator/(int32_16 a, f32_16 b) -{ - f32_16 simd; - simd.s = _mm512_div_ps(_mm512_cvtepi32_ps(a.s), b.s); - - return simd; -} - -inline int32_4 operator^(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_xor_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator^(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_xor_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 operator^(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_xor_epi32(a.s, b.s); - - return simd; -} - -inline int32_4 &operator-=(int32_4 &a, int32_4 b) -{ - a = a - b; - - return a; -} - -inline int32_8 &operator-=(int32_8 &a, int32_8 b) -{ - a = a - b; - - return a; -} - -inline int32_16 &operator-=(int32_16 &a, int32_16 b) -{ - a = a - b; - - return a; -} - -inline int32_4 &operator+=(int32_4 &a, int32_4 b) -{ - a = a + b; - - return a; -} - -inline int32_8 &operator+=(int32_8 &a, int32_8 b) -{ - a = a + b; - - return a; -} - -inline int32_16 &operator+=(int32_16 &a, int32_16 b) -{ - a = a + b; - - return a; -} - -inline int32_4 &operator*=(int32_4 &a, int32_4 b) -{ - a = a * b; - - return a; -} - -inline int32_8 &operator*=(int32_8 &a, int32_8 b) -{ - a = a * b; - - return a; -} - -inline int32_16 &operator*=(int32_16 &a, int32_16 b) -{ - a = a * b; - - return a; -} - -inline int32_4 &operator/=(int32_4 &a, int32_4 b) -{ - a.s = (a / b).s; - - return a; -} - -inline int32_8 &operator/=(int32_8 &a, int32_8 b) -{ - a.s = (a / b).s; - - return a; -} - -inline int32_16 &operator/=(int32_16 &a, int32_16 b) -{ - a.s = (a / b).s; - - return a; -} - -inline int32_4 &operator^=(int32_4 &a, int32_4 b) -{ - a = a ^ b; - - return a; -} - -inline int32_8 &operator^=(int32_8 &a, int32_8 b) -{ - a = a ^ b; - - return a; -} - -inline int32_16 &operator^=(int32_16 &a, int32_16 b) -{ - a = a ^ b; - - return a; -} - -inline int32_4 operator<(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_cmplt_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator<(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_xor_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1)); - - return simd; -} - -inline int32_16 operator<(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_mask_blend_epi32(_mm512_cmplt_epi32_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int32_4 operator<=(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_andnot_si128(_mm_cmplt_epi32(b.s, a.s), _mm_set1_epi32(-1)); - - return simd; -} - -inline int32_8 operator<=(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1)); - - return simd; -} - -inline int32_16 operator<=(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_mask_blend_epi32(_mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), b.s, a.s); - - return simd; -} - -inline int32_4 operator>(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_cmpgt_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator>(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_cmpgt_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 operator>(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_mask_blend_epi32(_mm512_cmpgt_epi32_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int32_4 operator>=(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_andnot_si128(_mm_cmplt_epi32(a.s, b.s), _mm_set1_epi32(-1)); - - return simd; -} - -inline int32_8 operator>=(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(b.s, a.s), _mm256_set1_epi32(-1)); - - return simd; -} - -inline int32_16 operator>=(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_mask_blend_epi32(_mm512_cmpge_epi32_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int32_4 operator==(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_cmpeq_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator==(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_cmpeq_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 operator==(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_mask_blend_epi32(_mm512_cmpeq_epi32_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int32_4 operator!=(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_andnot_si128(_mm_cmpeq_epi32(a.s, b.s), _mm_set1_epi32(-1)); - - return simd; -} - -inline int32_8 operator!=(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_mask_blend_epi32(_mm256_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); - - return simd; -} - -inline int32_16 operator!=(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_mask_blend_epi32(_mm512_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); - - return simd; -} - -inline int32_4 operator&(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_and_si128(a.s, b.s); - - return simd; -} - -inline int32_8 operator&(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_and_si256(a.s, b.s); - - return simd; -} - -inline int32_16 operator&(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_and_si512(a.s, b.s); - - return simd; -} - -inline int32_4 operator|(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_or_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 operator|(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_or_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 operator|(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_or_epi32(a.s, b.s); - - return simd; -} - -inline int32_4 &operator&=(int32_4 &a, int32_4 b) -{ - a = a & b; - - return a; -} - -inline int32_8 &operator&=(int32_8 &a, int32_8 b) -{ - a = a & b; - - return a; -} - -inline int32_16 &operator&=(int32_16 &a, int32_16 b) -{ - a = a & b; - - return a; -} - -inline int32_4 &operator|=(int32_4 &a, int32_4 b) -{ - a = a | b; - - return a; -} - -inline int32_8 &operator|=(int32_8 &a, int32_8 b) -{ - a = a | b; - - return a; -} - -inline int32_16 &operator|=(int32_16 &a, int32_16 b) -{ - a = a | b; - - return a; -} - -inline int32_4 abs(int32_4 a) -{ - int32_4 simd; - simd.s = _mm_abs_epi32(a.s); - - return simd; -} - -inline int32_8 abs(int32_8 a) -{ - int32_8 simd; - simd.s = _mm256_abs_epi32(a.s); - - return simd; -} - -inline int32_16 abs(int32_16 a) -{ - int32_16 simd; - simd.s = _mm512_abs_epi64(a.s); - - return simd; -} - -inline int32_4 simd_min(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_min_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 simd_min(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_min_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 simd_min(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_min_epi32(a.s, b.s); - - return simd; -} - -inline int32_4 simd_max(int32_4 a, int32_4 b) -{ - int32_4 simd; - simd.s = _mm_max_epi32(a.s, b.s); - - return simd; -} - -inline int32_8 simd_max(int32_8 a, int32_8 b) -{ - int32_8 simd; - simd.s = _mm256_max_epi32(a.s, b.s); - - return simd; -} - -inline int32_16 simd_max(int32_16 a, int32_16 b) -{ - int32_16 simd; - simd.s = _mm512_max_epi32(a.s, b.s); - - return simd; -} - -inline int32_4 sign(int32_4 a) -{ - __m128i mask = _mm_set1_epi32(0x80000000); - __m128i signBit = _mm_and_si128(a.s, mask); - __m128i b = _mm_set1_epi32(1); - - int32_4 simd; - simd.s = _mm_or_si128(b, signBit); - - return simd; -} - -inline int32_8 sign(int32_8 a) -{ - __m256i mask = _mm256_set1_epi32(0x80000000); - __m256i signBit = _mm256_and_si256(a.s, mask); - __m256i b = _mm256_set1_epi32(1); - - int32_8 simd; - simd.s = _mm256_or_si256(b, signBit); - - return simd; -} - -inline int32_16 sign(int32_16 a) -{ - __m512i mask = _mm512_set1_epi32(0x80000000); - __m512i signBit = _mm512_and_si512(a.s, mask); - __m512i b = _mm512_set1_epi32(1); - int32_16 simd; - - simd.s = _mm512_or_si512(b, signBit); - - return simd; -} - -inline f32_4 sqrt(int32_4 a) -{ - f32_4 simd; - simd.s = _mm_sqrt_ps(_mm_cvtepi32_ps(a.s)); - - return simd; -} - -inline f32_8 sqrt(int32_8 a) -{ - f32_8 simd; - simd.s = _mm256_sqrt_ps(_mm256_cvtepi32_ps(a.s)); - - return simd; -} - -inline f32_16 sqrt(int32_16 a) -{ - f32_16 simd; - simd.s = _mm512_sqrt_ps(_mm512_cvtepi32_ps(a.s)); - - return simd; -} - -inline f32_4 sqrt_inv_approx(int32_4 a) -{ - f32_4 simd; - simd.s = _mm_rsqrt_ps(_mm_cvtepi32_ps(a.s)); - - return simd; -} - -inline f32_8 sqrt_inv_approx(int32_8 a) -{ - f32_8 simd; - simd.s = _mm256_rsqrt_ps(_mm256_cvtepi32_ps(a.s)); - - return simd; -} - -inline f32_16 sqrt_inv_approx(int32_16 a) -{ - f32_16 simd; - simd.s = _mm512_rsqrt14_ps(_mm512_cvtepi32_ps(a.s)); - - return simd; -} - -inline f32_4 one_over_approx(int32_4 a) -{ - f32_4 simd; - simd.s = _mm_rcp_ps(_mm_cvtepi32_ps(a.s)); - - return simd; -} - -inline f32_8 one_over_approx(int32_8 a) -{ - f32_8 simd; - simd.s = _mm256_rcp_ps(_mm256_cvtepi32_ps(a.s)); - - return simd; -} - -inline f32_16 one_over_approx(int32_16 a) -{ - f32_16 simd; - simd.s = _mm512_rcp14_ps(_mm512_cvtepi32_ps(a.s)); - - return simd; -} - -inline int32_4 clamp(int32_4 min_value, int32_4 a, int32_4 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int32_8 clamp(int32_8 min_value, int32_8 a, int32_8 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int32_16 clamp(int32_16 min_value, int32_16 a, int32_16 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int32 which_true(int32_4 a) -{ - int32 which_true = _mm_movemask_epi8(a.s); - - return which_true; -} - -inline int32 which_true(int32_8 a) -{ - int32 which_true = _mm256_movemask_epi8(a.s); - - return which_true; -} - -inline int32 which_true(int32_16 a) -{ - int32 which_true = _mm512_movepi32_mask(a.s); - - return which_true; -} - -inline bool any_true(int32_4 a) -{ - bool is_any_true = _mm_movemask_epi8(a.s) > 0; - - return is_any_true; -} - -inline bool any_true(int32_8 a) -{ - bool is_any_true = _mm256_movemask_epi8(a.s) > 0; - - return is_any_true; -} - -inline bool any_true(int32_16 a) -{ - bool is_any_true = _mm512_movepi32_mask(a.s) > 0; - - return is_any_true; -} - -inline bool all_true(int32_4 a) -{ - bool is_true = _mm_movemask_epi8(a.s) == 15; - - return is_true; -} - -inline bool all_true(int32_8 a) -{ - bool is_true = _mm256_movemask_epi8(a.s) == 255; - - return is_true; -} - -inline bool all_true(int32_16 a) -{ - bool is_true = _mm512_movepi32_mask(a.s) == 65535; - - return is_true; -} - -inline bool all_false(int32_4 a) -{ - bool is_false = _mm_movemask_epi8(a.s) == 0; - - return is_false; -} - -inline bool all_false(int32_8 a) -{ - bool is_false = _mm256_movemask_epi8(a.s) == 0; - - return is_false; -} - -inline bool all_false(int32_16 a) -{ - // @todo This can be optimized (requires also changes in the comparison functions return) - bool is_false = _mm512_movepi32_mask(a.s) == 0; - - return is_false; -} - -// @todo from down here we can optimize some of the code by NOT using the wrappers -// the code is self contained and we could use te intrinsic functions directly +#ifdef MACRO_CPU_FEATURE_AVX512 + #include "SIMD_I32_AVX512.h" +#endif inline void simd_mult(const int32* a, const int32* b, int32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) b, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512i b_16; - __m512i result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512i b_16; + __m512i result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_epi32(a); - b_16 = _mm512_load_epi32(b); - result_16 = _mm512_mul_epi32(a_16, b_16); - _mm512_store_epi32(result, result_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_epi32(a); + b_16 = _mm512_load_epi32(b); + result_16 = _mm512_mul_epi32(a_16, b_16); + _mm512_store_epi32(result, result_16); - a += steps; - b += steps; - result += steps; - } - } else if (steps == 8) { - __m256i a_8; - __m256i b_8; - __m256i result_8; + a += steps; + b += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i *) a); - b_8 = _mm256_load_si256((__m256i *) b); - result_8 = _mm256_mul_epi32(a_8, b_8); - _mm256_store_si256((__m256i *) result, result_8); + steps = 1; + } + #endif - a += steps; - b += steps; - result += steps; - } - } else if (steps == 4) { - __m128i a_4; - __m128i b_4; - __m128i result_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256i b_8; + __m256i result_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i *) a); - b_4 = _mm_load_si128((__m128i *) b); - result_4 = _mm_mul_epi32(a_4, b_4); - _mm_store_si128((__m128i *) result, result_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i *) a); + b_8 = _mm256_load_si256((__m256i *) b); + result_8 = _mm256_mul_epi32(a_8, b_8); + _mm256_store_si256((__m256i *) result, result_8); - a += steps; - b += steps; - result += steps; - } - } + a += steps; + b += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128i b_4; + __m128i result_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i *) a); + b_4 = _mm_load_si128((__m128i *) b); + result_4 = _mm_mul_epi32(a_4, b_4); + _mm_store_si128((__m128i *) result, result_4); + + a += steps; + b += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { ++a; @@ -1109,59 +113,79 @@ inline void simd_mult(const int32* a, const f32* b, f32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) b, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512 af_16; - __m512 b_16; - __m512 result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512 af_16; + __m512 b_16; + __m512 result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_epi32(a); - af_16 = _mm512_cvtepi32_ps(a_16); - b_16 = _mm512_load_ps(b); - result_16 = _mm512_mul_ps(af_16, b_16); - _mm512_store_ps(result, result_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_epi32(a); + af_16 = _mm512_cvtepi32_ps(a_16); + b_16 = _mm512_load_ps(b); + result_16 = _mm512_mul_ps(af_16, b_16); + _mm512_store_ps(result, result_16); - a += steps; - b += steps; - result += steps; - } - } else if (steps == 8) { - __m256i a_8; - __m256 af_8; - __m256 b_8; - __m256 result_8; + a += steps; + b += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i *) a); - af_8 = _mm256_cvtepi32_ps(a_8); - b_8 = _mm256_load_ps(b); - result_8 = _mm256_mul_ps(af_8, b_8); - _mm256_store_ps(result, result_8); + steps = 1; + } + #endif - a += steps; - b += steps; - result += steps; - } - } else if (steps == 4) { - __m128i a_4; - __m128 af_4; - __m128 b_4; - __m128 result_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256 af_8; + __m256 b_8; + __m256 result_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i *) a); - af_4 = _mm_cvtepi32_ps(a_4); - b_4 = _mm_load_ps(b); - result_4 = _mm_mul_ps(af_4, b_4); - _mm_store_ps(result, result_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i *) a); + af_8 = _mm256_cvtepi32_ps(a_8); + b_8 = _mm256_load_ps(b); + result_8 = _mm256_mul_ps(af_8, b_8); + _mm256_store_ps(result, result_8); - a += steps; - b += steps; - result += steps; - } - } + a += steps; + b += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128 af_4; + __m128 b_4; + __m128 result_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i *) a); + af_4 = _mm_cvtepi32_ps(a_4); + b_4 = _mm_load_ps(b); + result_4 = _mm_mul_ps(af_4, b_4); + _mm_store_ps(result, result_4); + + a += steps; + b += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { *result = *a * *b; @@ -1176,65 +200,85 @@ inline void simd_mult(const int32* a, const f32* b, int32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) b, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512 af_16; - __m512 b_16; - __m512 result_16; - __m512i resulti_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512 af_16; + __m512 b_16; + __m512 result_16; + __m512i resulti_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_epi32(a); - af_16 = _mm512_cvtepi32_ps(a_16); - b_16 = _mm512_load_ps(b); - result_16 = _mm512_mul_ps(af_16, b_16); - resulti_16 = _mm512_cvtps_epi32(result_16); - _mm512_store_epi32(result, resulti_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_epi32(a); + af_16 = _mm512_cvtepi32_ps(a_16); + b_16 = _mm512_load_ps(b); + result_16 = _mm512_mul_ps(af_16, b_16); + resulti_16 = _mm512_cvtps_epi32(result_16); + _mm512_store_epi32(result, resulti_16); - a += steps; - b += steps; - result += steps; - } - } else if (steps == 8) { - __m256i a_8; - __m256 af_8; - __m256 b_8; - __m256 result_8; - __m256i resulti_8; + a += steps; + b += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i *) a); - af_8 = _mm256_cvtepi32_ps(a_8); - b_8 = _mm256_load_ps(b); - result_8 = _mm256_mul_ps(af_8, b_8); - resulti_8 = _mm256_cvtps_epi32(result_8); - _mm256_store_si256((__m256i *) result, resulti_8); + steps = 1; + } + #endif - a += steps; - b += steps; - result += steps; - } - } else if (steps == 4) { - __m128i a_4; - __m128 af_4; - __m128 b_4; - __m128 result_4; - __m128i resulti_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256 af_8; + __m256 b_8; + __m256 result_8; + __m256i resulti_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i *) a); - af_4 = _mm_cvtepi32_ps(a_4); - b_4 = _mm_load_ps(b); - result_4 = _mm_mul_ps(af_4, b_4); - resulti_4 = _mm_cvtps_epi32(result_4); - _mm_store_si128((__m128i *) result, resulti_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i *) a); + af_8 = _mm256_cvtepi32_ps(a_8); + b_8 = _mm256_load_ps(b); + result_8 = _mm256_mul_ps(af_8, b_8); + resulti_8 = _mm256_cvtps_epi32(result_8); + _mm256_store_si256((__m256i *) result, resulti_8); - a += steps; - b += steps; - result += steps; - } - } + a += steps; + b += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128 af_4; + __m128 b_4; + __m128 result_4; + __m128i resulti_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i *) a); + af_4 = _mm_cvtepi32_ps(a_4); + b_4 = _mm_load_ps(b); + result_4 = _mm_mul_ps(af_4, b_4); + resulti_4 = _mm_cvtps_epi32(result_4); + _mm_store_si128((__m128i *) result, resulti_4); + + a += steps; + b += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { *result = (int32) (*a * *b); @@ -1249,59 +293,78 @@ inline void simd_mult(const int32* a, f32 b, int32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512 af_16; - __m512 b_16 = _mm512_set1_ps(b); - __m512 result_16; - __m512i resulti_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512 af_16; + __m512 b_16 = _mm512_set1_ps(b); + __m512 result_16; + __m512i resulti_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_epi32(a); - af_16 = _mm512_cvtepi32_ps(a_16); - result_16 = _mm512_mul_ps(af_16, b_16); - resulti_16 = _mm512_cvtps_epi32(result_16); - _mm512_store_epi32(result, resulti_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_epi32(a); + af_16 = _mm512_cvtepi32_ps(a_16); + result_16 = _mm512_mul_ps(af_16, b_16); + resulti_16 = _mm512_cvtps_epi32(result_16); + _mm512_store_epi32(result, resulti_16); - a += steps; - result += steps; - } - } else if (steps == 8) { - __m256i a_8; - __m256 af_8; - __m256 b_8 = _mm256_set1_ps(b); - __m256 result_8; - __m256i resulti_8; + a += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i *) a); - af_8 = _mm256_cvtepi32_ps(a_8); - result_8 = _mm256_mul_ps(af_8, b_8); - resulti_8 = _mm256_cvtps_epi32(result_8); - _mm256_store_si256((__m256i *) result, resulti_8); + steps = 1; + } + #endif - a += steps; - result += steps; - } - } else if (steps == 4) { - __m128i a_4; - __m128 af_4; - __m128 b_4 = _mm_set1_ps(b); - __m128 result_4; - __m128i resulti_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256 af_8; + __m256 b_8 = _mm256_set1_ps(b); + __m256 result_8; + __m256i resulti_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i *) a); - af_4 = _mm_cvtepi32_ps(a_4); - result_4 = _mm_mul_ps(af_4, b_4); - resulti_4 = _mm_cvtps_epi32(result_4); - _mm_store_si128((__m128i *) result, resulti_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i *) a); + af_8 = _mm256_cvtepi32_ps(a_8); + result_8 = _mm256_mul_ps(af_8, b_8); + resulti_8 = _mm256_cvtps_epi32(result_8); + _mm256_store_si256((__m256i *) result, resulti_8); - a += steps; - result += steps; - } - } + a += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128 af_4; + __m128 b_4 = _mm_set1_ps(b); + __m128 result_4; + __m128i resulti_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i *) a); + af_4 = _mm_cvtepi32_ps(a_4); + result_4 = _mm_mul_ps(af_4, b_4); + resulti_4 = _mm_cvtps_epi32(result_4); + _mm_store_si128((__m128i *) result, resulti_4); + + a += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { *result = (int32) (*a * b); @@ -1315,53 +378,72 @@ inline void simd_div(const int32* a, f32 b, f32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512 af_16; - __m512 b_16 = _mm512_set1_ps(b); - __m512 result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512 af_16; + __m512 b_16 = _mm512_set1_ps(b); + __m512 result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_epi32(a); - af_16 = _mm512_cvtepi32_ps(a_16); - result_16 = _mm512_div_ps(af_16, b_16); - _mm512_store_ps(result, result_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_epi32(a); + af_16 = _mm512_cvtepi32_ps(a_16); + result_16 = _mm512_div_ps(af_16, b_16); + _mm512_store_ps(result, result_16); - a += steps; - result += steps; - } - } else if (steps == 8) { - __m256i a_8; - __m256 af_8; - __m256 b_8 = _mm256_set1_ps(b); - __m256 result_8; + a += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i *) a); - af_8 = _mm256_cvtepi32_ps(a_8); - result_8 = _mm256_div_ps(af_8, b_8); - _mm256_store_ps(result, result_8); + steps = 1; + } + #endif - a += steps; - result += steps; - } - } else if (steps == 4) { - __m128i a_4; - __m128 af_4; - __m128 b_4 = _mm_set1_ps(b); - __m128 result_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256 af_8; + __m256 b_8 = _mm256_set1_ps(b); + __m256 result_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i *) a); - af_4 = _mm_cvtepi32_ps(a_4); - result_4 = _mm_div_ps(af_4, b_4); - _mm_store_ps(result, result_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i *) a); + af_8 = _mm256_cvtepi32_ps(a_8); + result_8 = _mm256_div_ps(af_8, b_8); + _mm256_store_ps(result, result_8); - a += steps; - result += steps; - } - } + a += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128 af_4; + __m128 b_4 = _mm_set1_ps(b); + __m128 result_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i *) a); + af_4 = _mm_cvtepi32_ps(a_4); + result_4 = _mm_div_ps(af_4, b_4); + _mm_store_ps(result, result_4); + + a += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { *result = *a / b; @@ -1375,53 +457,73 @@ inline void simd_add(const int32* a, const int32* b, int32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) b, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512i b_16; - __m512i result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512i b_16; + __m512i result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_epi32(a); - b_16 = _mm512_load_epi32(b); - result_16 = _mm512_add_epi32(a_16, b_16); - _mm512_store_epi32(result, result_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_epi32(a); + b_16 = _mm512_load_epi32(b); + result_16 = _mm512_add_epi32(a_16, b_16); + _mm512_store_epi32(result, result_16); - a += steps; - b += steps; - result += steps; - } - } else if (steps == 8) { - __m256i a_8; - __m256i b_8; - __m256i result_8; + a += steps; + b += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i *) a); - b_8 = _mm256_load_si256((__m256i *) b); - result_8 = _mm256_add_epi32(a_8, b_8); - _mm256_store_si256((__m256i *) result, result_8); + steps = 1; + } + #endif - a += steps; - b += steps; - result += steps; - } - } else if (steps == 4) { - __m128i a_4; - __m128i b_4; - __m128i result_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256i b_8; + __m256i result_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i *) a); - b_4 = _mm_load_si128((__m128i *) b); - result_4 = _mm_add_epi32(a_4, b_4); - _mm_store_si128((__m128i *) result, result_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i *) a); + b_8 = _mm256_load_si256((__m256i *) b); + result_8 = _mm256_add_epi32(a_8, b_8); + _mm256_store_si256((__m256i *) result, result_8); - a += steps; - b += steps; - result += steps; - } - } + a += steps; + b += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128i b_4; + __m128i result_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i *) a); + b_4 = _mm_load_si128((__m128i *) b); + result_4 = _mm_add_epi32(a_4, b_4); + _mm_store_si128((__m128i *) result, result_4); + + a += steps; + b += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { *result = *a + *b; @@ -1436,59 +538,79 @@ inline void simd_add(const int32* a, const f32* b, f32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) b, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512 af_16; - __m512 b_16; - __m512 result_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512 af_16; + __m512 b_16; + __m512 result_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_epi32(a); - af_16 = _mm512_cvtepi32_ps(a_16); - b_16 = _mm512_load_ps(b); - result_16 = _mm512_add_ps(af_16, b_16); - _mm512_store_ps(result, result_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_epi32(a); + af_16 = _mm512_cvtepi32_ps(a_16); + b_16 = _mm512_load_ps(b); + result_16 = _mm512_add_ps(af_16, b_16); + _mm512_store_ps(result, result_16); - a += steps; - b += steps; - result += steps; - } - } else if (steps == 8) { - __m256i a_8; - __m256 af_8; - __m256 b_8; - __m256 result_8; + a += steps; + b += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i *) a); - af_8 = _mm256_cvtepi32_ps(a_8); - b_8 = _mm256_load_ps(b); - result_8 = _mm256_add_ps(af_8, b_8); - _mm256_store_ps(result, result_8); + steps = 1; + } + #endif - a += steps; - b += steps; - result += steps; - } - } else if (steps == 4) { - __m128i a_4; - __m128 af_4; - __m128 b_4; - __m128 result_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256 af_8; + __m256 b_8; + __m256 result_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i *) a); - af_4 = _mm_cvtepi32_ps(a_4); - b_4 = _mm_load_ps(b); - result_4 = _mm_add_ps(af_4, b_4); - _mm_store_ps(result, result_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i *) a); + af_8 = _mm256_cvtepi32_ps(a_8); + b_8 = _mm256_load_ps(b); + result_8 = _mm256_add_ps(af_8, b_8); + _mm256_store_ps(result, result_8); - a += steps; - b += steps; - result += steps; - } - } + a += steps; + b += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128 af_4; + __m128 b_4; + __m128 result_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i *) a); + af_4 = _mm_cvtepi32_ps(a_4); + b_4 = _mm_load_ps(b); + result_4 = _mm_add_ps(af_4, b_4); + _mm_store_ps(result, result_4); + + a += steps; + b += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { *result = *a + *b; @@ -1503,65 +625,85 @@ inline void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) b, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps == 16) { - __m512i a_16; - __m512 af_16; - __m512 b_16; - __m512 result_16; - __m512i resulti_16; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + __m512i a_16; + __m512 af_16; + __m512 b_16; + __m512 result_16; + __m512i resulti_16; - for (; i <= size - steps; i += steps) { - a_16 = _mm512_load_epi32(a); - af_16 = _mm512_cvtepi32_ps(a_16); - b_16 = _mm512_load_ps(b); - result_16 = _mm512_add_ps(af_16, b_16); - resulti_16 = _mm512_cvtps_epi32(result_16); - _mm512_store_epi32(result, resulti_16); + for (; i <= size - steps; i += steps) { + a_16 = _mm512_load_epi32(a); + af_16 = _mm512_cvtepi32_ps(a_16); + b_16 = _mm512_load_ps(b); + result_16 = _mm512_add_ps(af_16, b_16); + resulti_16 = _mm512_cvtps_epi32(result_16); + _mm512_store_epi32(result, resulti_16); - a += steps; - b += steps; - result += steps; - } - } else if (steps == 8) { - __m256i a_8; - __m256 af_8; - __m256 b_8; - __m256 result_8; - __m256i resulti_8; + a += steps; + b += steps; + result += steps; + } - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i *) a); - af_8 = _mm256_cvtepi32_ps(a_8); - b_8 = _mm256_load_ps(b); - result_8 = _mm256_add_ps(af_8, b_8); - resulti_8 = _mm256_cvtps_epi32(result_8); - _mm256_store_si256((__m256i *) result, resulti_8); + steps = 1; + } + #endif - a += steps; - b += steps; - result += steps; - } - } else if (steps == 4) { - __m128i a_4; - __m128 af_4; - __m128 b_4; - __m128 result_4; - __m128i resulti_4; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + __m256i a_8; + __m256 af_8; + __m256 b_8; + __m256 result_8; + __m256i resulti_8; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i *) a); - af_4 = _mm_cvtepi32_ps(a_4); - b_4 = _mm_load_ps(b); - result_4 = _mm_add_ps(af_4, b_4); - resulti_4 = _mm_cvtps_epi32(result_4); - _mm_store_si128((__m128i *) result, resulti_4); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i *) a); + af_8 = _mm256_cvtepi32_ps(a_8); + b_8 = _mm256_load_ps(b); + result_8 = _mm256_add_ps(af_8, b_8); + resulti_8 = _mm256_cvtps_epi32(result_8); + _mm256_store_si256((__m256i *) result, resulti_8); - a += steps; - b += steps; - result += steps; - } - } + a += steps; + b += steps; + result += steps; + } + + steps = 1; + } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + __m128i a_4; + __m128 af_4; + __m128 b_4; + __m128 result_4; + __m128i resulti_4; + + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i *) a); + af_4 = _mm_cvtepi32_ps(a_4); + b_4 = _mm_load_ps(b); + result_4 = _mm_add_ps(af_4, b_4); + resulti_4 = _mm_cvtps_epi32(result_4); + _mm_store_si128((__m128i *) result, resulti_4); + + a += steps; + b += steps; + result += steps; + } + } + #endif for (; i < size; ++i) { *result = (int32) (*a + *b); @@ -1572,49 +714,52 @@ void simd_add(const int32* a, const f32* b, int32* result, int32 size, int32 ste } } -// WARNING: only works with SSE4.2 -// WARNING: incl. \0 both strings must be <= 16 length -bool str_compare_avx512(const char* str1, const char* str2) { - __m128i s1 = _mm_load_si128((__m128i *) (const __m128i *) str1); - __m128i s2 = _mm_load_si128((__m128i *) (const __m128i *) str2); - - return _mm_cmpistrc(s1, s2, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH) == 0; -} - void endian_swap(const int32* val, int32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) val, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps >= 8) { - const __m256i mask_256 = _mm256_setr_epi8( - 3, 2, 1, 0, 7, 6, 5, 4, - 11, 10, 9, 8, 15, 14, 13, 12, - 19, 18, 17, 16, 23, 22, 21, 20, - 27, 26, 25, 24, 31, 30, 29, 28 - ); + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + const __m256i mask_256 = _mm256_setr_epi8( + 3, 2, 1, 0, 7, 6, 5, 4, + 11, 10, 9, 8, 15, 14, 13, 12, + 19, 18, 17, 16, 23, 22, 21, 20, + 27, 26, 25, 24, 31, 30, 29, 28 + ); - for (i = 0; i <= size - steps; i += steps) { - __m256i vec = _mm256_load_si256((const __m256i *) (val + i)); - vec = _mm256_shuffle_epi8(vec, mask_256); + for (i = 0; i <= size - steps; i += steps) { + __m256i vec = _mm256_load_si256((const __m256i *) (val + i)); + vec = _mm256_shuffle_epi8(vec, mask_256); - _mm256_storeu_si256((__m256i *) (result + i), vec); + _mm256_store_si256((__m256i *) (result + i), vec); + } + + steps = 1; } - } else if (steps == 4) { - const __m128i mask_128 = _mm_setr_epi8( - 3, 2, 1, 0, - 7, 6, 5, 4, - 11, 10, 9, 8, - 15, 14, 13, 12 - ); + #endif - for (i = 0; i <= size - steps; i += steps) { - __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i)); - vec = _mm_shuffle_epi8(vec, mask_128); + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + const __m128i mask_128 = _mm_setr_epi8( + 3, 2, 1, 0, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12 + ); - _mm_storeu_si128((__m128i *) (result + i), vec); + for (i = 0; i <= size - steps; i += steps) { + __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i)); + vec = _mm_shuffle_epi8(vec, mask_128); + + _mm_store_si128((__m128i *) (result + i), vec); + } } - } + #endif for (; i < size; ++i) { uint32 v = ((uint32 *) val)[i]; @@ -1629,36 +774,48 @@ void endian_swap(const uint32* val, uint32* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) val, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps >= 8) { - const __m256i mask_256 = _mm256_setr_epi8( - 3, 2, 1, 0, 7, 6, 5, 4, - 11, 10, 9, 8, 15, 14, 13, 12, - 19, 18, 17, 16, 23, 22, 21, 20, - 27, 26, 25, 24, 31, 30, 29, 28 - ); + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + const __m256i mask_256 = _mm256_setr_epi8( + 3, 2, 1, 0, 7, 6, 5, 4, + 11, 10, 9, 8, 15, 14, 13, 12, + 19, 18, 17, 16, 23, 22, 21, 20, + 27, 26, 25, 24, 31, 30, 29, 28 + ); - for (i = 0; i <= size - steps; i += steps) { - __m256i vec = _mm256_load_si256((const __m256i *) (val + i)); - vec = _mm256_shuffle_epi8(vec, mask_256); + for (i = 0; i <= size - steps; i += steps) { + __m256i vec = _mm256_load_si256((const __m256i *) (val + i)); + vec = _mm256_shuffle_epi8(vec, mask_256); - _mm256_storeu_si256((__m256i *) (result + i), vec); + _mm256_store_si256((__m256i *) (result + i), vec); + } + + steps = 1; } - } else if (steps == 4) { - const __m128i mask_128 = _mm_setr_epi8( - 3, 2, 1, 0, - 7, 6, 5, 4, - 11, 10, 9, 8, - 15, 14, 13, 12 - ); + #endif - for (i = 0; i <= size - steps; i += steps) { - __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i)); - vec = _mm_shuffle_epi8(vec, mask_128); + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + const __m128i mask_128 = _mm_setr_epi8( + 3, 2, 1, 0, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12 + ); - _mm_storeu_si128((__m128i *) (result + i), vec); + for (i = 0; i <= size - steps; i += steps) { + __m128i vec = _mm_load_si128((__m128i *) (const __m128i *) (val + i)); + vec = _mm_shuffle_epi8(vec, mask_128); + + _mm_store_si128((__m128i *) (result + i), vec); + } } - } + #endif for (; i < size; ++i) { uint32 v = ((uint32 *) val)[i]; @@ -1672,8 +829,12 @@ endian_swap(const uint32* val, uint32* result, int32 size, int32 steps) void endian_swap(const int16* val, int16* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) val, steps); + steps = intrin_validate_steps((const byte*) result, steps); + #ifdef MACRO_CPU_FEATURE_AVX2 if (steps >= 8) { + steps = 8; const __m256i mask_256 = _mm256_setr_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 @@ -1683,22 +844,29 @@ void endian_swap(const int16* val, int16* result, int32 size, int32 steps) __m256i vec = _mm256_load_si256((const __m256i *) (val + i)); vec = _mm256_shuffle_epi8(vec, mask_256); - _mm256_storeu_si256((__m256i *) (result + i), vec); + _mm256_store_si256((__m256i *) (result + i), vec); } - } else if (steps == 4) { - const __m128i mask_128 = _mm_setr_epi8( - 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 - ); - for (i = 0; i <= size - steps; i += steps) { - __m128i vec = _mm_load_si128((const __m128i *) (val + i)); - vec = _mm_shuffle_epi8(vec, mask_128); - - _mm_storeu_si128((__m128i *) (result + i), vec); - } + steps = 1; } + #endif + + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + const __m128i mask_128 = _mm_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 + ); + + for (i = 0; i <= size - steps; i += steps) { + __m128i vec = _mm_load_si128((const __m128i *) (val + i)); + vec = _mm_shuffle_epi8(vec, mask_128); + + _mm_store_si128((__m128i *) (result + i), vec); + } + } + #endif - // Handle remaining elements for (; i < size; ++i) { uint16 v = ((uint16 *) val)[i]; ((int16 *) result)[i] = ((v << 8) | (v >> 8)); @@ -1708,33 +876,44 @@ void endian_swap(const int16* val, int16* result, int32 size, int32 steps) void endian_swap(const uint16* val, uint16* result, int32 size, int32 steps) { int32 i = 0; + steps = intrin_validate_steps((const byte*) val, steps); + steps = intrin_validate_steps((const byte*) result, steps); - if (steps >= 8) { - const __m256i mask_256 = _mm256_setr_epi8( - 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 - ); + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + const __m256i mask_256 = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 + ); - for (i = 0; i <= size - steps; i += steps) { - __m256i vec = _mm256_load_si256((const __m256i *) (val + i)); - vec = _mm256_shuffle_epi8(vec, mask_256); + for (i = 0; i <= size - steps; i += steps) { + __m256i vec = _mm256_load_si256((const __m256i *) (val + i)); + vec = _mm256_shuffle_epi8(vec, mask_256); - _mm256_storeu_si256((__m256i *) (result + i), vec); + _mm256_store_si256((__m256i *) (result + i), vec); + } + + steps = 1; } - } else if (steps == 4) { - const __m128i mask_128 = _mm_setr_epi8( - 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 - ); + #endif - for (i = 0; i <= size - steps; i += steps) { - __m128i vec = _mm_load_si128((const __m128i *) (val + i)); - vec = _mm_shuffle_epi8(vec, mask_128); + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + const __m128i mask_128 = _mm_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 + ); - _mm_storeu_si128((__m128i *) (result + i), vec); + for (i = 0; i <= size - steps; i += steps) { + __m128i vec = _mm_load_si128((const __m128i *) (val + i)); + vec = _mm_shuffle_epi8(vec, mask_128); + + _mm_store_si128((__m128i *) (result + i), vec); + } } - } + #endif - // Handle remaining elements for (; i < size; ++i) { uint16 v = ((uint16 *) val)[i]; ((uint16 *) result)[i] = ((v << 8) | (v >> 8)); diff --git a/architecture/x86/simd/SIMD_I32_AVX2.h b/architecture/x86/simd/SIMD_I32_AVX2.h new file mode 100644 index 0000000..6cb7b0e --- /dev/null +++ b/architecture/x86/simd/SIMD_I32_AVX2.h @@ -0,0 +1,288 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I32_H +#define COMS_TOS_STDLIB_SIMD_I32_H + +#include +#include +#include + +#include "../../../stdlib/Types.h" + +// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general +// or better create alternative functions for the available sse version. + +// @question why are we passing structs by value? +struct int32_8 { + union { + #if ARM + svint32_t s; + #else + __m256i s; + #endif + + int32 v[8]; + }; +}; + +inline int32_8 load_int32_8(const int32* mem) +{ + int32_8 simd; + simd.s = _mm256_load_si256((__m256i *) mem); + + return simd; +} + +inline int32_8 init_int32_8(const int32* mem) +{ + int32_8 simd; + simd.s = _mm256_set_epi32(mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7]); + + return simd; +} + +inline void unload_int32_8(int32_8 a, int32 *array) { _mm256_store_si256((__m256i *) array, a.s); } + +inline int32_8 init_zero_int32_8() +{ + int32_8 simd; + simd.s = _mm256_setzero_si256(); + + return simd; +} + +inline int32_8 init_value_int32_8(int32 value) +{ + int32_8 simd; + simd.s = _mm256_set1_epi32(value); + + return simd; +} + +inline int32_8 init_values_int32_8( + int32 a, int32 b, int32 c, int32 d, + int32 e, int32 f, int32 g, int32 h +) +{ + int32_8 simd; + simd.s = _mm256_set_epi32(a, b, c, d, e, f, g, h); + + return simd; +} + +inline int32_8 operator+(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_add_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 operator-(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_sub_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 operator-(int32_8 a) { return init_zero_int32_8() - a; } + +inline int32_8 operator*(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_mul_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 operator^(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_xor_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 &operator-=(int32_8 &a, int32_8 b) +{ + a = a - b; + + return a; +} + +inline int32_8 &operator+=(int32_8 &a, int32_8 b) +{ + a = a + b; + + return a; +} + +inline int32_8 &operator*=(int32_8 &a, int32_8 b) +{ + a = a * b; + + return a; +} + +inline int32_8 &operator^=(int32_8 &a, int32_8 b) +{ + a = a ^ b; + + return a; +} + +inline int32_8 operator<(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_xor_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1)); + + return simd; +} + +inline int32_8 operator<=(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(a.s, b.s), _mm256_set1_epi32(-1)); + + return simd; +} + +inline int32_8 operator>(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_cmpgt_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 operator>=(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi32(b.s, a.s), _mm256_set1_epi32(-1)); + + return simd; +} + +inline int32_8 operator==(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_cmpeq_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 operator!=(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_mask_blend_epi32(_mm256_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); + + return simd; +} + +inline int32_8 operator&(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_and_si256(a.s, b.s); + + return simd; +} + +inline int32_8 operator|(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_or_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 &operator&=(int32_8 &a, int32_8 b) +{ + a = a & b; + + return a; +} + +inline int32_8 &operator|=(int32_8 &a, int32_8 b) +{ + a = a | b; + + return a; +} + +inline int32_8 abs(int32_8 a) +{ + int32_8 simd; + simd.s = _mm256_abs_epi32(a.s); + + return simd; +} + +inline int32_8 simd_min(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_min_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 simd_max(int32_8 a, int32_8 b) +{ + int32_8 simd; + simd.s = _mm256_max_epi32(a.s, b.s); + + return simd; +} + +inline int32_8 sign(int32_8 a) +{ + __m256i mask = _mm256_set1_epi32(0x80000000); + __m256i signBit = _mm256_and_si256(a.s, mask); + __m256i b = _mm256_set1_epi32(1); + + int32_8 simd; + simd.s = _mm256_or_si256(b, signBit); + + return simd; +} + +inline int32_8 clamp(int32_8 min_value, int32_8 a, int32_8 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(int32_8 a) +{ + int32 which_true = _mm256_movemask_epi8(a.s); + + return which_true; +} + +inline bool any_true(int32_8 a) +{ + bool is_any_true = _mm256_movemask_epi8(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int32_8 a) +{ + bool is_true = _mm256_movemask_epi8(a.s) == 255; + + return is_true; +} + +inline bool all_false(int32_8 a) +{ + bool is_false = _mm256_movemask_epi8(a.s) == 0; + + return is_false; +} + +#endif diff --git a/architecture/x86/simd/SIMD_I32_AVX512.h b/architecture/x86/simd/SIMD_I32_AVX512.h new file mode 100644 index 0000000..cd56539 --- /dev/null +++ b/architecture/x86/simd/SIMD_I32_AVX512.h @@ -0,0 +1,309 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I32_H +#define COMS_TOS_STDLIB_SIMD_I32_H + +#include +#include +#include + +#include "../../../stdlib/Types.h" +#include "SIMD_SVML_AVX512.h" + +// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general +// or better create alternative functions for the available sse version. + +// @question why are we passing structs by value? +struct int32_16 { + union { + #if ARM + svint32_t s; + #else + __m512i s; + #endif + + int32 v[16]; + }; +}; + +inline int32_16 load_int32_16(const int32* mem) +{ + int32_16 simd; + simd.s = _mm512_load_epi32(mem); + + return simd; +} + +inline int32_16 init_int32_16(const int32* mem) +{ + int32_16 simd; + simd.s = _mm512_set_epi32( + mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15]); + + return simd; +} + +inline void unload_int32_16(int32_16 a, int32 *array) { _mm512_store_epi32(array, a.s); } + +inline int32_16 init_zero_int32_16() +{ + int32_16 simd; + simd.s = _mm512_setzero_epi32(); + + return simd; +} + +inline int32_16 init_value_int32_16(int32 value) +{ + int32_16 simd; + simd.s = _mm512_set1_epi32(value); + + return simd; +} + +inline int32_16 init_values_int32_16( + int32 a, int32 b, int32 c, int32 d, + int32 e, int32 f, int32 g, int32 h, + int32 i, int32 j, int32 k, int32 l, + int32 m, int32 n, int32 o, int32 p +) +{ + int32_16 simd; + simd.s = _mm512_set_epi32(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p); + + return simd; +} + +inline int32_16 operator+(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_add_epi32(a.s, b.s); + + return simd; +} + +inline int32_16 operator-(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_sub_epi32(a.s, b.s); + + return simd; +} + +inline int32_16 operator-(int32_16 a) { return init_zero_int32_16() - a; } + +inline int32_16 operator*(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_mul_epi32(a.s, b.s); + + return simd; +} + +inline int32_16 operator/(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_div_epi32(a.s, b.s); + + return simd; +} + +inline int32_16 operator^(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_xor_epi32(a.s, b.s); + + return simd; +} + +inline int32_16 &operator-=(int32_16 &a, int32_16 b) +{ + a = a - b; + + return a; +} + +inline int32_16 &operator+=(int32_16 &a, int32_16 b) +{ + a = a + b; + + return a; +} + +inline int32_16 &operator*=(int32_16 &a, int32_16 b) +{ + a = a * b; + + return a; +} + +inline int32_16 &operator/=(int32_16 &a, int32_16 b) +{ + a.s = (a / b).s; + + return a; +} + +inline int32_16 &operator^=(int32_16 &a, int32_16 b) +{ + a = a ^ b; + + return a; +} + +inline int32_16 operator<(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_mask_blend_epi32(_mm512_cmplt_epi32_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int32_16 operator<=(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_mask_blend_epi32(_mm512_knot(_mm512_cmpgt_epi32_mask(b.s, a.s)), b.s, a.s); + + return simd; +} + +inline int32_16 operator>(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_mask_blend_epi32(_mm512_cmpgt_epi32_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int32_16 operator>=(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_mask_blend_epi32(_mm512_cmpge_epi32_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int32_16 operator==(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_mask_blend_epi32(_mm512_cmpeq_epi32_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int32_16 operator!=(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_mask_blend_epi32(_mm512_cmp_epi32_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); + + return simd; +} + +inline int32_16 operator&(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_and_si512(a.s, b.s); + + return simd; +} + +inline int32_16 operator|(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_or_epi32(a.s, b.s); + + return simd; +} + +inline int32_16 &operator&=(int32_16 &a, int32_16 b) +{ + a = a & b; + + return a; +} + +inline int32_16 &operator|=(int32_16 &a, int32_16 b) +{ + a = a | b; + + return a; +} + +inline int32_16 abs(int32_16 a) +{ + int32_16 simd; + simd.s = _mm512_abs_epi64(a.s); + + return simd; +} + +inline int32_16 simd_min(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_min_epi32(a.s, b.s); + + return simd; +} + +inline int32_16 simd_max(int32_16 a, int32_16 b) +{ + int32_16 simd; + simd.s = _mm512_max_epi32(a.s, b.s); + + return simd; +} + +inline int32_16 sign(int32_16 a) +{ + __m512i mask = _mm512_set1_epi32(0x80000000); + __m512i signBit = _mm512_and_si512(a.s, mask); + __m512i b = _mm512_set1_epi32(1); + int32_16 simd; + + simd.s = _mm512_or_si512(b, signBit); + + return simd; +} + +inline int32_16 clamp(int32_16 min_value, int32_16 a, int32_16 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(int32_16 a) +{ + int32 which_true = _mm512_movepi32_mask(a.s); + + return which_true; +} + +inline bool any_true(int32_16 a) +{ + bool is_any_true = _mm512_movepi32_mask(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int32_16 a) +{ + bool is_true = _mm512_movepi32_mask(a.s) == 65535; + + return is_true; +} + +inline bool all_false(int32_16 a) +{ + // @todo This can be optimized (requires also changes in the comparison functions return) + bool is_false = _mm512_movepi32_mask(a.s) == 0; + + return is_false; +} + +#endif diff --git a/architecture/x86/simd/SIMD_I32_SSE.h b/architecture/x86/simd/SIMD_I32_SSE.h new file mode 100644 index 0000000..399c49f --- /dev/null +++ b/architecture/x86/simd/SIMD_I32_SSE.h @@ -0,0 +1,286 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I32_H +#define COMS_TOS_STDLIB_SIMD_I32_H + +#include +#include +#include + +#include "../../../stdlib/Types.h" + +// @todo a lot of sse functions require high level (e.g. sse4.1) this needs to be changed to be more general +// or better create alternative functions for the available sse version. + +// @question why are we passing structs by value? + +struct int32_4 { + union { + #if ARM + svint32_t s; + #else + __m128i s; + #endif + + int32 v[4]; + }; +}; + +inline int32_4 load_int32_4(const int32* mem) +{ + int32_4 simd; + simd.s = _mm_load_si128((__m128i *) mem); + + return simd; +} + +inline int32_4 init_int32_4(const int32* mem) +{ + int32_4 simd; + simd.s = _mm_set_epi32(mem[0], mem[1], mem[2], mem[3]); + + return simd; +} + +inline void unload_int32_4(int32_4 a, int32 *array) { _mm_store_si128((__m128i *) array, a.s); } + +inline int32_4 init_zero_int32_4() +{ + int32_4 simd; + simd.s = _mm_setzero_si128(); + + return simd; +} + +inline int32_4 init_value_int32_4(int32 value) +{ + int32_4 simd; + simd.s = _mm_set1_epi32(value); + + return simd; +} + +inline int32_4 init_values_int32_4(int32 a, int32 b, int32 c, int32 d) +{ + int32_4 simd; + simd.s = _mm_set_epi32(a, b, c, d); + + return simd; +} + +inline int32_4 operator+(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_add_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 operator-(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_sub_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 operator-(int32_4 a) { return init_zero_int32_4() - a; } + +inline int32_4 operator*(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_mul_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 operator^(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_xor_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 &operator-=(int32_4 &a, int32_4 b) +{ + a = a - b; + + return a; +} + +inline int32_4 &operator+=(int32_4 &a, int32_4 b) +{ + a = a + b; + + return a; +} + +inline int32_4 &operator*=(int32_4 &a, int32_4 b) +{ + a = a * b; + + return a; +} + +inline int32_4 &operator^=(int32_4 &a, int32_4 b) +{ + a = a ^ b; + + return a; +} + +inline int32_4 operator<(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_cmplt_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 operator<=(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_andnot_si128(_mm_cmplt_epi32(b.s, a.s), _mm_set1_epi32(-1)); + + return simd; +} + +inline int32_4 operator>(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_cmpgt_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 operator>=(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_andnot_si128(_mm_cmplt_epi32(a.s, b.s), _mm_set1_epi32(-1)); + + return simd; +} + +inline int32_4 operator==(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_cmpeq_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 operator!=(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_andnot_si128(_mm_cmpeq_epi32(a.s, b.s), _mm_set1_epi32(-1)); + + return simd; +} + +inline int32_4 operator&(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_and_si128(a.s, b.s); + + return simd; +} + +inline int32_4 operator|(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_or_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 &operator&=(int32_4 &a, int32_4 b) +{ + a = a & b; + + return a; +} + +inline int32_4 &operator|=(int32_4 &a, int32_4 b) +{ + a = a | b; + + return a; +} + +inline int32_4 abs(int32_4 a) +{ + int32_4 simd; + simd.s = _mm_abs_epi32(a.s); + + return simd; +} + +inline int32_4 simd_min(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_min_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 simd_max(int32_4 a, int32_4 b) +{ + int32_4 simd; + simd.s = _mm_max_epi32(a.s, b.s); + + return simd; +} + +inline int32_4 sign(int32_4 a) +{ + __m128i mask = _mm_set1_epi32(0x80000000); + __m128i signBit = _mm_and_si128(a.s, mask); + __m128i b = _mm_set1_epi32(1); + + int32_4 simd; + simd.s = _mm_or_si128(b, signBit); + + return simd; +} + +inline int32_4 clamp(int32_4 min_value, int32_4 a, int32_4 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(int32_4 a) +{ + int32 which_true = _mm_movemask_epi8(a.s); + + return which_true; +} + +inline bool any_true(int32_4 a) +{ + bool is_any_true = _mm_movemask_epi8(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int32_4 a) +{ + bool is_true = _mm_movemask_epi8(a.s) == 15; + + return is_true; +} + +inline bool all_false(int32_4 a) +{ + bool is_false = _mm_movemask_epi8(a.s) == 0; + + return is_false; +} + +#endif diff --git a/architecture/x86/simd/SIMD_I64.h b/architecture/x86/simd/SIMD_I64.h old mode 100644 new mode 100755 index 33bbc41..151c222 --- a/architecture/x86/simd/SIMD_I64.h +++ b/architecture/x86/simd/SIMD_I64.h @@ -13,42 +13,17 @@ #include #include "../../../stdlib/Types.h" -#include "SIMD_F64.h" -struct int64_2 { - union { - #if ARM - svint64_t s; - #else - __m128i s; - #endif +#ifdef MACRO_CPU_FEATURE_SSE42 + #include "SIMD_I64_SSE.h" +#endif - int64 v[2]; - }; -}; +#ifdef MACRO_CPU_FEATURE_AVX2 + #include "SIMD_I64_AVX2.h" +#endif -struct int64_4 { - union { - #if ARM - svint64_t s; - #else - __m256i s; - #endif - - int64 v[4]; - }; -}; - -struct int64_8 { - union { - #if ARM - svint64_t s; - #else - __m512i s; - #endif - - int64 v[8]; - }; -}; +#ifdef MACRO_CPU_FEATURE_AVX512 + #include "SIMD_I64_AVX512.h" +#endif #endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I64_AVX2.h b/architecture/x86/simd/SIMD_I64_AVX2.h new file mode 100644 index 0000000..aa61750 --- /dev/null +++ b/architecture/x86/simd/SIMD_I64_AVX2.h @@ -0,0 +1,29 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I64_AVX2_H +#define COMS_TOS_STDLIB_SIMD_I64_AVX2_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int64_4 { + union { + #if ARM + svint64_t s; + #else + __m256i s; + #endif + + int64 v[4]; + }; +}; + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I64_AVX512.h b/architecture/x86/simd/SIMD_I64_AVX512.h new file mode 100644 index 0000000..8c49d05 --- /dev/null +++ b/architecture/x86/simd/SIMD_I64_AVX512.h @@ -0,0 +1,29 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I64_AVX512_H +#define COMS_TOS_STDLIB_SIMD_I64_AVX512_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int64_8 { + union { + #if ARM + svint64_t s; + #else + __m512i s; + #endif + + int64 v[8]; + }; +}; + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I64_SSE.h b/architecture/x86/simd/SIMD_I64_SSE.h new file mode 100644 index 0000000..82ea226 --- /dev/null +++ b/architecture/x86/simd/SIMD_I64_SSE.h @@ -0,0 +1,29 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I64_SSE_H +#define COMS_TOS_STDLIB_SIMD_I64_SSE_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int64_2 { + union { + #if ARM + svint64_t s; + #else + __m128i s; + #endif + + int64 v[2]; + }; +}; + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I8.h b/architecture/x86/simd/SIMD_I8.h old mode 100644 new mode 100755 index 6885808..4cb5205 --- a/architecture/x86/simd/SIMD_I8.h +++ b/architecture/x86/simd/SIMD_I8.h @@ -13,906 +13,108 @@ #include #include "../../../stdlib/Types.h" -#include "SIMD_F32.h" -#include "SIMD_I32.h" -struct int8_16 { - union { - #if ARM - svint8_t s; - #else - __m128i s; - #endif +#ifdef MACRO_CPU_FEATURE_SSE42 + #include "SIMD_I8_SSE.h" +#endif - int8 v[16]; - }; -}; +#ifdef MACRO_CPU_FEATURE_AVX2 + #include "SIMD_I8_AVX2.h" +#endif -struct int8_32 { - union { - #if ARM - svint8_t s; - #else - __m256i s; - #endif +#ifdef MACRO_CPU_FEATURE_AVX512 + #include "SIMD_I8_AVX512.h" +#endif - int8 v[32]; - }; -}; - -struct int8_64 { - union { - #if ARM - svint8_t s; - #else - __m512i s; - #endif - - int8 v[64]; - }; -}; - -inline int8_16 load_int8_16(const int8* mem) -{ - int8_16 simd; - simd.s = _mm_load_si128((__m128i *) mem); - - return simd; -} - -inline int8_16 init_int8_16(const int8* mem) -{ - int8_16 simd; - simd.s = _mm_set_epi8( - mem[0], mem[1], mem[2], mem[3], - mem[4], mem[5], mem[6], mem[7], - mem[8], mem[9], mem[10], mem[11], - mem[12], mem[13], mem[14], mem[15] - ); - - return simd; -} - -inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *) array, a.s); } - -inline int8_32 load_int8_32(const int8* mem) -{ - int8_32 simd; - simd.s = _mm256_load_si256((__m256i *) mem); - - return simd; -} - -inline int8_32 init_int8_32(const int8* mem) -{ - int8_32 simd; - simd.s = _mm256_set_epi8( - mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], - mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15], - mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23], - mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31] - ); - - return simd; -} - -inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i *) array, a.s); } - -inline int8_64 load_int8_64(const int8* mem) -{ - int8_64 simd; - simd.s = _mm512_load_si512((__m512i *) mem); - - return simd; -} - -inline int8_64 init_int8_64(const int8* mem) -{ - int8_64 simd; - simd.s = _mm512_set_epi8( - mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], - mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15], - mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23], - mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31], - mem[32], mem[33], mem[34], mem[35], mem[36], mem[37], mem[38], mem[39], - mem[40], mem[41], mem[42], mem[43], mem[44], mem[45], mem[46], mem[47], - mem[48], mem[49], mem[50], mem[51], mem[52], mem[53], mem[54], mem[55], - mem[56], mem[57], mem[58], mem[59], mem[60], mem[61], mem[62], mem[63] - ); - - return simd; -} - -inline void unload_int8_64(int8_64 a, int8 *array) { _mm512_storeu_epi8(array, a.s); } - -inline int8_16 init_zero_int8_16() -{ - int8_16 simd; - simd.s = _mm_setzero_si128(); - - return simd; -} - -inline int8_32 init_zero_int8_32() -{ - int8_32 simd; - simd.s = _mm256_setzero_si256(); - - return simd; -} - -inline int8_64 init_zero_int8_64() -{ - int8_64 simd; - simd.s = _mm512_setzero_si512(); - - return simd; -} - -inline int8_16 init_value_int8_16(int8 value) -{ - int8_16 simd; - simd.s = _mm_set1_epi8(value); - - return simd; -} - -inline int8_32 init_value_int8_32(int8 value) -{ - int8_32 simd; - simd.s = _mm256_set1_epi8(value); - - return simd; -} - -inline int8_64 init_value_int8_64(int8 value) -{ - int8_64 simd; - simd.s = _mm512_set1_epi8(value); - - return simd; -} - -inline -f32_4 int8_16_to_f32_4(int8_16 a) -{ - f32_4 result; - result.s = _mm_cvtepi32_ps(a.s); - - return result; -} - -inline -f32_8 int8_16_to_f32_8(int8_16 a) -{ - f32_8 result; - result.s = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(a.s)); - - return result; -} - -inline -f32_16 int8_16_to_f32_16(int8_16 a) -{ - f32_16 result; - result.s = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(a.s)); - - return result; -} - -inline int8_16 operator+(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_add_epi8(a.s, b.s); - - return simd; -} - -inline int8_32 operator+(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_add_epi8(a.s, b.s); - - return simd; -} - -inline int8_64 operator+(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_add_epi8(a.s, b.s); - - return simd; -} - -inline int8_16 operator-(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_sub_epi8(a.s, b.s); - - return simd; -} - -inline int8_16 operator-(int8_16 a) { return init_zero_int8_16() - a; } - -inline int8_32 operator-(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_sub_epi8(a.s, b.s); - - return simd; -} - -inline int8_32 operator-(int8_32 a) { return init_zero_int8_32() - a; } - -inline int8_64 operator-(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_sub_epi8(a.s, b.s); - - return simd; -} - -inline int8_64 operator-(int8_64 a) { return init_zero_int8_64() - a; } - -inline int8_16 operator*(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_mul_epi32(a.s, b.s); - - return simd; -} - -inline int8_32 operator*(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_mul_epi32(a.s, b.s); - - return simd; -} - -inline int8_64 operator*(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_mul_epi32(a.s, b.s); - - return simd; -} - -inline int8_16 operator^(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_xor_si128(a.s, b.s); - - return simd; -} - -inline int8_32 operator^(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_xor_si256(a.s, b.s); - - return simd; -} - -inline int8_64 operator^(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_xor_si512(a.s, b.s); - - return simd; -} - -inline int8_16 &operator-=(int8_16 &a, int8_16 b) -{ - a = a - b; - - return a; -} - -inline int8_32 &operator-=(int8_32 &a, int8_32 b) -{ - a = a - b; - - return a; -} - -inline int8_64 &operator-=(int8_64 &a, int8_64 b) -{ - a = a - b; - - return a; -} - -inline int8_16 &operator+=(int8_16 &a, int8_16 b) -{ - a = a + b; - - return a; -} - -inline int8_32 &operator+=(int8_32 &a, int8_32 b) -{ - a = a + b; - - return a; -} - -inline int8_64 &operator+=(int8_64 &a, int8_64 b) -{ - a = a + b; - - return a; -} - -inline int8_16 &operator*=(int8_16 &a, int8_16 b) -{ - a = a * b; - - return a; -} - -inline int8_32 &operator*=(int8_32 &a, int8_32 b) -{ - a = a * b; - - return a; -} - -inline int8_64 &operator*=(int8_64 &a, int8_64 b) -{ - a = a * b; - - return a; -} - -inline int8_16 &operator^=(int8_16 &a, int8_16 b) -{ - a = a ^ b; - - return a; -} - -inline int8_32 &operator^=(int8_32 &a, int8_32 b) -{ - a = a ^ b; - - return a; -} - -inline int8_64 &operator^=(int8_64 &a, int8_64 b) -{ - a = a ^ b; - - return a; -} - -inline int8_16 operator<(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_cmplt_epi8(a.s, b.s); - - return simd; -} - -inline int8_32 operator<(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_xor_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1)); - - return simd; -} - -inline int8_64 operator<(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_mask_blend_epi8(_mm512_cmplt_epi8_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int8_16 operator<=(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_andnot_si128(_mm_cmplt_epi8(b.s, a.s), _mm_set1_epi8(-1)); - - return simd; -} - -inline int8_32 operator<=(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1)); - - return simd; -} - -inline int8_64 operator<=(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_mask_blend_epi8(_mm512_cmple_epi8_mask(a.s, b.s), b.s, a.s); - - return simd; -} - -inline int8_16 operator>(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_cmpgt_epi8(a.s, b.s); - - return simd; -} - -inline int8_32 operator>(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_cmpgt_epi8(a.s, b.s); - - return simd; -} - -inline int8_64 operator>(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_mask_blend_epi8(_mm512_cmpgt_epi8_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int8_16 operator>=(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_andnot_si128(_mm_cmplt_epi8(a.s, b.s), _mm_set1_epi8(-1)); - - return simd; -} - -inline int8_32 operator>=(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(b.s, a.s), _mm256_set1_epi8(-1)); - - return simd; -} - -inline int8_64 operator>=(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_mask_blend_epi8(_mm512_cmpge_epi8_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int8_16 operator==(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_cmpeq_epi8(a.s, b.s); - - return simd; -} - -inline int8_32 operator==(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_cmpeq_epi8(a.s, b.s); - - return simd; -} - -inline int8_64 operator==(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_mask_blend_epi8(_mm512_cmpeq_epi8_mask(a.s, b.s), a.s, b.s); - - return simd; -} - -inline int8_16 operator!=(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_andnot_si128(_mm_cmpeq_epi8(a.s, b.s), _mm_set1_epi8(-1)); - - return simd; -} - -inline int8_32 operator!=(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_mask_blend_epi8(_mm256_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); - - return simd; -} - -inline int8_64 operator!=(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_mask_blend_epi8(_mm512_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); - - return simd; -} - -inline int8_16 operator&(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_and_si128(a.s, b.s); - - return simd; -} - -inline int8_32 operator&(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_and_si256(a.s, b.s); - - return simd; -} - -inline int8_64 operator&(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_and_si512(a.s, b.s); - - return simd; -} - -inline int8_16 operator|(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_or_si128(a.s, b.s); - - return simd; -} - -inline int8_32 operator|(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_or_si256(a.s, b.s); - - return simd; -} - -inline int8_64 operator|(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_or_si512(a.s, b.s); - - return simd; -} - -inline int8_16 &operator&=(int8_16 &a, int8_16 b) -{ - a = a & b; - - return a; -} - -inline int8_32 &operator&=(int8_32 &a, int8_32 b) -{ - a = a & b; - - return a; -} - -inline int8_64 &operator&=(int8_64 &a, int8_64 b) -{ - a = a & b; - - return a; -} - -inline int8_16 &operator|=(int8_16 &a, int8_16 b) -{ - a = a | b; - - return a; -} - -inline int8_32 &operator|=(int8_32 &a, int8_32 b) -{ - a = a | b; - - return a; -} - -inline int8_64 &operator|=(int8_64 &a, int8_64 b) -{ - a = a | b; - - return a; -} - -inline int8_16 abs(int8_16 a) -{ - int8_16 simd; - simd.s = _mm_abs_epi8(a.s); - - return simd; -} - -inline int8_32 abs(int8_32 a) -{ - int8_32 simd; - simd.s = _mm256_abs_epi16(a.s); - - return simd; -} - -inline int8_64 abs(int8_64 a) -{ - int8_64 simd; - simd.s = _mm512_abs_epi16(a.s); - - return simd; -} - -inline int8_16 simd_min(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_min_epi8(a.s, b.s); - - return simd; -} - -inline int8_32 simd_min(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_min_epi8(a.s, b.s); - - return simd; -} - -inline int8_64 simd_min(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_min_epi8(a.s, b.s); - - return simd; -} - -inline int8_16 simd_max(int8_16 a, int8_16 b) -{ - int8_16 simd; - simd.s = _mm_max_epi8(a.s, b.s); - - return simd; -} - -inline int8_32 simd_max(int8_32 a, int8_32 b) -{ - int8_32 simd; - simd.s = _mm256_max_epi8(a.s, b.s); - - return simd; -} - -inline int8_64 simd_max(int8_64 a, int8_64 b) -{ - int8_64 simd; - simd.s = _mm512_max_epi8(a.s, b.s); - - return simd; -} - -inline int8_16 clamp(int8_16 min_value, int8_16 a, int8_16 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int8_32 clamp(int8_32 min_value, int8_32 a, int8_32 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int8_64 clamp(int8_64 min_value, int8_64 a, int8_64 max_value) -{ - return simd_min(simd_max(a, min_value), max_value); -} - -inline int32 which_true(int8_16 a) -{ - int32 which_true = _mm_movemask_epi8(a.s); - - return which_true; -} - -inline int32 which_true(int8_32 a) -{ - int32 which_true = _mm256_movemask_epi8(a.s); - - return which_true; -} - -inline int64 which_true(int8_64 a) -{ - int64 which_true = _mm512_movepi8_mask(a.s); - - return which_true; -} - -inline bool any_true(int8_16 a) -{ - bool is_any_true = _mm_movemask_epi8(a.s) > 0; - - return is_any_true; -} - -inline bool any_true(int8_32 a) -{ - bool is_any_true = _mm256_movemask_epi8(a.s) > 0; - - return is_any_true; -} - -inline bool any_true(int8_64 a) -{ - bool is_any_true = _mm512_movepi8_mask(a.s) > 0; - - return is_any_true; -} - -inline bool all_true(int8_16 a) -{ - bool is_true = _mm_movemask_epi8(a.s) == 15; - - return is_true; -} - -inline bool all_true(int8_32 a) -{ - bool is_true = _mm256_movemask_epi8(a.s) == 255; - - return is_true; -} - -inline bool all_true(int8_64 a) -{ - bool is_true = _mm512_movepi8_mask(a.s) == 65535; - - return is_true; -} - -inline bool all_false(int8_16 a) -{ - bool is_false = _mm_movemask_epi8(a.s) == 0; - - return is_false; -} - -inline bool all_false(int8_32 a) -{ - bool is_false = _mm256_movemask_epi8(a.s) == 0; - - return is_false; -} - -inline bool all_false(int8_64 a) -{ - // @todo This can be optimized (requires also changes in the comparison functions return) - bool is_false = _mm512_movepi8_mask(a.s) == 0; - - return is_false; -} - -// @todo from down here we can optimize some of the code by NOT using the wrappers -// the code is self contained and we could use te intrinsic functions directly - -/* -inline -f32 simd_mult(const int8* a, f32 b, int32 size, int32 steps) -{ - if (steps == 16) { - __m512i a_16 = _mm512_load_si512((__m512i *) a); - __m512 af_16 = _mm512_cvtepi32_ps(a_16); - __m512 b_16 = _mm512_set1_ps(b); - - __m512 result = _mm512_mul_ps(af_16, b_16); - } else if (steps == 8) { - __m256i a_8 = _mm256_load_si256((__m256i *) a); - __m256 af_8 = _mm256_cvtepi32_ps(a_8); - __m256 b_8 = _mm256_set1_ps(b); - - __m256 result = _mm256_mul_ps(af_8, b_8); - } else if (steps == 4) { - __m128i a_4 = _mm_load_si128((__m128i *) a); - __m128 af_4 = _mm_cvtepi32_ps(a_4); - __m128 b_4 = _mm_set1_ps(b); - - __m128 result = _mm_mul_ps(af_4, b_4); - } else { - - } -} -*/ - -bool simd_compare_64(const byte* a, const byte* b) -{ - __m256i chunk1_a = _mm256_load_si256((__m256i*) a); - __m256i chunk1_b = _mm256_load_si256((__m256i*) b); - - __m256i chunk2_a = _mm256_load_si256((__m256i*) (a + 32)); - __m256i chunk2_b = _mm256_load_si256((__m256i*) (b + 32)); - - __m256i result1 = _mm256_cmpeq_epi8(chunk1_a, chunk1_b); - __m256i result2 = _mm256_cmpeq_epi8(chunk2_a, chunk2_b); - - __m256i combined = _mm256_and_si256(result1, result2); - - return _mm256_testc_si256(combined, _mm256_set1_epi8(-1)) != 1; -} - -int simd_compare(const byte* a, const byte* b, uint32 size, uint32 steps = 8) { +int simd_equal(const byte* a, const byte* b, uint32 size, uint32 steps = 8) { uint32 i = 0; + steps = intrin_validate_steps((const byte*) a, steps); + steps = intrin_validate_steps((const byte*) b, steps); - if (steps == 16) { - if (size >= 128) { - __m512i a_16; - __m512i b_16; - __mmask64 result_mask; + #ifdef MACRO_CPU_FEATURE_AVX512 + if (steps >= 16) { + steps = 16; + if (size >= 128) { + __m512i a_16; + __m512i b_16; + __mmask64 result_mask; - for (; i <= size - 64; i += 64) { // 64 bytes per iteration - a_16 = _mm512_load_si512((__m512i*) a); - b_16 = _mm512_load_si512((__m512i*) b); + for (; i <= size - 64; i += 64) { // 64 bytes per iteration + a_16 = _mm512_load_si512((__m512i*) a); + b_16 = _mm512_load_si512((__m512i*) b); - result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16); + result_mask = _mm512_cmpeq_epi8_mask(a_16, b_16); - if (result_mask != 0xFFFFFFFFFFFFFFFF) { - return false; + if (result_mask != 0xFFFFFFFFFFFFFFFF) { + return false; + } + + a += 64; + b += 64; } + } - a += 64; - b += 64; + if (size - i >= 64) { + return simd_equal(a, b, size - i, 8); + } else if (size - i >= 32) { + return simd_equal(a, b, size - i, 4); } } + #endif - if (size - i >= 64) { - return simd_compare(a, b, size - i, 8); - } else if (size - i >= 32) { - return simd_compare(a, b, size - i, 4); - } - } else if (steps == 8) { - if (size >= 64) { - __m256i a_8; - __m256i b_8; - __m256i result_8; + #ifdef MACRO_CPU_FEATURE_AVX2 + if (steps >= 8) { + steps = 8; + if (size >= 64) { + __m256i a_8; + __m256i b_8; + __m256i result_8; - for (; i <= size - steps; i += steps) { - a_8 = _mm256_load_si256((__m256i*) a); - b_8 = _mm256_load_si256((__m256i*) b); + for (; i <= size - steps; i += steps) { + a_8 = _mm256_load_si256((__m256i*) a); + b_8 = _mm256_load_si256((__m256i*) b); - result_8 = _mm256_cmpeq_epi8(a_8, b_8); + result_8 = _mm256_cmpeq_epi8(a_8, b_8); - if (_mm256_testc_si256(result_8, _mm256_set1_epi8(-1)) != 1) { - return false; + if (_mm256_testc_si256(result_8, _mm256_set1_epi8(-1)) != 1) { + return false; + } + + a += steps; + b += steps; } + } - a += steps; - b += steps; + if (size - i >= 32) { + return simd_equal(a, b, size - i, 4); } } + #endif - if (size - i >= 32) { - return simd_compare(a, b, size - i, 4); - } - } else if (steps == 4) { - if (size >= 16) { - __m128i a_4; - __m128i b_4; - __m128i result_4; + #ifdef MACRO_CPU_FEATURE_SSE42 + if (steps >= 4) { + steps = 4; + if (size >= 16) { + __m128i a_4; + __m128i b_4; + __m128i result_4; - for (; i <= size - steps; i += steps) { - a_4 = _mm_load_si128((__m128i*) a); - b_4 = _mm_load_si128((__m128i*) b); + for (; i <= size - steps; i += steps) { + a_4 = _mm_load_si128((__m128i*) a); + b_4 = _mm_load_si128((__m128i*) b); - result_4 = _mm_cmpeq_epi8(a_4, b_4); + result_4 = _mm_cmpeq_epi8(a_4, b_4); - if (_mm_movemask_epi8(result_4) != 0xFFFF) { - return false; + if (_mm_movemask_epi8(result_4) != 0xFFFF) { + return false; + } + + a += steps; + b += steps; } - - a += steps; - b += steps; } } - } + #endif for (; i < size; ++i) { if (*a++ != *b++) { diff --git a/architecture/x86/simd/SIMD_I8_AVX2.h b/architecture/x86/simd/SIMD_I8_AVX2.h new file mode 100644 index 0000000..462beaa --- /dev/null +++ b/architecture/x86/simd/SIMD_I8_AVX2.h @@ -0,0 +1,265 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I8_H +#define COMS_TOS_STDLIB_SIMD_I8_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int8_32 { + union { + #if ARM + svint8_t s; + #else + __m256i s; + #endif + + int8 v[32]; + }; +}; + +inline int8_32 load_int8_32(const int8* mem) +{ + int8_32 simd; + simd.s = _mm256_load_si256((__m256i *) mem); + + return simd; +} + +inline int8_32 init_int8_32(const int8* mem) +{ + int8_32 simd; + simd.s = _mm256_set_epi8( + mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15], + mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23], + mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31] + ); + + return simd; +} + +inline void unload_int8_32(int8_32 a, int8 *array) { _mm256_store_si256((__m256i *) array, a.s); } + +inline int8_32 init_zero_int8_32() +{ + int8_32 simd; + simd.s = _mm256_setzero_si256(); + + return simd; +} + +inline int8_32 init_value_int8_32(int8 value) +{ + int8_32 simd; + simd.s = _mm256_set1_epi8(value); + + return simd; +} + +inline int8_32 operator+(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_add_epi8(a.s, b.s); + + return simd; +} + +inline int8_32 operator-(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_sub_epi8(a.s, b.s); + + return simd; +} + +inline int8_32 operator-(int8_32 a) { return init_zero_int8_32() - a; } + +inline int8_32 operator*(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_mul_epi32(a.s, b.s); + + return simd; +} + +inline int8_32 operator^(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_xor_si256(a.s, b.s); + + return simd; +} + +inline int8_32 &operator-=(int8_32 &a, int8_32 b) +{ + a = a - b; + + return a; +} + +inline int8_32 &operator+=(int8_32 &a, int8_32 b) +{ + a = a + b; + + return a; +} + +inline int8_32 &operator*=(int8_32 &a, int8_32 b) +{ + a = a * b; + + return a; +} + +inline int8_32 &operator^=(int8_32 &a, int8_32 b) +{ + a = a ^ b; + + return a; +} + +inline int8_32 operator<(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_xor_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1)); + + return simd; +} + +inline int8_32 operator<=(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(a.s, b.s), _mm256_set1_epi8(-1)); + + return simd; +} + +inline int8_32 operator>(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_cmpgt_epi8(a.s, b.s); + + return simd; +} + +inline int8_32 operator>=(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_andnot_si256(_mm256_cmpgt_epi8(b.s, a.s), _mm256_set1_epi8(-1)); + + return simd; +} + +inline int8_32 operator==(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_cmpeq_epi8(a.s, b.s); + + return simd; +} + +inline int8_32 operator!=(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_mask_blend_epi8(_mm256_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); + + return simd; +} + +inline int8_32 operator&(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_and_si256(a.s, b.s); + + return simd; +} + +inline int8_32 operator|(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_or_si256(a.s, b.s); + + return simd; +} + +inline int8_32 &operator&=(int8_32 &a, int8_32 b) +{ + a = a & b; + + return a; +} + +inline int8_32 &operator|=(int8_32 &a, int8_32 b) +{ + a = a | b; + + return a; +} + +inline int8_32 abs(int8_32 a) +{ + int8_32 simd; + simd.s = _mm256_abs_epi16(a.s); + + return simd; +} + +inline int8_32 simd_min(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_min_epi8(a.s, b.s); + + return simd; +} + +inline int8_32 simd_max(int8_32 a, int8_32 b) +{ + int8_32 simd; + simd.s = _mm256_max_epi8(a.s, b.s); + + return simd; +} + +inline int8_32 clamp(int8_32 min_value, int8_32 a, int8_32 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(int8_32 a) +{ + int32 which_true = _mm256_movemask_epi8(a.s); + + return which_true; +} + +inline bool any_true(int8_32 a) +{ + bool is_any_true = _mm256_movemask_epi8(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int8_32 a) +{ + bool is_true = _mm256_movemask_epi8(a.s) == 255; + + return is_true; +} + +inline bool all_false(int8_32 a) +{ + bool is_false = _mm256_movemask_epi8(a.s) == 0; + + return is_false; +} + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I8_AVX512.h b/architecture/x86/simd/SIMD_I8_AVX512.h new file mode 100644 index 0000000..a14047d --- /dev/null +++ b/architecture/x86/simd/SIMD_I8_AVX512.h @@ -0,0 +1,270 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I8_H +#define COMS_TOS_STDLIB_SIMD_I8_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int8_64 { + union { + #if ARM + svint8_t s; + #else + __m512i s; + #endif + + int8 v[64]; + }; +}; + +inline int8_64 load_int8_64(const int8* mem) +{ + int8_64 simd; + simd.s = _mm512_load_si512((__m512i *) mem); + + return simd; +} + +inline int8_64 init_int8_64(const int8* mem) +{ + int8_64 simd; + simd.s = _mm512_set_epi8( + mem[0], mem[1], mem[2], mem[3], mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], mem[12], mem[13], mem[14], mem[15], + mem[16], mem[17], mem[18], mem[19], mem[20], mem[21], mem[22], mem[23], + mem[24], mem[25], mem[26], mem[27], mem[28], mem[29], mem[30], mem[31], + mem[32], mem[33], mem[34], mem[35], mem[36], mem[37], mem[38], mem[39], + mem[40], mem[41], mem[42], mem[43], mem[44], mem[45], mem[46], mem[47], + mem[48], mem[49], mem[50], mem[51], mem[52], mem[53], mem[54], mem[55], + mem[56], mem[57], mem[58], mem[59], mem[60], mem[61], mem[62], mem[63] + ); + + return simd; +} + +inline void unload_int8_64(int8_64 a, int8 *array) { _mm512_storeu_epi8(array, a.s); } + +inline int8_64 init_zero_int8_64() +{ + int8_64 simd; + simd.s = _mm512_setzero_si512(); + + return simd; +} + +inline int8_64 init_value_int8_64(int8 value) +{ + int8_64 simd; + simd.s = _mm512_set1_epi8(value); + + return simd; +} + +inline int8_64 operator+(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_add_epi8(a.s, b.s); + + return simd; +} + +inline int8_64 operator-(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_sub_epi8(a.s, b.s); + + return simd; +} + +inline int8_64 operator-(int8_64 a) { return init_zero_int8_64() - a; } + +inline int8_64 operator*(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_mul_epi32(a.s, b.s); + + return simd; +} + +inline int8_64 operator^(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_xor_si512(a.s, b.s); + + return simd; +} + +inline int8_64 &operator-=(int8_64 &a, int8_64 b) +{ + a = a - b; + + return a; +} + +inline int8_64 &operator+=(int8_64 &a, int8_64 b) +{ + a = a + b; + + return a; +} + +inline int8_64 &operator*=(int8_64 &a, int8_64 b) +{ + a = a * b; + + return a; +} + +inline int8_64 &operator^=(int8_64 &a, int8_64 b) +{ + a = a ^ b; + + return a; +} + +inline int8_64 operator<(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_mask_blend_epi8(_mm512_cmplt_epi8_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int8_64 operator<=(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_mask_blend_epi8(_mm512_cmple_epi8_mask(a.s, b.s), b.s, a.s); + + return simd; +} + +inline int8_64 operator>(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_mask_blend_epi8(_mm512_cmpgt_epi8_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int8_64 operator>=(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_mask_blend_epi8(_mm512_cmpge_epi8_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int8_64 operator==(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_mask_blend_epi8(_mm512_cmpeq_epi8_mask(a.s, b.s), a.s, b.s); + + return simd; +} + +inline int8_64 operator!=(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_mask_blend_epi8(_mm512_cmp_epi8_mask(a.s, b.s, _MM_CMPINT_NE), a.s, b.s); + + return simd; +} + +inline int8_64 operator&(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_and_si512(a.s, b.s); + + return simd; +} + +inline int8_64 operator|(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_or_si512(a.s, b.s); + + return simd; +} + +inline int8_64 &operator&=(int8_64 &a, int8_64 b) +{ + a = a & b; + + return a; +} + +inline int8_64 &operator|=(int8_64 &a, int8_64 b) +{ + a = a | b; + + return a; +} + +inline int8_64 abs(int8_64 a) +{ + int8_64 simd; + simd.s = _mm512_abs_epi16(a.s); + + return simd; +} + +inline int8_64 simd_min(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_min_epi8(a.s, b.s); + + return simd; +} + +inline int8_64 simd_max(int8_64 a, int8_64 b) +{ + int8_64 simd; + simd.s = _mm512_max_epi8(a.s, b.s); + + return simd; +} + +inline int8_64 clamp(int8_64 min_value, int8_64 a, int8_64 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int64 which_true(int8_64 a) +{ + int64 which_true = _mm512_movepi8_mask(a.s); + + return which_true; +} + +inline bool any_true(int8_64 a) +{ + bool is_any_true = _mm512_movepi8_mask(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int8_64 a) +{ + bool is_true = _mm512_movepi8_mask(a.s) == 65535; + + return is_true; +} + +inline bool all_false(int8_64 a) +{ + // @todo This can be optimized (requires also changes in the comparison functions return) + bool is_false = _mm512_movepi8_mask(a.s) == 0; + + return is_false; +} + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_I8_SSE.h b/architecture/x86/simd/SIMD_I8_SSE.h new file mode 100644 index 0000000..e676bc6 --- /dev/null +++ b/architecture/x86/simd/SIMD_I8_SSE.h @@ -0,0 +1,265 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_TOS_STDLIB_SIMD_I8_H +#define COMS_TOS_STDLIB_SIMD_I8_H + +#include +#include + +#include "../../../stdlib/Types.h" + +struct int8_16 { + union { + #if ARM + svint8_t s; + #else + __m128i s; + #endif + + int8 v[16]; + }; +}; + +inline int8_16 load_int8_16(const int8* mem) +{ + int8_16 simd; + simd.s = _mm_load_si128((__m128i *) mem); + + return simd; +} + +inline int8_16 init_int8_16(const int8* mem) +{ + int8_16 simd; + simd.s = _mm_set_epi8( + mem[0], mem[1], mem[2], mem[3], + mem[4], mem[5], mem[6], mem[7], + mem[8], mem[9], mem[10], mem[11], + mem[12], mem[13], mem[14], mem[15] + ); + + return simd; +} + +inline void unload_int8_16(int8_16 a, int8 *array) { _mm_store_si128((__m128i *) array, a.s); } + +inline int8_16 init_zero_int8_16() +{ + int8_16 simd; + simd.s = _mm_setzero_si128(); + + return simd; +} + +inline int8_16 init_value_int8_16(int8 value) +{ + int8_16 simd; + simd.s = _mm_set1_epi8(value); + + return simd; +} + +inline int8_16 operator+(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_add_epi8(a.s, b.s); + + return simd; +} + +inline int8_16 operator-(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_sub_epi8(a.s, b.s); + + return simd; +} + +inline int8_16 operator-(int8_16 a) { return init_zero_int8_16() - a; } + +inline int8_16 operator*(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_mul_epi32(a.s, b.s); + + return simd; +} + +inline int8_16 operator^(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_xor_si128(a.s, b.s); + + return simd; +} + +inline int8_16 &operator-=(int8_16 &a, int8_16 b) +{ + a = a - b; + + return a; +} + +inline int8_16 &operator+=(int8_16 &a, int8_16 b) +{ + a = a + b; + + return a; +} + +inline int8_16 &operator*=(int8_16 &a, int8_16 b) +{ + a = a * b; + + return a; +} + +inline int8_16 &operator^=(int8_16 &a, int8_16 b) +{ + a = a ^ b; + + return a; +} + +inline int8_16 operator<(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_cmplt_epi8(a.s, b.s); + + return simd; +} + +inline int8_16 operator<=(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_andnot_si128(_mm_cmplt_epi8(b.s, a.s), _mm_set1_epi8(-1)); + + return simd; +} + +inline int8_16 operator>(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_cmpgt_epi8(a.s, b.s); + + return simd; +} + +inline int8_16 operator>=(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_andnot_si128(_mm_cmplt_epi8(a.s, b.s), _mm_set1_epi8(-1)); + + return simd; +} + +inline int8_16 operator==(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_cmpeq_epi8(a.s, b.s); + + return simd; +} + +inline int8_16 operator!=(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_andnot_si128(_mm_cmpeq_epi8(a.s, b.s), _mm_set1_epi8(-1)); + + return simd; +} + +inline int8_16 operator&(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_and_si128(a.s, b.s); + + return simd; +} + +inline int8_16 operator|(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_or_si128(a.s, b.s); + + return simd; +} + +inline int8_16 &operator&=(int8_16 &a, int8_16 b) +{ + a = a & b; + + return a; +} + +inline int8_16 &operator|=(int8_16 &a, int8_16 b) +{ + a = a | b; + + return a; +} + +inline int8_16 abs(int8_16 a) +{ + int8_16 simd; + simd.s = _mm_abs_epi8(a.s); + + return simd; +} + +inline int8_16 simd_min(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_min_epi8(a.s, b.s); + + return simd; +} + +inline int8_16 simd_max(int8_16 a, int8_16 b) +{ + int8_16 simd; + simd.s = _mm_max_epi8(a.s, b.s); + + return simd; +} + +inline int8_16 clamp(int8_16 min_value, int8_16 a, int8_16 max_value) +{ + return simd_min(simd_max(a, min_value), max_value); +} + +inline int32 which_true(int8_16 a) +{ + int32 which_true = _mm_movemask_epi8(a.s); + + return which_true; +} + +inline bool any_true(int8_16 a) +{ + bool is_any_true = _mm_movemask_epi8(a.s) > 0; + + return is_any_true; +} + +inline bool all_true(int8_16 a) +{ + bool is_true = _mm_movemask_epi8(a.s) == 15; + + return is_true; +} + +inline bool all_false(int8_16 a) +{ + bool is_false = _mm_movemask_epi8(a.s) == 0; + + return is_false; +} + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_SVML.h b/architecture/x86/simd/SIMD_SVML.h index 022e658..83a5c10 100644 --- a/architecture/x86/simd/SIMD_SVML.h +++ b/architecture/x86/simd/SIMD_SVML.h @@ -9,160 +9,16 @@ #ifndef COMS_STDLIB_SIMD_SVML_H #define COMS_STDLIB_SIMD_SVML_H -#include -#include +#ifdef MACRO_CPU_FEATURE_SSE42 + #include "SIMD_SVML_SSE.h" +#endif -#include "../../../stdlib/Types.h" +#ifdef MACRO_CPU_FEATURE_AVX2 + #include "SIMD_SVML_AVX2.h" +#endif -#if __linux__ - #include "math.h" - - inline __m128i _mm_div_epi32(__m128i a, __m128i b) { - alignas(16) int32 a_array[4], b_array[4], result[4]; - - _mm_storeu_si128((__m128i*) a_array, a); - _mm_storeu_si128((__m128i*) b_array, b); - - for (int32 i = 0; i < 4; ++i) { - result[i] = a_array[i] / b_array[i]; - } - - return _mm_load_si128((__m128i*) result); - } - - inline __m256i _mm256_div_epi32(__m256i a, __m256i b) { - alignas(32) int32 a_array[8], b_array[8], result[8]; - - _mm256_storeu_si256((__m256i*) a_array, a); - _mm256_storeu_si256((__m256i*) b_array, b); - - for (int32 i = 0; i < 8; ++i) { - result[i] = a_array[i] / b_array[i]; - } - - return _mm256_load_si256((__m256i*) result); - } - - inline __m512i _mm512_div_epi32(__m512i a, __m512i b) { - alignas(64) int32 a_array[16], b_array[16], result[16]; - - _mm512_storeu_si512((__m512i*) a_array, a); - _mm512_storeu_si512((__m512i*) b_array, b); - - for (int32 i = 0; i < 16; ++i) { - result[i] = a_array[i] / b_array[i]; - } - - return _mm512_load_si512((__m512i*) result); - } - - inline __m128 _mm_sin_ps(__m128 a) { - alignas(16) f32 a_array[4], result[4]; - _mm_storeu_ps(a_array, a); - for (int32 i = 0; i < 4; ++i) { - result[i] = sinf(a_array[i]); - } - return _mm_load_ps(result); - } - - inline __m128 _mm_cos_ps(__m128 a) { - alignas(16) f32 a_array[4], result[4]; - _mm_storeu_ps(a_array, a); - for (int32 i = 0; i < 4; ++i) { - result[i] = cosf(a_array[i]); - } - return _mm_load_ps(result); - } - - inline __m128 _mm_asin_ps(__m128 a) { - alignas(16) f32 a_array[4], result[4]; - _mm_storeu_ps(a_array, a); - for (int32 i = 0; i < 4; ++i) { - result[i] = asinf(a_array[i]); - } - return _mm_load_ps(result); - } - - inline __m128 _mm_acos_ps(__m128 a) { - alignas(16) f32 a_array[4], result[4]; - _mm_storeu_ps(a_array, a); - for (int32 i = 0; i < 4; ++i) { - result[i] = acosf(a_array[i]); - } - return _mm_load_ps(result); - } - - inline __m256 _mm256_sin_ps(__m256 a) { - alignas(32) f32 a_array[8], result[8]; - _mm256_storeu_ps(a_array, a); - for (int32 i = 0; i < 8; ++i) { - result[i] = sinf(a_array[i]); - } - return _mm256_load_ps(result); - } - - inline __m256 _mm256_cos_ps(__m256 a) { - alignas(32) f32 a_array[8], result[8]; - _mm256_storeu_ps(a_array, a); - for (int32 i = 0; i < 8; ++i) { - result[i] = cosf(a_array[i]); - } - return _mm256_load_ps(result); - } - - inline __m256 _mm256_asin_ps(__m256 a) { - alignas(32) f32 a_array[8], result[8]; - _mm256_storeu_ps(a_array, a); - for (int32 i = 0; i < 8; ++i) { - result[i] = asinf(a_array[i]); - } - return _mm256_load_ps(result); - } - - inline __m256 _mm256_acos_ps(__m256 a) { - alignas(32) f32 a_array[8], result[8]; - _mm256_storeu_ps(a_array, a); - for (int32 i = 0; i < 16; ++i) { - result[i] = acosf(a_array[i]); - } - return _mm256_load_ps(result); - } - - inline __m512 _mm512_sin_ps(__m512 a) { - alignas(64) f32 a_array[8], result[8]; - _mm512_storeu_ps(a_array, a); - for (int32 i = 0; i < 16; ++i) { - result[i] = sinf(a_array[i]); - } - return _mm512_load_ps(result); - } - - inline __m512 _mm512_cos_ps(__m512 a) { - alignas(64) f32 a_array[8], result[8]; - _mm512_storeu_ps(a_array, a); - for (int32 i = 0; i < 16; ++i) { - result[i] = cosf(a_array[i]); - } - return _mm512_load_ps(result); - } - - inline __m512 _mm512_asin_ps(__m512 a) { - alignas(64) f32 a_array[8], result[8]; - _mm512_storeu_ps(a_array, a); - for (int32 i = 0; i < 16; ++i) { - result[i] = asinf(a_array[i]); - } - return _mm512_load_ps(result); - } - - inline __m512 _mm512_acos_ps(__m512 a) { - alignas(64) f32 a_array[16], result[16]; - _mm512_storeu_ps(a_array, a); - for (int32 i = 0; i < 16; ++i) { - result[i] = acosf(a_array[i]); - } - return _mm512_load_ps(result); - } +#ifdef MACRO_CPU_FEATURE_AVX512 + #include "SIMD_SVML_AVX512.h" #endif #endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_SVML_AVX2.h b/architecture/x86/simd/SIMD_SVML_AVX2.h new file mode 100644 index 0000000..2365c12 --- /dev/null +++ b/architecture/x86/simd/SIMD_SVML_AVX2.h @@ -0,0 +1,69 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_SVML_AVX2_H +#define COMS_STDLIB_SIMD_SVML_AVX2_H + +#include +#include + +#include "../../../stdlib/Types.h" + +#if __linux__ + #include + inline __m256i _mm256_div_epi32(__m256i a, __m256i b) { + alignas(32) int32 a_array[8], b_array[8], result[8]; + + _mm256_store_si256((__m256i*) a_array, a); + _mm256_store_si256((__m256i*) b_array, b); + + for (int32 i = 0; i < 8; ++i) { + result[i] = a_array[i] / b_array[i]; + } + + return _mm256_load_si256((__m256i*) result); + } + + inline __m256 _mm256_sin_ps(__m256 a) { + alignas(32) f32 a_array[8], result[8]; + _mm256_store_ps(a_array, a); + for (int32 i = 0; i < 8; ++i) { + result[i] = sinf(a_array[i]); + } + return _mm256_load_ps(result); + } + + inline __m256 _mm256_cos_ps(__m256 a) { + alignas(32) f32 a_array[8], result[8]; + _mm256_store_ps(a_array, a); + for (int32 i = 0; i < 8; ++i) { + result[i] = cosf(a_array[i]); + } + return _mm256_load_ps(result); + } + + inline __m256 _mm256_asin_ps(__m256 a) { + alignas(32) f32 a_array[8], result[8]; + _mm256_store_ps(a_array, a); + for (int32 i = 0; i < 8; ++i) { + result[i] = asinf(a_array[i]); + } + return _mm256_load_ps(result); + } + + inline __m256 _mm256_acos_ps(__m256 a) { + alignas(32) f32 a_array[8], result[8]; + _mm256_store_ps(a_array, a); + for (int32 i = 0; i < 16; ++i) { + result[i] = acosf(a_array[i]); + } + return _mm256_load_ps(result); + } +#endif + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_SVML_AVX512.h b/architecture/x86/simd/SIMD_SVML_AVX512.h new file mode 100755 index 0000000..d8b15e5 --- /dev/null +++ b/architecture/x86/simd/SIMD_SVML_AVX512.h @@ -0,0 +1,70 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_SVML_AVX512_H +#define COMS_STDLIB_SIMD_SVML_AVX512_H + +#include +#include + +#include "../../../stdlib/Types.h" + +#if __linux__ + #include + + inline __m512i _mm512_div_epi32(__m512i a, __m512i b) { + alignas(64) int32 a_array[16], b_array[16], result[16]; + + _mm512_store_si512((__m512i*) a_array, a); + _mm512_store_si512((__m512i*) b_array, b); + + for (int32 i = 0; i < 16; ++i) { + result[i] = a_array[i] / b_array[i]; + } + + return _mm512_load_si512((__m512i*) result); + } + + inline __m512 _mm512_sin_ps(__m512 a) { + alignas(64) f32 a_array[8], result[8]; + _mm512_store_ps(a_array, a); + for (int32 i = 0; i < 16; ++i) { + result[i] = sinf(a_array[i]); + } + return _mm512_load_ps(result); + } + + inline __m512 _mm512_cos_ps(__m512 a) { + alignas(64) f32 a_array[8], result[8]; + _mm512_store_ps(a_array, a); + for (int32 i = 0; i < 16; ++i) { + result[i] = cosf(a_array[i]); + } + return _mm512_load_ps(result); + } + + inline __m512 _mm512_asin_ps(__m512 a) { + alignas(64) f32 a_array[8], result[8]; + _mm512_store_ps(a_array, a); + for (int32 i = 0; i < 16; ++i) { + result[i] = asinf(a_array[i]); + } + return _mm512_load_ps(result); + } + + inline __m512 _mm512_acos_ps(__m512 a) { + alignas(64) f32 a_array[16], result[16]; + _mm512_store_ps(a_array, a); + for (int32 i = 0; i < 16; ++i) { + result[i] = acosf(a_array[i]); + } + return _mm512_load_ps(result); + } +#endif + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/SIMD_SVML_SSE.h b/architecture/x86/simd/SIMD_SVML_SSE.h new file mode 100644 index 0000000..683f554 --- /dev/null +++ b/architecture/x86/simd/SIMD_SVML_SSE.h @@ -0,0 +1,70 @@ +/** + * Jingga + * + * @copyright Jingga + * @license OMS License 2.0 + * @version 1.0.0 + * @link https://jingga.app + */ +#ifndef COMS_STDLIB_SIMD_SVML_SSE_H +#define COMS_STDLIB_SIMD_SVML_SSE_H + +#include +#include + +#include "../../../stdlib/Types.h" + +#if __linux__ + #include + + inline __m128i _mm_div_epi32(__m128i a, __m128i b) { + alignas(16) int32 a_array[4], b_array[4], result[4]; + + _mm_store_si128((__m128i*) a_array, a); + _mm_store_si128((__m128i*) b_array, b); + + for (int32 i = 0; i < 4; ++i) { + result[i] = a_array[i] / b_array[i]; + } + + return _mm_load_si128((__m128i*) result); + } + + inline __m128 _mm_sin_ps(__m128 a) { + alignas(16) f32 a_array[4], result[4]; + _mm_store_ps(a_array, a); + for (int32 i = 0; i < 4; ++i) { + result[i] = sinf(a_array[i]); + } + return _mm_load_ps(result); + } + + inline __m128 _mm_cos_ps(__m128 a) { + alignas(16) f32 a_array[4], result[4]; + _mm_store_ps(a_array, a); + for (int32 i = 0; i < 4; ++i) { + result[i] = cosf(a_array[i]); + } + return _mm_load_ps(result); + } + + inline __m128 _mm_asin_ps(__m128 a) { + alignas(16) f32 a_array[4], result[4]; + _mm_store_ps(a_array, a); + for (int32 i = 0; i < 4; ++i) { + result[i] = asinf(a_array[i]); + } + return _mm_load_ps(result); + } + + inline __m128 _mm_acos_ps(__m128 a) { + alignas(16) f32 a_array[4], result[4]; + _mm_store_ps(a_array, a); + for (int32 i = 0; i < 4; ++i) { + result[i] = acosf(a_array[i]); + } + return _mm_load_ps(result); + } +#endif + +#endif \ No newline at end of file diff --git a/architecture/x86/simd/utils/Utils.h b/architecture/x86/simd/utils/Utils.h old mode 100644 new mode 100755 index 9554392..b5633c6 --- a/architecture/x86/simd/utils/Utils.h +++ b/architecture/x86/simd/utils/Utils.h @@ -17,6 +17,7 @@ // Only allowed for data >= 64 bits bool is_empty(const byte* region, uint64 size, int32 steps = 8) { + // Quick check of first 8 bytes if (*((uint64 *) region) != 0) { return false; } @@ -25,40 +26,52 @@ bool is_empty(const byte* region, uint64 size, int32 steps = 8) steps = intrin_validate_steps(region, steps); switch (steps) { - case 16: { - while (region + 64 <= end) { - __m512i chunk = _mm512_loadu_si512((const __m512i *) region); - __mmask64 mask = _mm512_cmpeq_epi8_mask(chunk, _mm512_setzero_si512()); - if (mask != 0xFFFFFFFFFFFFFFFF) { + #ifdef MACRO_CPU_FEATURE_AVX512 + case 16: { + while (region + 64 <= end) { + __m512i chunk = _mm512_load_si512((const __m512i *) region); + __mmask64 mask = _mm512_cmpeq_epi8_mask(chunk, _mm512_setzero_si512()); + if (mask != 0xFFFFFFFFFFFFFFFF) { + return false; + } + + region += 64; + } + }; + [[fallthrough]]; + #else + case 16: [[fallthrough]]; + #endif + #ifdef MACRO_CPU_FEATURE_AVX2 + case 8: { + while (region + 32 <= end) { + __m256i chunk = _mm256_load_si256((const __m256i *) region); + if (!_mm256_testz_si256(chunk, chunk)) { return false; } - region += 64; + region += 32; } }; [[fallthrough]]; - case 8: { - while (region + 32 <= end) { - __m256i chunk = _mm256_loadu_si256((const __m256i *) region); - if (!_mm256_testz_si256(chunk, chunk)) { - return false; - } + #else + case 8: [[fallthrough]]; + #endif + #ifdef MACRO_CPU_FEATURE_SSE42 + case 4: { + while (region + 16 <= end) { + __m128i chunk = _mm_load_si128((const __m128i *) region); + if (!_mm_testz_si128(chunk, chunk)) { + return false; + } - region += 32; - } - }; - [[fallthrough]]; - case 4: { - while (region + 16 <= end) { - __m128i chunk = _mm_loadu_si128((const __m128i *) region); - if (!_mm_testz_si128(chunk, chunk)) { - return false; + region += 16; } - - region += 16; } - } - [[fallthrough]]; + [[fallthrough]]; + #else + case 4: [[fallthrough]]; + #endif case 1: { while (region + 4 <= end) { if (*((const uint32_t *) region) != 0) { diff --git a/asset/Asset.h b/asset/Asset.h old mode 100644 new mode 100755 diff --git a/asset/AssetArchive.h b/asset/AssetArchive.h old mode 100644 new mode 100755 diff --git a/asset/AssetManagementSystem.h b/asset/AssetManagementSystem.h old mode 100644 new mode 100755 diff --git a/asset/AssetType.h b/asset/AssetType.h old mode 100644 new mode 100755 diff --git a/audio/Audio.cpp b/audio/Audio.cpp old mode 100644 new mode 100755 diff --git a/audio/Audio.h b/audio/Audio.h old mode 100644 new mode 100755 diff --git a/audio/AudioMixer.h b/audio/AudioMixer.h old mode 100644 new mode 100755 diff --git a/audio/AudioSetting.h b/audio/AudioSetting.h old mode 100644 new mode 100755 diff --git a/audio/Qoa.h b/audio/Qoa.h old mode 100644 new mode 100755 diff --git a/audio/QoaSimd.h b/audio/QoaSimd.h old mode 100644 new mode 100755 diff --git a/audio/Wav.h b/audio/Wav.h old mode 100644 new mode 100755 diff --git a/auth/Auth.h b/auth/Auth.h old mode 100644 new mode 100755 diff --git a/camera/Camera.h b/camera/Camera.h old mode 100644 new mode 100755 diff --git a/camera/CameraMovement.h b/camera/CameraMovement.h old mode 100644 new mode 100755 diff --git a/color/ColorVisionDeficiency.h b/color/ColorVisionDeficiency.h old mode 100644 new mode 100755 diff --git a/command/AppCmdBuffer.cpp b/command/AppCmdBuffer.cpp old mode 100644 new mode 100755 diff --git a/command/AppCmdBuffer.h b/command/AppCmdBuffer.h old mode 100644 new mode 100755 diff --git a/command/Command.h b/command/Command.h old mode 100644 new mode 100755 diff --git a/compiler/CompilerUtils.h b/compiler/CompilerUtils.h old mode 100644 new mode 100755 diff --git a/compiler/gcc/Atomic.h b/compiler/gcc/Atomic.h old mode 100644 new mode 100755 diff --git a/compiler/gcc/CompilerUtils.h b/compiler/gcc/CompilerUtils.h old mode 100644 new mode 100755 diff --git a/compiler/msvc/CompilerUtils.h b/compiler/msvc/CompilerUtils.h old mode 100644 new mode 100755 diff --git a/compression/Huffman.h b/compression/Huffman.h old mode 100644 new mode 100755 diff --git a/compression/LZP.h b/compression/LZP.h old mode 100644 new mode 100755 diff --git a/compression/RLE.h b/compression/RLE.h old mode 100644 new mode 100755 diff --git a/database/Database.h b/database/Database.h old mode 100644 new mode 100755 diff --git a/database/DatabaseConnection.h b/database/DatabaseConnection.h old mode 100644 new mode 100755 diff --git a/database/DatabaseType.h b/database/DatabaseType.h old mode 100644 new mode 100755 diff --git a/encryption/CeasarEncryption.h b/encryption/CeasarEncryption.h old mode 100644 new mode 100755 diff --git a/encryption/XorEncryption.h b/encryption/XorEncryption.h old mode 100644 new mode 100755 diff --git a/entity/AnimationEntityComponent.h b/entity/AnimationEntityComponent.h old mode 100644 new mode 100755 diff --git a/entity/CursorEntity.h b/entity/CursorEntity.h old mode 100644 new mode 100755 diff --git a/entity/Entity.h b/entity/Entity.h old mode 100644 new mode 100755 diff --git a/entity/EntityComponentSystem.h b/entity/EntityComponentSystem.h old mode 100644 new mode 100755 diff --git a/entity/EntitySize.h b/entity/EntitySize.h old mode 100644 new mode 100755 diff --git a/environment/Globe.h b/environment/Globe.h old mode 100644 new mode 100755 diff --git a/environment/Universe.h b/environment/Universe.h old mode 100644 new mode 100755 diff --git a/error/HammingCodes.h b/error/HammingCodes.h old mode 100644 new mode 100755 diff --git a/font/Font.h b/font/Font.h old mode 100644 new mode 100755 diff --git a/font/font_characters.txt b/font/font_characters.txt old mode 100644 new mode 100755 diff --git a/gpuapi/AntiAliasing.h b/gpuapi/AntiAliasing.h old mode 100644 new mode 100755 diff --git a/gpuapi/GpuApiType.h b/gpuapi/GpuApiType.h old mode 100644 new mode 100755 diff --git a/gpuapi/GpuAttributeType.h b/gpuapi/GpuAttributeType.h old mode 100644 new mode 100755 diff --git a/gpuapi/RenderUtils.h b/gpuapi/RenderUtils.h old mode 100644 new mode 100755 diff --git a/gpuapi/ShaderType.h b/gpuapi/ShaderType.h old mode 100644 new mode 100755 diff --git a/gpuapi/direct3d/AppCmdBuffer.h b/gpuapi/direct3d/AppCmdBuffer.h old mode 100644 new mode 100755 diff --git a/gpuapi/direct3d/DirectXUtils.h b/gpuapi/direct3d/DirectXUtils.h old mode 100644 new mode 100755 diff --git a/gpuapi/direct3d/FramesInFlightContainer.h b/gpuapi/direct3d/FramesInFlightContainer.h old mode 100644 new mode 100755 diff --git a/gpuapi/direct3d/Shader.h b/gpuapi/direct3d/Shader.h old mode 100644 new mode 100755 diff --git a/gpuapi/direct3d/ShaderUtils.h b/gpuapi/direct3d/ShaderUtils.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/AppCmdBuffer.h b/gpuapi/opengl/AppCmdBuffer.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/FramesInFlightContainer.h b/gpuapi/opengl/FramesInFlightContainer.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/Opengl.h b/gpuapi/opengl/Opengl.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/OpenglDefines.h b/gpuapi/opengl/OpenglDefines.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/OpenglDescriptorSetLayoutBinding.h b/gpuapi/opengl/OpenglDescriptorSetLayoutBinding.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/OpenglLinux.h b/gpuapi/opengl/OpenglLinux.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/OpenglUtils.h b/gpuapi/opengl/OpenglUtils.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/OpenglWin32.h b/gpuapi/opengl/OpenglWin32.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/Shader.h b/gpuapi/opengl/Shader.h old mode 100644 new mode 100755 diff --git a/gpuapi/opengl/ShaderUtils.h b/gpuapi/opengl/ShaderUtils.h old mode 100644 new mode 100755 diff --git a/gpuapi/vulkan/AppCmdBuffer.h b/gpuapi/vulkan/AppCmdBuffer.h old mode 100644 new mode 100755 diff --git a/gpuapi/vulkan/FramesInFlightContainer.h b/gpuapi/vulkan/FramesInFlightContainer.h old mode 100644 new mode 100755 diff --git a/gpuapi/vulkan/Shader.h b/gpuapi/vulkan/Shader.h old mode 100644 new mode 100755 diff --git a/gpuapi/vulkan/ShaderUtils.h b/gpuapi/vulkan/ShaderUtils.h old mode 100644 new mode 100755 diff --git a/gpuapi/vulkan/VulkanUtils.h b/gpuapi/vulkan/VulkanUtils.h old mode 100644 new mode 100755 diff --git a/hash/Crc.h b/hash/Crc.h old mode 100644 new mode 100755 diff --git a/hash/GeneralHash.h b/hash/GeneralHash.h old mode 100644 new mode 100755 index 9a5901f..bc4cd02 --- a/hash/GeneralHash.h +++ b/hash/GeneralHash.h @@ -402,8 +402,8 @@ uint32 intrin_hash(uint64 a, uint64 b = 0) noexcept }; __m128i hash = _mm_set_epi64x(a, b); - hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); - hash = _mm_aesdec_si128(hash, _mm_loadu_si128((__m128i *) seed)); + hash = _mm_aesdec_si128(hash, _mm_load_si128((__m128i *) seed)); + hash = _mm_aesdec_si128(hash, _mm_load_si128((__m128i *) seed)); return _mm_extract_epi32(hash, 0); } diff --git a/html/template/HtmlTemplate.h b/html/template/HtmlTemplate.h old mode 100644 new mode 100755 diff --git a/html/template/HtmlTemplateCache.h b/html/template/HtmlTemplateCache.h old mode 100644 new mode 100755 diff --git a/html/template/HtmlTemplateContext.h b/html/template/HtmlTemplateContext.h old mode 100644 new mode 100755 diff --git a/html/template/HtmlTemplateInterpreter.h b/html/template/HtmlTemplateInterpreter.h old mode 100644 new mode 100755 diff --git a/html/template/HtmlTemplateLexer.h b/html/template/HtmlTemplateLexer.h old mode 100644 new mode 100755 diff --git a/html/template/HtmlTemplateParser.h b/html/template/HtmlTemplateParser.h old mode 100644 new mode 100755 diff --git a/image/Bitmap.h b/image/Bitmap.h old mode 100644 new mode 100755 diff --git a/image/Image.cpp b/image/Image.cpp old mode 100644 new mode 100755 diff --git a/image/Image.h b/image/Image.h old mode 100644 new mode 100755 diff --git a/image/Png.h b/image/Png.h old mode 100644 new mode 100755 diff --git a/image/Qoi.h b/image/Qoi.h old mode 100644 new mode 100755 diff --git a/image/Tga.h b/image/Tga.h old mode 100644 new mode 100755 diff --git a/image/default_colors.h b/image/default_colors.h old mode 100644 new mode 100755 diff --git a/image/default_colors.htm b/image/default_colors.htm old mode 100644 new mode 100755 diff --git a/image/stb_image.h b/image/stb_image.h old mode 100644 new mode 100755 diff --git a/input/ControllerInput.h b/input/ControllerInput.h old mode 100644 new mode 100755 diff --git a/input/ControllerType.h b/input/ControllerType.h old mode 100644 new mode 100755 diff --git a/input/Input.h b/input/Input.h old mode 100644 new mode 100755 diff --git a/input/InputConnectionType.h b/input/InputConnectionType.h old mode 100644 new mode 100755 diff --git a/light/Material.h b/light/Material.h old mode 100644 new mode 100755 diff --git a/localization/Dialog.h b/localization/Dialog.h old mode 100644 new mode 100755 diff --git a/localization/Language.h b/localization/Language.h old mode 100644 new mode 100755 diff --git a/log/DebugContainer.h b/log/DebugContainer.h old mode 100644 new mode 100755 diff --git a/log/DebugMemory.h b/log/DebugMemory.h old mode 100644 new mode 100755 diff --git a/log/Log.h b/log/Log.h old mode 100644 new mode 100755 index 9fc45ee..0d6a085 --- a/log/Log.h +++ b/log/Log.h @@ -20,8 +20,6 @@ * Debug builds also log to the debug console, or alternative standard output if no dedicated debug console is available */ -#define LOG_DATA_ARRAY 5 - #ifndef LOG_LEVEL // 0 = no logging at all // 1 = release logging @@ -93,6 +91,7 @@ struct LogData { void* value; }; +#define LOG_DATA_ARRAY 5 struct LogDataArray{ LogData data[LOG_DATA_ARRAY]; }; @@ -143,6 +142,7 @@ void log_to_file() } // Same as log_to_file with the exception that reset the log pos to avoid repeated output +inline void log_flush() { if (!_log_memory || _log_memory->pos == 0 || !_log_fp) { @@ -178,8 +178,12 @@ void log(const char* str, const char* file, const char* function, int32 line) str += message_length; len -= MAX_LOG_LENGTH - sizeof(LogMessage); - #if DEBUG + #if DEBUG || VERBOSE // In debug mode we always output the log message to the debug console + char time_str[9]; + format_time_hh_mm_ss(time_str, msg->time / 1000000ULL); + compiler_debug_print(time_str); + compiler_debug_print(" "); compiler_debug_print(msg->message); compiler_debug_print("\n"); #endif @@ -260,6 +264,10 @@ void log(const char* format, LogDataArray data, const char* file, const char* fu #if DEBUG || VERBOSE // In debug mode we always output the log message to the debug console + char time_str[9]; + format_time_hh_mm_ss(time_str, msg->time / 1000000ULL); + compiler_debug_print(time_str); + compiler_debug_print(" "); compiler_debug_print(msg->message); compiler_debug_print("\n"); #endif diff --git a/log/PerformanceProfiler.h b/log/PerformanceProfiler.h old mode 100644 new mode 100755 diff --git a/log/Stats.h b/log/Stats.h old mode 100644 new mode 100755 diff --git a/math/Evaluator.h b/math/Evaluator.h old mode 100644 new mode 100755 diff --git a/math/PerlinNoise.h b/math/PerlinNoise.h old mode 100644 new mode 100755 diff --git a/math/matrix/MatrixFloat32.h b/math/matrix/MatrixFloat32.h old mode 100644 new mode 100755 index faca090..ce2ab1a --- a/math/matrix/MatrixFloat32.h +++ b/math/matrix/MatrixFloat32.h @@ -518,13 +518,13 @@ void mat4mat4_mult(const f32* __restrict a, const f32* __restrict b, f32* __rest inline void mat4mat4_mult_simd(const f32* __restrict a, const f32* __restrict b, f32* __restrict result) noexcept { - __m128 row1 = _mm_loadu_ps(&b[0]); - __m128 row2 = _mm_loadu_ps(&b[4]); - __m128 row3 = _mm_loadu_ps(&b[8]); - __m128 row4 = _mm_loadu_ps(&b[12]); + __m128 row1 = _mm_load_ps(&b[0]); + __m128 row2 = _mm_load_ps(&b[4]); + __m128 row3 = _mm_load_ps(&b[8]); + __m128 row4 = _mm_load_ps(&b[12]); for (int32 i = 3; i >= 0; --i) { - __m128 vW = _mm_loadu_ps(&a[i * 4]); + __m128 vW = _mm_load_ps(&a[i * 4]); __m128 vX = _mm_shuffle_ps(vW, vW, _MM_SHUFFLE(0, 0, 0, 0)); __m128 vY = _mm_shuffle_ps(vW, vW, _MM_SHUFFLE(1, 1, 1, 1)); @@ -540,7 +540,7 @@ void mat4mat4_mult_simd(const f32* __restrict a, const f32* __restrict b, f32* _ vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); - _mm_storeu_ps(&result[i * 4], vX); + _mm_store_ps(&result[i * 4], vX); } } diff --git a/math/matrix/MatrixInt32.h b/math/matrix/MatrixInt32.h old mode 100644 new mode 100755 diff --git a/math/matrix/MatrixInt64.h b/math/matrix/MatrixInt64.h old mode 100644 new mode 100755 diff --git a/math/matrix/QuaternionFloat32.h b/math/matrix/QuaternionFloat32.h old mode 100644 new mode 100755 diff --git a/math/matrix/VectorFloat32.h b/math/matrix/VectorFloat32.h old mode 100644 new mode 100755 diff --git a/math/matrix/VectorFloat64.h b/math/matrix/VectorFloat64.h old mode 100644 new mode 100755 diff --git a/math/matrix/VectorInt32.h b/math/matrix/VectorInt32.h old mode 100644 new mode 100755 diff --git a/math/matrix/VectorInt64.h b/math/matrix/VectorInt64.h old mode 100644 new mode 100755 diff --git a/math/random/BlueNoise.h b/math/random/BlueNoise.h old mode 100644 new mode 100755 diff --git a/memory/BufferMemory.h b/memory/BufferMemory.h old mode 100644 new mode 100755 diff --git a/memory/ChunkMemory.h b/memory/ChunkMemory.h old mode 100644 new mode 100755 diff --git a/memory/Heap.h b/memory/Heap.h old mode 100644 new mode 100755 diff --git a/memory/Queue.h b/memory/Queue.h old mode 100644 new mode 100755 diff --git a/memory/RingMemory.h b/memory/RingMemory.h old mode 100644 new mode 100755 diff --git a/memory/ThreadedChunkMemory.h b/memory/ThreadedChunkMemory.h old mode 100644 new mode 100755 diff --git a/memory/ThreadedQueue.h b/memory/ThreadedQueue.h old mode 100644 new mode 100755 diff --git a/memory/ThreadedRingMemory.h b/memory/ThreadedRingMemory.h old mode 100644 new mode 100755 diff --git a/models/Colors.h b/models/Colors.h old mode 100644 new mode 100755 diff --git a/models/Location.h b/models/Location.h old mode 100644 new mode 100755 diff --git a/models/Map/map_chunks.h b/models/Map/map_chunks.h old mode 100644 new mode 100755 diff --git a/models/Obj.h b/models/Obj.h old mode 100644 new mode 100755 diff --git a/models/Sound.h b/models/Sound.h old mode 100644 new mode 100755 diff --git a/models/account/Account.h b/models/account/Account.h old mode 100644 new mode 100755 diff --git a/models/bracket/Bracket.h b/models/bracket/Bracket.h old mode 100644 new mode 100755 diff --git a/models/bracket/BracketMatch.h b/models/bracket/BracketMatch.h old mode 100644 new mode 100755 diff --git a/models/bracket/BracketSeeding.h b/models/bracket/BracketSeeding.h old mode 100644 new mode 100755 diff --git a/models/bracket/BracketTeam.h b/models/bracket/BracketTeam.h old mode 100644 new mode 100755 diff --git a/models/chat/Chat.h b/models/chat/Chat.h old mode 100644 new mode 100755 diff --git a/models/chat/ChatLevel.h b/models/chat/ChatLevel.h old mode 100644 new mode 100755 diff --git a/models/chat/ChatStatus.h b/models/chat/ChatStatus.h old mode 100644 new mode 100755 diff --git a/models/chat/ChatType.h b/models/chat/ChatType.h old mode 100644 new mode 100755 diff --git a/models/event/Event.h b/models/event/Event.h old mode 100644 new mode 100755 diff --git a/models/event/EventTaskType.h b/models/event/EventTaskType.h old mode 100644 new mode 100755 diff --git a/models/event/tmp b/models/event/tmp old mode 100644 new mode 100755 diff --git a/models/extension/ExtensionType.h b/models/extension/ExtensionType.h old mode 100644 new mode 100755 diff --git a/models/guild/GuildBanner.h b/models/guild/GuildBanner.h old mode 100644 new mode 100755 diff --git a/models/item/Consumable.h b/models/item/Consumable.h old mode 100644 new mode 100755 diff --git a/models/item/ConsumableType.h b/models/item/ConsumableType.h old mode 100644 new mode 100755 diff --git a/models/item/Equipment.cpp b/models/item/Equipment.cpp old mode 100644 new mode 100755 diff --git a/models/item/Equipment.h b/models/item/Equipment.h old mode 100644 new mode 100755 diff --git a/models/item/EquipmentType.h b/models/item/EquipmentType.h old mode 100644 new mode 100755 diff --git a/models/item/Item.h b/models/item/Item.h old mode 100644 new mode 100755 diff --git a/models/item/ItemAffixDistribution.h b/models/item/ItemAffixDistribution.h old mode 100644 new mode 100755 diff --git a/models/item/ItemLevelStats.h b/models/item/ItemLevelStats.h old mode 100644 new mode 100755 diff --git a/models/item/ItemRarityDefinition.h b/models/item/ItemRarityDefinition.h old mode 100644 new mode 100755 diff --git a/models/item/ItemRarityStats.h b/models/item/ItemRarityStats.h old mode 100644 new mode 100755 diff --git a/models/item/ItemStatsDistribution.h b/models/item/ItemStatsDistribution.h old mode 100644 new mode 100755 diff --git a/models/item/MobLevelStats.h b/models/item/MobLevelStats.h old mode 100644 new mode 100755 diff --git a/models/item/_equipment_slots.h b/models/item/_equipment_slots.h old mode 100644 new mode 100755 diff --git a/models/item/_equipment_types.h b/models/item/_equipment_types.h old mode 100644 new mode 100755 diff --git a/models/item/_item_rarity.h b/models/item/_item_rarity.h old mode 100644 new mode 100755 diff --git a/models/map.h b/models/map.h old mode 100644 new mode 100755 diff --git a/models/mob/ActivityStats.h b/models/mob/ActivityStats.h old mode 100644 new mode 100755 diff --git a/models/mob/FixedStats.h b/models/mob/FixedStats.h old mode 100644 new mode 100755 diff --git a/models/mob/Mob.cpp b/models/mob/Mob.cpp old mode 100644 new mode 100755 diff --git a/models/mob/Mob.h b/models/mob/Mob.h old mode 100644 new mode 100755 diff --git a/models/mob/MobAction.h b/models/mob/MobAction.h old mode 100644 new mode 100755 diff --git a/models/mob/MobCategory.h b/models/mob/MobCategory.h old mode 100644 new mode 100755 diff --git a/models/mob/MobState.h b/models/mob/MobState.h old mode 100644 new mode 100755 diff --git a/models/mob/MobStats.cpp b/models/mob/MobStats.cpp old mode 100644 new mode 100755 diff --git a/models/mob/MobStats.h b/models/mob/MobStats.h old mode 100644 new mode 100755 diff --git a/models/mob/MobStatsType.h b/models/mob/MobStatsType.h old mode 100644 new mode 100755 diff --git a/models/mob/PrimaryStatsPoints.cpp b/models/mob/PrimaryStatsPoints.cpp old mode 100644 new mode 100755 diff --git a/models/mob/PrimaryStatsPoints.h b/models/mob/PrimaryStatsPoints.h old mode 100644 new mode 100755 diff --git a/models/mob/SecondaryStatsPoints.cpp b/models/mob/SecondaryStatsPoints.cpp old mode 100644 new mode 100755 diff --git a/models/mob/SecondaryStatsPoints.h b/models/mob/SecondaryStatsPoints.h old mode 100644 new mode 100755 diff --git a/models/mob/_mob_category.h b/models/mob/_mob_category.h old mode 100644 new mode 100755 diff --git a/models/mob/_mob_list.h b/models/mob/_mob_list.h old mode 100644 new mode 100755 diff --git a/models/mob/monster/Drop.h b/models/mob/monster/Drop.h old mode 100644 new mode 100755 diff --git a/models/mob/monster/LootTable.h b/models/mob/monster/LootTable.h old mode 100644 new mode 100755 diff --git a/models/mob/monster/Monster.h b/models/mob/monster/Monster.h old mode 100644 new mode 100755 diff --git a/models/mob/monster/MonsterStats.h b/models/mob/monster/MonsterStats.h old mode 100644 new mode 100755 diff --git a/models/mob/player/Backpack.h b/models/mob/player/Backpack.h old mode 100644 new mode 100755 diff --git a/models/mob/player/Guild.h b/models/mob/player/Guild.h old mode 100644 new mode 100755 diff --git a/models/mob/player/LootFilter.h b/models/mob/player/LootFilter.h old mode 100644 new mode 100755 diff --git a/models/mob/player/Player.cpp b/models/mob/player/Player.cpp old mode 100644 new mode 100755 diff --git a/models/mob/player/Player.h b/models/mob/player/Player.h old mode 100644 new mode 100755 diff --git a/models/mob/player/PlayerStats.h b/models/mob/player/PlayerStats.h old mode 100644 new mode 100755 diff --git a/models/mob/player/PlayerXPRequirement.h b/models/mob/player/PlayerXPRequirement.h old mode 100644 new mode 100755 diff --git a/models/mob/player/Reputation.h b/models/mob/player/Reputation.h old mode 100644 new mode 100755 diff --git a/models/mob/player/_player_class.h b/models/mob/player/_player_class.h old mode 100644 new mode 100755 diff --git a/models/mob/skill/AoeDistribution.h b/models/mob/skill/AoeDistribution.h old mode 100644 new mode 100755 diff --git a/models/mob/skill/AoeShape.h b/models/mob/skill/AoeShape.h old mode 100644 new mode 100755 diff --git a/models/mob/skill/ProjectileDistribution.h b/models/mob/skill/ProjectileDistribution.h old mode 100644 new mode 100755 diff --git a/models/mob/skill/Skill.h b/models/mob/skill/Skill.h old mode 100644 new mode 100755 diff --git a/models/mob/skill/SkillLocation.h b/models/mob/skill/SkillLocation.h old mode 100644 new mode 100755 diff --git a/models/mob/skill/StatsTarget.h b/models/mob/skill/StatsTarget.h old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/arcane_bolt.cfg b/models/mob/skill/definitions/arcane_bolt.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/arise.cfg b/models/mob/skill/definitions/arise.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/back_fist.cfg b/models/mob/skill/definitions/back_fist.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/beam.cfg b/models/mob/skill/definitions/beam.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/black_fist.cfg b/models/mob/skill/definitions/black_fist.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/chain.cfg b/models/mob/skill/definitions/chain.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/chain_lightning.cfg b/models/mob/skill/definitions/chain_lightning.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/corruption_bolt.cfg b/models/mob/skill/definitions/corruption_bolt.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/cyclone.cfg b/models/mob/skill/definitions/cyclone.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/dodge.cfg b/models/mob/skill/definitions/dodge.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/earth_bolt.cfg b/models/mob/skill/definitions/earth_bolt.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/elemental_pilar.cfg b/models/mob/skill/definitions/elemental_pilar.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/fear.cfg b/models/mob/skill/definitions/fear.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/fire_bolt.cfg b/models/mob/skill/definitions/fire_bolt.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/frost_bolt.cfg b/models/mob/skill/definitions/frost_bolt.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/ghost_walk.cfg b/models/mob/skill/definitions/ghost_walk.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/health_inverse_dmg.cfg b/models/mob/skill/definitions/health_inverse_dmg.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/health_to_dmg.cfg b/models/mob/skill/definitions/health_to_dmg.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/holy_bolt.cfg b/models/mob/skill/definitions/holy_bolt.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/hook.cfg b/models/mob/skill/definitions/hook.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/kick.cfg b/models/mob/skill/definitions/kick.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/launch_strike.cfg b/models/mob/skill/definitions/launch_strike.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/lightning_bolt.cfg b/models/mob/skill/definitions/lightning_bolt.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/meteor_strike.cfg b/models/mob/skill/definitions/meteor_strike.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/mirage.cfg b/models/mob/skill/definitions/mirage.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/net.cfg b/models/mob/skill/definitions/net.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/palm_strike.cfg b/models/mob/skill/definitions/palm_strike.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/poison_strike.cfg b/models/mob/skill/definitions/poison_strike.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/pull.cfg b/models/mob/skill/definitions/pull.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/punch.cfg b/models/mob/skill/definitions/punch.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/push.cfg b/models/mob/skill/definitions/push.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/reflect.cfg b/models/mob/skill/definitions/reflect.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/revive.cfg b/models/mob/skill/definitions/revive.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/root.cfg b/models/mob/skill/definitions/root.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/sacrafice.cfg b/models/mob/skill/definitions/sacrafice.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/shield.cfg b/models/mob/skill/definitions/shield.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/side_kick.cfg b/models/mob/skill/definitions/side_kick.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/spikes.cfg b/models/mob/skill/definitions/spikes.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/sprint.cfg b/models/mob/skill/definitions/sprint.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/stomp.cfg b/models/mob/skill/definitions/stomp.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/summon.cfg b/models/mob/skill/definitions/summon.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/sword_dance.cfg b/models/mob/skill/definitions/sword_dance.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/teleport.cfg b/models/mob/skill/definitions/teleport.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/totem.cfg b/models/mob/skill/definitions/totem.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/uppercut.cfg b/models/mob/skill/definitions/uppercut.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/whirlwind.cfg b/models/mob/skill/definitions/whirlwind.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/definitions/wind_slashes.cfg b/models/mob/skill/definitions/wind_slashes.cfg old mode 100644 new mode 100755 diff --git a/models/mob/skill/modifiers/split_shot.cfg b/models/mob/skill/modifiers/split_shot.cfg old mode 100644 new mode 100755 diff --git a/models/object/Block.cpp b/models/object/Block.cpp old mode 100644 new mode 100755 diff --git a/models/object/Block.h b/models/object/Block.h old mode 100644 new mode 100755 diff --git a/models/object/Chunk.h b/models/object/Chunk.h old mode 100644 new mode 100755 diff --git a/models/object/Cube.h b/models/object/Cube.h old mode 100644 new mode 100755 diff --git a/models/object/Object.h b/models/object/Object.h old mode 100644 new mode 100755 diff --git a/models/object/ObjectType.h b/models/object/ObjectType.h old mode 100644 new mode 100755 diff --git a/models/object/_object_list.h b/models/object/_object_list.h old mode 100644 new mode 100755 diff --git a/models/object/_object_types.h b/models/object/_object_types.h old mode 100644 new mode 100755 diff --git a/models/settings/DungeonSettings.h b/models/settings/DungeonSettings.h old mode 100644 new mode 100755 diff --git a/models/settings/ItemDistributionType.h b/models/settings/ItemDistributionType.h old mode 100644 new mode 100755 diff --git a/models/settings/Settings.h b/models/settings/Settings.h old mode 100644 new mode 100755 diff --git a/models/settings/setting_types.h b/models/settings/setting_types.h old mode 100644 new mode 100755 diff --git a/module/Module.h b/module/Module.h old mode 100644 new mode 100755 diff --git a/module/ModuleManager.h b/module/ModuleManager.h old mode 100644 new mode 100755 diff --git a/network/Server.h b/network/Server.h old mode 100644 new mode 100755 diff --git a/network/Socket.h b/network/Socket.h old mode 100644 new mode 100755 diff --git a/network/SocketConnection.h b/network/SocketConnection.h old mode 100644 new mode 100755 diff --git a/network/packet/OMSPacket.h b/network/packet/OMSPacket.h old mode 100644 new mode 100755 diff --git a/network/packet/PacketCache.h b/network/packet/PacketCache.h old mode 100644 new mode 100755 diff --git a/network/packet/PacketHeader.h b/network/packet/PacketHeader.h old mode 100644 new mode 100755 diff --git a/network/packet/UDPPacket.h b/network/packet/UDPPacket.h old mode 100644 new mode 100755 diff --git a/network/packet/chat/ChatMessagePacket.h b/network/packet/chat/ChatMessagePacket.h old mode 100644 new mode 100755 diff --git a/network/packet/general/AckPacket.h b/network/packet/general/AckPacket.h old mode 100644 new mode 100755 diff --git a/network/packet/general/PingPacket.h b/network/packet/general/PingPacket.h old mode 100644 new mode 100755 diff --git a/network/packet/mob/MobInfoPacket.h b/network/packet/mob/MobInfoPacket.h old mode 100644 new mode 100755 diff --git a/network/packet/mob/MobStatePacket.h b/network/packet/mob/MobStatePacket.h old mode 100644 new mode 100755 diff --git a/network/packet/mob/player/PlayerInfoPacket.h b/network/packet/mob/player/PlayerInfoPacket.h old mode 100644 new mode 100755 diff --git a/network/packet/mob/player/PlayerState.h b/network/packet/mob/player/PlayerState.h old mode 100644 new mode 100755 diff --git a/network/packet/packet_types.h b/network/packet/packet_types.h old mode 100644 new mode 100755 diff --git a/noise/FractalNoise.h b/noise/FractalNoise.h old mode 100644 new mode 100755 diff --git a/noise/PerlinNoise.h b/noise/PerlinNoise.h old mode 100644 new mode 100755 diff --git a/noise/SimplexNoise.h b/noise/SimplexNoise.h old mode 100644 new mode 100755 diff --git a/noise/ValueNoise.h b/noise/ValueNoise.h old mode 100644 new mode 100755 diff --git a/noise/WorleyNoise.h b/noise/WorleyNoise.h old mode 100644 new mode 100755 diff --git a/object/Animation.h b/object/Animation.h old mode 100644 new mode 100755 diff --git a/object/Hitbox.h b/object/Hitbox.h old mode 100644 new mode 100755 diff --git a/object/Material.h b/object/Material.h old mode 100644 new mode 100755 diff --git a/object/Materials.md b/object/Materials.md old mode 100644 new mode 100755 diff --git a/object/Mesh.h b/object/Mesh.h old mode 100644 new mode 100755 diff --git a/object/Model.h b/object/Model.h old mode 100644 new mode 100755 diff --git a/object/Texture.h b/object/Texture.h old mode 100644 new mode 100755 diff --git a/object/Vertex.h b/object/Vertex.h old mode 100644 new mode 100755 diff --git a/particle/Particle.h b/particle/Particle.h old mode 100644 new mode 100755 diff --git a/pathfinding/Jpsp.h b/pathfinding/Jpsp.h old mode 100644 new mode 100755 diff --git a/pathfinding/Metric2d.h b/pathfinding/Metric2d.h old mode 100644 new mode 100755 diff --git a/pathfinding/Metric3d.h b/pathfinding/Metric3d.h old mode 100644 new mode 100755 diff --git a/pathfinding/Path.h b/pathfinding/Path.h old mode 100644 new mode 100755 diff --git a/pathfinding/jps/Jps.h b/pathfinding/jps/Jps.h old mode 100644 new mode 100755 diff --git a/pathfinding/jps/JpsGrid.h b/pathfinding/jps/JpsGrid.h old mode 100644 new mode 100755 diff --git a/pathfinding/jps/JpsNode.h b/pathfinding/jps/JpsNode.h old mode 100644 new mode 100755 diff --git a/platform/linux/Allocator.h b/platform/linux/Allocator.h old mode 100644 new mode 100755 diff --git a/platform/linux/ExceptionHandler.h b/platform/linux/ExceptionHandler.h old mode 100644 new mode 100755 diff --git a/platform/linux/FileUtils.cpp b/platform/linux/FileUtils.cpp old mode 100644 new mode 100755 diff --git a/platform/linux/Library.cpp b/platform/linux/Library.cpp old mode 100644 new mode 100755 diff --git a/platform/linux/Library.h b/platform/linux/Library.h old mode 100644 new mode 100755 diff --git a/platform/linux/SystemInfo.cpp b/platform/linux/SystemInfo.cpp old mode 100644 new mode 100755 diff --git a/platform/linux/TimeUtils.h b/platform/linux/TimeUtils.h old mode 100644 new mode 100755 index c05a0d0..0ea6c0a --- a/platform/linux/TimeUtils.h +++ b/platform/linux/TimeUtils.h @@ -14,10 +14,10 @@ #include "../../stdlib/Types.h" uint64 system_time() { - struct timeval tv; - gettimeofday(&tv, NULL); + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); - return (uint64) tv.tv_sec * 1000000ULL + (uint64) tv.tv_usec; + return (uint64_t) ts.tv_sec * 1000000ULL + (uint64_t) ts.tv_nsec / 1000ULL; } uint64 time_mu() { diff --git a/platform/linux/UtilsLinux.h b/platform/linux/UtilsLinux.h old mode 100644 new mode 100755 diff --git a/platform/linux/network/Server.h b/platform/linux/network/Server.h old mode 100644 new mode 100755 diff --git a/platform/linux/network/Socket.h b/platform/linux/network/Socket.h old mode 100644 new mode 100755 diff --git a/platform/linux/threading/Atomic.h b/platform/linux/threading/Atomic.h old mode 100644 new mode 100755 diff --git a/platform/linux/threading/Semaphore.h b/platform/linux/threading/Semaphore.h old mode 100644 new mode 100755 diff --git a/platform/linux/threading/Spinlock.cpp b/platform/linux/threading/Spinlock.cpp old mode 100644 new mode 100755 diff --git a/platform/linux/threading/Spinlock.h b/platform/linux/threading/Spinlock.h old mode 100644 new mode 100755 diff --git a/platform/linux/threading/Thread.h b/platform/linux/threading/Thread.h old mode 100644 new mode 100755 diff --git a/platform/linux/threading/ThreadDefines.h b/platform/linux/threading/ThreadDefines.h old mode 100644 new mode 100755 diff --git a/platform/win32/Allocator.h b/platform/win32/Allocator.h old mode 100644 new mode 100755 diff --git a/platform/win32/Clipboard.h b/platform/win32/Clipboard.h old mode 100644 new mode 100755 diff --git a/platform/win32/ExceptionHandler.h b/platform/win32/ExceptionHandler.h old mode 100644 new mode 100755 diff --git a/platform/win32/FastPipes.h b/platform/win32/FastPipes.h old mode 100644 new mode 100755 diff --git a/platform/win32/FileUtils.cpp b/platform/win32/FileUtils.cpp old mode 100644 new mode 100755 diff --git a/platform/win32/LeanWin32.h b/platform/win32/LeanWin32.h old mode 100644 new mode 100755 diff --git a/platform/win32/Library.cpp b/platform/win32/Library.cpp old mode 100644 new mode 100755 diff --git a/platform/win32/Library.h b/platform/win32/Library.h old mode 100644 new mode 100755 diff --git a/platform/win32/SystemInfo.cpp b/platform/win32/SystemInfo.cpp old mode 100644 new mode 100755 diff --git a/platform/win32/TimeUtils.h b/platform/win32/TimeUtils.h old mode 100644 new mode 100755 diff --git a/platform/win32/UtilsWin32.h b/platform/win32/UtilsWin32.h old mode 100644 new mode 100755 diff --git a/platform/win32/UtilsWindows.h b/platform/win32/UtilsWindows.h old mode 100644 new mode 100755 diff --git a/platform/win32/Window.h b/platform/win32/Window.h old mode 100644 new mode 100755 diff --git a/platform/win32/audio/DirectSound.h b/platform/win32/audio/DirectSound.h old mode 100644 new mode 100755 diff --git a/platform/win32/audio/Wasapi.h b/platform/win32/audio/Wasapi.h old mode 100644 new mode 100755 diff --git a/platform/win32/audio/XAudio2.h b/platform/win32/audio/XAudio2.h old mode 100644 new mode 100755 diff --git a/platform/win32/input/DirectInput.h b/platform/win32/input/DirectInput.h old mode 100644 new mode 100755 diff --git a/platform/win32/input/HidInput.h b/platform/win32/input/HidInput.h old mode 100644 new mode 100755 diff --git a/platform/win32/input/RawInput.h b/platform/win32/input/RawInput.h old mode 100644 new mode 100755 diff --git a/platform/win32/input/XInput.h b/platform/win32/input/XInput.h old mode 100644 new mode 100755 diff --git a/platform/win32/input/controller/ControllerHandler.h b/platform/win32/input/controller/ControllerHandler.h old mode 100644 new mode 100755 diff --git a/platform/win32/input/controller/DualSense.h b/platform/win32/input/controller/DualSense.h old mode 100644 new mode 100755 diff --git a/platform/win32/input/controller/DualShock4.h b/platform/win32/input/controller/DualShock4.h old mode 100644 new mode 100755 diff --git a/platform/win32/input/controller/XBoxS.h b/platform/win32/input/controller/XBoxS.h old mode 100644 new mode 100755 diff --git a/platform/win32/network/Client.h b/platform/win32/network/Client.h old mode 100644 new mode 100755 diff --git a/platform/win32/network/Server.h b/platform/win32/network/Server.h old mode 100644 new mode 100755 diff --git a/platform/win32/network/Socket.h b/platform/win32/network/Socket.h old mode 100644 new mode 100755 diff --git a/platform/win32/threading/Atomic.h b/platform/win32/threading/Atomic.h old mode 100644 new mode 100755 diff --git a/platform/win32/threading/Semaphore.h b/platform/win32/threading/Semaphore.h old mode 100644 new mode 100755 diff --git a/platform/win32/threading/Spinlock.cpp b/platform/win32/threading/Spinlock.cpp old mode 100644 new mode 100755 diff --git a/platform/win32/threading/Spinlock.h b/platform/win32/threading/Spinlock.h old mode 100644 new mode 100755 diff --git a/platform/win32/threading/Thread.h b/platform/win32/threading/Thread.h old mode 100644 new mode 100755 diff --git a/platform/win32/threading/ThreadDefines.h b/platform/win32/threading/ThreadDefines.h old mode 100644 new mode 100755 diff --git a/render/liquid.cpp b/render/liquid.cpp old mode 100644 new mode 100755 diff --git a/render/mob.cpp b/render/mob.cpp old mode 100644 new mode 100755 diff --git a/render/object.cpp b/render/object.cpp old mode 100644 new mode 100755 diff --git a/render/sky.cpp b/render/sky.cpp old mode 100644 new mode 100755 diff --git a/render/text.cpp b/render/text.cpp old mode 100644 new mode 100755 diff --git a/scene/SceneInfo.h b/scene/SceneInfo.h old mode 100644 new mode 100755 diff --git a/shaders/liquids/lava.hlsl b/shaders/liquids/lava.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/cube_fragment.hlsl b/shaders/liquids/water/cube_fragment.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/cube_vertex.hlsl b/shaders/liquids/water/cube_vertex.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/helper.hlsli b/shaders/liquids/water/helper.hlsli old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/sphere_fragment.hlsl b/shaders/liquids/water/sphere_fragment.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/sphere_vertex.hlsl b/shaders/liquids/water/sphere_vertex.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/water_above_fragment.hlsl b/shaders/liquids/water/water_above_fragment.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/water_below_fragment.hlsl b/shaders/liquids/water/water_below_fragment.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/water_caustics_fragment.hlsl b/shaders/liquids/water/water_caustics_fragment.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/water_caustics_vertex.hlsl b/shaders/liquids/water/water_caustics_vertex.hlsl old mode 100644 new mode 100755 diff --git a/shaders/liquids/water/water_vertex.hlsl b/shaders/liquids/water/water_vertex.hlsl old mode 100644 new mode 100755 diff --git a/shaders/nature/cloud.hlsl b/shaders/nature/cloud.hlsl old mode 100644 new mode 100755 diff --git a/shaders/nature/fire.hlsl b/shaders/nature/fire.hlsl old mode 100644 new mode 100755 diff --git a/shaders/nature/fog.hlsl b/shaders/nature/fog.hlsl old mode 100644 new mode 100755 diff --git a/shaders/nature/godray.hlsl b/shaders/nature/godray.hlsl old mode 100644 new mode 100755 diff --git a/shaders/nature/lightning.hlsl b/shaders/nature/lightning.hlsl old mode 100644 new mode 100755 diff --git a/shaders/nature/rain.hlsl b/shaders/nature/rain.hlsl old mode 100644 new mode 100755 diff --git a/shaders/nature/smoke.hlsl b/shaders/nature/smoke.hlsl old mode 100644 new mode 100755 diff --git a/shaders/nature/snow.hlsl b/shaders/nature/snow.hlsl old mode 100644 new mode 100755 diff --git a/shaders/shaders.hlsl b/shaders/shaders.hlsl old mode 100644 new mode 100755 diff --git a/sort/BinarySearch.h b/sort/BinarySearch.h old mode 100644 new mode 100755 diff --git a/sort/EytzingerSearch.h b/sort/EytzingerSearch.h old mode 100644 new mode 100755 diff --git a/sort/HeapSort.h b/sort/HeapSort.h old mode 100644 new mode 100755 diff --git a/sort/InsertionSort.h b/sort/InsertionSort.h old mode 100644 new mode 100755 diff --git a/sort/IntroSort.h b/sort/IntroSort.h old mode 100644 new mode 100755 diff --git a/sort/QuickSort.h b/sort/QuickSort.h old mode 100644 new mode 100755 diff --git a/sort/Sort.h b/sort/Sort.h old mode 100644 new mode 100755 diff --git a/stdlib/HashMap.h b/stdlib/HashMap.h old mode 100644 new mode 100755 diff --git a/stdlib/PerfectHashMap.h b/stdlib/PerfectHashMap.h old mode 100644 new mode 100755 diff --git a/stdlib/Simd.h b/stdlib/Simd.h old mode 100644 new mode 100755 index 44d87e4..1e8e85e --- a/stdlib/Simd.h +++ b/stdlib/Simd.h @@ -9,9 +9,16 @@ #ifndef COMS_STDLIB_SIMD_H #define COMS_STDLIB_SIMD_H +#include "../utils/TestUtils.h" + // Adjusts the step size based on the memory alignment inline int32 intrin_validate_steps(const byte* mem, int32 steps) { + // During development we want to spot invalid alignment + ASSERT_SIMPLE(steps < 16 || (steps >= 16 && ((uintptr_t) mem & 63) == 0)); + ASSERT_SIMPLE(steps < 8 || (steps >= 8 && ((uintptr_t) mem & 31) == 0)); + ASSERT_SIMPLE(steps < 4 || (steps >= 4 && ((uintptr_t) mem & 15) == 0)); + if (steps >= 16 && ((uintptr_t) mem & 63) == 0) { return 16; } else if (steps >= 8 && ((uintptr_t) mem & 31) == 0) { @@ -35,5 +42,4 @@ int32 intrin_validate_steps(const byte* mem, int32 steps) { #include "../architecture/x86/simd/SIMD_SVML.h" #endif - #endif \ No newline at end of file diff --git a/stdlib/ThreadedHashMap.h b/stdlib/ThreadedHashMap.h old mode 100644 new mode 100755 diff --git a/stdlib/Types.h b/stdlib/Types.h old mode 100644 new mode 100755 diff --git a/system/Allocator.h b/system/Allocator.h old mode 100644 new mode 100755 diff --git a/system/FileUtils.cpp b/system/FileUtils.cpp old mode 100644 new mode 100755 diff --git a/system/Library.cpp b/system/Library.cpp old mode 100644 new mode 100755 diff --git a/system/Library.h b/system/Library.h old mode 100644 new mode 100755 diff --git a/system/SystemInfo.cpp b/system/SystemInfo.cpp old mode 100644 new mode 100755 diff --git a/system/SystemInfo.h b/system/SystemInfo.h old mode 100644 new mode 100755 diff --git a/system/Window.h b/system/Window.h old mode 100644 new mode 100755 diff --git a/tests.bat b/tests.bat old mode 100644 new mode 100755 diff --git a/tests/.vscode/c_cpp_properties.json b/tests/.vscode/c_cpp_properties.json old mode 100644 new mode 100755 diff --git a/tests/.vscode/launch.json b/tests/.vscode/launch.json old mode 100644 new mode 100755 diff --git a/tests/.vscode/settings.json b/tests/.vscode/settings.json old mode 100644 new mode 100755 diff --git a/tests/.vscode/tasks.json b/tests/.vscode/tasks.json old mode 100644 new mode 100755 diff --git a/tests/MainTest.cpp b/tests/MainTest.cpp old mode 100644 new mode 100755 diff --git a/tests/TestFramework.h b/tests/TestFramework.h old mode 100644 new mode 100755 diff --git a/tests/math/EvaluatorTest.cpp b/tests/math/EvaluatorTest.cpp old mode 100644 new mode 100755 diff --git a/tests/memory/ChunkMemoryTest.cpp b/tests/memory/ChunkMemoryTest.cpp old mode 100644 new mode 100755 diff --git a/tests/memory/RingMemoryTest.cpp b/tests/memory/RingMemoryTest.cpp old mode 100644 new mode 100755 diff --git a/tests/stdlib/HashMapTest.cpp b/tests/stdlib/HashMapTest.cpp old mode 100644 new mode 100755 diff --git a/tests/ui/UILayoutTest.cpp b/tests/ui/UILayoutTest.cpp old mode 100644 new mode 100755 diff --git a/tests/ui/UIThemeTest.cpp b/tests/ui/UIThemeTest.cpp old mode 100644 new mode 100755 diff --git a/tests/utils/BitUtilsTest.cpp b/tests/utils/BitUtilsTest.cpp old mode 100644 new mode 100755 diff --git a/tests/utils/EndianUtilsTest.cpp b/tests/utils/EndianUtilsTest.cpp old mode 100644 new mode 100755 diff --git a/tests/utils/MathUtilsTest.cpp b/tests/utils/MathUtilsTest.cpp old mode 100644 new mode 100755 diff --git a/tests/utils/StringUtilsTest.cpp b/tests/utils/StringUtilsTest.cpp old mode 100644 new mode 100755 diff --git a/tests/utils/UtilsTest.cpp b/tests/utils/UtilsTest.cpp old mode 100644 new mode 100755 diff --git a/tests_iter.bat b/tests_iter.bat old mode 100644 new mode 100755 diff --git a/thread/Atomic.h b/thread/Atomic.h old mode 100644 new mode 100755 diff --git a/thread/Semaphore.h b/thread/Semaphore.h old mode 100644 new mode 100755 diff --git a/thread/Spinlock.cpp b/thread/Spinlock.cpp old mode 100644 new mode 100755 diff --git a/thread/Spinlock.h b/thread/Spinlock.h old mode 100644 new mode 100755 diff --git a/thread/Thread.h b/thread/Thread.h old mode 100644 new mode 100755 diff --git a/thread/ThreadDefines.h b/thread/ThreadDefines.h old mode 100644 new mode 100755 diff --git a/thread/ThreadJob.h b/thread/ThreadJob.h old mode 100644 new mode 100755 diff --git a/thread/ThreadPool.h b/thread/ThreadPool.h old mode 100644 new mode 100755 diff --git a/ui/UIAlignment.h b/ui/UIAlignment.h old mode 100644 new mode 100755 diff --git a/ui/UIAnimation.h b/ui/UIAnimation.h old mode 100644 new mode 100755 diff --git a/ui/UIButton.h b/ui/UIButton.h old mode 100644 new mode 100755 diff --git a/ui/UICursor.h b/ui/UICursor.h old mode 100644 new mode 100755 diff --git a/ui/UICustom.h b/ui/UICustom.h old mode 100644 new mode 100755 diff --git a/ui/UIElement.h b/ui/UIElement.h old mode 100644 new mode 100755 diff --git a/ui/UIElementType.h b/ui/UIElementType.h old mode 100644 new mode 100755 diff --git a/ui/UIImage.h b/ui/UIImage.h old mode 100644 new mode 100755 diff --git a/ui/UIInput.h b/ui/UIInput.h old mode 100644 new mode 100755 diff --git a/ui/UILabel.h b/ui/UILabel.h old mode 100644 new mode 100755 diff --git a/ui/UILayout.cpp b/ui/UILayout.cpp old mode 100644 new mode 100755 diff --git a/ui/UILayout.h b/ui/UILayout.h old mode 100644 new mode 100755 diff --git a/ui/UILink.h b/ui/UILink.h old mode 100644 new mode 100755 diff --git a/ui/UIPanel.h b/ui/UIPanel.h old mode 100644 new mode 100755 diff --git a/ui/UISelect.h b/ui/UISelect.h old mode 100644 new mode 100755 diff --git a/ui/UIStyleType.h b/ui/UIStyleType.h old mode 100644 new mode 100755 diff --git a/ui/UITab.h b/ui/UITab.h old mode 100644 new mode 100755 diff --git a/ui/UITable.h b/ui/UITable.h old mode 100644 new mode 100755 diff --git a/ui/UIText.h b/ui/UIText.h old mode 100644 new mode 100755 diff --git a/ui/UITextarea.h b/ui/UITextarea.h old mode 100644 new mode 100755 diff --git a/ui/UITheme.h b/ui/UITheme.h old mode 100644 new mode 100755 diff --git a/ui/UIWindow.h b/ui/UIWindow.h old mode 100644 new mode 100755 diff --git a/ui/attribute/UIAttribute.h b/ui/attribute/UIAttribute.h old mode 100644 new mode 100755 diff --git a/ui/attribute/UIAttributeBackground.h b/ui/attribute/UIAttributeBackground.h old mode 100644 new mode 100755 diff --git a/ui/attribute/UIAttributeBorder.h b/ui/attribute/UIAttributeBorder.h old mode 100644 new mode 100755 diff --git a/ui/attribute/UIAttributeDimension.h b/ui/attribute/UIAttributeDimension.h old mode 100644 new mode 100755 diff --git a/ui/attribute/UIAttributeFont.h b/ui/attribute/UIAttributeFont.h old mode 100644 new mode 100755 diff --git a/ui/attribute/UIAttributeShadow.h b/ui/attribute/UIAttributeShadow.h old mode 100644 new mode 100755 diff --git a/ui/attribute/UIAttributeType.h b/ui/attribute/UIAttributeType.h old mode 100644 new mode 100755 diff --git a/utils/BitUtils.h b/utils/BitUtils.h old mode 100644 new mode 100755 diff --git a/utils/EndianUtils.h b/utils/EndianUtils.h old mode 100644 new mode 100755 diff --git a/utils/MathUtils.h b/utils/MathUtils.h old mode 100644 new mode 100755 diff --git a/utils/PerformanceProfiler.h b/utils/PerformanceProfiler.h old mode 100644 new mode 100755 diff --git a/utils/RandomUtils.h b/utils/RandomUtils.h old mode 100644 new mode 100755 diff --git a/utils/StringUtils.h b/utils/StringUtils.h old mode 100644 new mode 100755 index 4029ac1..59acd69 --- a/utils/StringUtils.h +++ b/utils/StringUtils.h @@ -18,8 +18,7 @@ #define HAS_CHAR(x, c) (HAS_ZERO((x) ^ (((size_t)-1 / 0xFF) * (c)))) inline constexpr -size_t str_length(const char* str) noexcept -{ +size_t str_length(const char* str) noexcept { const char* ptr = str; // Align the pointer to the size of size_t @@ -32,22 +31,15 @@ size_t str_length(const char* str) noexcept // Check one longword (size_t) at a time const size_t* longword_ptr = (const size_t *) ptr; while (true) { - size_t longword = *longword_ptr++; - if (HAS_ZERO(longword)) { - const char* cp = (const char *) (longword_ptr - 1); - if (cp[0] == '\0') return cp - str; - if (cp[1] == '\0') return cp + 1 - str; - if (cp[2] == '\0') return cp + 2 - str; - if (cp[3] == '\0') return cp + 3 - str; - - // Are we using 8bytes for size_t? - #if SIZE_MAX > 0xFFFFFFFF - if (cp[4] == '\0') return cp + 4 - str; - if (cp[5] == '\0') return cp + 5 - str; - if (cp[6] == '\0') return cp + 6 - str; - if (cp[7] == '\0') return cp + 7 - str; - #endif + // Ensure we don't read past the end of the string + const char* end_ptr = (const char *) longword_ptr + sizeof(size_t); + for (const char* cp = (const char *) longword_ptr; cp < end_ptr; ++cp) { + if (*cp == '\0') { + return cp - str; + } } + + ++longword_ptr; } } diff --git a/utils/TestUtils.h b/utils/TestUtils.h old mode 100644 new mode 100755 diff --git a/utils/TimeUtils.h b/utils/TimeUtils.h old mode 100644 new mode 100755 diff --git a/utils/Utils.h b/utils/Utils.h old mode 100644 new mode 100755