From e9fdffc2dd53f11c9285bb8300522ebff93a4513 Mon Sep 17 00:00:00 2001 From: marauder2k7 Date: Thu, 26 Feb 2026 14:57:16 +0000 Subject: [PATCH] math backend setup setup libraries for different simd isa's add float4 functions for c sse2 and avx2 (placeholder file for neon to be implemented on mac) --- Engine/source/CMakeLists.txt | 11 +- Engine/source/math/impl/float4_c.cpp | 60 ++++++++++ Engine/source/math/impl/float4_impl.inl | 113 ++++++++++++++++++ Engine/source/math/isa/avx2/float4.cpp | 58 +++++++++ Engine/source/math/isa/neon/float4.cpp | 0 Engine/source/math/isa/sse2/float4.cpp | 58 +++++++++ Engine/source/math/public/float4_dispatch.cpp | 7 ++ Engine/source/math/public/float4_dispatch.h | 34 ++++++ Tools/CMake/torque_macros.cmake | 47 +++++++- 9 files changed, 386 insertions(+), 2 deletions(-) create mode 100644 Engine/source/math/impl/float4_c.cpp create mode 100644 Engine/source/math/impl/float4_impl.inl create mode 100644 Engine/source/math/isa/avx2/float4.cpp create mode 100644 Engine/source/math/isa/neon/float4.cpp create mode 100644 Engine/source/math/isa/sse2/float4.cpp create mode 100644 Engine/source/math/public/float4_dispatch.cpp create mode 100644 Engine/source/math/public/float4_dispatch.h diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt index 029ef89ec..fd225b579 100644 --- a/Engine/source/CMakeLists.txt +++ b/Engine/source/CMakeLists.txt @@ -130,7 +130,7 @@ torqueAddSourceDirectories("windowManager" "windowManager/torque" "windowManager torqueAddSourceDirectories("scene" "scene/culling" "scene/zones" "scene/mixin") # Handle math -torqueAddSourceDirectories("math" "math/util") +torqueAddSourceDirectories("math" "math/util" "math/public" "math/impl") # note impl must skip the .inl files, never use them in engine code. # Handle persistence set(TORQUE_INCLUDE_DIRECTORIES ${TORQUE_INCLUDE_DIRECTORIES} "persistence/rapidjson") @@ -496,6 +496,15 @@ else() set_target_properties(${TORQUE_APP_NAME} PROPERTIES LINK_FLAGS "-Wl,-rpath,./") endif() +add_math_backend(scalar MATH_SIMD_SCALAR) +add_math_backend(sse2 MATH_SIMD_SSE2) +add_math_backend(sse41 MATH_SIMD_SSE41) +add_math_backend(avx MATH_SIMD_AVX) +add_math_backend(avx2 MATH_SIMD_AVX2) +# Only on ARM +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64") + add_math_backend(neon MATH_SIMD_NEON) +endif() if(MSVC) # Match projectGenerator naming for executables diff --git a/Engine/source/math/impl/float4_c.cpp b/Engine/source/math/impl/float4_c.cpp new file mode 100644 index 000000000..b3d6559f9 --- /dev/null +++ b/Engine/source/math/impl/float4_c.cpp @@ -0,0 +1,60 @@ +#include "math/public/float4_dispatch.h" +#include "math/mConstants.h" +#include + +namespace math_backend::float4::dispatch +{ + void install_scalar() + { + gFloat4.add = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] + b[i]; + }; + + gFloat4.sub = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] - b[i]; + }; + + gFloat4.mul = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] * b[i]; + }; + + gFloat4.mul_scalar = [](const float* a, float s, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] * s; + }; + + gFloat4.div = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] / b[i]; + }; + + gFloat4.div_scalar = [](const float* a, float s, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] / s; + }; + + gFloat4.dot = [](const float* a, const float* b) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * b[i]; + return sum; + }; + + gFloat4.length = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * a[i]; + return sqrtf(sum); + }; + + gFloat4.lengthSquared = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * a[i]; + return (sum); + }; + + gFloat4.normalize = [](float* a) { + float len = gFloat4.length(a); + if (len > POINT_EPSILON) for (int i = 0; i < 4; i++) a[i] /= len; + }; + + gFloat4.lerp = [](const float* from, const float* to, float f, float* r) { + for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f; + }; + } +} diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl new file mode 100644 index 000000000..4871d3161 --- /dev/null +++ b/Engine/source/math/impl/float4_impl.inl @@ -0,0 +1,113 @@ +#pragma once +#include // for sqrtf, etc. + +namespace math_backend::float4 +{ + + //---------------------------------------------------------- + // Add two float4 vectors: r = a + b + inline void float4_add_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vr = v_add(va, vb); + v_store(r, vr); + } + + // Subtract: r = a - b + inline void float4_sub_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vr = v_sub(va, vb); + v_store(r, vr); + } + + // Multiply element-wise: r = a * b + inline void float4_mul_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vr = v_mul(va, vb); + v_store(r, vr); + } + + // Multiply by scalar: r = a * s + inline void float4_mul_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_mul(va, vs); + v_store(r, vr); + } + + // Divide element-wise: r = a / b + inline void float4_div_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vr = _mm_div_ps(va, vb); + v_store(r, vr); + } + + // Divide by scalar: r = a / s + inline void float4_div_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_div(va, vs); + v_store(r, vr); + } + + // Dot product: returns scalar + inline float float4_dot_impl(const float* a, const float* b) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vmul = v_mul(va, vb); + return v_hadd4(vmul); + } + + // Length squared + inline float float4_length_squared_impl(const float* a) + { + return float4_dot_impl(a, a); + } + + // Length + inline float float4_length_impl(const float* a) + { + return std::sqrt(float4_length_squared_impl(a)); + } + + // Normalize in-place + inline void float4_normalize_impl(float* a) + { + float len = float4_length_impl(a); + if (len > 1e-6f) // safe threshold + { + float4_mul_scalar_impl(a, 1.0f / len, a); + } + } + + // Normalize with magnitude: r = normalize(a) * r + inline void float4_normalize_mag_impl(float* a, float r) + { + float len = float4_length_impl(a); + if (len > 1e-6f) + { + float4_mul_scalar_impl(a, r / len, a); + } + } + + // Linear interpolation: r = from + (to - from) * f + inline void float4_lerp_impl(const float* from, const float* to, float f, float* r) + { + f32x4 vfrom = v_load(from); + f32x4 vto = v_load(to); + f32x4 vf = v_set1(f); + f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom))); + v_store(r, vr); + } + +} // namespace math_backend::float4 diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp new file mode 100644 index 000000000..c6e2ffa99 --- /dev/null +++ b/Engine/source/math/isa/avx2/float4.cpp @@ -0,0 +1,58 @@ + +#include "float4_dispatch.h" +#include // AVX/AVX2 intrinsics + +namespace +{ + typedef __m128 f32x4; + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + // Store 4 floats from SIMD register back to memory + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + // Broadcast a single float across all 4 lanes + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + // Horizontal sum of all 4 elements (for dot product, length, etc.) + inline float v_hadd4(f32x4 a) + { + __m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...] + __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3 + return _mm_cvtss_f32(t2); // extract first element + } +} + +#include "float4_impl.inl" + +namespace math_backend::float4::dispatch +{ + // Install AVX2 backend + void install_avx2() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.lerp = float4_lerp_impl; + } +} diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp new file mode 100644 index 000000000..3b9e80e28 --- /dev/null +++ b/Engine/source/math/isa/sse2/float4.cpp @@ -0,0 +1,58 @@ +#include "float4_dispatch.h" +#include // SSE2 intrinsics +namespace +{ + typedef __m128 f32x4; + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + // Store 4 floats from SIMD register back to memory + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + // Broadcast a single float across all 4 lanes + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + // Horizontal sum of all 4 elements (for dot product, length, etc.) + inline float v_hadd4(f32x4 a) + { + __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs + __m128 sums = _mm_add_ps(a, shuf); // sums: [a0+a1 a1+a0 a2+a3 a3+a2] + shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)); // move high pair to low + sums = _mm_add_ps(sums, shuf); // total sum in lower float + return _mm_cvtss_f32(sums); + } +} + +#include "../../impl/float4_impl.inl" + +namespace math_backend::float4::dispatch +{ + // Install AVX2 backend + void install_sse2() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.lerp = float4_lerp_impl; + } +} diff --git a/Engine/source/math/public/float4_dispatch.cpp b/Engine/source/math/public/float4_dispatch.cpp new file mode 100644 index 000000000..810eb0e46 --- /dev/null +++ b/Engine/source/math/public/float4_dispatch.cpp @@ -0,0 +1,7 @@ +#include "math/public/float4_dispatch.h" + +namespace math_backend::float4::dispatch +{ + // Single definition of the global dispatch table + Float4Funcs gFloat4{}; +} diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h new file mode 100644 index 000000000..68f9a6520 --- /dev/null +++ b/Engine/source/math/public/float4_dispatch.h @@ -0,0 +1,34 @@ +#pragma once +#include + +namespace math_backend::float4::dispatch +{ + struct Float4Funcs + { + void (*add)(const float*, const float*, float*) = nullptr; + void (*sub)(const float*, const float*, float*) = nullptr; + void (*mul)(const float*, const float*, float*) = nullptr; + void (*mul_scalar)(const float*, float, float*) = nullptr; + void (*div)(const float*, const float*, float*) = nullptr; + void (*div_scalar)(const float*, float, float*) = nullptr; + float (*dot)(const float*, const float*) = nullptr; + float (*length)(const float*) = nullptr; + float (*lengthSquared)(const float*) = nullptr; + void (*normalize)(float*) = nullptr; + void (*lerp)(const float*, const float*, float, float*) = nullptr; + }; + + // Global dispatch table + extern Float4Funcs gFloat4; + + // Backend installers (defined in ISA libraries) + void install_scalar(); + void install_sse2(); + void install_sse41(); + void install_avx(); + void install_avx2(); + void install_neon(); + + // Centralized installer (engine calls this once) + void install_preferred(); +} diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake index 48bb6896e..23c334780 100644 --- a/Tools/CMake/torque_macros.cmake +++ b/Tools/CMake/torque_macros.cmake @@ -133,4 +133,49 @@ macro(addFramework framework) find_library(_${framework}_FRAMEWORK_PATH ${framework} PATHS /System/Library/Frameworks /Library/Frameworks) set(TORQUE_LINK_FRAMEWORKS ${TORQUE_LINK_FRAMEWORKS} "${_${framework}_FRAMEWORK_PATH}") endif() -endmacro() \ No newline at end of file +endmacro() + +function(add_math_backend name compile_defs) + file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp") + + if(NOT SRC) + return() + endif() + + add_library(math_${name} OBJECT ${SRC}) + + target_include_directories(math_${name} PUBLIC + "math/public" + "math/impl" + "math/isa/${name}" + ) + + target_compile_definitions(math_${name} PRIVATE ${compile_defs}) + + # ISA flags + if(MSVC) + if(name STREQUAL "sse2" OR name STREQUAL "sse41") + target_compile_options(math_${name} PRIVATE /arch:SSE2) + elseif(name STREQUAL "avx") + target_compile_options(math_${name} PRIVATE /arch:AVX) + elseif(name STREQUAL "avx2") + target_compile_options(math_${name} PRIVATE /arch:AVX2) + endif() + else() + if(name STREQUAL "sse2") + target_compile_options(math_${name} PRIVATE -msse2) + elseif(name STREQUAL "sse41") + target_compile_options(math_${name} PRIVATE -msse4.1) + elseif(name STREQUAL "avx") + target_compile_options(math_${name} PRIVATE -mavx) + elseif(name STREQUAL "avx2") + target_compile_options(math_${name} PRIVATE -mavx2 -mfma) + elseif(name STREQUAL "neon") + target_compile_options(math_${name} PRIVATE -march=armv8-a+simd) + endif() + endif() + + # Inject objects into engine + target_sources(${TORQUE_APP_NAME} PRIVATE $) + set_target_properties(math_${name} PROPERTIES FOLDER "Libraries/Math") +endfunction() \ No newline at end of file