diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl index 4871d3161..cb61ed4fc 100644 --- a/Engine/source/math/impl/float4_impl.inl +++ b/Engine/source/math/impl/float4_impl.inl @@ -1,5 +1,6 @@ #pragma once #include // for sqrtf, etc. +#include "../mConstants.h" namespace math_backend::float4 { @@ -46,7 +47,7 @@ namespace math_backend::float4 { f32x4 va = v_load(a); f32x4 vb = v_load(b); - f32x4 vr = _mm_div_ps(va, vb); + f32x4 vr = v_div(va, vb); v_store(r, vr); } @@ -84,7 +85,7 @@ namespace math_backend::float4 inline void float4_normalize_impl(float* a) { float len = float4_length_impl(a); - if (len > 1e-6f) // safe threshold + if (len > POINT_EPSILON) // safe threshold { float4_mul_scalar_impl(a, 1.0f / len, a); } @@ -94,7 +95,7 @@ namespace math_backend::float4 inline void float4_normalize_mag_impl(float* a, float r) { float len = float4_length_impl(a); - if (len > 1e-6f) + if (len > POINT_EPSILON) { float4_mul_scalar_impl(a, r / len, a); } diff --git a/Engine/source/math/isa/avx/float4.cpp b/Engine/source/math/isa/avx/float4.cpp new file mode 100644 index 000000000..1e23fb8b1 --- /dev/null +++ b/Engine/source/math/isa/avx/float4.cpp @@ -0,0 +1,67 @@ + +#include "float4_dispatch.h" +#include // AVX/AVX2 intrinsics + +namespace +{ + typedef __m128 f32x4; + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + // Store 4 floats from SIMD register back to memory + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + // Broadcast a single float across all 4 lanes + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + // Horizontal sum of all 4 elements (for dot product, length, etc.) + inline float v_hadd4(f32x4 a) + { + __m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...] + __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3 + return _mm_cvtss_f32(t2); // extract first element + } + + // specialized dot product for AVX + float float4_dot_avx(const float* a, const float* b) + { + f32x4 va = _mm_loadu_ps(a); + f32x4 vb = _mm_loadu_ps(b); + __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane + return _mm_cvtss_f32(dp); + } +} + +#include "float4_impl.inl" + +namespace math_backend::float4::dispatch +{ + // Install AVX backend + void install_avx() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_avx; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.lerp = float4_lerp_impl; + } +} diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp index c6e2ffa99..439d0e2d0 100644 --- a/Engine/source/math/isa/avx2/float4.cpp +++ b/Engine/source/math/isa/avx2/float4.cpp @@ -34,6 +34,15 @@ namespace __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3 return _mm_cvtss_f32(t2); // extract first element } + + // specialized dot product for AVX + float float4_dot_avx(const float* a, const float* b) + { + f32x4 va = _mm_loadu_ps(a); + f32x4 vb = _mm_loadu_ps(b); + __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane + return _mm_cvtss_f32(dp); + } } #include "float4_impl.inl" @@ -49,7 +58,7 @@ namespace math_backend::float4::dispatch gFloat4.mul_scalar = float4_mul_scalar_impl; gFloat4.div = float4_div_impl; gFloat4.div_scalar = float4_div_scalar_impl; - gFloat4.dot = float4_dot_impl; + gFloat4.dot = float4_dot_avx; gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; gFloat4.normalize = float4_normalize_impl; diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp index 3b9e80e28..00850560a 100644 --- a/Engine/source/math/isa/sse2/float4.cpp +++ b/Engine/source/math/isa/sse2/float4.cpp @@ -40,19 +40,19 @@ namespace namespace math_backend::float4::dispatch { - // Install AVX2 backend + // Install SSE2 backend void install_sse2() { - gFloat4.add = float4_add_impl; - gFloat4.sub = float4_sub_impl; - gFloat4.mul = float4_mul_impl; - gFloat4.mul_scalar = float4_mul_scalar_impl; - gFloat4.div = float4_div_impl; - gFloat4.div_scalar = float4_div_scalar_impl; - gFloat4.dot = float4_dot_impl; - gFloat4.length = float4_length_impl; + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; - gFloat4.normalize = float4_normalize_impl; - gFloat4.lerp = float4_lerp_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.lerp = float4_lerp_impl; } } diff --git a/Engine/source/math/isa/sse41/float4.cpp b/Engine/source/math/isa/sse41/float4.cpp new file mode 100644 index 000000000..80127acb9 --- /dev/null +++ b/Engine/source/math/isa/sse41/float4.cpp @@ -0,0 +1,67 @@ + +#include "float4_dispatch.h" +#include // SSE41 intrinsics + +namespace +{ + typedef __m128 f32x4; + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + // Store 4 floats from SIMD register back to memory + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + // Broadcast a single float across all 4 lanes + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + // Horizontal sum of all 4 elements (for dot product, length, etc.) + inline float v_hadd4(f32x4 a) + { + __m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...] + __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3 + return _mm_cvtss_f32(t2); // extract first element + } + + // specialized dot product for SSE4.1 + float float4_dot_sse41(const float* a, const float* b) + { + f32x4 va = _mm_loadu_ps(a); + f32x4 vb = _mm_loadu_ps(b); + __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane + return _mm_cvtss_f32(dp); + } +} + +#include "float4_impl.inl" + +namespace math_backend::float4::dispatch +{ + // Install SSE41 backend + void install_sse41() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_sse41; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.lerp = float4_lerp_impl; + } +} diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h index 68f9a6520..319b1893f 100644 --- a/Engine/source/math/public/float4_dispatch.h +++ b/Engine/source/math/public/float4_dispatch.h @@ -1,4 +1,8 @@ #pragma once +#ifndef _FLOAT4_DISPATCH_H_ +#define _FLOAT4_DISPATCH_H_ + + #include namespace math_backend::float4::dispatch @@ -32,3 +36,5 @@ namespace math_backend::float4::dispatch // Centralized installer (engine calls this once) void install_preferred(); } + +#endif // !_FLOAT4_DISPATCH_H_ diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp new file mode 100644 index 000000000..7998924ee --- /dev/null +++ b/Engine/source/math/public/math_backend.cpp @@ -0,0 +1,53 @@ +#pragma once +#include "math/public/math_backend.h" + +math_backend::backend math_backend::choose_backend(U32 cpu_flags) +{ +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) + + if (cpu_flags & CPU_PROP_AVX2) return backend::avx2; + if (cpu_flags & CPU_PROP_AVX) return backend::avx; + if (cpu_flags & CPU_PROP_SSE4_1) return backend::sse41; + if (cpu_flags & CPU_PROP_SSE2) return backend::sse2; + +#elif defined(__aarch64__) || defined(__ARM_NEON) + + if (cpu_flags & CPU_NEON) return backend::neon; + +#endif + return backend::scalar; +} + +void math_backend::install_from_cpu_flags(uint32_t cpu_flags) +{ + { + g_backend = choose_backend(cpu_flags); + + switch (g_backend) + { + case backend::avx2: + float4::dispatch::install_avx2(); + break; + + case backend::avx: + //float4::dispatch::install_avx(); + break; + + case backend::sse41: + float4::dispatch::install_sse41(); + break; + + case backend::sse2: + float4::dispatch::install_sse2(); + break; + + case backend::neon: + float4::dispatch::install_neon(); + break; + + default: + float4::dispatch::install_scalar(); + break; + } + } +} diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h new file mode 100644 index 000000000..40476e7f0 --- /dev/null +++ b/Engine/source/math/public/math_backend.h @@ -0,0 +1,27 @@ +#pragma once +#ifndef _MCONSTANTS_H_ +#include "math/mConstants.h" +#endif +#ifndef _PLATFORMASSERT_H_ +#include "platform/platformAssert.h" +#endif +#ifndef _FLOAT4_DISPATCH_H_ +#include "math/public/float4_dispatch.h" +#endif + +namespace math_backend +{ + enum class backend + { + scalar, + sse2, + sse41, + avx, + avx2, + neon + }; + + static backend g_backend = backend::scalar; + backend choose_backend(U32 cpu_flags); + void install_from_cpu_flags(uint32_t cpu_flags); +} diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake index 23c334780..5a7928b38 100644 --- a/Tools/CMake/torque_macros.cmake +++ b/Tools/CMake/torque_macros.cmake @@ -154,7 +154,9 @@ function(add_math_backend name compile_defs) # ISA flags if(MSVC) - if(name STREQUAL "sse2" OR name STREQUAL "sse41") + if(name STREQUAL "sse2") + target_compile_options(math_${name} PRIVATE /arch:SSE2) + elseif(name STREQUAL "sse41") target_compile_options(math_${name} PRIVATE /arch:SSE2) elseif(name STREQUAL "avx") target_compile_options(math_${name} PRIVATE /arch:AVX)