From 8c1acbd1da4b86ebca534b28bac4252bb13f7969 Mon Sep 17 00:00:00 2001 From: marauder2k7 Date: Wed, 4 Mar 2026 23:49:08 +0000 Subject: [PATCH] add normal safety wrap safety around normal checks, this was done on the scalar math may as well do it here just in case. Ad the impl.inl files to libraries so they can actually be found. --- Engine/source/math/impl/float3_impl.inl | 5 ++--- Engine/source/math/isa/avx/avx_intrinsics.h | 11 +++++++++-- Engine/source/math/isa/avx2/avx2_intrinsics.h | 11 +++++++++-- Engine/source/math/isa/neon/neon_intrinsics.h | 15 +++++++++++++-- Engine/source/math/isa/sse2/sse2_intrinsics.h | 18 ++++++++++++++++-- .../source/math/isa/sse41/sse41_intrinsics.h | 11 +++++++++-- Tools/CMake/torque_macros.cmake | 4 +++- 7 files changed, 61 insertions(+), 14 deletions(-) diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl index 216fc99ba..ec67fff8d 100644 --- a/Engine/source/math/impl/float3_impl.inl +++ b/Engine/source/math/impl/float3_impl.inl @@ -85,9 +85,8 @@ namespace math_backend::float3 inline void float3_normalize_impl(float* a) { f32x4 va = v_load3_vec(a); - f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted - f32x4 vnorm = v_mul(va, invLen); - v_store3(a, vnorm); + f32x4 vr = v_normalize3(va); + v_store3(a, vr); } // Normalize with magnitude: r = normalize(a) * r diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h index a67f900b4..2dcdf513b 100644 --- a/Engine/source/math/isa/avx/avx_intrinsics.h +++ b/Engine/source/math/isa/avx/avx_intrinsics.h @@ -183,8 +183,15 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); - return _mm_mul_ps(v, inv); + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); } // adds all 4 lanes together. diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h index 67f0df04a..e276e4c6e 100644 --- a/Engine/source/math/isa/avx2/avx2_intrinsics.h +++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h @@ -183,8 +183,15 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); - return _mm_mul_ps(v, inv); + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); } // adds all 4 lanes together. diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h index e6e8ef123..888adab45 100644 --- a/Engine/source/math/isa/neon/neon_intrinsics.h +++ b/Engine/source/math/isa/neon/neon_intrinsics.h @@ -205,8 +205,19 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v,v)); - return vmulq_f32(v, inv); + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t fallback = {0.0f, 0.0f, 1.0f, 0.0f}; + + f32x4 dot = v_dot3(v, v); + + // dot == 0? + uint32x4_t isZero = vceqq_f32(dot, zero); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 norm = vmulq_f32(v, inv); + + // Select fallback when zero + return vbslq_f32(isZero, fallback, norm); } inline f32x4 v_hadd4(f32x4 a) diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h index 71a95c1b7..63243d2d3 100644 --- a/Engine/source/math/isa/sse2/sse2_intrinsics.h +++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h @@ -216,8 +216,22 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); - return _mm_mul_ps(v, inv); + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + + f32x4 norm = _mm_mul_ps(v, inv); + + // vbsl equivalent + f32x4 result = _mm_or_ps( + _mm_and_ps(isZero, fallback), + _mm_andnot_ps(isZero, norm) + ); + + return result; } // adds all 4 lanes together. diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h index 2ea63e6b5..58d66ca6c 100644 --- a/Engine/source/math/isa/sse41/sse41_intrinsics.h +++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h @@ -195,8 +195,15 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); - return _mm_mul_ps(v, inv); + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); } // adds all 4 lanes together. diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake index f799e8c39..9b98a5b99 100644 --- a/Tools/CMake/torque_macros.cmake +++ b/Tools/CMake/torque_macros.cmake @@ -142,7 +142,9 @@ function(add_math_backend name compile_defs) return() endif() - add_library(math_${name} OBJECT ${SRC}) + file(GLOB_RECURSE INL CONFIGURE_DEPENDS "math/impl/*.inl") + + add_library(math_${name} OBJECT ${SRC} ${INL}) message(STATUS "adding math library for isa ${name}") target_include_directories(math_${name} PUBLIC