diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl index 216fc99ba..ec67fff8d 100644 --- a/Engine/source/math/impl/float3_impl.inl +++ b/Engine/source/math/impl/float3_impl.inl @@ -85,9 +85,8 @@ namespace math_backend::float3 inline void float3_normalize_impl(float* a) { f32x4 va = v_load3_vec(a); - f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted - f32x4 vnorm = v_mul(va, invLen); - v_store3(a, vnorm); + f32x4 vr = v_normalize3(va); + v_store3(a, vr); } // Normalize with magnitude: r = normalize(a) * r diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h index a67f900b4..2dcdf513b 100644 --- a/Engine/source/math/isa/avx/avx_intrinsics.h +++ b/Engine/source/math/isa/avx/avx_intrinsics.h @@ -183,8 +183,15 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); - return _mm_mul_ps(v, inv); + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); } // adds all 4 lanes together. diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h index 67f0df04a..e276e4c6e 100644 --- a/Engine/source/math/isa/avx2/avx2_intrinsics.h +++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h @@ -183,8 +183,15 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); - return _mm_mul_ps(v, inv); + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); } // adds all 4 lanes together. diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h index e6e8ef123..888adab45 100644 --- a/Engine/source/math/isa/neon/neon_intrinsics.h +++ b/Engine/source/math/isa/neon/neon_intrinsics.h @@ -205,8 +205,19 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v,v)); - return vmulq_f32(v, inv); + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t fallback = {0.0f, 0.0f, 1.0f, 0.0f}; + + f32x4 dot = v_dot3(v, v); + + // dot == 0? + uint32x4_t isZero = vceqq_f32(dot, zero); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 norm = vmulq_f32(v, inv); + + // Select fallback when zero + return vbslq_f32(isZero, fallback, norm); } inline f32x4 v_hadd4(f32x4 a) diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h index 71a95c1b7..63243d2d3 100644 --- a/Engine/source/math/isa/sse2/sse2_intrinsics.h +++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h @@ -216,8 +216,22 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); - return _mm_mul_ps(v, inv); + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + + f32x4 norm = _mm_mul_ps(v, inv); + + // vbsl equivalent + f32x4 result = _mm_or_ps( + _mm_and_ps(isZero, fallback), + _mm_andnot_ps(isZero, norm) + ); + + return result; } // adds all 4 lanes together. diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h index 2ea63e6b5..58d66ca6c 100644 --- a/Engine/source/math/isa/sse41/sse41_intrinsics.h +++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h @@ -195,8 +195,15 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); - return _mm_mul_ps(v, inv); + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); } // adds all 4 lanes together. diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake index f799e8c39..9b98a5b99 100644 --- a/Tools/CMake/torque_macros.cmake +++ b/Tools/CMake/torque_macros.cmake @@ -142,7 +142,9 @@ function(add_math_backend name compile_defs) return() endif() - add_library(math_${name} OBJECT ${SRC}) + file(GLOB_RECURSE INL CONFIGURE_DEPENDS "math/impl/*.inl") + + add_library(math_${name} OBJECT ${SRC} ${INL}) message(STATUS "adding math library for isa ${name}") target_include_directories(math_${name} PUBLIC