add normal safety

wrap safety around normal checks, this was done on the scalar math may as well do it here just in case.

Ad the impl.inl files to libraries so they can actually be found.
This commit is contained in:
marauder2k7 2026-03-04 23:49:08 +00:00
parent 5a6467d54a
commit 8c1acbd1da
7 changed files with 61 additions and 14 deletions

View file

@ -85,9 +85,8 @@ namespace math_backend::float3
inline void float3_normalize_impl(float* a)
{
f32x4 va = v_load3_vec(a);
f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted
f32x4 vnorm = v_mul(va, invLen);
v_store3(a, vnorm);
f32x4 vr = v_normalize3(va);
v_store3(a, vr);
}
// Normalize with magnitude: r = normalize(a) * r

View file

@ -183,8 +183,15 @@ namespace
inline f32x4 v_normalize3(f32x4 v)
{
f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
return _mm_mul_ps(v, inv);
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
return _mm_blendv_ps(norm, fallback, isZero);
}
// adds all 4 lanes together.

View file

@ -183,8 +183,15 @@ namespace
inline f32x4 v_normalize3(f32x4 v)
{
f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
return _mm_mul_ps(v, inv);
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
return _mm_blendv_ps(norm, fallback, isZero);
}
// adds all 4 lanes together.

View file

@ -205,8 +205,19 @@ namespace
inline f32x4 v_normalize3(f32x4 v)
{
f32x4 inv = v_rsqrt_nr(v_dot3(v,v));
return vmulq_f32(v, inv);
const float32x4_t zero = vdupq_n_f32(0.0f);
const float32x4_t fallback = {0.0f, 0.0f, 1.0f, 0.0f};
f32x4 dot = v_dot3(v, v);
// dot == 0?
uint32x4_t isZero = vceqq_f32(dot, zero);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 norm = vmulq_f32(v, inv);
// Select fallback when zero
return vbslq_f32(isZero, fallback, norm);
}
inline f32x4 v_hadd4(f32x4 a)

View file

@ -216,8 +216,22 @@ namespace
inline f32x4 v_normalize3(f32x4 v)
{
f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
return _mm_mul_ps(v, inv);
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
// vbsl equivalent
f32x4 result = _mm_or_ps(
_mm_and_ps(isZero, fallback),
_mm_andnot_ps(isZero, norm)
);
return result;
}
// adds all 4 lanes together.

View file

@ -195,8 +195,15 @@ namespace
inline f32x4 v_normalize3(f32x4 v)
{
f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
return _mm_mul_ps(v, inv);
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
return _mm_blendv_ps(norm, fallback, isZero);
}
// adds all 4 lanes together.