From ac6ec05690e52c85c9858a134d02050b8054559b Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 5 Mar 2026 14:16:11 +0000
Subject: [PATCH] fix drift turns out we require the dot product to be in all
 rows

---
 Engine/source/math/isa/avx/avx_intrinsics.h   | 157 +++++++++++++++++-
 Engine/source/math/isa/avx2/avx2_intrinsics.h | 157 +++++++++++++++++-
 Engine/source/math/isa/sse2/sse2_intrinsics.h |  96 +++++++++--
 .../source/math/isa/sse41/sse41_intrinsics.h  |  81 ++++++++-
 4 files changed, 456 insertions(+), 35 deletions(-)

diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
index 2dcdf513b..28a5fa4fa 100644
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -136,7 +136,8 @@ namespace
    {
       f32x4 r = _mm_rcp_ps(b);
       f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
    }
 
    // Divide fast ( b = recip eg 1/b)
@@ -144,14 +145,12 @@ namespace
 
    inline f32x4 v_rsqrt_nr(f32x4 x)
    {
-      f32x4 r = _mm_rsqrt_ps(x);
-
       f32x4 half = _mm_set1_ps(0.5f);
       f32x4 three = _mm_set1_ps(3.0f);
 
-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
    }
 
    //------------------------------------------------------
@@ -161,13 +160,13 @@ namespace
    // full dot4
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
    }
 
    // dot3 (ignores w)
    inline f32x4 v_dot3(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
    }
 
    // cross product xyz only.
@@ -392,4 +391,146 @@ namespace
       f32x4 c0 = v_cross(r1, r2);
       return v_dot3(r0, c0); // splatted determinant
    }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+   typedef __m256 f32x8;
+
+   struct vec4_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+      f32x8 w;
+   };
+
+   struct vec3_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+   };
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+   inline vec3_batch8 load_vec3_batch8(const float* ptr)
+   {
+      // ptr points to x0
+      // layout: x y z x y z ...
+
+      vec3_batch8 r;
+
+      r.x = _mm256_set_ps(
+         ptr[21], ptr[18], ptr[15], ptr[12],
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm256_set_ps(
+         ptr[22], ptr[19], ptr[16], ptr[13],
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm256_set_ps(
+         ptr[23], ptr[20], ptr[17], ptr[14],
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 8 at a time
+   inline void store_f32x8(float* out, f32x8 v)
+   {
+      _mm256_storeu_ps(out, v);
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 8
+   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   {
+      alignas(32) float xs[8];
+      alignas(32) float ys[8];
+      alignas(32) float zs[8];
+
+      _mm256_store_ps(xs, v.x);
+      _mm256_store_ps(ys, v.y);
+      _mm256_store_ps(zs, v.z);
+
+      for (int i = 0; i < 8; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   {
+      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
+      f32x8 muly = _mm256_mul_ps(a.y, b.y);
+      f32x8 mulz = _mm256_mul_ps(a.z, b.z);
+
+      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
+   }
+
 }
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
index e276e4c6e..bb367f198 100644
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -136,7 +136,8 @@ namespace
    {
       f32x4 r = _mm_rcp_ps(b);
       f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
    }
 
    // Divide fast ( b = recip eg 1/b)
@@ -144,14 +145,12 @@ namespace
 
    inline f32x4 v_rsqrt_nr(f32x4 x)
    {
-      f32x4 r = _mm_rsqrt_ps(x);
-
       f32x4 half = _mm_set1_ps(0.5f);
       f32x4 three = _mm_set1_ps(3.0f);
 
-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
    }
 
    //------------------------------------------------------
@@ -161,13 +160,13 @@ namespace
    // full dot4
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
    }
 
    // dot3 (ignores w)
    inline f32x4 v_dot3(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
    }
 
    // cross product xyz only.
@@ -392,4 +391,146 @@ namespace
       f32x4 c0 = v_cross(r1, r2);
       return v_dot3(r0, c0); // splatted determinant
    }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+   typedef __m256 f32x8;
+
+   struct vec4_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+      f32x8 w;
+   };
+
+   struct vec3_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+   };
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+   inline vec3_batch8 load_vec3_batch8(const float* ptr)
+   {
+      // ptr points to x0
+      // layout: x y z x y z ...
+
+      vec3_batch8 r;
+
+      r.x = _mm256_set_ps(
+         ptr[21], ptr[18], ptr[15], ptr[12],
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm256_set_ps(
+         ptr[22], ptr[19], ptr[16], ptr[13],
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm256_set_ps(
+         ptr[23], ptr[20], ptr[17], ptr[14],
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 8 at a time
+   inline void store_f32x8(float* out, f32x8 v)
+   {
+      _mm256_storeu_ps(out, v);
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 8
+   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   {
+      alignas(32) float xs[8];
+      alignas(32) float ys[8];
+      alignas(32) float zs[8];
+
+      _mm256_store_ps(xs, v.x);
+      _mm256_store_ps(ys, v.y);
+      _mm256_store_ps(zs, v.z);
+
+      for (int i = 0; i < 8; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   {
+      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
+      f32x8 muly = _mm256_mul_ps(a.y, b.y);
+      f32x8 mulz = _mm256_mul_ps(a.z, b.z);
+
+      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
+   }
+
 }
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
index 63243d2d3..0150a78a5 100644
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -149,7 +149,7 @@ namespace
       return _mm_and_ps(v, mask);
    }
 
-   //------------------------------------------------------
+   //------------------------------------------------------ 
    // Fast recip
    //------------------------------------------------------
 
@@ -158,7 +158,8 @@ namespace
    {
       f32x4 r = _mm_rcp_ps(b);
       f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
    }
 
    // Divide fast ( b = recip eg 1/b)
@@ -166,14 +167,12 @@ namespace
 
    inline f32x4 v_rsqrt_nr(f32x4 x)
    {
-      f32x4 r = _mm_rsqrt_ps(x);
-
       f32x4 half = _mm_set1_ps(0.5f);
       f32x4 three = _mm_set1_ps(3.0f);
 
-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
    }
 
    //------------------------------------------------------
@@ -184,9 +183,9 @@ namespace
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
       f32x4 prod = _mm_mul_ps(a, b);           // multiply element-wise
-      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2));
       prod = _mm_add_ps(prod, shuf);
-      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1));
       prod = _mm_add_ps(prod, shuf);
       return prod;                             // f32x4, all lanes = dot(a,b)
    }
@@ -305,10 +304,11 @@ namespace
       f32x4 z = v_dot4(m.r2, v);
       f32x4 w = v_dot4(m.r3, v);
 
-      // Pack lowest lane of each into a vector
-      f32x4 xy = _mm_unpacklo_ps(x, y);
-      f32x4 zw = _mm_unpacklo_ps(z, w);
-      return _mm_movelh_ps(xy, zw);
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
    }
 
    inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
@@ -531,4 +531,74 @@ namespace
       f32x4 c0 = v_cross(r1, r2);
       return v_dot3(r0, c0); // splatted determinant
    }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
 }
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
index 58d66ca6c..4134480aa 100644
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -148,7 +148,8 @@ namespace
    {
       f32x4 r = _mm_rcp_ps(b);
       f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
    }
 
    // Divide fast ( b = recip eg 1/b)
@@ -156,14 +157,12 @@ namespace
 
    inline f32x4 v_rsqrt_nr(f32x4 x)
    {
-      f32x4 r = _mm_rsqrt_ps(x);
-
       f32x4 half = _mm_set1_ps(0.5f);
       f32x4 three = _mm_set1_ps(3.0f);
 
-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
    }
 
    //------------------------------------------------------
@@ -502,4 +501,74 @@ namespace
       f32x4 c0 = v_cross(r1, r2);
       return v_dot3(r0, c0); // splatted determinant
    }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
 }