fix drift turns out we require the dot product to be in all rows

2026-07-11 14:44:36 +00:00 · 2026-03-05 14:16:11 +00:00 · 2026-03-05 14:16:11 +00:00 · ac6ec05690
commit ac6ec05690
parent 8c1acbd1da
4 changed files with 456 additions and 35 deletions
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@ -136,7 +136,8 @@ namespace
   {
      f32x4 r = _mm_rcp_ps(b);
      f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
   }

   // Divide fast ( b = recip eg 1/b)
@ -144,14 +145,12 @@ namespace

   inline f32x4 v_rsqrt_nr(f32x4 x)
   {
-      f32x4 r = _mm_rsqrt_ps(x);
-
      f32x4 half = _mm_set1_ps(0.5f);
      f32x4 three = _mm_set1_ps(3.0f);

-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
   }

   //------------------------------------------------------
@ -161,13 +160,13 @@ namespace
   // full dot4
   inline f32x4 v_dot4(f32x4 a, f32x4 b)
   {
-      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
   }

   // dot3 (ignores w)
   inline f32x4 v_dot3(f32x4 a, f32x4 b)
   {
-      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
   }

   // cross product xyz only.
@ -392,4 +391,146 @@ namespace
      f32x4 c0 = v_cross(r1, r2);
      return v_dot3(r0, c0); // splatted determinant
   }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+   typedef __m256 f32x8;
+
+   struct vec4_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+      f32x8 w;
+   };
+
+   struct vec3_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+   };
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+   inline vec3_batch8 load_vec3_batch8(const float* ptr)
+   {
+      // ptr points to x0
+      // layout: x y z x y z ...
+
+      vec3_batch8 r;
+
+      r.x = _mm256_set_ps(
+         ptr[21], ptr[18], ptr[15], ptr[12],
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm256_set_ps(
+         ptr[22], ptr[19], ptr[16], ptr[13],
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm256_set_ps(
+         ptr[23], ptr[20], ptr[17], ptr[14],
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 8 at a time
+   inline void store_f32x8(float* out, f32x8 v)
+   {
+      _mm256_storeu_ps(out, v);
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 8
+   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   {
+      alignas(32) float xs[8];
+      alignas(32) float ys[8];
+      alignas(32) float zs[8];
+
+      _mm256_store_ps(xs, v.x);
+      _mm256_store_ps(ys, v.y);
+      _mm256_store_ps(zs, v.z);
+
+      for (int i = 0; i < 8; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   {
+      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
+      f32x8 muly = _mm256_mul_ps(a.y, b.y);
+      f32x8 mulz = _mm256_mul_ps(a.z, b.z);
+
+      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
+   }
+
 }
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@ -136,7 +136,8 @@ namespace
   {
      f32x4 r = _mm_rcp_ps(b);
      f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
   }

   // Divide fast ( b = recip eg 1/b)
@ -144,14 +145,12 @@ namespace

   inline f32x4 v_rsqrt_nr(f32x4 x)
   {
-      f32x4 r = _mm_rsqrt_ps(x);
-
      f32x4 half = _mm_set1_ps(0.5f);
      f32x4 three = _mm_set1_ps(3.0f);

-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
   }

   //------------------------------------------------------
@ -161,13 +160,13 @@ namespace
   // full dot4
   inline f32x4 v_dot4(f32x4 a, f32x4 b)
   {
-      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
   }

   // dot3 (ignores w)
   inline f32x4 v_dot3(f32x4 a, f32x4 b)
   {
-      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
   }

   // cross product xyz only.
@ -392,4 +391,146 @@ namespace
      f32x4 c0 = v_cross(r1, r2);
      return v_dot3(r0, c0); // splatted determinant
   }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+   typedef __m256 f32x8;
+
+   struct vec4_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+      f32x8 w;
+   };
+
+   struct vec3_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+   };
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+   inline vec3_batch8 load_vec3_batch8(const float* ptr)
+   {
+      // ptr points to x0
+      // layout: x y z x y z ...
+
+      vec3_batch8 r;
+
+      r.x = _mm256_set_ps(
+         ptr[21], ptr[18], ptr[15], ptr[12],
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm256_set_ps(
+         ptr[22], ptr[19], ptr[16], ptr[13],
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm256_set_ps(
+         ptr[23], ptr[20], ptr[17], ptr[14],
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 8 at a time
+   inline void store_f32x8(float* out, f32x8 v)
+   {
+      _mm256_storeu_ps(out, v);
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 8
+   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   {
+      alignas(32) float xs[8];
+      alignas(32) float ys[8];
+      alignas(32) float zs[8];
+
+      _mm256_store_ps(xs, v.x);
+      _mm256_store_ps(ys, v.y);
+      _mm256_store_ps(zs, v.z);
+
+      for (int i = 0; i < 8; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   {
+      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
+      f32x8 muly = _mm256_mul_ps(a.y, b.y);
+      f32x8 mulz = _mm256_mul_ps(a.z, b.z);
+
+      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
+   }
+
 }
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@ -149,7 +149,7 @@ namespace
      return _mm_and_ps(v, mask);
   }

-   //------------------------------------------------------
+   //------------------------------------------------------ 
   // Fast recip
   //------------------------------------------------------

@ -158,7 +158,8 @@ namespace
   {
      f32x4 r = _mm_rcp_ps(b);
      f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
   }

   // Divide fast ( b = recip eg 1/b)
@ -166,14 +167,12 @@ namespace

   inline f32x4 v_rsqrt_nr(f32x4 x)
   {
-      f32x4 r = _mm_rsqrt_ps(x);
-
      f32x4 half = _mm_set1_ps(0.5f);
      f32x4 three = _mm_set1_ps(3.0f);

-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
   }

   //------------------------------------------------------
@ -184,9 +183,9 @@ namespace
   inline f32x4 v_dot4(f32x4 a, f32x4 b)
   {
      f32x4 prod = _mm_mul_ps(a, b);           // multiply element-wise
-      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2));
      prod = _mm_add_ps(prod, shuf);
-      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1));
      prod = _mm_add_ps(prod, shuf);
      return prod;                             // f32x4, all lanes = dot(a,b)
   }
@ -305,10 +304,11 @@ namespace
      f32x4 z = v_dot4(m.r2, v);
      f32x4 w = v_dot4(m.r3, v);

-      // Pack lowest lane of each into a vector
-      f32x4 xy = _mm_unpacklo_ps(x, y);
-      f32x4 zw = _mm_unpacklo_ps(z, w);
-      return _mm_movelh_ps(xy, zw);
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
   }

   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
@ -531,4 +531,74 @@ namespace
      f32x4 c0 = v_cross(r1, r2);
      return v_dot3(r0, c0); // splatted determinant
   }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
 }
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@ -148,7 +148,8 @@ namespace
   {
      f32x4 r = _mm_rcp_ps(b);
      f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
   }

   // Divide fast ( b = recip eg 1/b)
@ -156,14 +157,12 @@ namespace

   inline f32x4 v_rsqrt_nr(f32x4 x)
   {
-      f32x4 r = _mm_rsqrt_ps(x);
-
      f32x4 half = _mm_set1_ps(0.5f);
      f32x4 three = _mm_set1_ps(3.0f);

-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
   }

   //------------------------------------------------------
@ -502,4 +501,74 @@ namespace
      f32x4 c0 = v_cross(r1, r2);
      return v_dot3(r0, c0); // splatted determinant
   }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
 }