more intrinsics

add transform plane added first batch function for mulp to intrinsics
2026-07-10 22:24:33 +00:00 · 2026-03-05 18:54:28 +00:00 · 2026-03-05 18:54:28 +00:00 · add7f2a5d7
commit add7f2a5d7
parent ac6ec05690
14 changed files with 710 additions and 113 deletions
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@ -32,6 +32,53 @@ namespace math_backend::mat44
      m_store(m, ma);
   }

+   inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result)
+   {
+      f32x4x4 M = m_load(m);
+
+      f32x4 plane_v = v_load(plane);
+      f32x4 scale_v = v_load3_vec(scale);
+      f32x4 invScale = v_rcp_nr(scale_v);
+
+      // normal = plane.xyz
+      f32x4 normal = plane_v;
+
+      // apply Inv(s)
+      normal = v_mul(normal, invScale);
+
+      // multiply by Inv(Tr(m)) (only the rotation part matters)
+      f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0);
+      f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1);
+      f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2);
+
+      normal = v_add(v_add(nx, ny), nz);
+
+      normal = v_normalize3(normal);
+
+      // compute point on plane
+      float d = v_extract0(v_swizzle_singular_mask(plane_v, 3));
+
+      f32x4 point = v_mul(plane_v, v_set1(-d));
+      point = v_preserve_w(point, v_set1(1.0f));
+
+      // apply scale
+      point = v_mul(point, scale_v);
+
+      // transform point by matrix
+      point = m_mul_vec4(M, point);
+
+      // compute new plane distance
+      float newD = -v_extract0(v_dot3(point, normal));
+
+      alignas(16) float n[4];
+      v_store(n, normal);
+
+      plane_result[0] = n[0];
+      plane_result[1] = n[1];
+      plane_result[2] = n[2];
+      plane_result[3] = newD;
+   }
+
   inline void mat44_get_scale_impl(const float* m, float* s)
   {
      f32x4x4 ma = m_load(m);
@ -341,4 +388,39 @@ namespace math_backend::mat44
      m_store(m, mo);
   }

+   //--------------------------------------------------
+   // MATRIX BATCH FUNCTIONS
+   //--------------------------------------------------
+
+   inline void mat44_batch_mul_pos3(const float* m, const float* points, size_t count, float* result)
+   {
+      size_t i = 0;
+      f32x4x4 ma = m_load(m);
+
+      // AVX has 8 lanes to play with
+#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX)
+      // 8-wide AVX only
+      for (; i + 8 <= count; i += 8)
+      {
+         vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false);
+         vec4_batch8 vr = m_mul_pos3_batch8(ma, va);
+         store_vec3_batch8(&result[i*3], vr);
+      }
+#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX
+
+      // 4-wide
+      for (; i + 4 <= count; i += 4)
+      {
+         vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false);
+         vec4_batch4 vr = m_mul_pos3_batch4(ma, va);
+         store_vec3_batch4(&result[i * 3], vr);
+      }
+
+      for (; i < count; ++i)
+      {
+         size_t idx = i * 3;
+         mat44_mul_pos3_impl(m, &points[idx], &result[idx]);
+      }
+   }
+
 } // namespace math_backend::mat44
--- a/Engine/source/math/impl/math_c.cpp
+++ b/Engine/source/math/impl/math_c.cpp
@ -3,6 +3,7 @@
 #include "math/public/float3_dispatch.h"
 #include "math/public/mat44_dispatch.h"
 #include "math/mConstants.h"
+#include "math/mMatrix.h"
 #include <cmath>   // for sqrtf, etc.

 namespace math_backend::float4::dispatch
@ -375,6 +376,88 @@ namespace math_backend::mat44::dispatch
         mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
      };

+      gMat44.transform_plane = [](const F32* m, const F32* s, const F32* p, F32* presult) {
+         // We take in a matrix, a scale factor, and a plane equation.  We want to output
+         //  the resultant normal
+         // We have T = m*s
+         // To multiply the normal, we want Inv(Tr(m*s))
+         //  Inv(Tr(ms)) = Inv(Tr(s) * Tr(m))
+         //              = Inv(Tr(m)) * Inv(Tr(s))
+         //
+         //  Inv(Tr(s)) = Inv(s) = [ 1/x   0   0  0]
+         //                        [   0 1/y   0  0]
+         //                        [   0   0 1/z  0]
+         //                        [   0   0   0  1]
+         //
+         // Since m is an affine matrix,
+         //  Tr(m) = [ [       ] 0 ]
+         //          [ [   R   ] 0 ]
+         //          [ [       ] 0 ]
+         //          [ [ x y z ] 1 ]
+         //
+         // Inv(Tr(m)) = [ [    -1 ] 0 ]
+         //              [ [   R   ] 0 ]
+         //              [ [       ] 0 ]
+         //              [ [ A B C ] 1 ]
+         // Where:
+         //
+         //  P = (x, y, z)
+         //  A = -(Row(0, r) * P);
+         //  B = -(Row(1, r) * P);
+         //  C = -(Row(2, r) * P);
+
+         MatrixF invScale(true);
+         F32* pScaleElems = invScale;
+         pScaleElems[MatrixF::idx(0, 0)] = 1.0f / s[0];
+         pScaleElems[MatrixF::idx(1, 1)] = 1.0f / s[1];
+         pScaleElems[MatrixF::idx(2, 2)] = 1.0f / s[2];
+
+         const Point3F shear(m[MatrixF::idx(3, 0)], m[MatrixF::idx(3, 1)], m[MatrixF::idx(3, 2)]);
+
+         const Point3F row0(m[MatrixF::idx(0, 0)], m[MatrixF::idx(0, 1)], m[MatrixF::idx(0, 2)]);
+         const Point3F row1(m[MatrixF::idx(1, 0)], m[MatrixF::idx(1, 1)], m[MatrixF::idx(1, 2)]);
+         const Point3F row2(m[MatrixF::idx(2, 0)], m[MatrixF::idx(2, 1)], m[MatrixF::idx(2, 2)]);
+
+         const F32 A = -mDot(row0, shear);
+         const F32 B = -mDot(row1, shear);
+         const F32 C = -mDot(row2, shear);
+
+         MatrixF invTrMatrix(true);
+         F32* destMat = invTrMatrix;
+         destMat[MatrixF::idx(0, 0)] = m[MatrixF::idx(0, 0)];
+         destMat[MatrixF::idx(1, 0)] = m[MatrixF::idx(1, 0)];
+         destMat[MatrixF::idx(2, 0)] = m[MatrixF::idx(2, 0)];
+         destMat[MatrixF::idx(0, 1)] = m[MatrixF::idx(0, 1)];
+         destMat[MatrixF::idx(1, 1)] = m[MatrixF::idx(1, 1)];
+         destMat[MatrixF::idx(2, 1)] = m[MatrixF::idx(2, 1)];
+         destMat[MatrixF::idx(0, 2)] = m[MatrixF::idx(0, 2)];
+         destMat[MatrixF::idx(1, 2)] = m[MatrixF::idx(1, 2)];
+         destMat[MatrixF::idx(2, 2)] = m[MatrixF::idx(2, 2)];
+         destMat[MatrixF::idx(0, 3)] = A;
+         destMat[MatrixF::idx(1, 3)] = B;
+         destMat[MatrixF::idx(2, 3)] = C;
+         invTrMatrix.mul(invScale);
+
+         Point3F norm(p[0], p[1], p[2]);
+         Point3F point = norm * -p[3];
+         invTrMatrix.mulP(norm);
+         norm.normalize();
+
+         MatrixF temp;
+         dMemcpy(temp, m, sizeof(F32) * 16);
+         point.x *= s[0];
+         point.y *= s[1];
+         point.z *= s[2];
+         temp.mulP(point);
+
+         PlaneF resultPlane(point, norm);
+         presult[0] = resultPlane.x;
+         presult[1] = resultPlane.y;
+         presult[2] = resultPlane.z;
+         presult[3] = resultPlane.d;
+
+      };
+
      gMat44.normalize = [](float* a) {
         F32 col0[3], col1[3], col2[3];
         // extract columns 0 and 1
@ -404,5 +487,14 @@ namespace math_backend::mat44::dispatch
         a[10] = col2[2];

      };
+
+      gMat44.batch_mul_pos3 = [](const float* m, const float* pts, size_t count, float* result_ptrs) {
+         size_t i = 0;
+         for (; i < count; i++)
+         {
+            size_t idx = i * 3;
+            gMat44.mul_pos3(m, &pts[idx], &result_ptrs[idx]);
+         }
+      };
   }
 }
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@ -57,7 +57,7 @@ namespace
   // Shuffle helpers
   //------------------------------------------------------

-   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, const int x, const int y, const int z, const int w)
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
   {
      f32x4 lo = v1;
      f32x4 hi = v2;
@ -160,13 +160,13 @@ namespace
   // full dot4
   inline f32x4 v_dot4(f32x4 a, f32x4 b)
   {
-      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
   }

   // dot3 (ignores w)
   inline f32x4 v_dot3(f32x4 a, f32x4 b)
   {
-      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
   }

   // cross product xyz only.
@ -182,7 +182,7 @@ namespace

   inline f32x4 v_normalize3(f32x4 v)
   {
-      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 zero = _mm_setzero_ps();
      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
      f32x4 dot = v_dot3(v, v);

@ -405,13 +405,6 @@ namespace
      f32x8 w;
   };

-   struct vec3_batch8
-   {
-      f32x8 x;
-      f32x8 y;
-      f32x8 z;
-   };
-
   struct vec4_batch4
   {
      f32x4 x;
@ -420,47 +413,37 @@ namespace
      f32x4 w;
   };

-   struct vec3_batch4
+   inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
   {
-      f32x4 x;
-      f32x4 y;
-      f32x4 z;
-   };
+      vec4_batch8 r;

-   inline vec3_batch8 load_vec3_batch8(const float* ptr)
-   {
-      // ptr points to x0
-      // layout: x y z x y z ...
+      r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
+         ptr[12], ptr[15], ptr[18], ptr[21]);
+      r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
+         ptr[13], ptr[16], ptr[19], ptr[22]);
+      r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
+         ptr[14], ptr[17], ptr[20], ptr[23]);

-      vec3_batch8 r;
-
-      r.x = _mm256_set_ps(
-         ptr[21], ptr[18], ptr[15], ptr[12],
-         ptr[9], ptr[6], ptr[3], ptr[0]);
-
-      r.y = _mm256_set_ps(
-         ptr[22], ptr[19], ptr[16], ptr[13],
-         ptr[10], ptr[7], ptr[4], ptr[1]);
-
-      r.z = _mm256_set_ps(
-         ptr[23], ptr[20], ptr[17], ptr[14],
-         ptr[11], ptr[8], ptr[5], ptr[2]);
+      if (fillW)
+      {
+         r.w = _mm256_set1_ps(w);
+      }

      return r;
   }

-   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
   {
-      vec3_batch4 r;
+      vec4_batch4 r;

-      r.x = _mm_set_ps(
-         ptr[9], ptr[6], ptr[3], ptr[0]);
+      r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
+      r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
+      r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);

-      r.y = _mm_set_ps(
-         ptr[10], ptr[7], ptr[4], ptr[1]);
-
-      r.z = _mm_set_ps(
-         ptr[11], ptr[8], ptr[5], ptr[2]);
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }

      return r;
   }
@ -478,7 +461,7 @@ namespace
   }

   // Store the result back to a float3 array with size of 8
-   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   inline void store_vec3_batch8(float* out, const vec4_batch8& v)
   {
      alignas(32) float xs[8];
      alignas(32) float ys[8];
@ -497,7 +480,7 @@ namespace
   }

   // Store the result back to a float3 array with size of 4
-   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
   {
      alignas(16) float xs[8];
      alignas(16) float ys[8];
@ -515,7 +498,7 @@ namespace
      }
   }

-   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
   {
      f32x4 mulx = _mm_mul_ps(a.x, b.x);
      f32x4 muly = _mm_mul_ps(a.y, b.y);
@ -524,7 +507,7 @@ namespace
      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
   }

-   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
   {
      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
      f32x8 muly = _mm256_mul_ps(a.y, b.y);
@ -533,4 +516,110 @@ namespace
      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
   }

+   // Batch 8 mul_Vec4.
+   inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v)
+   {
+      vec4_batch8 r;
+
+      __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]);
+      __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]);
+      __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]);
+      __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]);
+
+      __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]);
+      __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]);
+      __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]);
+      __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]);
+
+      __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]);
+      __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]);
+      __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]);
+      __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m00),
+            _mm256_mul_ps(v.y, m01)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m10),
+            _mm256_mul_ps(v.y, m11)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m20),
+            _mm256_mul_ps(v.y, m21)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
+      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
+      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
+      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+
+      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
+      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
+      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
+      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+
+      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
+      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
+      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
+      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
 }
--- a/Engine/source/math/isa/avx/mat44.cpp
+++ b/Engine/source/math/isa/avx/mat44.cpp
@ -15,9 +15,12 @@ namespace math_backend::mat44::dispatch
      gMat44.mul_pos3         = mat44_mul_pos3_impl;
      gMat44.mul_vec3         = mat44_mul_vec3_impl;
      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
      gMat44.scale            = mat44_scale_impl;
      gMat44.get_scale        = mat44_get_scale_impl;
      gMat44.normalize        = mat44_normalize_impl;
      gMat44.determinant      = mat44_get_determinant;
+
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
   }
 }
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@ -405,13 +405,6 @@ namespace
      f32x8 w;
   };

-   struct vec3_batch8
-   {
-      f32x8 x;
-      f32x8 y;
-      f32x8 z;
-   };
-
   struct vec4_batch4
   {
      f32x4 x;
@ -420,47 +413,37 @@ namespace
      f32x4 w;
   };

-   struct vec3_batch4
+   inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
   {
-      f32x4 x;
-      f32x4 y;
-      f32x4 z;
-   };
+      vec4_batch8 r;

-   inline vec3_batch8 load_vec3_batch8(const float* ptr)
-   {
-      // ptr points to x0
-      // layout: x y z x y z ...
+      r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
+         ptr[12], ptr[15], ptr[18], ptr[21]);
+      r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
+         ptr[13], ptr[16], ptr[19], ptr[22]);
+      r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
+         ptr[14], ptr[17], ptr[20], ptr[23]);

-      vec3_batch8 r;
-
-      r.x = _mm256_set_ps(
-         ptr[21], ptr[18], ptr[15], ptr[12],
-         ptr[9], ptr[6], ptr[3], ptr[0]);
-
-      r.y = _mm256_set_ps(
-         ptr[22], ptr[19], ptr[16], ptr[13],
-         ptr[10], ptr[7], ptr[4], ptr[1]);
-
-      r.z = _mm256_set_ps(
-         ptr[23], ptr[20], ptr[17], ptr[14],
-         ptr[11], ptr[8], ptr[5], ptr[2]);
+      if (fillW)
+      {
+         r.w = _mm256_set1_ps(w);
+      }

      return r;
   }

-   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
   {
-      vec3_batch4 r;
+      vec4_batch4 r;

-      r.x = _mm_set_ps(
-         ptr[9], ptr[6], ptr[3], ptr[0]);
+      r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
+      r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
+      r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);

-      r.y = _mm_set_ps(
-         ptr[10], ptr[7], ptr[4], ptr[1]);
-
-      r.z = _mm_set_ps(
-         ptr[11], ptr[8], ptr[5], ptr[2]);
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }

      return r;
   }
@ -478,7 +461,7 @@ namespace
   }

   // Store the result back to a float3 array with size of 8
-   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   inline void store_vec3_batch8(float* out, const vec4_batch8& v)
   {
      alignas(32) float xs[8];
      alignas(32) float ys[8];
@ -497,7 +480,7 @@ namespace
   }

   // Store the result back to a float3 array with size of 4
-   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
   {
      alignas(16) float xs[8];
      alignas(16) float ys[8];
@ -515,7 +498,7 @@ namespace
      }
   }

-   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
   {
      f32x4 mulx = _mm_mul_ps(a.x, b.x);
      f32x4 muly = _mm_mul_ps(a.y, b.y);
@ -524,7 +507,7 @@ namespace
      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
   }

-   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
   {
      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
      f32x8 muly = _mm256_mul_ps(a.y, b.y);
@ -533,4 +516,110 @@ namespace
      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
   }

+   // Batch 8 mul_Vec4.
+   inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v)
+   {
+      vec4_batch8 r;
+
+      __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]);
+      __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]);
+      __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]);
+      __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]);
+
+      __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]);
+      __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]);
+      __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]);
+      __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]);
+
+      __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]);
+      __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]);
+      __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]);
+      __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m00),
+            _mm256_mul_ps(v.y, m01)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m10),
+            _mm256_mul_ps(v.y, m11)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m20),
+            _mm256_mul_ps(v.y, m21)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
+      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
+      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
+      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+
+      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
+      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
+      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
+      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+
+      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
+      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
+      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
+      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
 }
--- a/Engine/source/math/isa/avx2/mat44.cpp
+++ b/Engine/source/math/isa/avx2/mat44.cpp
@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch
      gMat44.mul_pos3         = mat44_mul_pos3_impl;
      gMat44.mul_vec3         = mat44_mul_vec3_impl;
      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
      gMat44.scale            = mat44_scale_impl;
      gMat44.get_scale        = mat44_get_scale_impl;
      gMat44.normalize        = mat44_normalize_impl;
      gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
   }
 }
--- a/Engine/source/math/isa/neon/neon_intrinsics.h
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@ -410,4 +410,95 @@ namespace
       return m_transpose(adj);
   }

+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch4 r;
+
+      r.x = (f32x4){ ptr[9], ptr[6], ptr[3], ptr[0] };
+      r.y = (f32x4){ ptr[10], ptr[7], ptr[4], ptr[1] };
+      r.z = (f32x4){ ptr[11], ptr[8], ptr[5], ptr[2] };
+
+      if (fillW)
+      {
+         r.w = vdupq_n_f32(w);
+      }
+
+      return r;
+   }
+
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
+   {
+      alignas(16) float xs[4];
+      alignas(16) float ys[4];
+      alignas(16) float zs[4];
+
+      vst1q_f32(xs, v.x);
+      vst1q_f32(ys, v.y);
+      vst1q_f32(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      float32x4_t m00 = vdupq_n_f32(m.r0.m128_f32[0]);
+      float32x4_t m01 = vdupq_n_f32(m.r0.m128_f32[1]);
+      float32x4_t m02 = vdupq_n_f32(m.r0.m128_f32[2]);
+      float32x4_t m03 = vdupq_n_f32(m.r0.m128_f32[3]);
+
+      float32x4_t m10 = vdupq_n_f32(m.r1.m128_f32[0]);
+      float32x4_t m11 = vdupq_n_f32(m.r1.m128_f32[1]);
+      float32x4_t m12 = vdupq_n_f32(m.r1.m128_f32[2]);
+      float32x4_t m13 = vdupq_n_f32(m.r1.m128_f32[3]);
+
+      float32x4_t m20 = vdupq_n_f32(m.r2.m128_f32[0]);
+      float32x4_t m21 = vdupq_n_f32(m.r2.m128_f32[1]);
+      float32x4_t m22 = vdupq_n_f32(m.r2.m128_f32[2]);
+      float32x4_t m23 = vdupq_n_f32(m.r2.m128_f32[3]);
+
+      // row0 dot
+      r.x = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m00),
+            vmulq_f32(v.y, m01)),
+         vaddq_f32(
+            vmulq_f32(v.z, m02),
+            m03));
+
+      // row1 dot
+      r.y = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m10),
+            vmulq_f32(v.y, m11)),
+         vaddq_f32(
+            vmulq_f32(v.z, m12),
+            m13));
+
+      // row2 dot
+      r.z = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m20),
+            vmulq_f32(v.y, m21)),
+         vaddq_f32(
+            vmulq_f32(v.z, m22),
+            m23));
+
+      return r;
+   }
+
 }
--- a/Engine/source/math/isa/sse2/mat44.cpp
+++ b/Engine/source/math/isa/sse2/mat44.cpp
@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch
      gMat44.mul_pos3         = mat44_mul_pos3_impl;
      gMat44.mul_vec3         = mat44_mul_vec3_impl;
      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane  = mat44_transform_plane_impl;
      gMat44.scale            = mat44_scale_impl;
      gMat44.get_scale        = mat44_get_scale_impl;
      gMat44.normalize        = mat44_normalize_impl;
      gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
   }
 }
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@ -544,17 +544,10 @@ namespace
      f32x4 w;
   };

-   struct vec3_batch4
-   {
-      f32x4 x;
-      f32x4 y;
-      f32x4 z;
-   };

-
-   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
   {
-      vec3_batch4 r;
+      vec4_batch4 r;

      r.x = _mm_set_ps(
         ptr[9], ptr[6], ptr[3], ptr[0]);
@ -565,6 +558,11 @@ namespace
      r.z = _mm_set_ps(
         ptr[11], ptr[8], ptr[5], ptr[2]);

+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
+
      return r;
   }

@ -575,7 +573,7 @@ namespace
   }

   // Store the result back to a float3 array with size of 4
-   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
   {
      alignas(16) float xs[8];
      alignas(16) float ys[8];
@ -593,7 +591,7 @@ namespace
      }
   }

-   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
   {
      f32x4 mulx = _mm_mul_ps(a.x, b.x);
      f32x4 muly = _mm_mul_ps(a.y, b.y);
@ -601,4 +599,57 @@ namespace

      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
+      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
+      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
+      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+
+      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
+      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
+      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
+      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+
+      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
+      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
+      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
+      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
 }
--- a/Engine/source/math/isa/sse41/mat44.cpp
+++ b/Engine/source/math/isa/sse41/mat44.cpp
@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch
      gMat44.mul_pos3         = mat44_mul_pos3_impl;
      gMat44.mul_vec3         = mat44_mul_vec3_impl;
      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
      gMat44.scale            = mat44_scale_impl;
      gMat44.get_scale        = mat44_get_scale_impl;
      gMat44.normalize        = mat44_normalize_impl;
      gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
   }
 }
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@ -514,17 +514,10 @@ namespace
      f32x4 w;
   };

-   struct vec3_batch4
-   {
-      f32x4 x;
-      f32x4 y;
-      f32x4 z;
-   };

-
-   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
   {
-      vec3_batch4 r;
+      vec4_batch4 r;

      r.x = _mm_set_ps(
         ptr[9], ptr[6], ptr[3], ptr[0]);
@ -535,6 +528,11 @@ namespace
      r.z = _mm_set_ps(
         ptr[11], ptr[8], ptr[5], ptr[2]);

+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
+
      return r;
   }

@ -545,7 +543,7 @@ namespace
   }

   // Store the result back to a float3 array with size of 4
-   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
   {
      alignas(16) float xs[8];
      alignas(16) float ys[8];
@ -563,7 +561,7 @@ namespace
      }
   }

-   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
   {
      f32x4 mulx = _mm_mul_ps(a.x, b.x);
      f32x4 muly = _mm_mul_ps(a.y, b.y);
@ -571,4 +569,57 @@ namespace

      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
+      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
+      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
+      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+
+      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
+      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
+      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
+      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+
+      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
+      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
+      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
+      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
 }
--- a/Engine/source/math/mMatrix.h
+++ b/Engine/source/math/mMatrix.h
@ -231,6 +231,8 @@ public:
   void mul( Point4F& p ) const;                       ///< M * p -> p (full [4x4] * [1x4])
   void mulP( Point3F& p ) const;                      ///< M * p -> p (assume w = 1.0f)
   void mulP( const Point3F &p, Point3F *d) const;     ///< M * p -> d (assume w = 1.0f)
+   void batch_mulP(Point3F* points, size_t count) const;
+   void batch_mulP(const Point3F* points, size_t count, Point3F* out) const;
   void mulV( VectorF& p ) const;                      ///< M * v -> v (assume w = 0.0f)
   void mulV( const VectorF &p, Point3F *d) const;     ///< M * v -> d (assume w = 0.0f)

@ -477,6 +479,40 @@ inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
   GetMat44().mul_pos3(*this, p, *d);
 }

+inline void MatrixF::batch_mulP(Point3F* points, size_t count) const
+{
+   // Allocate temporary buffer
+   Point3F* temp = new Point3F[count];
+
+   GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
+
+   // Copy the results back to out
+   for (size_t i = 0; i < count; ++i)
+   {
+      points[i] = temp[i];
+   }
+
+   // Free temporary buffer
+   delete[] temp;
+}
+
+inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* out) const
+{
+   // Allocate temporary buffer
+   Point3F* temp = new Point3F[count];
+
+   GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
+
+   // Copy the results back to out
+   for (size_t i = 0; i < count; ++i)
+   {
+      out[i] = temp[i];
+   }
+
+   // Free temporary buffer
+   delete[] temp;
+}
+
 inline void MatrixF::mulV( VectorF& v) const
 {
   // M * v -> v
@ -663,7 +699,9 @@ inline bool MatrixF::isNaN()

 inline void mTransformPlane(const MatrixF& mat, const Point3F& scale, const PlaneF&  plane, PlaneF * result)
 {
-   m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
+   //m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
+
+   GetMat44().transform_plane(mat, scale, plane, *result);
 }

 #else // !USE_TEMPLATE_MATRIX
--- a/Engine/source/math/public/mat44_dispatch.h
+++ b/Engine/source/math/public/mat44_dispatch.h
@ -16,9 +16,12 @@ namespace math_backend::mat44::dispatch
      void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr;
      void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr;
      void (*mul_float4)(const float* a, const float* b, float* r) = nullptr;
+      void (*transform_plane)(const float* m, const float* s, const float* p, float* r) = nullptr;
      float (*determinant)(const float*) = nullptr;
      void (*scale)(float*, const float*) = nullptr;
      void (*get_scale)(const float*, float*) = nullptr;
+
+      void (*batch_mul_pos3)(const float* m, const float* pts, size_t count, float* result_ptrs) = nullptr;
   };

   // Global dispatch table
--- a/Engine/source/math/util/frustum.cpp
+++ b/Engine/source/math/util/frustum.cpp
@ -381,8 +381,10 @@ void FrustumData::_update() const

   // Transform the points into the desired culling space.

-   for( U32 i = 0; i < mPoints.size(); ++ i )
-      mTransform.mulP( mPoints[ i ] );
+   /*for( U32 i = 0; i < mPoints.size(); ++ i )
+      mTransform.mulP( mPoints[ i ] );*/
+
+   mTransform.batch_mulP(mPoints.address(), mPoints.size());

   // Update the axis aligned bounding box from 
   // the newly transformed points.