Merge pull request #1680 from marauder2k9-torque/SIMD-Math-and-ISA-integration

Simd math and isa integration
2026-07-11 06:34:36 +00:00 · 2026-03-13 12:50:17 -05:00 · 2026-03-13 12:50:17 -05:00 · 6700cfd993
commit 6700cfd993
parent c666f411ef 090dd4993a
49 changed files with 5232 additions and 212 deletions
--- a/Engine/source/CMakeLists.txt
+++ b/Engine/source/CMakeLists.txt
@ -130,7 +130,7 @@ torqueAddSourceDirectories("windowManager" "windowManager/torque" "windowManager
 torqueAddSourceDirectories("scene" "scene/culling" "scene/zones" "scene/mixin")

 # Handle math
-torqueAddSourceDirectories("math" "math/util")
+torqueAddSourceDirectories("math" "math/util" "math/public" "math/impl") # note impl must skip the .inl files, never use them in engine code.

 # Handle persistence
 set(TORQUE_INCLUDE_DIRECTORIES ${TORQUE_INCLUDE_DIRECTORIES} "persistence/rapidjson")
@ -496,6 +496,37 @@ else()
 	set_target_properties(${TORQUE_APP_NAME} PROPERTIES LINK_FLAGS "-Wl,-rpath,./")
 endif()

+string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH)
+
+set(IS_X86 FALSE)
+set(IS_ARM FALSE)
+
+if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
+    set(IS_X86 TRUE)
+endif()
+
+if(ARCH MATCHES "arm64|aarch64")
+    set(IS_ARM TRUE)
+endif()
+
+# always available
+add_math_backend(scalar  MATH_SIMD_SCALAR)
+message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "IS_X86=${IS_X86}")
+message(STATUS "IS_ARM=${IS_ARM}")
+
+# x86 family
+if(IS_X86)
+    add_math_backend(sse2  MATH_SIMD_SSE2)
+    add_math_backend(sse41 MATH_SIMD_SSE41)
+    add_math_backend(avx   MATH_SIMD_AVX)
+    add_math_backend(avx2  MATH_SIMD_AVX2)
+endif()
+
+# ARM family
+if(IS_ARM)
+    add_math_backend(neon  MATH_SIMD_NEON)
+endif()

 if(MSVC)
    # Match projectGenerator naming for executables
--- a/Engine/source/environment/meshRoad.cpp
+++ b/Engine/source/environment/meshRoad.cpp
@ -3406,7 +3406,7 @@ MatrixF MeshRoad::getNodeTransform( U32 idx )
   mat.setColumn( 2, node.normal );
   mat.setColumn( 3, node.point );

-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal(mat.determinant() != 0.0f, "no inverse!");

   return mat; 
 }
@ -3456,7 +3456,7 @@ void MeshRoad::calcSliceTransform( U32 idx, MatrixF &mat )
   mat.setColumn( 2, slice.normal );
   mat.setColumn( 3, slice.p1 );

-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal(mat.determinant() != 0.0f, "no inverse!");
 }

 F32 MeshRoad::getRoadLength() const
--- a/Engine/source/environment/river.cpp
+++ b/Engine/source/environment/river.cpp
@ -2139,7 +2139,7 @@ MatrixF River::getNodeTransform( U32 idx ) const
   mat.setColumn( 2, node.normal );
   mat.setColumn( 3, node.point );

-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal( mat.determinant() != 0.0f, "no inverse!");

   return mat; 
 }
--- a/Engine/source/math/impl/float3_impl.inl
+++ b/Engine/source/math/impl/float3_impl.inl
@ -0,0 +1,122 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+// Safely loads a float3 -> simd 4 lane backend
+namespace math_backend::float3
+{
+   //----------------------------------------------------------
+   // Add two float4 vectors: r = a + b
+   inline void float3_add_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_add(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Subtract: r = a - b
+   inline void float3_sub_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_sub(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply element-wise: r = a * b
+   inline void float3_mul_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_mul(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply by scalar: r = a * s
+   inline void float3_mul_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_mul(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Divide element-wise: r = a / b
+   inline void float3_div_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_div(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Divide by scalar: r = a / s
+   inline void float3_div_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_div(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Dot product: returns scalar
+   inline float float3_dot_impl(const float* a, const float* b)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vdot = v_dot3(va, vb);
+      return v_extract0(vdot); // first lane is the sum of 3 elements
+   }
+
+   // Length squared
+   inline float float3_length_squared_impl(const float* a)
+   {
+      return float3_dot_impl(a, a);
+   }
+
+   // Length
+   inline float float3_length_impl(const float* a)
+   {
+      return std::sqrt(float3_length_squared_impl(a));
+   }
+
+   // Normalize in-place
+   inline void float3_normalize_impl(float* a)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vr = v_normalize3(va);
+      v_store3(a, vr);
+   }
+
+   // Normalize with magnitude: r = normalize(a) * r
+   inline void float3_normalize_mag_impl(float* a, float r)
+   {
+      f32x4 va = v_load3_vec(a);
+
+      // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
+      f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
+
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store3(a, vnorm);
+   }
+
+   // Linear interpolation: r = from + (to - from) * f
+   inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
+   {
+      f32x4 vfrom = v_load3_vec(from);
+      f32x4 vto = v_load3_vec(to);
+      f32x4 vf = v_set1(f);
+      f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
+      v_store3(r, vr);
+   }
+
+   inline void float3_cross_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vcross = v_cross(va, vb);
+      v_store3(r, vcross);
+   }
+
+}
--- a/Engine/source/math/impl/float4_impl.inl
+++ b/Engine/source/math/impl/float4_impl.inl
@ -0,0 +1,123 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+namespace math_backend::float4
+{
+
+   //----------------------------------------------------------
+   // Add two float4 vectors: r = a + b
+   inline void float4_add_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vr = v_add(va, vb);
+      v_store(r, vr);
+   }
+
+   // Subtract: r = a - b
+   inline void float4_sub_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vr = v_sub(va, vb);
+      v_store(r, vr);
+   }
+
+   // Multiply element-wise: r = a * b
+   inline void float4_mul_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vr = v_mul(va, vb);
+      v_store(r, vr);
+   }
+
+   // Multiply by scalar: r = a * s
+   inline void float4_mul_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_mul(va, vs);
+      v_store(r, vr);
+   }
+
+   // Divide element-wise: r = a / b
+   inline void float4_div_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vr = v_div(va, vb);
+      v_store(r, vr);
+   }
+
+   // Divide by scalar: r = a / s
+   inline void float4_div_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_div(va, vs);
+      v_store(r, vr);
+   }
+
+   // Dot product: returns scalar
+   inline float float4_dot_impl(const float* a, const float* b)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vdot = v_dot4(va, vb);   // calls ISA-specific implementation
+      return v_extract0(vdot);
+   }
+
+   // Length squared
+   inline float float4_length_squared_impl(const float* a)
+   {
+      return float4_dot_impl(a, a);
+   }
+
+   // Length
+   inline float float4_length_impl(const float* a)
+   {
+      return std::sqrt(float4_length_squared_impl(a));
+   }
+
+   // Normalize in-place
+   inline void float4_normalize_impl(float* a)
+   {
+      f32x4 va = v_load(a);
+      f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
+   }
+
+   // Normalize with magnitude: r = normalize(a) * r
+   inline void float4_normalize_mag_impl(float* a, float r)
+   {
+      f32x4 va = v_load(a);
+
+      // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
+      f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va)));
+
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
+   }
+
+   // Linear interpolation: r = from + (to - from) * f
+   inline void float4_lerp_impl(const float* from, const float* to, float f, float* r)
+   {
+      f32x4 vfrom = v_load(from);
+      f32x4 vto = v_load(to);
+      f32x4 vf = v_set1(f);
+      f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
+      v_store(r, vr);
+   }
+
+   inline void float4_cross_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vcross = v_cross(va, vb);
+      v_store(r, vcross);
+   }
+
+} // namespace math_backend::float4
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@ -0,0 +1,424 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+namespace math_backend::mat44
+{
+   //------------------------------------------------------------------
+   // Matrix Transpose
+   inline void mat44_transpose_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4x4 mr = m_transpose(ma);
+      m_store(m, mr);
+   }
+
+   inline float mat44_get_determinant(const float* m)
+   {
+      f32x4x4 ma = m_load(m);
+      return v_extract0(m_determinant_affine(ma));
+   }
+
+   // Matrix Scale: Float3 (assume w = 1.0f)
+   inline void mat44_scale_impl(float* m, const float* s)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_pos(s);
+
+      ma.r0 = v_mul(ma.r0, va);
+      ma.r1 = v_mul(ma.r1, va);
+      ma.r2 = v_mul(ma.r2, va);
+      ma.r3 = v_mul(ma.r3, va);
+      m_store(m, ma);
+   }
+
+   inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result)
+   {
+      f32x4x4 M = m_load(m);
+
+      f32x4 plane_v = v_load(plane);
+      f32x4 scale_v = v_load3_vec(scale);
+      f32x4 invScale = v_rcp_nr(scale_v);
+
+      // normal = plane.xyz
+      f32x4 normal = plane_v;
+
+      // apply Inv(s)
+      normal = v_mul(normal, invScale);
+
+      // multiply by Inv(Tr(m)) (only the rotation part matters)
+      f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0);
+      f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1);
+      f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2);
+
+      normal = v_add(v_add(nx, ny), nz);
+
+      normal = v_normalize3(normal);
+
+      // compute point on plane
+      float d = v_extract0(v_swizzle_singular_mask(plane_v, 3));
+
+      f32x4 point = v_mul(plane_v, v_set1(-d));
+      point = v_preserve_w(point, v_set1(1.0f));
+
+      // apply scale
+      point = v_mul(point, scale_v);
+
+      // transform point by matrix
+      point = m_mul_vec4(M, point);
+
+      // compute new plane distance
+      float newD = -v_extract0(v_dot3(point, normal));
+
+      alignas(16) float n[4];
+      v_store(n, normal);
+
+      plane_result[0] = n[0];
+      plane_result[1] = n[1];
+      plane_result[2] = n[2];
+      plane_result[3] = newD;
+   }
+
+   inline void mat44_get_scale_impl(const float* m, float* s)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // squared lengths
+      f32x4 len2_x = v_dot3(ma.r0, ma.r0);
+      f32x4 len2_y = v_dot3(ma.r1, ma.r1);
+      f32x4 len2_z = v_dot3(ma.r2, ma.r2);
+
+      // extract and sqrt
+      s[0] = 1.0f / v_extract0(v_rsqrt_nr(len2_x));
+      s[1] = 1.0f / v_extract0(v_rsqrt_nr(len2_y));
+      s[2] = 1.0f / v_extract0(v_rsqrt_nr(len2_z));
+   }
+
+   // Matrix Scale Uniform: Float value (assume w = 1.0f)
+   inline void mat44_scale_uniform(float* m, float s)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // (s, s, s, 1)
+      f32x4 scale = v_set(s, s, s, 1.0f);
+
+      // Scale only rotation rows (xyz part)
+      ma.r0 = v_mul(ma.r0, scale);
+      ma.r1 = v_mul(ma.r1, scale);
+      ma.r2 = v_mul(ma.r2, scale);
+      m_store(m, ma);
+   }
+
+   // Matrix Inverse
+   inline void mat44_inverse_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // Compute cofactors using cross products
+      f32x4x4 mTemp;
+      mTemp.r0 = v_cross(ma.r1, ma.r2);
+      mTemp.r1 = v_cross(ma.r2, ma.r0);
+      mTemp.r2 = v_cross(ma.r0, ma.r1);
+
+      // Determinant = dot(ma.r0, c0)
+      f32x4 det = v_dot3(ma.r0, mTemp.r0);
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Scale cofactors
+      mTemp.r0 = v_mul(mTemp.r0, invDet);
+      mTemp.r1 = v_mul(mTemp.r1, invDet);
+      mTemp.r2 = v_mul(mTemp.r2, invDet);
+
+      // Store inverse 3x3 (transpose of cofactor matrix)
+
+      mTemp = m_transpose(mTemp);
+      mTemp.r3 = ma.r3;
+
+      // ---- Translation ----
+
+      // Load original translation
+      f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
+      
+      // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
+      f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
+      result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
+      result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
+      result = v_mul(result, v_set1(-1.0f));
+
+      m_store(m, mTemp);
+
+      // Store translation
+      m[3] = v_extract0(result);
+      m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
+      m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
+   }
+
+   // Matrix Inverse
+   inline void mat44_inverse_to_impl(const float* m, float* dst)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // Compute cofactors using cross products
+      f32x4x4 mTemp;
+      mTemp.r0 = v_cross(ma.r1, ma.r2);
+      mTemp.r1 = v_cross(ma.r2, ma.r0);
+      mTemp.r2 = v_cross(ma.r0, ma.r1);
+
+      // Determinant = dot(ma.r0, c0)
+      f32x4 det = v_dot3(ma.r0, mTemp.r0);
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Scale cofactors
+      mTemp.r0 = v_mul(mTemp.r0, invDet);
+      mTemp.r1 = v_mul(mTemp.r1, invDet);
+      mTemp.r2 = v_mul(mTemp.r2, invDet);
+
+      // Store inverse 3x3 (transpose of cofactor matrix)
+
+      mTemp = m_transpose(mTemp);
+      mTemp.r3 = ma.r3;
+
+      // ---- Translation ----
+
+      // Load original translation
+      f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
+
+      // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
+      f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
+      result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
+      result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
+      result = v_mul(result, v_set1(-1.0f));
+
+      m_store(dst, mTemp);
+
+      // Store translation
+      dst[3] = v_extract0(result);
+      dst[7] = v_extract0(v_swizzle_singular_mask(result, 1));
+      dst[11] = v_extract0(v_swizzle_singular_mask(result, 2));
+   }
+
+   // Matrix Affine Inverse
+   inline void mat44_affine_inverse_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+
+      f32x4x4 mTemp = m_transpose(ma);
+      mTemp.r3 = v_set(0, 0, 0, 1);
+
+      // ---- Translation ----
+      // Load original translation
+      f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
+
+      // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
+      f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
+      result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
+      result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
+      result = v_mul(result, v_set1(-1.0f));
+
+      m_store(m, mTemp);
+
+      // Store translation
+      m[3] = v_extract0(result);
+      m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
+      m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
+   }
+
+   inline void mat44_normalize_impl(float* m)
+   {
+      // Load the matrix into SIMD registers
+      f32x4x4 mat = m_load(m);
+
+      // Transpose: now rows are columns
+      mat = m_transpose(mat);
+
+      // Extract columns (which are now rows)
+      f32x4 col0 = mat.r0;
+      f32x4 col1 = mat.r1;
+
+      // Rebuild orthonormal basis
+      f32x4 col2 = v_cross(col0, col1);
+      col1 = v_cross(col2, col0);
+
+      // Normalize columns
+      col0 = v_normalize3(col0);
+      col1 = v_normalize3(col1);
+      col2 = v_normalize3(col2);
+
+      // Write back directly into transposed matrix
+      mat.r0 = col0;
+      mat.r1 = col1;
+      mat.r2 = col2;
+
+      // Transpose back to row-major
+      mat = m_transpose(mat);
+
+      // Store back
+      m_store(m, mat);
+   }
+
+   // Matrix Multiply: a * b
+   inline void mat44_mul_mat44_impl(const float* a, const float* b, float* r)
+   {
+      f32x4x4 ma = m_load(a);
+      f32x4x4 mb = m_load(b);
+
+      f32x4x4 mr = m_mul(ma, mb);
+      m_store(r, mr);
+   }
+
+   // Vector Multiply: m * p (assume w = 1.0f)
+   inline void mat44_mul_pos3_impl(const float *m, const float *p, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_pos(p);
+      f32x4 vr = m_mul_vec4(ma, va);
+      v_store3(r, vr);
+   }
+
+   // Vector Multiply: m * v (assume w = 0.0f)
+   inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_vec(v);
+      f32x4 vr = m_mul_vec3(ma, va);
+      v_store3(r, vr);
+   }
+
+   // Vector Multiply: m * p (full [4x4] * [1x4])
+   inline void mat44_mul_float4_impl(const float* m, const float* p, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load(p);
+      f32x4 vr = m_mul_vec4(ma, va);
+      v_store(r, vr);
+   }
+
+   //--------------------------------------------------
+   // MATRIX ROTATION FUNCTIONS
+   //--------------------------------------------------
+
+   inline void mat44_rotation_x_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r1 = v_set(0, c, s, 0);
+      mr.r2 = v_set(0, -s, c, 0);
+      m_store(m, mr);
+   }
+
+   inline void mat44_rotation_y_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r0 = v_set(c, 0, -s, 0);
+      mr.r2 = v_set(s, 0, c, 0);
+      m_store(m, mr);
+   }
+
+   inline void mat44_rotation_z_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r0 = v_set(c, s, 0, 0);
+      mr.r1 = v_set(-s, c, 0, 0);
+      m_store(m, mr);
+   }
+
+   // Compose rotation from Euler angles (pitch=X, yaw=Y, roll=Z)
+   inline void mat44_rotation_euler_impl(float* m, float pitch, float yaw, float roll)
+   {
+      f32x4x4 rx, ry, rz;
+      mat44_rotation_x_impl((float*)&rx, pitch);
+      mat44_rotation_y_impl((float*)&ry, yaw);
+      mat44_rotation_z_impl((float*)&rz, roll);
+
+      f32x4x4 r = m_mul(rz, m_mul(ry, rx));
+      m_store(m, r);
+   }
+
+   inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up)
+   {
+      f32x4 vEye = v_load3_pos(eye);
+      f32x4 vTarget = v_load3_pos(target);
+      f32x4 vUp = v_load3_vec(up);
+
+      // Forward (z+)
+      f32x4 zaxis = v_normalize3(v_sub(vTarget, vEye));
+
+      // Right (x+)
+      f32x4 xaxis = v_normalize3(v_cross(vUp, zaxis));
+
+      // Up (y+)
+      f32x4 yaxis = v_cross(zaxis, xaxis);
+
+      // Compute translation components: -dot(axis, eye)
+      f32x4 t_x = v_mul(v_dot3(xaxis, vEye), v_set1(-1.0f));
+      f32x4 t_y = v_mul(v_dot3(yaxis, vEye), v_set1(-1.0f));
+      f32x4 t_z = v_mul(v_dot3(zaxis, vEye), v_set1(-1.0f));
+
+      f32x4x4 view;
+      view.r0 = v_insert_w(xaxis, t_x);
+      view.r1 = v_insert_w(yaxis, t_y);
+      view.r2 = v_insert_w(zaxis, t_z);
+      view.r3 = v_set(0, 0, 0, 1.0f);
+
+      m_store(m, view);
+   }
+
+   inline void mat44_perspective_impl(float* m, float fovY, float aspect, float znear, float zfar)
+   {
+      float f = 1.0f / tanf(fovY * 0.5f);
+      float nf = 1.0f / (znear - zfar);
+
+      f32x4x4 mp = m_zero();
+      mp.r0 = v_set(f / aspect, 0, 0, 0);
+      mp.r1 = v_set(0, f, 0, 0);
+      mp.r2 = v_set(0, 0, (zfar + znear) * nf, 2 * zfar * znear * nf);
+      mp.r3 = v_set(0, 0, -1, 0); // row-major projection
+      m_store(m, mp);
+   }
+
+   inline void mat44_orthographic_impl(float* m, float left, float right, float bottom, float top, float znear, float zfar)
+   {
+      f32x4x4 mo = m_zero();
+      mo.r0 = v_set(2.0f / (right - left), 0, 0, -(right + left) / (right - left));
+      mo.r1 = v_set(0, 2.0f / (top - bottom), 0, -(top + bottom) / (top - bottom));
+      mo.r2 = v_set(0, 0, -2.0f / (zfar - znear), -(zfar + znear) / (zfar - znear));
+      mo.r3 = v_set(0, 0, 0, 1.0f);
+      m_store(m, mo);
+   }
+
+   //--------------------------------------------------
+   // MATRIX BATCH FUNCTIONS
+   //--------------------------------------------------
+
+   inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result)
+   {
+      int i = 0;
+      // AVX has 8 lanes to play with
+#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX)
+      // 8-wide AVX only
+      for (; i + 8 <= count; i += 8)
+      {
+         vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false);
+         vec4_batch8 vr = m_mul_pos3_batch8(m, va);
+         store_vec3_batch8(&result[i*3], vr);
+      }
+#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX
+
+      // 4-wide
+      for (; i + 4 <= count; i += 4)
+      {
+         vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false);
+         vec4_batch4 vr = m_mul_pos3_batch4(m, va);
+         store_vec3_batch4(&result[i * 3], vr);
+      }
+
+      for (; i < count; ++i)
+      {
+         size_t idx = i * 3;
+         mat44_mul_pos3_impl(m, &points[idx], &result[idx]);
+      }
+   }
+
+} // namespace math_backend::mat44
--- a/Engine/source/math/impl/math_c.cpp
+++ b/Engine/source/math/impl/math_c.cpp
@ -0,0 +1,500 @@
+#include "platform/platform.h"
+#include "math/public/float4_dispatch.h"
+#include "math/public/float3_dispatch.h"
+#include "math/public/mat44_dispatch.h"
+#include "math/mConstants.h"
+#include "math/mMatrix.h"
+#include <cmath>   // for sqrtf, etc.
+
+namespace math_backend::float4::dispatch
+{
+   void install_scalar()
+   {
+      gFloat4.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat4.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat4.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat4.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * s;
+      };
+
+      gFloat4.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat4.div_scalar = [](const float* a, float s, float* r) {
+         float denom = 1.0f / s;
+         for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
+      };
+
+      gFloat4.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat4.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return std::sqrt(sum);
+      };
+
+      gFloat4.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat4.normalize = [](float* a) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = 1.0f / len;
+            for (int i = 0; i < 4; i++)
+               a[i] *= denom;
+         }
+      };
+
+      gFloat4.normalize_mag = [](float* a, float f) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = f / len;
+            for (int i = 0; i < 4; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+
+      gFloat4.cross = [](const float* a, const float* b, float* r) {
+         const float ax = a[0];
+         const float ay = a[1];
+         const float az = a[2];
+
+         const float bx = b[0];
+         const float by = b[1];
+         const float bz = b[2];
+
+         r[0] = ay * bz - az * by;
+         r[1] = az * bx - ax * bz;
+         r[2] = ax * by - ay * bx;
+      };
+   }
+}
+
+namespace math_backend::float3::dispatch
+{
+   void install_scalar()
+   {
+      gFloat3.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat3.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat3.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat3.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] * s;
+      };
+
+      gFloat3.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat3.div_scalar = [](const float* a, float s, float* r) {
+         float denom = 1.0f / s;
+         for (int i = 0; i < 3; i++) r[i] = a[i] * denom;
+      };
+
+      gFloat3.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat3.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * a[i];
+         return std::sqrt(sum);
+      };
+
+      gFloat3.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat3.normalize = [](float* a) {
+         float len = gFloat3.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = 1.0 / len;
+            for (int i = 0; i < 3; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat3.normalize_mag = [](float* a, float f) {
+         float len = gFloat3.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = f / len;
+            for (int i = 0; i < 3; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat3.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+
+      gFloat3.cross = [](const float* a, const float* b, float* r) {
+         const float ax = a[0];
+         const float ay = a[1];
+         const float az = a[2];
+
+         const float bx = b[0];
+         const float by = b[1];
+         const float bz = b[2];
+
+         r[0] = ay * bz - az * by;
+         r[1] = az * bx - ax * bz;
+         r[2] = ax * by - ay * bx;
+      };
+   }
+}
+
+inline void swap(float& a, float& b)
+{
+   float temp = a;
+   a = b;
+   b = temp;
+}
+
+
+namespace math_backend::mat44::dispatch
+{
+   void install_scalar()
+   {
+      gMat44.transpose = [](float* a) {
+         swap(a[1], a[4]);
+         swap(a[2], a[8]);
+         swap(a[3], a[12]);
+         swap(a[6], a[9]);
+         swap(a[7], a[13]);
+         swap(a[11], a[14]);
+      };
+
+      gMat44.determinant = [](const float* m) {
+         return m[0] * (m[5] * m[10] - m[6] * m[9]) +
+            m[4] * (m[2] * m[9] - m[1] * m[10]) +
+            m[8] * (m[1] * m[6] - m[2] * m[5]);
+      };
+
+      gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
+#ifdef TORQUE_COMPILER_GCC
+         const F32   v0 = b[0], v1 = b[1], v2 = b[2];
+         const F32   m0 = a[0], m1 = a[1], m2 = a[2];
+         const F32   m4 = a[4], m5 = a[5], m6 = a[6];
+         const F32   m8 = a[8], m9 = a[9], m10 = a[10];
+
+         r[0] = m0 * v0 + m1 * v1 + m2 * v2;
+         r[1] = m4 * v0 + m5 * v1 + m6 * v2;
+         r[2] = m8 * v0 + m9 * v1 + m10 * v2;
+#else
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
+#endif
+
+      };
+
+      gMat44.inverse = [](float* m) {
+         // using Cramers Rule find the Inverse
+         // Minv = (1/det(M)) * adjoint(M)
+         float det = gMat44.determinant(m);
+         AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+         float invDet = 1.0f / det;
+         float temp[16];
+         temp[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
+         temp[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
+         temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
+
+         temp[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
+         temp[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
+         temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
+
+         temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
+         temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
+         temp[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
+
+         m[0] = temp[0];
+         m[1] = temp[1];
+         m[2] = temp[2];
+
+         m[4] = temp[4];
+         m[5] = temp[5];
+         m[6] = temp[6];
+
+         m[8] = temp[8];
+         m[9] = temp[9];
+         m[10] = temp[10];
+         
+         // invert the translation
+         temp[0] = -m[3];
+         temp[1] = -m[7];
+         temp[2] = -m[11];
+         gMat44.mul_vec3(m, temp, &temp[4]);
+         m[3] = temp[4];
+         m[7] = temp[5];
+         m[11] = temp[6];
+
+      };
+
+      gMat44.inverse_to = [](const float* m, float* d) {
+         // using Cramers Rule find the Inverse
+         // Minv = (1/det(M)) * adjoint(M)
+         F32 det = gMat44.determinant(m);
+         AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+
+         F32 invDet = 1.0f / det;
+
+         d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
+         d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
+         d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
+
+         d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
+         d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
+         d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
+
+         d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
+         d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
+         d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
+
+         // invert the translation
+         F32 temp[6];
+         temp[0] = -m[3];
+         temp[1] = -m[7];
+         temp[2] = -m[11];
+         gMat44.mul_vec3(d, temp, &temp[3]);
+         d[3] = temp[3];
+         d[7] = temp[4];
+         d[11] = temp[5];
+         d[12] = m[12];
+         d[13] = m[13];
+         d[14] = m[14];
+         d[15] = m[15];
+      };
+
+      gMat44.affine_inverse = [](float* a) {
+         F32 temp[16];
+         dMemcpy(temp, a, 16 * sizeof(F32));
+
+         // Transpose rotation
+         a[1] = temp[4];
+         a[4] = temp[1];
+         a[2] = temp[8];
+         a[8] = temp[2];
+         a[6] = temp[9];
+         a[9] = temp[6];
+
+         a[3] = -(temp[0] * temp[3] + temp[4] * temp[7] + temp[8] * temp[11]);
+         a[7] = -(temp[1] * temp[3] + temp[5] * temp[7] + temp[9] * temp[11]);
+         a[11] = -(temp[2] * temp[3] + temp[6] * temp[7] + temp[10] * temp[11]);
+      };
+
+      gMat44.scale = [](float* a, const float* s) {
+         // Note, doesn't allow scaling w...
+
+         a[0] *= s[0];  a[1] *= s[1];  a[2] *= s[2];
+         a[4] *= s[0];  a[5] *= s[1];  a[6] *= s[2];
+         a[8] *= s[0];  a[9] *= s[1];  a[10] *= s[2];
+         a[12] *= s[0];  a[13] *= s[1];  a[14] *= s[2];
+      };
+
+      gMat44.get_scale = [](const float* a, float* s) {
+         // Note, doesn't allow scaling w...
+         s[0] = sqrt(a[0] * a[0] + a[4] * a[4] + a[8] * a[8]);
+         s[1] = sqrt(a[1] * a[1] + a[5] * a[5] + a[9] * a[9]);
+         s[2] = sqrt(a[2] * a[2] + a[6] * a[6] + a[10] * a[10]);
+      };
+
+      gMat44.mul_float4 = [](const float* a, const float* b, float* r) {
+         AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2] + a[7] * b[3];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2] + a[11] * b[3];
+         r[2] = a[12] * b[0] + a[13] * b[1] + a[14] * b[2] + a[15] * b[3];
+      };
+
+      gMat44.mul_pos3 = [](const float* a, const float* b, float* r) {
+             AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+             r[0] = a[0]*b[0] + a[1]*b[1] + a[2]*b[2]  + a[3];
+             r[1] = a[4]*b[0] + a[5]*b[1] + a[6]*b[2]  + a[7];
+             r[2] = a[8]*b[0] + a[9]*b[1] + a[10]*b[2] + a[11];
+      };
+
+      gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
+         AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
+      };
+
+      gMat44.mul_mat44 = [](const float* a, const float* b, float* mresult) {
+         mresult[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8]  + a[3]*b[12];
+         mresult[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9]  + a[3]*b[13];
+         mresult[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14];
+         mresult[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15];
+
+         mresult[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8]  + a[7]*b[12];
+         mresult[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9]  + a[7]*b[13];
+         mresult[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14];
+         mresult[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15];
+
+         mresult[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12];
+         mresult[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13];
+         mresult[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14];
+         mresult[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15];
+
+         mresult[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12];
+         mresult[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13];
+         mresult[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14];
+         mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
+      };
+
+      gMat44.transform_plane = [](const F32* m, const F32* s, const F32* p, F32* presult) {
+         // We take in a matrix, a scale factor, and a plane equation.  We want to output
+         //  the resultant normal
+         // We have T = m*s
+         // To multiply the normal, we want Inv(Tr(m*s))
+         //  Inv(Tr(ms)) = Inv(Tr(s) * Tr(m))
+         //              = Inv(Tr(m)) * Inv(Tr(s))
+         //
+         //  Inv(Tr(s)) = Inv(s) = [ 1/x   0   0  0]
+         //                        [   0 1/y   0  0]
+         //                        [   0   0 1/z  0]
+         //                        [   0   0   0  1]
+         //
+         // Since m is an affine matrix,
+         //  Tr(m) = [ [       ] 0 ]
+         //          [ [   R   ] 0 ]
+         //          [ [       ] 0 ]
+         //          [ [ x y z ] 1 ]
+         //
+         // Inv(Tr(m)) = [ [    -1 ] 0 ]
+         //              [ [   R   ] 0 ]
+         //              [ [       ] 0 ]
+         //              [ [ A B C ] 1 ]
+         // Where:
+         //
+         //  P = (x, y, z)
+         //  A = -(Row(0, r) * P);
+         //  B = -(Row(1, r) * P);
+         //  C = -(Row(2, r) * P);
+
+         MatrixF invScale(true);
+         F32* pScaleElems = invScale;
+         pScaleElems[MatrixF::idx(0, 0)] = 1.0f / s[0];
+         pScaleElems[MatrixF::idx(1, 1)] = 1.0f / s[1];
+         pScaleElems[MatrixF::idx(2, 2)] = 1.0f / s[2];
+
+         const Point3F shear(m[MatrixF::idx(3, 0)], m[MatrixF::idx(3, 1)], m[MatrixF::idx(3, 2)]);
+
+         const Point3F row0(m[MatrixF::idx(0, 0)], m[MatrixF::idx(0, 1)], m[MatrixF::idx(0, 2)]);
+         const Point3F row1(m[MatrixF::idx(1, 0)], m[MatrixF::idx(1, 1)], m[MatrixF::idx(1, 2)]);
+         const Point3F row2(m[MatrixF::idx(2, 0)], m[MatrixF::idx(2, 1)], m[MatrixF::idx(2, 2)]);
+
+         const F32 A = -mDot(row0, shear);
+         const F32 B = -mDot(row1, shear);
+         const F32 C = -mDot(row2, shear);
+
+         MatrixF invTrMatrix(true);
+         F32* destMat = invTrMatrix;
+         destMat[MatrixF::idx(0, 0)] = m[MatrixF::idx(0, 0)];
+         destMat[MatrixF::idx(1, 0)] = m[MatrixF::idx(1, 0)];
+         destMat[MatrixF::idx(2, 0)] = m[MatrixF::idx(2, 0)];
+         destMat[MatrixF::idx(0, 1)] = m[MatrixF::idx(0, 1)];
+         destMat[MatrixF::idx(1, 1)] = m[MatrixF::idx(1, 1)];
+         destMat[MatrixF::idx(2, 1)] = m[MatrixF::idx(2, 1)];
+         destMat[MatrixF::idx(0, 2)] = m[MatrixF::idx(0, 2)];
+         destMat[MatrixF::idx(1, 2)] = m[MatrixF::idx(1, 2)];
+         destMat[MatrixF::idx(2, 2)] = m[MatrixF::idx(2, 2)];
+         destMat[MatrixF::idx(0, 3)] = A;
+         destMat[MatrixF::idx(1, 3)] = B;
+         destMat[MatrixF::idx(2, 3)] = C;
+         invTrMatrix.mul(invScale);
+
+         Point3F norm(p[0], p[1], p[2]);
+         Point3F point = norm * -p[3];
+         invTrMatrix.mulP(norm);
+         norm.normalize();
+
+         MatrixF temp;
+         dMemcpy(temp, m, sizeof(F32) * 16);
+         point.x *= s[0];
+         point.y *= s[1];
+         point.z *= s[2];
+         temp.mulP(point);
+
+         PlaneF resultPlane(point, norm);
+         presult[0] = resultPlane.x;
+         presult[1] = resultPlane.y;
+         presult[2] = resultPlane.z;
+         presult[3] = resultPlane.d;
+
+      };
+
+      gMat44.normalize = [](float* a) {
+         F32 col0[3], col1[3], col2[3];
+         // extract columns 0 and 1
+         col0[0] = a[0];
+         col0[1] = a[4];
+         col0[2] = a[8];
+
+         col1[0] = a[1];
+         col1[1] = a[5];
+         col1[2] = a[9];
+
+         math_backend::float3::dispatch::gFloat3.normalize(col0);
+         math_backend::float3::dispatch::gFloat3.normalize(col1);
+         math_backend::float3::dispatch::gFloat3.normalize(col2);
+
+         // store the normalized columns
+         a[0] = col0[0];
+         a[4] = col0[1];
+         a[8] = col0[2];
+
+         a[1] = col1[0];
+         a[5] = col1[1];
+         a[9] = col1[2];
+
+         a[2] = col2[0];
+         a[6] = col2[1];
+         a[10] = col2[2];
+
+      };
+
+      gMat44.batch_mul_pos3 = [](const float* m, const float* pts, int count, float* result_ptrs) {
+         int i = 0;
+         for (; i < count; i++)
+         {
+            int idx = i * 3;
+            gMat44.mul_pos3(m, &pts[idx], &result_ptrs[idx]);
+         }
+      };
+   }
+}
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@ -0,0 +1,625 @@
+#pragma once
+#include <immintrin.h>  // AVX
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_swizzle_mask(f32x4 v1, const int x, const int y, const int z, const int w)
+   {
+      __m128i idx = _mm_set_epi32(w, z, y, x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
+   {
+      __m128i idx = _mm_set1_epi32(x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      f32x4 lo = v1;
+      f32x4 hi = v2;
+
+      f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
+      f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
+
+      return _mm_movelh_ps(combined_lo, combined_hi);
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------ 
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      const f32x4 zero = _mm_setzero_ps();
+      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
+      f32x4 dot = v_dot3(v, v);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 isZero = _mm_cmpeq_ps(dot, zero);
+      f32x4 norm = _mm_mul_ps(v, inv);
+
+      return _mm_blendv_ps(norm, fallback, isZero);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+   typedef __m256 f32x8;
+
+   struct vec4_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+      f32x8 w;
+   };
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch8 r;
+
+      r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
+         ptr[12], ptr[15], ptr[18], ptr[21]);
+      r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
+         ptr[13], ptr[16], ptr[19], ptr[22]);
+      r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
+         ptr[14], ptr[17], ptr[20], ptr[23]);
+
+      if (fillW)
+      {
+         r.w = _mm256_set1_ps(w);
+      }
+
+      return r;
+   }
+
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch4 r;
+
+      r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
+      r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
+      r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);
+
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
+
+      return r;
+   }
+
+   // Store the result back out to a float array 8 at a time
+   inline void store_f32x8(float* out, f32x8 v)
+   {
+      _mm256_storeu_ps(out, v);
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 8
+   inline void store_vec3_batch8(float* out, const vec4_batch8& v)
+   {
+      alignas(32) float xs[8];
+      alignas(32) float ys[8];
+      alignas(32) float zs[8];
+
+      _mm256_store_ps(xs, v.x);
+      _mm256_store_ps(ys, v.y);
+      _mm256_store_ps(zs, v.z);
+
+      for (int i = 0; i < 8; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
+   {
+      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
+      f32x8 muly = _mm256_mul_ps(a.y, b.y);
+      f32x8 mulz = _mm256_mul_ps(a.z, b.z);
+
+      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
+   }
+
+   // Batch 8 mul_Vec4.
+   inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
+   {
+      vec4_batch8 r;
+
+      __m256 m00 = _mm256_set1_ps(m[0]);
+      __m256 m01 = _mm256_set1_ps(m[1]);
+      __m256 m02 = _mm256_set1_ps(m[2]);
+      __m256 m03 = _mm256_set1_ps(m[3]);
+
+      __m256 m10 = _mm256_set1_ps(m[4]);
+      __m256 m11 = _mm256_set1_ps(m[5]);
+      __m256 m12 = _mm256_set1_ps(m[6]);
+      __m256 m13 = _mm256_set1_ps(m[7]);
+
+      __m256 m20 = _mm256_set1_ps(m[8]);
+      __m256 m21 = _mm256_set1_ps(m[9]);
+      __m256 m22 = _mm256_set1_ps(m[10]);
+      __m256 m23 = _mm256_set1_ps(m[11]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m00),
+            _mm256_mul_ps(v.y, m01)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m10),
+            _mm256_mul_ps(v.y, m11)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m20),
+            _mm256_mul_ps(v.y, m21)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m[0]);
+      f32x4 m01 = _mm_set1_ps(m[1]);
+      f32x4 m02 = _mm_set1_ps(m[2]);
+      f32x4 m03 = _mm_set1_ps(m[3]);
+
+      f32x4 m10 = _mm_set1_ps(m[4]);
+      f32x4 m11 = _mm_set1_ps(m[5]);
+      f32x4 m12 = _mm_set1_ps(m[6]);
+      f32x4 m13 = _mm_set1_ps(m[7]);
+
+      f32x4 m20 = _mm_set1_ps(m[8]);
+      f32x4 m21 = _mm_set1_ps(m[9]);
+      f32x4 m22 = _mm_set1_ps(m[10]);
+      f32x4 m23 = _mm_set1_ps(m[11]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
+}
--- a/Engine/source/math/isa/avx/float3.cpp
+++ b/Engine/source/math/isa/avx/float3.cpp
@ -0,0 +1,26 @@
+#include "avx_intrinsics.h"
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX backend
+   void install_avx()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_impl;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+      gFloat3.cross           = float3_cross_impl;
+   }
+}
--- a/Engine/source/math/isa/avx/float4.cpp
+++ b/Engine/source/math/isa/avx/float4.cpp
@ -0,0 +1,25 @@
+#include "avx_intrinsics.h"
+#include "float4_dispatch.h"
+
+#include "float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+   // Install AVX backend
+   void install_avx()
+   {
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
+      gFloat4.lengthSquared = float4_length_squared_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
+   }
+}
--- a/Engine/source/math/isa/avx/mat44.cpp
+++ b/Engine/source/math/isa/avx/mat44.cpp
@ -0,0 +1,26 @@
+#include "avx_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_avx()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to       = mat44_inverse_to_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
+   }
+}
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@ -0,0 +1,625 @@
+#pragma once
+#include <immintrin.h>  // AVX2
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_swizzle_mask(f32x4 v1,const int x, const int y, const int z, const int w)
+   {
+      __m128i idx = _mm_set_epi32(w, z, y, x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
+   {
+      __m128i idx = _mm_set1_epi32(x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      f32x4 lo = v1;
+      f32x4 hi = v2;
+
+      f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
+      f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
+
+      return _mm_movelh_ps(combined_lo, combined_hi);
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------ 
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
+      f32x4 dot = v_dot3(v, v);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 isZero = _mm_cmpeq_ps(dot, zero);
+      f32x4 norm = _mm_mul_ps(v, inv);
+
+      return _mm_blendv_ps(norm, fallback, isZero);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+   typedef __m256 f32x8;
+
+   struct vec4_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+      f32x8 w;
+   };
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch8 r;
+
+      r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
+         ptr[12], ptr[15], ptr[18], ptr[21]);
+      r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
+         ptr[13], ptr[16], ptr[19], ptr[22]);
+      r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
+         ptr[14], ptr[17], ptr[20], ptr[23]);
+
+      if (fillW)
+      {
+         r.w = _mm256_set1_ps(w);
+      }
+
+      return r;
+   }
+
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch4 r;
+
+      r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
+      r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
+      r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);
+
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
+
+      return r;
+   }
+
+   // Store the result back out to a float array 8 at a time
+   inline void store_f32x8(float* out, f32x8 v)
+   {
+      _mm256_storeu_ps(out, v);
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 8
+   inline void store_vec3_batch8(float* out, const vec4_batch8& v)
+   {
+      alignas(32) float xs[8];
+      alignas(32) float ys[8];
+      alignas(32) float zs[8];
+
+      _mm256_store_ps(xs, v.x);
+      _mm256_store_ps(ys, v.y);
+      _mm256_store_ps(zs, v.z);
+
+      for (int i = 0; i < 8; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
+   {
+      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
+      f32x8 muly = _mm256_mul_ps(a.y, b.y);
+      f32x8 mulz = _mm256_mul_ps(a.z, b.z);
+
+      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
+   }
+
+   // Batch 8 mul_Vec4.
+   inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
+   {
+      vec4_batch8 r;
+
+      __m256 m00 = _mm256_set1_ps(m[0]);
+      __m256 m01 = _mm256_set1_ps(m[1]);
+      __m256 m02 = _mm256_set1_ps(m[2]);
+      __m256 m03 = _mm256_set1_ps(m[3]);
+
+      __m256 m10 = _mm256_set1_ps(m[4]);
+      __m256 m11 = _mm256_set1_ps(m[5]);
+      __m256 m12 = _mm256_set1_ps(m[6]);
+      __m256 m13 = _mm256_set1_ps(m[7]);
+
+      __m256 m20 = _mm256_set1_ps(m[8]);
+      __m256 m21 = _mm256_set1_ps(m[9]);
+      __m256 m22 = _mm256_set1_ps(m[10]);
+      __m256 m23 = _mm256_set1_ps(m[11]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m00),
+            _mm256_mul_ps(v.y, m01)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m10),
+            _mm256_mul_ps(v.y, m11)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m20),
+            _mm256_mul_ps(v.y, m21)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m[0]);
+      f32x4 m01 = _mm_set1_ps(m[1]);
+      f32x4 m02 = _mm_set1_ps(m[2]);
+      f32x4 m03 = _mm_set1_ps(m[3]);
+
+      f32x4 m10 = _mm_set1_ps(m[4]);
+      f32x4 m11 = _mm_set1_ps(m[5]);
+      f32x4 m12 = _mm_set1_ps(m[6]);
+      f32x4 m13 = _mm_set1_ps(m[7]);
+
+      f32x4 m20 = _mm_set1_ps(m[8]);
+      f32x4 m21 = _mm_set1_ps(m[9]);
+      f32x4 m22 = _mm_set1_ps(m[10]);
+      f32x4 m23 = _mm_set1_ps(m[11]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
+}
--- a/Engine/source/math/isa/avx2/float3.cpp
+++ b/Engine/source/math/isa/avx2/float3.cpp
@ -0,0 +1,26 @@
+#include "avx2_intrinsics.h"
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX2 backend
+   void install_avx2()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_impl;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+      gFloat3.cross           = float3_cross_impl;
+   }
+}
--- a/Engine/source/math/isa/avx2/float4.cpp
+++ b/Engine/source/math/isa/avx2/float4.cpp
@ -0,0 +1,25 @@
+#include "avx2_intrinsics.h"
+#include "float4_dispatch.h"
+
+#include "float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+   // Install AVX2 backend
+   void install_avx2()
+   {
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
+      gFloat4.lengthSquared = float4_length_squared_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
+   }
+}
--- a/Engine/source/math/isa/avx2/mat44.cpp
+++ b/Engine/source/math/isa/avx2/mat44.cpp
@ -0,0 +1,25 @@
+#include "avx2_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl" 
+
+namespace math_backend::mat44::dispatch
+{
+   void install_avx2()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to       = mat44_inverse_to_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
+   }
+}
--- a/Engine/source/math/isa/neon/float3.cpp
+++ b/Engine/source/math/isa/neon/float3.cpp
@ -0,0 +1,25 @@
+#include "neon_intrinsics.h"
+#include "float3_dispatch.h"
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+    // Install NEON backend
+    void install_neon()
+    {
+        gFloat3.add = float3_add_impl;
+        gFloat3.sub = float3_sub_impl;
+        gFloat3.mul = float3_mul_impl;
+        gFloat3.mul_scalar = float3_mul_scalar_impl;
+        gFloat3.div = float3_div_impl;
+        gFloat3.div_scalar = float3_div_scalar_impl;
+        gFloat3.dot = float3_dot_impl;
+        gFloat3.length = float3_length_impl;
+        gFloat3.lengthSquared = float3_length_squared_impl;
+        gFloat3.normalize = float3_normalize_impl;
+        gFloat3.normalize_mag = float3_normalize_mag_impl;
+        gFloat3.lerp = float3_lerp_impl;
+        gFloat3.cross = float3_cross_impl;
+    }
+}
--- a/Engine/source/math/isa/neon/float4.cpp
+++ b/Engine/source/math/isa/neon/float4.cpp
@ -0,0 +1,25 @@
+#include "neon_intrinsics.h"
+#include "float4_dispatch.h"
+
+#include "float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+    // Install NEON64 backend
+    void install_neon()
+    {
+        gFloat4.add          = float4_add_impl;
+        gFloat4.sub          = float4_sub_impl;
+        gFloat4.mul          = float4_mul_impl;
+        gFloat4.mul_scalar   = float4_mul_scalar_impl;
+        gFloat4.div          = float4_div_impl;
+        gFloat4.div_scalar   = float4_div_scalar_impl;
+        gFloat4.dot          = float4_dot_impl;
+        gFloat4.length       = float4_length_impl;
+        gFloat4.lengthSquared = float4_length_squared_impl;
+        gFloat4.normalize    = float4_normalize_impl;
+        gFloat4.normalize_mag = float4_normalize_mag_impl;
+        gFloat4.lerp         = float4_lerp_impl;
+        gFloat4.cross        = float4_cross_impl;
+    }
+}
--- a/Engine/source/math/isa/neon/mat44.cpp
+++ b/Engine/source/math/isa/neon/mat44.cpp
@ -0,0 +1,23 @@
+#include "neon_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_neon()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to       = mat44_inverse_to_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
--- a/Engine/source/math/isa/neon/neon_intrinsics.h
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@ -0,0 +1,504 @@
+#pragma once
+#include <arm_neon.h>
+
+namespace
+{
+   typedef float32x4_t f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+   inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
+   inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
+   inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
+   inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
+   inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
+
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+       return { x, y, z, w }; // lane0=x, lane1=y, lane2=z, lane3=w
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+       // extract scalar value from w lane0
+       float w_val = vgetq_lane_f32(w, 0);
+
+       // broadcast w_val to all lanes (we only need lane3 later)
+       f32x4 w_broadcast = vdupq_n_f32(w_val);
+
+       // mask to select only lane3
+       uint32x4_t mask = {0, 0, 0, 0xFFFFFFFF};
+
+       // vbslq: if mask bit=1, take from first argument, else take from second
+       return vbslq_f32(mask, w_broadcast, v);
+   }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+   inline f32x4 v_mask_xyz()
+   {
+       uint32x4_t mask = {
+           0xFFFFFFFF,
+           0xFFFFFFFF,
+           0xFFFFFFFF,
+           0x00000000
+       };
+
+       return vreinterpretq_f32_u32(mask);
+   }
+
+   inline f32x4 v_and(f32x4 a, f32x4 b)
+   {
+       return vreinterpretq_f32_u32(
+           vandq_u32(
+               vreinterpretq_u32_f32(a),
+               vreinterpretq_u32_f32(b)
+           )
+       );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+       // base byte index of the float lane
+       uint8x16_t base = vdupq_n_u8((uint8_t)(x * 4));
+
+       // byte offsets inside a float (0,1,2,3 repeated 4 times)
+       const uint8x16_t offsets = {
+           0,1,2,3,  0,1,2,3,
+           0,1,2,3,  0,1,2,3
+       };
+
+       uint8x16_t idx = vaddq_u8(base, offsets);
+
+       return vreinterpretq_f32_u8(
+           vqtbl1q_u8(
+               vreinterpretq_u8_f32(v),
+               idx
+           )
+       );
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v)
+   {
+       float32x4x2_t t = vzipq_f32(v, v);
+       return t.val[0];   // [x, x, z, z]
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v)
+   {
+       float32x4x2_t t = vzipq_f32(v, v);
+       return t.val[1];   // [y, y, w, w]
+   }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f};
+      return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers
+   //------------------------------------------------------
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      float tmp[4] = { p[0], p[1], p[2], 0.0f };
+      return vld1q_f32(tmp);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      float tmp[4] = { p[0], p[1], p[2], 1.0f };
+      return vld1q_f32(tmp);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      float tmp[4];
+      vst1q_f32(tmp, v);
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmetic
+   //------------------------------------------------------
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = vrecpeq_f32(b);
+      r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson
+      r = vmulq_f32(r, vrecpsq_f32(b, r));
+      return r;
+   }
+
+   inline f32x4 v_div(f32x4 a, f32x4 b)
+   {
+      return vmulq_f32(a, v_rcp_nr(b));
+   }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = vrsqrteq_f32(x);
+      r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine
+      r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x));
+      return r;
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      f32x4 mul = vmulq_f32(a, b);
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
+      float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
+      return vdupq_n_f32(sum);
+   }
+
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
+      f32x4 mul = vmulq_f32(a, b);
+      mul = vmulq_f32(mul, mask);
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
+      float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
+      return vdupq_n_f32(sum);
+   }
+
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = { vgetq_lane_f32(a,1),
+                    vgetq_lane_f32(a,2),
+                    vgetq_lane_f32(a,0),
+                    0.0f };
+
+      f32x4 b_zxy = { vgetq_lane_f32(b,2),
+                    vgetq_lane_f32(b,0),
+                    vgetq_lane_f32(b,1),
+                    0.0f };
+
+      f32x4 a_zxy = { vgetq_lane_f32(a,2),
+                    vgetq_lane_f32(a,0),
+                    vgetq_lane_f32(a,1),
+                    0.0f };
+
+      f32x4 b_yzx = { vgetq_lane_f32(b,1),
+                    vgetq_lane_f32(b,2),
+                    vgetq_lane_f32(b,0),
+                    0.0f };
+
+      return vsubq_f32(
+        vmulq_f32(a_yzx, b_zxy),
+        vmulq_f32(a_zxy, b_yzx)
+      );
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      const float32x4_t zero = vdupq_n_f32(0.0f);
+      const float32x4_t fallback = {0.0f, 0.0f, 1.0f, 0.0f};
+
+      f32x4 dot = v_dot3(v, v);
+
+      // dot == 0?
+      uint32x4_t isZero = vceqq_f32(dot, zero);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 norm = vmulq_f32(v, inv);
+
+      // Select fallback when zero
+      return vbslq_f32(isZero, fallback, norm);
+   }
+
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a));
+      float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
+      return vdupq_n_f32(sum);
+   }
+
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+       f32x4 r0;
+       f32x4 r1;
+       f32x4 r2;
+       f32x4 r3;
+   };
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+       f32x4x4 out;
+       out.r0 = v_load(m + 0);
+       out.r1 = v_load(m + 4);
+       out.r2 = v_load(m + 8);
+       out.r3 = v_load(m + 12);
+       return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+       v_store(dst + 0,  m.r0);
+       v_store(dst + 4,  m.r1);
+       v_store(dst + 8,  m.r2);
+       v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+       f32x4x4 m;
+       m.r0 = {1,0,0,0};
+       m.r1 = {0,1,0,0};
+       m.r2 = {0,0,1,0};
+       m.r3 = {0,0,0,1};
+       return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+       f32x4 z = v_zero();
+       return { z, z, z, z };
+   }
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+       f32x4 x = v_dot4(m.r0, v);
+       f32x4 y = v_dot4(m.r1, v);
+       f32x4 z = v_dot4(m.r2, v);
+       f32x4 w = v_dot4(m.r3, v);
+
+       return {
+           v_extract0(x),
+           v_extract0(y),
+           v_extract0(z),
+           v_extract0(w)
+       };
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+       f32x4 x = v_dot3(m.r0, v);
+       f32x4 y = v_dot3(m.r1, v);
+       f32x4 z = v_dot3(m.r2, v);
+
+       return {
+           v_extract0(x),
+           v_extract0(y),
+           v_extract0(z),
+           0.0f
+       };
+   }
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+       float32x4x2_t t0 = vtrnq_f32(m.r0, m.r1);
+       float32x4x2_t t1 = vtrnq_f32(m.r2, m.r3);
+
+       float32x2_t a0 = vget_low_f32(t0.val[0]);
+       float32x2_t a1 = vget_high_f32(t0.val[0]);
+       float32x2_t a2 = vget_low_f32(t1.val[0]);
+       float32x2_t a3 = vget_high_f32(t1.val[0]);
+
+       float32x2_t b0 = vget_low_f32(t0.val[1]);
+       float32x2_t b1 = vget_high_f32(t0.val[1]);
+       float32x2_t b2 = vget_low_f32(t1.val[1]);
+       float32x2_t b3 = vget_high_f32(t1.val[1]);
+
+       f32x4x4 out;
+
+       out.r0 = vcombine_f32(a0, a2);
+       out.r1 = vcombine_f32(b0, b2);
+       out.r2 = vcombine_f32(a1, a3);
+       out.r3 = vcombine_f32(b1, b3);
+
+       return out;
+   }
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+       f32x4x4 bt = m_transpose(b);
+
+       auto mul_row = [&](f32x4 row)
+       {
+           f32x4 x = v_dot4(row, bt.r0);
+           f32x4 y = v_dot4(row, bt.r1);
+           f32x4 z = v_dot4(row, bt.r2);
+           f32x4 w = v_dot4(row, bt.r3);
+
+           return f32x4{
+               v_extract0(x),
+               v_extract0(y),
+               v_extract0(z),
+               v_extract0(w)
+           };
+       };
+
+       f32x4x4 C;
+       C.r0 = mul_row(a.r0);
+       C.r1 = mul_row(a.r1);
+       C.r2 = mul_row(a.r2);
+       C.r3 = mul_row(a.r3);
+
+       return C;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+       f32x4 a = m.r0;
+       f32x4 b = m.r1;
+       f32x4 c = m.r2;
+       f32x4 d = m.r3;
+
+       f32x4 c0 = v_cross(c, d);
+       f32x4 c1 = v_cross(d, b);
+       f32x4 c2 = v_cross(b, c);
+
+       f32x4 term0 = vmulq_f32(a, c0);
+       f32x4 term1 = vmulq_f32(a, c1);
+       f32x4 term2 = vmulq_f32(a, c2);
+
+       f32x4 det = vaddq_f32(term0, vaddq_f32(term1, term2));
+
+       return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+       f32x4 r0 = v_and(m.r0, v_mask_xyz());
+       f32x4 r1 = v_and(m.r1, v_mask_xyz());
+       f32x4 r2 = v_and(m.r2, v_mask_xyz());
+
+       f32x4 c0 = v_cross(r1, r2);
+       return v_dot3(r0, c0);
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+       f32x4 a = m.r0;
+       f32x4 b = m.r1;
+       f32x4 c = m.r2;
+       f32x4 d = m.r3;
+
+       f32x4 c0 = v_cross(b, c);
+       f32x4 c1 = v_cross(c, d);
+       f32x4 c2 = v_cross(d, a);
+       f32x4 c3 = v_cross(a, b);
+
+       f32x4 det = v_dot4(a, c1);
+       f32x4 invDet = v_rcp_nr(det);
+
+       f32x4x4 adj;
+       adj.r0 = vmulq_f32(c1, invDet);
+       adj.r1 = vmulq_f32(c2, invDet);
+       adj.r2 = vmulq_f32(c3, invDet);
+       adj.r3 = vmulq_f32(c0, invDet);
+
+       return m_transpose(adj);
+   }
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch4 r;
+
+      r.x = (f32x4){ ptr[0], ptr[3], ptr[6], ptr[9] };
+      r.y = (f32x4){ ptr[1], ptr[4], ptr[7], ptr[10] };
+      r.z = (f32x4){ ptr[2], ptr[5], ptr[8], ptr[11] };
+
+      if (fillW)
+      {
+         r.w = vdupq_n_f32(w);
+      }
+
+      return r;
+   }
+
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
+   {
+      alignas(16) float xs[4];
+      alignas(16) float ys[4];
+      alignas(16) float zs[4];
+
+      vst1q_f32(xs, v.x);
+      vst1q_f32(ys, v.y);
+      vst1q_f32(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      float32x4_t m00 = vdupq_n_f32(m[0]);
+      float32x4_t m01 = vdupq_n_f32(m[1]);
+      float32x4_t m02 = vdupq_n_f32(m[2]);
+      float32x4_t m03 = vdupq_n_f32(m[3]);
+
+      float32x4_t m10 = vdupq_n_f32(m[4]);
+      float32x4_t m11 = vdupq_n_f32(m[5]);
+      float32x4_t m12 = vdupq_n_f32(m[6]);
+      float32x4_t m13 = vdupq_n_f32(m[7]);
+
+      float32x4_t m20 = vdupq_n_f32(m[8]);
+      float32x4_t m21 = vdupq_n_f32(m[9]);
+      float32x4_t m22 = vdupq_n_f32(m[10]);
+      float32x4_t m23 = vdupq_n_f32(m[11]);
+
+      // row0 dot
+      r.x = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m00),
+            vmulq_f32(v.y, m01)),
+         vaddq_f32(
+            vmulq_f32(v.z, m02),
+            m03));
+
+      // row1 dot
+      r.y = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m10),
+            vmulq_f32(v.y, m11)),
+         vaddq_f32(
+            vmulq_f32(v.z, m12),
+            m13));
+
+      // row2 dot
+      r.z = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m20),
+            vmulq_f32(v.y, m21)),
+         vaddq_f32(
+            vmulq_f32(v.z, m22),
+            m23));
+
+      return r;
+   }
+
+}
--- a/Engine/source/math/isa/sse2/float3.cpp
+++ b/Engine/source/math/isa/sse2/float3.cpp
@ -0,0 +1,26 @@
+#include "sse2_intrinsics.h"
+#include "float3_dispatch.h"
+#include <emmintrin.h> // SSE2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install SSE2 backend
+   void install_sse2()
+   {
+      gFloat3.add = float3_add_impl;
+      gFloat3.sub = float3_sub_impl;
+      gFloat3.mul = float3_mul_impl;
+      gFloat3.mul_scalar = float3_mul_scalar_impl;
+      gFloat3.div = float3_div_impl;
+      gFloat3.div_scalar = float3_div_scalar_impl;
+      gFloat3.dot = float3_dot_impl;
+      gFloat3.length = float3_length_impl;
+      gFloat3.lengthSquared = float3_length_squared_impl;
+      gFloat3.normalize = float3_normalize_impl;
+      gFloat3.normalize_mag = float3_normalize_mag_impl;
+      gFloat3.lerp = float3_lerp_impl;
+      gFloat3.cross = float3_cross_impl;
+   }
+}
--- a/Engine/source/math/isa/sse2/float4.cpp
+++ b/Engine/source/math/isa/sse2/float4.cpp
@ -0,0 +1,26 @@
+#include "sse2_intrinsics.h"
+#include "float4_dispatch.h"
+
+#include "float4_impl.inl"
+
+
+namespace math_backend::float4::dispatch
+{
+   // Install SSE2 backend
+   void install_sse2()
+   {
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
+      gFloat4.lengthSquared = float4_length_squared_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
+   }
+}
--- a/Engine/source/math/isa/sse2/mat44.cpp
+++ b/Engine/source/math/isa/sse2/mat44.cpp
@ -0,0 +1,25 @@
+#include "sse2_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_sse2()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to = mat44_inverse_to_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane  = mat44_transform_plane_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
+   }
+}
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@ -0,0 +1,655 @@
+#pragma once
+#include <emmintrin.h>  // SSE2
+#include <xmmintrin.h>  // SSE
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Vector/Point Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
+      return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v));
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
+
+   inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+
+      return _mm_set_ps(
+         tmp[w],
+         tmp[z],
+         tmp[y],
+         tmp[x]
+      );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+      return _mm_set1_ps(tmp[x]);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0));
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1));
+   }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1
+      return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
+   }
+
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      alignas(16) float a[4], b[4];
+      _mm_store_ps(a, v1);
+      _mm_store_ps(b, v2);
+
+      return _mm_set_ps(
+         b[w],
+         b[z],
+         a[y],
+         a[x]
+      );
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   // Absolute value.
+   inline f32x4 v_abs(f32x4 v)
+   {
+      const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+      return _mm_and_ps(v, mask);
+   }
+
+   //------------------------------------------------------ 
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      f32x4 prod = _mm_mul_ps(a, b);           // multiply element-wise
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2));
+      prod = _mm_add_ps(prod, shuf);
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1));
+      prod = _mm_add_ps(prod, shuf);
+      return prod;                             // f32x4, all lanes = dot(a,b)
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      f32x4 prod = _mm_mul_ps(a, b);
+      prod = _mm_and_ps(prod, v_mask_xyz());    // zero w
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      prod = _mm_add_ps(prod, shuf);
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      prod = _mm_add_ps(prod, shuf);
+      return prod;                             // f32x4, all lanes = dot(a,b)
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
+      f32x4 dot = v_dot3(v, v);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 isZero = _mm_cmpeq_ps(dot, zero);
+
+      f32x4 norm = _mm_mul_ps(v, inv);
+
+      // vbsl equivalent
+      f32x4 result = _mm_or_ps(
+         _mm_and_ps(isZero, fallback),
+         _mm_andnot_ps(isZero, norm)
+      );
+      
+      return result;
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE2
+      __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs
+      __m128 sums = _mm_add_ps(a, shuf);
+      shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
+      return _mm_add_ps(sums, shuf);
+   }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+   
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
+   {
+      dst[0] = v_extract0(c0);            // c0.x
+      dst[1] = v_extract0(c1);            // c1.x
+      dst[2] = v_extract0(c2);            // c2.x
+
+      dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
+      dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
+      dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
+
+      dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
+      dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
+      dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(b, c);
+      f32x4 c1 = v_cross(c, d);
+      f32x4 c2 = v_cross(d, a);
+      f32x4 c3 = v_cross(a, b);
+
+      f32x4 det = v_dot4(a, c1);
+      f32x4 invDet = v_rcp_nr(det);
+
+      f32x4x4 adj;
+      adj.r0 = _mm_mul_ps(c1, invDet);
+      adj.r1 = _mm_mul_ps(c2, invDet);
+      adj.r2 = _mm_mul_ps(c3, invDet);
+      adj.r3 = _mm_mul_ps(c0, invDet);
+
+      return m_transpose(adj);
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+       
+      return t;
+   }
+
+   inline f32x4x4 m_inverse_affine(const f32x4x4& m)
+   {
+      // Extract upper 3x3 rows
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      // Zero translation
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      // Compute cofactors via cross products
+      f32x4 c0 = v_cross(r1, r2);
+      f32x4 c1 = v_cross(r2, r0);
+      f32x4 c2 = v_cross(r0, r1);
+
+      // Determinant
+      f32x4 det = v_dot3(r0, c0);
+
+      // Reciprocal determinant
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Inverse 3x3 (transpose of cofactor matrix)
+      f32x4 i0 = _mm_mul_ps(c0, invDet);
+      f32x4 i1 = _mm_mul_ps(c1, invDet);
+      f32x4 i2 = _mm_mul_ps(c2, invDet);
+
+      // Transpose 3x3
+      f32x4 t0 = i0;
+      f32x4 t1 = i1;
+      f32x4 t2 = i2;
+      f32x4 t3 = _mm_setzero_ps();
+
+      _MM_TRANSPOSE4_PS(t0, t1, t2, t3);
+
+      // Extract translation
+      alignas(16) float tmp[4];
+      tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[3] = 0.0f;
+
+      f32x4 T = v_load(tmp);
+
+      // New translation = -R' * T
+      f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      // Assemble final matrix
+      f32x4x4 out;
+      out.r0 = v_preserve_w(t0, newT);
+      out.r1 = v_preserve_w(t1, newT);
+      out.r2 = v_preserve_w(t2, newT);
+      out.r3 = _mm_set_ps(1, 0, 0, 0);
+
+      return out;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
+
+      return r;
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m[0]);
+      f32x4 m01 = _mm_set1_ps(m[1]);
+      f32x4 m02 = _mm_set1_ps(m[2]);
+      f32x4 m03 = _mm_set1_ps(m[3]);
+
+      f32x4 m10 = _mm_set1_ps(m[4]);
+      f32x4 m11 = _mm_set1_ps(m[5]);
+      f32x4 m12 = _mm_set1_ps(m[6]);
+      f32x4 m13 = _mm_set1_ps(m[7]);
+
+      f32x4 m20 = _mm_set1_ps(m[8]);
+      f32x4 m21 = _mm_set1_ps(m[9]);
+      f32x4 m22 = _mm_set1_ps(m[10]);
+      f32x4 m23 = _mm_set1_ps(m[11]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+}
--- a/Engine/source/math/isa/sse41/float3.cpp
+++ b/Engine/source/math/isa/sse41/float3.cpp
@ -0,0 +1,26 @@
+#include "sse41_intrinsics.h"
+#include "float3_dispatch.h"
+#include <smmintrin.h> // SSE41 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install SSE41 backend
+   void install_sse41()
+   {
+      gFloat3.add = float3_add_impl;
+      gFloat3.sub = float3_sub_impl;
+      gFloat3.mul = float3_mul_impl;
+      gFloat3.mul_scalar = float3_mul_scalar_impl;
+      gFloat3.div = float3_div_impl;
+      gFloat3.div_scalar = float3_div_scalar_impl;
+      gFloat3.dot = float3_dot_impl;
+      gFloat3.length = float3_length_impl;
+      gFloat3.lengthSquared = float3_length_squared_impl;
+      gFloat3.normalize = float3_normalize_impl;
+      gFloat3.normalize_mag = float3_normalize_mag_impl;
+      gFloat3.lerp = float3_lerp_impl;
+      gFloat3.cross = float3_cross_impl;
+   }
+}
--- a/Engine/source/math/isa/sse41/float4.cpp
+++ b/Engine/source/math/isa/sse41/float4.cpp
@ -0,0 +1,25 @@
+#include "sse41_intrinsics.h"
+#include "float4_dispatch.h"
+
+#include "float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+   // Install SSE41 backend
+   void install_sse41()
+   {
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
+      gFloat4.lengthSquared = float4_length_squared_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
+   }
+}
--- a/Engine/source/math/isa/sse41/mat44.cpp
+++ b/Engine/source/math/isa/sse41/mat44.cpp
@ -0,0 +1,25 @@
+#include "sse41_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_sse41()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to       = mat44_inverse_to_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
+   }
+}
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@ -0,0 +1,625 @@
+#pragma once
+#include <smmintrin.h>  // SSE4.1
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+
+      return _mm_set_ps(
+         tmp[w],
+         tmp[z],
+         tmp[y],
+         tmp[x]
+      );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+      return _mm_set1_ps(tmp[x]);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      alignas(16) float a[4], b[4];
+      _mm_store_ps(a, v1);
+      _mm_store_ps(b, v2);
+
+      return _mm_set_ps(
+         b[w],
+         b[z],
+         a[y],
+         a[x]
+      );
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------ 
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
+      f32x4 dot = v_dot3(v, v);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 isZero = _mm_cmpeq_ps(dot, zero);
+      f32x4 norm = _mm_mul_ps(v, inv);
+
+      return _mm_blendv_ps(norm, fallback, isZero);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      _mm_storeu_ps(dst + 0, m.r0);
+      _mm_storeu_ps(dst + 4, m.r1);
+      _mm_storeu_ps(dst + 8, m.r2);
+      _mm_storeu_ps(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+                        _mm_cvtss_f32(z),
+                        _mm_cvtss_f32(y),
+                        _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
+   {
+      dst[0] = v_extract0(c0);            // c0.x
+      dst[1] = v_extract0(c1);            // c1.x
+      dst[2] = v_extract0(c2);            // c2.x
+
+      dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
+      dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
+      dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
+
+      dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
+      dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
+      dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(b, c);
+      f32x4 c1 = v_cross(c, d);
+      f32x4 c2 = v_cross(d, a);
+      f32x4 c3 = v_cross(a, b);
+
+      f32x4 det = v_dot4(a, c1);
+      f32x4 invDet = v_rcp_nr(det);
+
+      f32x4x4 adj;
+      adj.r0 = _mm_mul_ps(c1, invDet);
+      adj.r1 = _mm_mul_ps(c2, invDet);
+      adj.r2 = _mm_mul_ps(c3, invDet);
+      adj.r3 = _mm_mul_ps(c0, invDet);
+
+      return m_transpose(adj);
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4x4 m_inverse_affine(const f32x4x4& m)
+   {
+      // Extract upper 3x3 rows
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      // Zero translation
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      // Compute cofactors via cross products
+      f32x4 c0 = v_cross(r1, r2);
+      f32x4 c1 = v_cross(r2, r0);
+      f32x4 c2 = v_cross(r0, r1);
+
+      // Determinant
+      f32x4 det = v_dot3(r0, c0);
+
+      // Reciprocal determinant
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Inverse 3x3 (transpose of cofactor matrix)
+      f32x4 i0 = _mm_mul_ps(c0, invDet);
+      f32x4 i1 = _mm_mul_ps(c1, invDet);
+      f32x4 i2 = _mm_mul_ps(c2, invDet);
+
+      // Transpose 3x3
+      f32x4 t0 = i0;
+      f32x4 t1 = i1;
+      f32x4 t2 = i2;
+      f32x4 t3 = _mm_setzero_ps();
+
+      _MM_TRANSPOSE4_PS(t0, t1, t2, t3);
+
+      // Extract translation
+      alignas(16) float tmp[4];
+      tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[3] = 0.0f;
+
+      f32x4 T = v_load(tmp);
+
+      // New translation = -R' * T
+      f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      // Assemble final matrix
+      f32x4x4 out;
+      out.r0 = v_preserve_w(t0, newT);
+      out.r1 = v_preserve_w(t1, newT);
+      out.r2 = v_preserve_w(t2, newT);
+      out.r3 = _mm_set_ps(1, 0, 0, 0);
+
+      return out;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
+
+      return r;
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m[0]);
+      f32x4 m01 = _mm_set1_ps(m[1]);
+      f32x4 m02 = _mm_set1_ps(m[2]);
+      f32x4 m03 = _mm_set1_ps(m[3]);
+
+      f32x4 m10 = _mm_set1_ps(m[4]);
+      f32x4 m11 = _mm_set1_ps(m[5]);
+      f32x4 m12 = _mm_set1_ps(m[6]);
+      f32x4 m13 = _mm_set1_ps(m[7]);
+
+      f32x4 m20 = _mm_set1_ps(m[8]);
+      f32x4 m21 = _mm_set1_ps(m[9]);
+      f32x4 m22 = _mm_set1_ps(m[10]);
+      f32x4 m23 = _mm_set1_ps(m[11]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+}
--- a/Engine/source/math/mMathFn.h
+++ b/Engine/source/math/mMathFn.h
@ -151,7 +151,7 @@ inline void m_matF_x_vectorF(const F32 *m, const F32 *v, F32 *vresult)
 inline bool mIsEqual( F32 a, F32 b, const F32 epsilon = __EQUAL_CONST_F )
 {
   F32 diff = a - b;
-   return diff > -epsilon && diff < epsilon; 
+   return diff > -epsilon && diff < epsilon;
 }

 inline bool mIsZero(const F32 val, const F32 epsilon = __EQUAL_CONST_F )
@ -207,16 +207,16 @@ inline F32 mFmod(const F32 val, const F32 mod)
   return fmod(val, mod);
 }

-inline S32 mRound(const F32 val)  
-{  
-   return (S32)floor(val + 0.5f);  
-}  
+inline S32 mRound(const F32 val)
+{
+   return (S32)floor(val + 0.5f);
+}
      
-inline F32 mRound(const F32 val, const S32 n)  
-{  
-   S32 place = (S32) pow(10.0f, n);  
+inline F32 mRound(const F32 val, const S32 n)
+{
+   S32 place = (S32) pow(10.0f, n);
      
-   return mFloor((val*place)+0.5)/place;  
+   return mFloor((val*place)+0.5)/place;
 }

 inline F32 mRoundF(const F32 val, const F32 step)
@ -261,15 +261,15 @@ inline F32 mClampF(F32 val, F32 low, F32 high)

 inline S32 mWrap(S32 val, S32 low, S32 high)
 {
-	int len = high - low;
-	return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0);
+   int len = high - low;
+   return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0);

 }

 inline F32 mWrapF(F32 val, F32 low, F32 high)
 {
-	F32 t = fmod(val - low, high - low);
-	return t < 0 ? t + high : t + low;
+   F32 t = fmod(val - low, high - low);
+   return t < 0 ? t + high : t + low;
 }

 /// Template function for doing a linear interpolation between any two
@ -497,7 +497,7 @@ inline F64 mRadToDeg(F64 r)

 inline bool mIsNaN_F( const F32 x )
 {
-   // If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN. 
+   // If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN.
   // This is only going to work if the compiler is IEEE 748 compliant.
   //
   // Tested and working on VC2k5
--- a/Engine/source/math/mMath_C.cpp
+++ b/Engine/source/math/mMath_C.cpp
@ -152,8 +152,8 @@ static void m_point3F_interpolate_C(const F32 *from, const F32 *to, F32 factor,
 #ifdef TORQUE_COMPILER_GCC
 // remove possibility of aliases
   const F32 inverse = 1.0f - factor;
-   const F32	from0 = from[0], from1 = from[1], from2 = from[2];
-   const F32	to0 = to[0], to1 = to[1], to2 = to[2];
+   const F32   from0 = from[0], from1 = from[1], from2 = from[2];
+   const F32   to0 = to[0], to1 = to[1], to2 = to[2];
   
   result[0] = from0 * inverse + to0 * factor;
   result[1] = from1 * inverse + to1 * factor;
@ -182,8 +182,8 @@ static void m_point3D_interpolate_C(const F64 *from, const F64 *to, F64 factor,
 #ifdef TORQUE_COMPILER_GCC
 // remove possibility of aliases
   const F64 inverse = 1.0f - factor;
-   const F64	from0 = from[0], from1 = from[1], from2 = from[2];
-   const F64	to0 = to[0], to1 = to[1], to2 = to[2];
+   const F64   from0 = from[0], from1 = from[1], from2 = from[2];
+   const F64   to0 = to[0], to1 = to[1], to2 = to[2];
   
   result[0] = from0 * inverse + to0 * factor;
   result[1] = from1 * inverse + to1 * factor;
@ -436,8 +436,8 @@ static void affineInvertTo(const F32 * m, F32 * out)

   F32 invDet = 1.0f / (m[idx(0,0)] * d1 + m[idx(0,1)] * d2 + m[idx(0,2)] * d3);

-   F32 m00 = m[idx(0,0)] * invDet; 
-   F32 m01 = m[idx(0,1)] * invDet; 
+   F32 m00 = m[idx(0,0)] * invDet;
+   F32 m01 = m[idx(0,1)] * invDet;
   F32 m02 = m[idx(0,2)] * invDet;

   F32 * result = out;
@ -950,4 +950,3 @@ void mInstallLibrary_C()
   m_matF_x_scale_x_planeF = m_matF_x_scale_x_planeF_C;
   m_matF_x_box3F          = m_matF_x_box3F_C;
 }
-
--- a/Engine/source/math/mMatrix.h
+++ b/Engine/source/math/mMatrix.h
@ -50,13 +50,16 @@
 #ifndef _CONSOLE_H_
 #include "console/console.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif

 #ifndef USE_TEMPLATE_MATRIX

 /// 4x4 Matrix Class
 ///
 /// This runs at F32 precision.
-
+using namespace math_backend::mat44::dispatch;
 class MatrixF
 {
   friend class MatrixFEngineExport;
@ -128,7 +131,7 @@ public:
   EulerF toEuler() const;

   F32 determinant() const {
-      return m_matF_determinant(*this);
+      return GetMat44().determinant(*this);
   }

   /// Compute the inverse of the matrix.
@ -228,6 +231,8 @@ public:
   void mul( Point4F& p ) const;                       ///< M * p -> p (full [4x4] * [1x4])
   void mulP( Point3F& p ) const;                      ///< M * p -> p (assume w = 1.0f)
   void mulP( const Point3F &p, Point3F *d) const;     ///< M * p -> d (assume w = 1.0f)
+   void batch_mulP(Point3F* points, U32 count) const;
+   void batch_mulP(const Point3F* points, U32 count, Point3F* out) const;
   void mulV( VectorF& p ) const;                      ///< M * v -> v (assume w = 0.0f)
   void mulV( const VectorF &p, Point3F *d) const;     ///< M * v -> d (assume w = 0.0f)

@ -372,71 +377,66 @@ inline MatrixF& MatrixF::identity()

 inline MatrixF& MatrixF::inverse()
 {
-   m_matF_inverse(m);
+   GetMat44().inverse(m);
   return (*this);
 }

 inline void MatrixF::invertTo( MatrixF *out )
 {
-   m_matF_invert_to(m,*out);
+   GetMat44().inverse_to(*this, *out);
 }

 inline MatrixF& MatrixF::affineInverse()
 {
-//   AssertFatal(isAffine() == true, "Error, this matrix is not an affine transform");
-   m_matF_affineInverse(m);
+   GetMat44().affine_inverse(m);
   return (*this);
 }

 inline MatrixF& MatrixF::transpose()
 {
-   m_matF_transpose(m);
+   GetMat44().transpose(m);
   return (*this);
 }

 inline MatrixF& MatrixF::scale(const Point3F& p)
 {
-   m_matF_scale(m,p);
+   GetMat44().scale(m, p);
   return *this;
 }

 inline Point3F MatrixF::getScale() const
 {
   Point3F scale;
-   scale.x = mSqrt(m[0]*m[0] + m[4] * m[4] + m[8] * m[8]);
-   scale.y = mSqrt(m[1]*m[1] + m[5] * m[5] + m[9] * m[9]);
-   scale.z = mSqrt(m[2]*m[2] + m[6] * m[6] + m[10] * m[10]);
+   GetMat44().get_scale(m, scale);
   return scale;
 }

 inline void MatrixF::normalize()
 {
-   m_matF_normalize(m);
+   GetMat44().normalize(m);
 }

 inline MatrixF& MatrixF::mul( const MatrixF &a )
 {  // M * a -> M
   AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!");
-
   MatrixF tempThis(*this);
-   m_matF_x_matF(tempThis, a, *this);
+   GetMat44().mul_mat44(tempThis, a, *this);
+
   return (*this);
 }

 inline MatrixF& MatrixF::mulL( const MatrixF &a )
 {  // a * M -> M
   AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!");
-
   MatrixF tempThis(*this);
-   m_matF_x_matF(a, tempThis, *this);
+   GetMat44().mul_mat44(a, tempThis, *this);
   return (*this);
 }

 inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b )
 {  // a * b -> M
   AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!");
-
-   m_matF_x_matF(a, b, *this);
+   GetMat44().mul_mat44(a, b, *this);
   return (*this);
 }

@ -461,7 +461,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b)
 inline void MatrixF::mul( Point4F& p ) const
 {
   Point4F temp;
-   m_matF_x_point4F(*this, &p.x, &temp.x);
+   GetMat44().mul_float4(*this, p, temp);
   p = temp;
 }

@ -469,28 +469,62 @@ inline void MatrixF::mulP( Point3F& p) const
 {
   // M * p -> d
   Point3F d;
-   m_matF_x_point3F(*this, &p.x, &d.x);
+   GetMat44().mul_pos3(*this, p, d);
   p = d;
 }

 inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
 {
   // M * p -> d
-   m_matF_x_point3F(*this, &p.x, &d->x);
+   GetMat44().mul_pos3(*this, p, *d);
+}
+
+inline void MatrixF::batch_mulP(Point3F* points, U32 count) const
+{
+   // Allocate temporary buffer
+   Point3F* temp = new Point3F[count];
+
+   GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
+
+   // Copy the results back to out
+   for (U32 i = 0; i < count; ++i)
+   {
+      points[i] = temp[i];
+   }
+
+   // Free temporary buffer
+   delete[] temp;
+}
+
+inline void MatrixF::batch_mulP(const Point3F* points, U32 count, Point3F* out) const
+{
+   // Allocate temporary buffer
+   Point3F* temp = new Point3F[count];
+
+   GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
+
+   // Copy the results back to out
+   for (U32 i = 0; i < count; ++i)
+   {
+      out[i] = temp[i];
+   }
+
+   // Free temporary buffer
+   delete[] temp;
 }

 inline void MatrixF::mulV( VectorF& v) const
 {
   // M * v -> v
   VectorF temp;
-   m_matF_x_vectorF(*this, &v.x, &temp.x);
+   GetMat44().mul_vec3(*this, v, temp);
   v = temp;
 }

 inline void MatrixF::mulV( const VectorF &v, Point3F *d) const
 {
   // M * v -> d
-   m_matF_x_vectorF(*this, &v.x, &d->x);
+   GetMat44().mul_vec3(*this, v, *d);
 }

 inline void MatrixF::mul(Box3F& b) const
@ -634,14 +668,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 )
 {
   // temp = m1 * m2
   MatrixF temp;
-   m_matF_x_matF(m1, m2, temp);
+   GetMat44().mul_mat44(m1, m2, temp);
   return temp;
 }

 inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 )
 {
   MatrixF tempThis(*this);
-   m_matF_x_matF(tempThis, m1, *this);
+   GetMat44().mul_mat44(tempThis, m1, *this);
   return (*this);
 }

@ -665,7 +699,9 @@ inline bool MatrixF::isNaN()

 inline void mTransformPlane(const MatrixF& mat, const Point3F& scale, const PlaneF&  plane, PlaneF * result)
 {
-   m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
+   //m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
+
+   GetMat44().transform_plane(mat, scale, plane, *result);
 }

 #else // !USE_TEMPLATE_MATRIX
--- a/Engine/source/math/mPoint3.h
+++ b/Engine/source/math/mPoint3.h
@ -29,6 +29,11 @@
 #ifndef _MPOINT2_H_
 #include "math/mPoint2.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
+
+#include <array>

 //------------------------------------------------------------------------------
 /// 3D integer point
@ -97,6 +102,7 @@ public:
 class Point3D;

 //------------------------------------------------------------------------------
+using namespace math_backend::float3::dispatch;
 class Point3F
 {
   //-------------------------------------- Public data
@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
 inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
 {
   AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
-   m_point3F_interpolate( _from, _to, _factor, *this);
+
+   GetFloat3().lerp(_from, _to, _factor, *this);
 }

 inline void Point3F::zero()
@ -585,31 +592,27 @@ inline void Point3F::neg()

 inline void Point3F::convolve(const Point3F& c)
 {
-   x *= c.x;
-   y *= c.y;
-   z *= c.z;
+   GetFloat3().mul(*this, c, *this);
 }

 inline void Point3F::convolveInverse(const Point3F& c)
 {
-   x /= c.x;
-   y /= c.y;
-   z /= c.z;
+   GetFloat3().div(*this, c, *this);
 }

 inline F32 Point3F::lenSquared() const
 {
-   return (x * x) + (y * y) + (z * z);
+   return GetFloat3().lengthSquared(*this);
 }

 inline F32 Point3F::len() const
 {
-   return mSqrt(x*x + y*y + z*z);
+   return GetFloat3().length(*this);
 }

 inline void Point3F::normalize()
 {
-   m_point3F_normalize(*this);
+   GetFloat3().normalize(*this);
 }

 inline F32 Point3F::magnitudeSafe() const
@ -626,18 +629,13 @@ inline F32 Point3F::magnitudeSafe() const

 inline void Point3F::normalizeSafe()
 {
-   F32 vmag = magnitudeSafe();
-
-   if( vmag > POINT_EPSILON )
-   {
-      *this *= F32(1.0 / vmag);
-   }
+   GetFloat3().normalize(*this);
 }


 inline void Point3F::normalize(F32 val)
 {
-   m_point3F_normalize_f(*this, val);
+   GetFloat3().normalize_mag(*this, val);
 }

 inline bool Point3F::operator==(const Point3F& _test) const
@ -652,52 +650,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const

 inline Point3F Point3F::operator+(const Point3F& _add) const
 {
-   return Point3F(x + _add.x, y + _add.y,  z + _add.z);
+   Point3F temp;
+   GetFloat3().add(*this, _add, temp);
+   return temp;
 }

 inline Point3F Point3F::operator-(const Point3F& _rSub) const
 {
-   return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
+   Point3F temp;
+   GetFloat3().sub(*this, _rSub, temp);
+   return temp;
 }

 inline Point3F& Point3F::operator+=(const Point3F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-
+   GetFloat3().add(*this, _add, *this);
   return *this;
 }

 inline Point3F& Point3F::operator-=(const Point3F& _rSub)
 {
-   x -= _rSub.x;
-   y -= _rSub.y;
-   z -= _rSub.z;
-
+   GetFloat3().sub(*this, _rSub, *this);
   return *this;
 }

 inline Point3F Point3F::operator*(F32 _mul) const
 {
-   return Point3F(x * _mul, y * _mul, z * _mul);
+   Point3F temp;
+   GetFloat3().mul_scalar(*this, _mul, temp);
+   return temp;
 }

 inline Point3F Point3F::operator/(F32 _div) const
 {
   AssertFatal(_div != 0.0f, "Error, div by zero attempted");

-   F32 inv = 1.0f / _div;
-
-   return Point3F(x * inv, y * inv, z * inv);
+   Point3F temp;
+   GetFloat3().div_scalar(*this, _div, temp);
+   return temp;
 }

 inline Point3F& Point3F::operator*=(F32 _mul)
 {
-   x *= _mul;
-   y *= _mul;
-   z *= _mul;
-
+   GetFloat3().mul_scalar(*this, _mul, *this);
   return *this;
 }

@ -705,39 +700,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
 {
   AssertFatal(_div != 0.0f, "Error, div by zero attempted");

-   F32 inv = 1.0f / _div;
-   x *= inv;
-   y *= inv;
-   z *= inv;
-
+   GetFloat3().div_scalar(*this, _div, *this);
   return *this;
 }

 inline Point3F Point3F::operator*(const Point3F &_vec) const
 {
-   return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
+   Point3F temp;
+   GetFloat3().mul(*this, _vec, temp);
+   return temp;
 }

 inline Point3F& Point3F::operator*=(const Point3F &_vec)
 {
-   x *= _vec.x;
-   y *= _vec.y;
-   z *= _vec.z;
+   GetFloat3().mul(*this, _vec, *this);
   return *this;
 }

 inline Point3F Point3F::operator/(const Point3F &_vec) const
 {
   AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
+   Point3F temp;
+   GetFloat3().div(*this, _vec, temp);
+   return temp;
 }

 inline Point3F& Point3F::operator/=(const Point3F &_vec)
 {
   AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   x /= _vec.x;
-   y /= _vec.y;
-   z /= _vec.z;
+   GetFloat3().div(*this, _vec, *this);
   return *this;
 }

@ -989,7 +980,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)

 inline Point3F operator*(F32 mul, const Point3F& multiplicand)
 {
-   return multiplicand * mul;
+   Point3F temp;
+   GetFloat3().mul_scalar(multiplicand, mul, temp);
+   return temp;
 }

 inline Point3D operator*(F64 mul, const Point3D& multiplicand)
@ -999,7 +992,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)

 inline F32 mDot(const Point3F &p1, const Point3F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
+   return GetFloat3().dot(p1, p2);
 }

 inline F64 mDot(const Point3D &p1, const Point3D &p2)
@ -1009,9 +1002,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2)

 inline void mCross(const Point3F &a, const Point3F &b, Point3F *res)
 {
-   res->x = (a.y * b.z) - (a.z * b.y);
-   res->y = (a.z * b.x) - (a.x * b.z);
-   res->z = (a.x * b.y) - (a.y * b.x);
+   GetFloat3().cross(a, b, *res);
 }

 inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
@ -1024,7 +1015,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
 inline Point3F mCross(const Point3F &a, const Point3F &b)
 {
   Point3F r;
-   mCross( a, b, &r );
+   GetFloat3().cross(a, b, r);
   return r;
 }

--- a/Engine/source/math/mPoint4.h
+++ b/Engine/source/math/mPoint4.h
@ -26,10 +26,12 @@
 #ifndef _MMATHFN_H_
 #include "math/mMathFn.h"
 #endif
-
 #ifndef _MPOINT3_H_
 #include "math/mPoint3.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif


 //------------------------------------------------------------------------------
@ -61,6 +63,8 @@ class Point4I
 /// Uses F32 internally.
 ///
 /// Useful for representing quaternions and other 4d beasties.
+using namespace math_backend::float4::dispatch;
+
 class Point4F
 {
   //-------------------------------------- Public data
@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)

 inline F32 Point4F::len() const
 {
-   return mSqrt(x*x + y*y + z*z + w*w);
+   return GetFloat4().length(*this);
 }

 inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
 {
-   x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
-   y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
-   z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
-   w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
+   GetFloat4().lerp(_from, _to, _factor, *this);
 }

 inline void Point4F::zero()
@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
   if (mIsZero(scalar))
      return *this;

-   F32 denom = 1 / scalar;
-
-   x *= denom;
-   y *= denom;
-   z *= denom;
-   w *= denom;
+   GetFloat4().div_scalar(*this, scalar, *this);

   return *this;
 }

 inline Point4F Point4F::operator+(const Point4F& _add) const
 {
-   return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
+   Point4F res;
+   GetFloat4().add(*this, _add, res);
+   return res;
 }

 inline Point4F& Point4F::operator+=(const Point4F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-   w += _add.w;
-
+   GetFloat4().add(*this, _add, *this);
   return *this;
 }

 inline Point4F Point4F::operator-(const Point4F& _rSub) const
 {
-   return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
+   Point4F res;
+   GetFloat4().sub(*this, _rSub, res);
+   return res;
 }

 inline Point4F Point4F::operator*(const Point4F &_vec) const
 {
-   return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
+   Point4F res;
+   GetFloat4().mul(*this, _vec, res);
+   return res;
 }

 inline Point4F Point4F::operator*(F32 _mul) const
 {
-   return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
+   Point4F res;
+   GetFloat4().mul_scalar(*this, _mul, res);
+   return res;
 }

 inline Point4F Point4F::operator /(F32 t) const
 {
-   F32 f = 1.0f / t;
-   return Point4F( x * f, y * f, z * f, w * f );
+   Point4F res;
+   GetFloat4().div_scalar(*this, t, res);
+   return res;
 }

 inline F32 mDot(const Point4F &p1, const Point4F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
+   return  GetFloat4().dot(p1, p2);
 }

 //------------------------------------------------------------------------------
--- a/Engine/source/math/public/float3_dispatch.h
+++ b/Engine/source/math/public/float3_dispatch.h
@ -0,0 +1,39 @@
+#pragma once
+#ifndef _FLOAT3_DISPATCH_H_
+#define _FLOAT3_DISPATCH_H_
+
+
+#include <cstdint>
+
+namespace math_backend::float3::dispatch
+{
+   struct Float3Funcs
+   {
+      void (*add)(const float*, const float*, float*) = nullptr;
+      void (*sub)(const float*, const float*, float*) = nullptr;
+      void (*mul)(const float*, const float*, float*) = nullptr;
+      void (*mul_scalar)(const float*, float, float*) = nullptr;
+      void (*div)(const float*, const float*, float*) = nullptr;
+      void (*div_scalar)(const float*, float, float*) = nullptr;
+      float (*dot)(const float*, const float*) = nullptr;
+      float (*length)(const float*) = nullptr;
+      float (*lengthSquared)(const float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
+      void (*lerp)(const float*, const float*, float, float*) = nullptr;
+      void (*cross)(const float*, const float*, float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Float3Funcs gFloat3;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_FLOAT4_DISPATCH_H_
--- a/Engine/source/math/public/float4_dispatch.h
+++ b/Engine/source/math/public/float4_dispatch.h
@ -0,0 +1,39 @@
+#pragma once
+#ifndef _FLOAT4_DISPATCH_H_
+#define _FLOAT4_DISPATCH_H_
+
+
+#include <cstdint>
+
+namespace math_backend::float4::dispatch
+{
+   struct Float4Funcs
+   {
+      void (*add)(const float*, const float*, float*) = nullptr;
+      void (*sub)(const float*, const float*, float*) = nullptr;
+      void (*mul)(const float*, const float*, float*) = nullptr;
+      void (*mul_scalar)(const float*, float, float*) = nullptr;
+      void (*div)(const float*, const float*, float*) = nullptr;
+      void (*div_scalar)(const float*, float, float*) = nullptr;
+      float (*dot)(const float*, const float*) = nullptr;
+      float (*length)(const float*) = nullptr;
+      float (*lengthSquared)(const float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
+      void (*lerp)(const float*, const float*, float, float*) = nullptr;
+      void (*cross)(const float*, const float*, float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Float4Funcs gFloat4;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_FLOAT4_DISPATCH_H_
--- a/Engine/source/math/public/mat44_dispatch.h
+++ b/Engine/source/math/public/mat44_dispatch.h
@ -0,0 +1,39 @@
+#pragma once
+#ifndef _MAT44_DISPATCH_H_
+#define _MAT44_DISPATCH_H_
+
+
+namespace math_backend::mat44::dispatch
+{
+   struct Mat44Funcs
+   {
+      void (*transpose)(float*) = nullptr;
+      void (*inverse)(float*) = nullptr;
+      void (*inverse_to)(const float*, float*) = nullptr;
+      void (*affine_inverse)(float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*mul_mat44)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_float4)(const float* a, const float* b, float* r) = nullptr;
+      void (*transform_plane)(const float* m, const float* s, const float* p, float* r) = nullptr;
+      float (*determinant)(const float*) = nullptr;
+      void (*scale)(float*, const float*) = nullptr;
+      void (*get_scale)(const float*, float*) = nullptr;
+
+      void (*batch_mul_pos3)(const float* m, const float* pts, int count, float* result_ptrs) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Mat44Funcs gMat44;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_MAT44_DISPATCH_H_
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@ -0,0 +1,103 @@
+#include "math/public/math_backend.h"
+
+namespace math_backend::float4::dispatch
+{
+   // Single definition of the global dispatch table
+   Float4Funcs gFloat4{};
+}
+
+namespace math_backend::float3::dispatch
+{
+   // Single definition of the global dispatch table
+   Float3Funcs gFloat3{};
+}
+
+namespace math_backend::mat44::dispatch
+{
+   Mat44Funcs gMat44{};
+}
+
+namespace math_backend
+{
+   // Use an anonymous namespace for the static initializer
+   namespace {
+      struct ScalarInitializer
+      {
+         ScalarInitializer()
+         {
+            // Install scalar defaults immediately for all types
+            float4::dispatch::install_scalar();
+            float3::dispatch::install_scalar();
+            mat44::dispatch::install_scalar();
+         }
+      };
+
+      // Static instance ensures constructor runs before main()
+      ScalarInitializer g_scalarInitializer;
+   }
+}
+
+
+math_backend::backend math_backend::choose_backend(U32 cpu_flags)
+{
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
+
+   if (cpu_flags & CPU_PROP_AVX2)  return backend::avx2;
+   if (cpu_flags & CPU_PROP_AVX)   return backend::avx;
+   if (cpu_flags & CPU_PROP_SSE4_1) return backend::sse41;
+   if (cpu_flags & CPU_PROP_SSE2)  return backend::sse2;
+
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+
+   if (cpu_flags & CPU_PROP_NEON) return backend::neon;
+
+#endif
+   return backend::scalar;
+}
+
+void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
+{
+   {
+      g_backend = choose_backend(cpu_flags);
+
+      switch (g_backend)
+      {
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
+      case backend::avx2:
+         float4::dispatch::install_avx2();
+         float3::dispatch::install_avx2();
+         mat44::dispatch::install_avx2();
+         break;
+
+      case backend::avx:
+         float4::dispatch::install_avx();
+         float3::dispatch::install_avx();
+         mat44::dispatch::install_avx();
+         break;
+
+      case backend::sse41:
+         float4::dispatch::install_sse41();
+         float3::dispatch::install_sse41();
+         mat44::dispatch::install_sse41();
+         break;
+
+      case backend::sse2:
+         float4::dispatch::install_sse2();
+         float3::dispatch::install_sse2();
+         mat44::dispatch::install_sse2();
+         break;
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+      case backend::neon:
+         float4::dispatch::install_neon();
+         float3::dispatch::install_neon();
+         mat44::dispatch::install_neon();
+         break;
+#endif
+      default:
+         float4::dispatch::install_scalar();
+         float3::dispatch::install_scalar();
+         mat44::dispatch::install_scalar();
+         break;
+      }
+   }
+}
--- a/Engine/source/math/public/math_backend.h
+++ b/Engine/source/math/public/math_backend.h
@ -0,0 +1,95 @@
+#pragma once
+#ifndef _MATH_BACKEND_H_
+#define _MATH_BACKEND_H_
+
+#ifndef _MCONSTANTS_H_
+#include "math/mConstants.h"
+#endif
+#ifndef _PLATFORMASSERT_H_
+#include "platform/platformAssert.h"
+#endif
+#ifndef _FLOAT4_DISPATCH_H_
+#include "math/public/float4_dispatch.h"
+#endif
+#ifndef _FLOAT3_DISPATCH_H_
+#include "math/public/float3_dispatch.h"
+#endif
+#ifndef _MAT44_DISPATCH_H_
+#include "math/public/mat44_dispatch.h"
+#endif
+
+namespace math_backend
+{
+   enum class backend
+   {
+      scalar,
+      sse2,
+      sse41,
+      avx,
+      avx2,
+      neon
+   };
+
+   static backend g_backend = backend::scalar;
+   backend choose_backend(U32 cpu_flags);
+   void install_from_cpu_flags(uint32_t cpu_flags);
+}
+
+#include <mutex>
+
+namespace math_backend::float4::dispatch
+{
+   //--------------------------------------------------
+   // Thread-safe getter, ensures scalar installed if empty
+   //--------------------------------------------------
+   inline Float4Funcs& GetFloat4()
+   {
+      if (gFloat4.mul == nullptr)
+      {
+         static std::once_flag once;
+         std::call_once(once, []{
+            install_scalar();
+         });
+      }
+      return gFloat4;
+   }
+}
+
+namespace math_backend::float3::dispatch
+{
+   //--------------------------------------------------
+   // Thread-safe getter, ensures scalar installed if empty
+   //--------------------------------------------------
+   inline Float3Funcs& GetFloat3()
+   {
+      if (gFloat3.mul == nullptr)
+      {
+         static std::once_flag once;
+         std::call_once(once, []{
+            install_scalar();
+         });
+      }
+      return gFloat3;
+   }
+}
+
+
+namespace math_backend::mat44::dispatch
+{
+   //--------------------------------------------------
+   // Thread-safe getter, ensures scalar installed if empty
+   //--------------------------------------------------
+   inline Mat44Funcs& GetMat44()
+   {
+      if (gMat44.mul_mat44 == nullptr)
+      {
+         static std::once_flag once;
+         std::call_once(once, []{
+            install_scalar();
+         });
+      }
+      return gMat44;
+   }
+}
+
+#endif // !_MATH_BACKEND_H_
--- a/Engine/source/math/util/frustum.cpp
+++ b/Engine/source/math/util/frustum.cpp
@ -381,8 +381,10 @@ void FrustumData::_update() const

   // Transform the points into the desired culling space.

-   for( U32 i = 0; i < mPoints.size(); ++ i )
-      mTransform.mulP( mPoints[ i ] );
+   /*for( U32 i = 0; i < mPoints.size(); ++ i )
+      mTransform.mulP( mPoints[ i ] );*/
+
+   mTransform.batch_mulP(mPoints.address(), mPoints.size());

   // Update the axis aligned bounding box from 
   // the newly transformed points.
--- a/Engine/source/platform/platform.h
+++ b/Engine/source/platform/platform.h
@ -75,10 +75,12 @@ enum ProcessorProperties
   CPU_PROP_SSE4_1    = (1<<7),  ///< Supports SSE4_1 instruction set extension.  
   CPU_PROP_SSE4_2    = (1<<8),  ///< Supports SSE4_2 instruction set extension.
   CPU_PROP_AVX       = (1<<9), ///< Supports AVX256 instruction set extension.
-   CPU_PROP_MP        = (1<<10), ///< This is a multi-processor system.
-   CPU_PROP_LE        = (1<<11), ///< This processor is LITTLE ENDIAN.
-   CPU_PROP_64bit     = (1<<12), ///< This processor is 64-bit capable
-   CPU_PROP_NEON      = (1<<13), ///< Supports the Arm Neon instruction set extension.
+   CPU_PROP_AVX2      = (1<<10), ///< Supports AVX256 instruction set extension.
+   CPU_PROP_AVX512    = (1<<11), ///< Supports AVX512 instruction set extension.
+   CPU_PROP_MP        = (1<<12), ///< This is a multi-processor system.
+   CPU_PROP_LE        = (1<<13), ///< This processor is LITTLE ENDIAN.
+   CPU_PROP_64bit     = (1<<14), ///< This processor is 64-bit capable
+   CPU_PROP_NEON      = (1<<15), ///< Supports the Arm Neon instruction set extension.
 };

 /// Processor info manager. 
--- a/Engine/source/platformMac/macMath.mm
+++ b/Engine/source/platformMac/macMath.mm
@ -25,6 +25,7 @@
 #import "math/mMath.h"
 #import "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"

 extern void mInstallLibrary_C();

@ -107,14 +108,10 @@ void Math::init(U32 properties)

   Con::printf("Math Init:");
   Con::printf("   Installing Standard C extensions");
-   mInstallLibrary_C();   
+   mInstallLibrary_C();

-   #ifdef TORQUE_CPU_X86
-   if( properties & CPU_PROP_SSE )
-   {
-      Con::printf( "   Installing SSE extensions" );
-   }
-   #endif
+   Con::printf( "   Installing ISA extensions" );
+   math_backend::install_from_cpu_flags(properties);
   
   Con::printf(" ");
 }   
--- a/Engine/source/platformPOSIX/POSIXMath.cpp
+++ b/Engine/source/platformPOSIX/POSIXMath.cpp
@ -27,6 +27,7 @@
 #include "math/mMath.h"
 #include "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"

 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@ -90,29 +91,14 @@ void Math::init(U32 properties)
   Con::printf("   Installing Standard C extensions");
   mInstallLibrary_C();

+   Con::printf("   Installing ISA extensions");
+   math_backend::install_from_cpu_flags(properties);
+
 #if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64)
   Con::printf("   Installing Assembly extensions");
   mInstallLibrary_ASM();
 #endif

-   if (properties & CPU_PROP_FPU)
-   {
-      Con::printf("   Installing FPU extensions");
-   }
-
-   if (properties & CPU_PROP_MMX)
-   {
-      Con::printf("   Installing MMX extensions");
-   }
-
-#if !defined(__MWERKS__) || (__MWERKS__ >= 0x2400)
-   if (properties & CPU_PROP_SSE)
-   {
-      Con::printf("   Installing SSE extensions");
-   }
-#endif //mwerks>2.4
-
-   Con::printf(" ");
 }


--- a/Engine/source/platformWin32/winCPUInfo.cpp
+++ b/Engine/source/platformWin32/winCPUInfo.cpp
@ -72,13 +72,15 @@ enum CpuFlags

   BIT_XSAVE_RESTORE = BIT(27),
   BIT_AVX = BIT(28),
+   BIT_AVX2 = BIT(5),
+   BIT_AVX512F = BIT(16)
 };

 static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor)
 {
   S32 cpuInfo[4];
   __cpuid(cpuInfo, 1);
-   //U32 eax = cpuInfo[0];   // eax
+   int nIds = cpuInfo[0];
   U32 edx = cpuInfo[3];  // edx
   U32 ecx = cpuInfo[2]; // ecx

@ -90,6 +92,18 @@ static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor)
   processor.properties |= (ecx & BIT_SSE4_1) ? CPU_PROP_SSE4_1 : 0;
   processor.properties |= (ecx & BIT_SSE4_2) ? CPU_PROP_SSE4_2 : 0;

+   if (nIds >= 7)
+   {
+      int ext[4];
+      __cpuidex(ext, 7, 0);
+
+      if (ext[1] & (BIT_AVX2)) // EBX bit 5
+         processor.properties |= CPU_PROP_AVX2;
+
+      if (ext[1] & (BIT_AVX512F)) // AVX-512 Foundation
+         processor.properties |= CPU_PROP_AVX512;
+   }
+
   // AVX detection requires that xsaverestore is supported
   if (ecx & BIT_XSAVE_RESTORE && ecx & BIT_AVX)
   {
@ -176,6 +190,10 @@ void Processor::init()
      Con::printf("      SSE4.2 detected" );
   if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX)
      Con::printf("      AVX detected");
+   if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX2)
+      Con::printf("      AVX2 detected");
+   if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX512)
+      Con::printf("      AVX512 detected");

   if (Platform::SystemInfo.processor.properties & CPU_PROP_MP)
      Con::printf("   MultiCore CPU detected [%i cores, %i logical]", Platform::SystemInfo.processor.numPhysicalProcessors, Platform::SystemInfo.processor.numLogicalProcessors);
--- a/Engine/source/platformWin32/winMath.cpp
+++ b/Engine/source/platformWin32/winMath.cpp
@ -25,7 +25,7 @@
 #include "console/engineAPI.h"

 #include "math/mMath.h"
-
+#include "math/public/math_backend.h"

 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@ -98,24 +98,12 @@ void Math::init(U32 properties)
   Con::printf("   Installing Standard C extensions");
   mInstallLibrary_C();

+   Con::printf("   Installing ISA extensions");
+   math_backend::install_from_cpu_flags(properties);
+
   Con::printf("   Installing Assembly extensions");
   mInstallLibrary_ASM();

-   if (properties & CPU_PROP_FPU)
-   {
-      Con::printf("   Installing FPU extensions");
-   }
-
-   if (properties & CPU_PROP_MMX)
-   {
-      Con::printf("   Installing MMX extensions");
-   }
-
-   if (properties & CPU_PROP_SSE)
-   {
-      Con::printf("   Installing SSE extensions");
-   }
-
   Con::printf(" ");
 }

--- a/Engine/source/ts/assimp/assimpAppNode.cpp
+++ b/Engine/source/ts/assimp/assimpAppNode.cpp
@ -250,7 +250,7 @@ MatrixF AssimpAppNode::getNodeTransform(F32 time)
      // Check for inverted node coordinate spaces => can happen when modelers
      // use the 'mirror' tool in their 3d app. Shows up as negative <scale>
      // transforms in the collada model.
-      if (m_matF_determinant(nodeTransform) < 0.0f)
+      if (nodeTransform.determinant() < 0.0f)
      {
         // Mark this node as inverted so we can mirror mesh geometry, then
         // de-invert the transform matrix
--- a/Engine/source/ts/collada/colladaAppMesh.cpp
+++ b/Engine/source/ts/collada/colladaAppMesh.cpp
@ -1099,7 +1099,7 @@ void ColladaAppMesh::lookupSkinData()
      // Inverted node coordinate spaces (negative scale factor) are corrected
      // in ColladaAppNode::getNodeTransform, so need to apply the same operation
      // here to match
-      if (m_matF_determinant(invBind) < 0.0f)
+      if (invBind.determinant() < 0.0f)
         initialTransforms[iJoint].scale(Point3F(1, 1, -1));

      initialTransforms[iJoint].mul(invBind);
--- a/Engine/source/ts/collada/colladaAppNode.cpp
+++ b/Engine/source/ts/collada/colladaAppNode.cpp
@ -192,7 +192,7 @@ MatrixF ColladaAppNode::getNodeTransform(F32 time)
      // Check for inverted node coordinate spaces => can happen when modelers
      // use the 'mirror' tool in their 3d app. Shows up as negative <scale>
      // transforms in the collada model.
-      if (m_matF_determinant(nodeTransform) < 0.0f)
+      if (nodeTransform.determinant() < 0.0f)
      {
         // Mark this node as inverted so we can mirror mesh geometry, then
         // de-invert the transform matrix
--- a/Engine/source/util/fpsTracker.cpp
+++ b/Engine/source/util/fpsTracker.cpp
@ -36,6 +36,8 @@ void FPSTracker::reset()
 {
   fpsNext         = (F32)Platform::getRealMilliseconds()/1000.0f + mUpdateInterval;

+   fpsAccumTime      = 0.0f;
+   fpsAccumVirtualTime = 0.0f;
   fpsRealLast       = 0.0f;
   fpsReal           = 0.0f;
   fpsRealMin        = 0.000001f; // Avoid division by zero.
@ -51,42 +53,60 @@ void FPSTracker::update()
   F32 realSeconds    = (F32)Platform::getRealMilliseconds()/1000.0f;
   F32 virtualSeconds = (F32)Platform::getVirtualMilliseconds()/1000.0f;

-   fpsFrames++;
-   if (fpsFrames > 1)
+   if (fpsRealLast == 0.0f)
   {
-      fpsReal    = fpsReal*(1.0-alpha) + (realSeconds-fpsRealLast)*alpha;
-      fpsVirtual = fpsVirtual*(1.0-alpha) + (virtualSeconds-fpsVirtualLast)*alpha;
-
-      if( fpsFrames > 10 ) // Wait a few frames before updating these.
-      {
-         // Update min/max.  This is a bit counter-intuitive, as the comparisons are
-         // inversed because these are all one-over-x values.
-
-         if( fpsReal > fpsRealMin )
-            fpsRealMin = fpsReal;
-         if( fpsReal < fpsRealMax )
-            fpsRealMax = fpsReal;
-      }
+      fpsRealLast = realSeconds;
+      fpsVirtualLast = virtualSeconds;
+      return;
   }

-   fpsRealLast    = realSeconds;
+   F32 frameTimeReal = realSeconds - fpsRealLast;
+   F32 frameTimeVirtual = virtualSeconds - fpsVirtualLast;
+
+   fpsRealLast = realSeconds;
   fpsVirtualLast = virtualSeconds;

-   // update variables every few frames
-   F32 update = fpsRealLast - fpsNext;
-   if (update > 0.5f)
-   {
-      F32 delta = realSeconds - fpsNext;
-      Con::setVariable( "fps::frameDelta",avar("%g", delta));
-      Con::setVariable( "fps::real",      avar( "%4.1f", 1.0f / fpsReal ) );
-      Con::setVariable( "fps::realMin",   avar( "%4.1f", 1.0f / fpsRealMin ) );
-      Con::setVariable( "fps::realMax",   avar( "%4.1f", 1.0f / fpsRealMax ) );
-      Con::setVariable( "fps::virtual",   avar( "%4.1f", 1.0f / fpsVirtual ) );
+   // Accumulate for windowed FPS calculation
+   fpsFrames++;
+   fpsAccumTime += frameTimeReal;
+   fpsAccumVirtualTime += frameTimeVirtual;

-      if (update > mUpdateInterval)
-         fpsNext  = fpsRealLast + mUpdateInterval;
-      else
-         fpsNext += mUpdateInterval;
+   // Only update console values at interval
+   if (realSeconds >= fpsNext)
+   {
+      fpsReal = 0.0f;
+      fpsVirtual = 0.0f;
+
+      if (fpsAccumTime > 0.0f)
+         fpsReal = fpsFrames / fpsAccumTime;
+
+      if (fpsAccumVirtualTime > 0.0f)
+         fpsVirtual = fpsFrames / fpsAccumVirtualTime;
+
+      // Update min/max correctly (these are FPS now)
+      if (fpsReal > 0.0f)
+      {
+         if (fpsReal < fpsRealMin)
+            fpsRealMin = fpsReal;
+
+         if (fpsReal > fpsRealMax)
+            fpsRealMax = fpsReal;
+      }
+
+      // frameDelta = actual accumulated real time over window
+      Con::setVariable("fps::frameTimeMs",   avar("%4.3f", (fpsAccumTime / fpsFrames) * 1000.0f));
+      Con::setVariable("fps::frameDelta",    avar("%g", fpsAccumTime));
+      Con::setVariable("fps::real",          avar("%4.1f", fpsReal));
+      Con::setVariable("fps::realMin",       avar("%4.1f", fpsRealMin));
+      Con::setVariable("fps::realMax",       avar("%4.1f", fpsRealMax));
+      Con::setVariable("fps::virtual",       avar("%4.1f", fpsVirtual));
+
+      // Reset window
+      fpsFrames = 0;
+      fpsAccumTime = 0.0f;
+      fpsAccumVirtualTime = 0.0f;
+
+      fpsNext = realSeconds + mUpdateInterval;
   }
 }

--- a/Engine/source/util/fpsTracker.h
+++ b/Engine/source/util/fpsTracker.h
@ -27,6 +27,8 @@

 struct FPSTracker
 {
+   F32 fpsAccumTime;
+   F32 fpsAccumVirtualTime;
   F32 fpsRealLast;
   F32 fpsReal;
   F32 fpsRealMin;
@ -48,4 +50,4 @@ struct FPSTracker

 extern FPSTracker gFPS;

-#endif
+#endif