diff --git a/Engine/source/environment/meshRoad.cpp b/Engine/source/environment/meshRoad.cpp
index 0b10f4a29..ea09c4352 100644
--- a/Engine/source/environment/meshRoad.cpp
+++ b/Engine/source/environment/meshRoad.cpp
@@ -3406,7 +3406,7 @@ MatrixF MeshRoad::getNodeTransform( U32 idx )
    mat.setColumn( 2, node.normal );
    mat.setColumn( 3, node.point );
 
-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal(mat.determinant() != 0.0f, "no inverse!");
 
    return mat; 
 }
@@ -3456,7 +3456,7 @@ void MeshRoad::calcSliceTransform( U32 idx, MatrixF &mat )
    mat.setColumn( 2, slice.normal );
    mat.setColumn( 3, slice.p1 );
 
-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal(mat.determinant() != 0.0f, "no inverse!");
 }
 
 F32 MeshRoad::getRoadLength() const
diff --git a/Engine/source/environment/river.cpp b/Engine/source/environment/river.cpp
index acefb66b3..a3456cf17 100644
--- a/Engine/source/environment/river.cpp
+++ b/Engine/source/environment/river.cpp
@@ -2139,7 +2139,7 @@ MatrixF River::getNodeTransform( U32 idx ) const
    mat.setColumn( 2, node.normal );
    mat.setColumn( 3, node.point );
 
-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal( mat.determinant() != 0.0f, "no inverse!");
 
    return mat; 
 }
diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl
index 87b325faf..216fc99ba 100644
--- a/Engine/source/math/impl/float3_impl.inl
+++ b/Engine/source/math/impl/float3_impl.inl
@@ -99,7 +99,7 @@ namespace math_backend::float3
       f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
 
       f32x4 vnorm = v_mul(va, invLen);
-      v_store(a, vnorm);
+      v_store3(a, vnorm);
    }
 
    // Linear interpolation: r = from + (to - from) * f
diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
new file mode 100644
index 000000000..a9635cff0
--- /dev/null
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -0,0 +1,312 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+namespace math_backend::mat44
+{
+   //------------------------------------------------------------------
+   // Matrix Transpose
+   inline void mat44_transpose_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4x4 mr = m_transpose(ma);
+      m_store(m, mr);
+   }
+
+   inline float mat44_get_determinant(const float* m)
+   {
+      f32x4x4 ma = m_load(m);
+      return v_extract0(m_determinant_affine(ma));
+   }
+
+   // Matrix Scale: Float3 (assume w = 1.0f)
+   inline void mat44_scale_impl(float* m, const float* s)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_pos(s);
+
+      ma.r0 = v_mul(ma.r0, va);
+      ma.r1 = v_mul(ma.r1, va);
+      ma.r2 = v_mul(ma.r2, va);
+      ma.r3 = v_mul(ma.r3, va);
+      m_store(m, ma);
+   }
+
+   inline void mat44_get_scale_impl(const float* m, float* s)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // squared lengths
+      f32x4 len2_x = v_dot3(ma.r0, ma.r0);
+      f32x4 len2_y = v_dot3(ma.r1, ma.r1);
+      f32x4 len2_z = v_dot3(ma.r2, ma.r2);
+
+      // extract and sqrt
+      s[0] = 1.0f / v_extract0(v_rsqrt_nr(len2_x));
+      s[1] = 1.0f / v_extract0(v_rsqrt_nr(len2_y));
+      s[2] = 1.0f / v_extract0(v_rsqrt_nr(len2_z));
+   }
+
+   // Matrix Scale Uniform: Float value (assume w = 1.0f)
+   inline void mat44_scale_uniform(float* m, float s)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // (s, s, s, 1)
+      f32x4 scale = v_set(s, s, s, 1.0f);
+
+      // Scale only rotation rows (xyz part)
+      ma.r0 = v_mul(ma.r0, scale);
+      ma.r1 = v_mul(ma.r1, scale);
+      ma.r2 = v_mul(ma.r2, scale);
+      m_store(m, ma);
+   }
+
+   // Matrix Inverse
+   inline void mat44_inverse_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // Compute cofactors using cross products
+      f32x4x4 mTemp;
+      mTemp.r0 = v_cross(ma.r1, ma.r2);
+      mTemp.r1 = v_cross(ma.r2, ma.r0);
+      mTemp.r2 = v_cross(ma.r0, ma.r1);
+
+      // Determinant = dot(ma.r0, c0)
+      f32x4 det = v_dot3(ma.r0, mTemp.r0);
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Scale cofactors
+      mTemp.r0 = v_mul(mTemp.r0, invDet);
+      mTemp.r1 = v_mul(mTemp.r1, invDet);
+      mTemp.r2 = v_mul(mTemp.r2, invDet);
+
+      // Store inverse 3x3 (transpose of cofactor matrix)
+
+      mTemp = m_transpose(mTemp);
+      mTemp.r3 = v_set(0, 0, 0, 1.0f);
+
+      // ---- Translation ----
+
+      // Load original translation
+      f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
+      
+      // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
+      f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
+      result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
+      result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
+      result = v_mul(result, v_set1(-1.0f));
+
+      m_store(m, mTemp);
+
+      // Store translation
+      m[3] = v_extract0(result);
+      m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
+      m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
+   }
+
+   // Matrix Affine Inverse
+   inline void mat44_affine_inverse_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+
+      f32x4x4 mTemp = m_transpose(ma);
+      mTemp.r3 = v_set(0, 0, 0, 1);
+
+      // ---- Translation ----
+      // Load original translation
+      f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
+
+      // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
+      f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
+      result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
+      result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
+      result = v_mul(result, v_set1(-1.0f));
+
+      m_store(m, mTemp);
+
+      // Store translation
+      m[3] = v_extract0(result);
+      m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
+      m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
+   }
+
+   inline void mat44_normalize_impl(float* a)
+   {
+      // Load the matrix
+      f32x4x4 m = m_load(a);
+
+      // Extract axes (rows 0-2), zero out w using v_mask_xyz
+      f32x4 xaxis = v_mul(m.r0, v_mask_xyz());
+      f32x4 yaxis = v_mul(m.r1, v_mask_xyz());
+      f32x4 zaxis = v_mul(m.r2, v_mask_xyz());
+
+      xaxis = v_normalize3(xaxis);
+
+      float dotXY = v_extract0(v_hadd4(v_mul(xaxis, yaxis)));
+      f32x4 projYonX = v_mul(v_set1(dotXY), xaxis);
+      yaxis = v_normalize3(v_sub(yaxis, projYonX));
+
+      zaxis = v_cross(xaxis, yaxis);
+
+      // Store normalized axes back (preserve translation w)
+      m.r0 = v_preserve_w(xaxis, m.r0);
+      m.r1 = v_preserve_w(yaxis, m.r1);
+      m.r2 = v_preserve_w(zaxis, m.r2);
+
+      // Store back to memory
+      m_store(a, m);
+   }
+
+   // Matrix Multiply: a * b
+   inline void mat44_mul_mat44_impl(const float* a, const float* b, float* r)
+   {
+      f32x4x4 ma = m_load(a);
+      f32x4x4 mb = m_load(b);
+
+      f32x4x4 mr = m_mul(ma, mb);
+      m_store(r, mr);
+   }
+
+   // Vector Multiply: m * p (assume w = 1.0f)
+   inline void mat44_mul_pos3_impl(const float *m, const float *p, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_pos(p);
+      f32x4 vr = m_mul_vec4(ma, va);
+      v_store3(r, vr);
+   }
+
+   // Vector Multiply: m * v (assume w = 0.0f)
+   inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_vec(v);
+      f32x4 vr = m_mul_vec4(ma, va);
+      v_store3(r, vr);
+   }
+
+   // Vector Multiply: m * p (full [4x4] * [1x4])
+   inline void mat44_mul_float4_impl(const float* m, const float* p, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load(p);
+      f32x4 vr = m_mul_vec4(ma, va);
+      v_store(r, vr);
+   }
+
+   //--------------------------------------------------
+   // MATRIX ROTATION FUNCTIONS
+   //--------------------------------------------------
+
+   inline void mat44_rotation_x_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r1 = v_set(0, c, s, 0);
+      mr.r2 = v_set(0, -s, c, 0);
+      m_store(m, mr);
+   }
+
+   inline void mat44_rotation_y_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r0 = v_set(c, 0, -s, 0);
+      mr.r2 = v_set(s, 0, c, 0);
+      m_store(m, mr);
+   }
+
+   inline void mat44_rotation_z_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r0 = v_set(c, s, 0, 0);
+      mr.r1 = v_set(-s, c, 0, 0);
+      m_store(m, mr);
+   }
+
+   // Compose rotation from Euler angles (pitch=X, yaw=Y, roll=Z)
+   inline void mat44_rotation_euler_impl(float* m, float pitch, float yaw, float roll)
+   {
+      f32x4x4 rx, ry, rz;
+      mat44_rotation_x_impl((float*)&rx, pitch);
+      mat44_rotation_y_impl((float*)&ry, yaw);
+      mat44_rotation_z_impl((float*)&rz, roll);
+
+      f32x4x4 r = m_mul(rz, m_mul(ry, rx));
+      m_store(m, r);
+   }
+
+   inline void mat44_trs_impl(float* m, const float* t, const float* r_euler, const float* s)
+   {
+      f32x4x4 mr;
+      mat44_rotation_euler_impl((float*)&mr, r_euler[0], r_euler[1], r_euler[2]);
+
+      f32x4 vs = v_load3_vec(s); // scale xyz
+      mr.r0 = v_mul(mr.r0, vs);
+      mr.r1 = v_mul(mr.r1, vs);
+      mr.r2 = v_mul(mr.r2, vs);
+
+      mr.r0 = v_insert_w(mr.r0, _mm_set_ss(t[0]));
+      mr.r1 = v_insert_w(mr.r1, _mm_set_ss(t[1]));
+      mr.r2 = v_insert_w(mr.r2, _mm_set_ss(t[2]));
+      mr.r3 = v_set(0, 0, 0, 1.0f);
+
+      m_store(m, mr);
+   }
+
+   inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up)
+   {
+      f32x4 vEye = v_load3_pos(eye);
+      f32x4 vTarget = v_load3_pos(target);
+      f32x4 vUp = v_load3_vec(up);
+
+      // Forward (z+)
+      f32x4 zaxis = v_normalize3(v_sub(vTarget, vEye));
+
+      // Right (x+)
+      f32x4 xaxis = v_normalize3(v_cross(vUp, zaxis));
+
+      // Up (y+)
+      f32x4 yaxis = v_cross(zaxis, xaxis);
+
+      // Compute translation components: -dot(axis, eye)
+      f32x4 t_x = v_mul(v_dot3(xaxis, vEye), v_set1(-1.0f));
+      f32x4 t_y = v_mul(v_dot3(yaxis, vEye), v_set1(-1.0f));
+      f32x4 t_z = v_mul(v_dot3(zaxis, vEye), v_set1(-1.0f));
+
+      f32x4x4 view;
+      view.r0 = v_insert_w(xaxis, t_x);
+      view.r1 = v_insert_w(yaxis, t_y);
+      view.r2 = v_insert_w(zaxis, t_z);
+      view.r3 = v_set(0, 0, 0, 1.0f);
+
+      m_store(m, view);
+   }
+
+   inline void mat44_perspective_impl(float* m, float fovY, float aspect, float znear, float zfar)
+   {
+      float f = 1.0f / tanf(fovY * 0.5f);
+      float nf = 1.0f / (znear - zfar);
+
+      f32x4x4 mp = m_zero();
+      mp.r0 = v_set(f / aspect, 0, 0, 0);
+      mp.r1 = v_set(0, f, 0, 0);
+      mp.r2 = v_set(0, 0, (zfar + znear) * nf, 2 * zfar * znear * nf);
+      mp.r3 = v_set(0, 0, -1, 0); // row-major projection
+      m_store(m, mp);
+   }
+
+   inline void mat44_orthographic_impl(float* m, float left, float right, float bottom, float top, float znear, float zfar)
+   {
+      f32x4x4 mo = m_zero();
+      mo.r0 = v_set(2.0f / (right - left), 0, 0, -(right + left) / (right - left));
+      mo.r1 = v_set(0, 2.0f / (top - bottom), 0, -(top + bottom) / (top - bottom));
+      mo.r2 = v_set(0, 0, -2.0f / (zfar - znear), -(zfar + znear) / (zfar - znear));
+      mo.r3 = v_set(0, 0, 0, 1.0f);
+      m_store(m, mo);
+   }
+
+} // namespace math_backend::mat44
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
index 7d8a317ba..3fe449a1b 100644
--- a/Engine/source/math/impl/math_c.cpp
+++ b/Engine/source/math/impl/math_c.cpp
@@ -1,3 +1,4 @@
+#include "platform/platform.h"
 #include "math/public/float4_dispatch.h"
 #include "math/public/float3_dispatch.h"
 #include "math/public/mat44_dispatch.h"
@@ -196,6 +197,89 @@ namespace math_backend::mat44::dispatch
          swap(a[11], a[14]);
       };
 
+      gMat44.determinant = [](const float* m) {
+         return m[0] * (m[5] * m[10] - m[6] * m[9]) +
+            m[4] * (m[2] * m[9] - m[1] * m[10]) +
+            m[8] * (m[1] * m[6] - m[2] * m[5]);
+      };
+
+      gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
+#ifdef TORQUE_COMPILER_GCC
+         const F32   v0 = b[0], v1 = b[1], v2 = b[2];
+         const F32   m0 = a[0], m1 = a[1], m2 = a[2];
+         const F32   m4 = a[4], m5 = a[5], m6 = a[6];
+         const F32   m8 = a[8], m9 = a[9], m10 = a[10];
+
+         r[0] = m0 * v0 + m1 * v1 + m2 * v2;
+         r[1] = m4 * v0 + m5 * v1 + m6 * v2;
+         r[2] = m8 * v0 + m9 * v1 + m10 * v2;
+#else
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
+#endif
+
+      };
+
+      gMat44.inverse = [](float* m) {
+         // using Cramers Rule find the Inverse
+         // Minv = (1/det(M)) * adjoint(M)
+         float det = gMat44.determinant(m);
+         AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+         float invDet = 1.0f / det;
+         float temp[16];
+         temp[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
+         temp[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
+         temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
+
+         temp[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
+         temp[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
+         temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
+
+         temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
+         temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
+         temp[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
+
+         m[0] = temp[0];
+         m[1] = temp[1];
+         m[2] = temp[2];
+
+         m[4] = temp[4];
+         m[5] = temp[5];
+         m[6] = temp[6];
+
+         m[8] = temp[8];
+         m[9] = temp[9];
+         m[10] = temp[10];
+         
+         // invert the translation
+         temp[0] = -m[3];
+         temp[1] = -m[7];
+         temp[2] = -m[11];
+         gMat44.mul_vec3(m, temp, &temp[4]);
+         m[3] = temp[4];
+         m[7] = temp[5];
+         m[11] = temp[6];
+
+      };
+
+      gMat44.affine_inverse = [](float* a) {
+         F32 temp[16];
+         dMemcpy(temp, a, 16 * sizeof(F32));
+
+         // Transpose rotation
+         a[1] = temp[4];
+         a[4] = temp[1];
+         a[2] = temp[8];
+         a[8] = temp[2];
+         a[6] = temp[9];
+         a[9] = temp[6];
+
+         a[3] = -(temp[0] * temp[3] + temp[4] * temp[7] + temp[8] * temp[11]);
+         a[7] = -(temp[1] * temp[3] + temp[5] * temp[7] + temp[9] * temp[11]);
+         a[11] = -(temp[2] * temp[3] + temp[6] * temp[7] + temp[10] * temp[11]);
+      };
+
       gMat44.scale = [](float* a, const float* s) {
          // Note, doesn't allow scaling w...
 
@@ -204,5 +288,86 @@ namespace math_backend::mat44::dispatch
          a[8] *= s[0];  a[9] *= s[1];  a[10] *= s[2];
          a[12] *= s[0];  a[13] *= s[1];  a[14] *= s[2];
       };
+
+      gMat44.get_scale = [](const float* a, float* s) {
+         // Note, doesn't allow scaling w...
+         s[0] = sqrt(a[0] * a[0] + a[4] * a[4] + a[8] * a[8]);
+         s[1] = sqrt(a[1] * a[1] + a[5] * a[5] + a[9] * a[9]);
+         s[2] = sqrt(a[2] * a[2] + a[6] * a[6] + a[10] * a[10]);
+      };
+
+      gMat44.mul_float4 = [](const float* a, const float* b, float* r) {
+         AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2] + a[7] * b[3];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2] + a[11] * b[3];
+         r[2] = a[12] * b[0] + a[13] * b[1] + a[14] * b[2] + a[15] * b[3];
+      };
+
+      gMat44.mul_pos3 = [](const float* a, const float* b, float* r) {
+             AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+             r[0] = a[0]*b[0] + a[1]*b[1] + a[2]*b[2]  + a[3];
+             r[1] = a[4]*b[0] + a[5]*b[1] + a[6]*b[2]  + a[7];
+             r[2] = a[8]*b[0] + a[9]*b[1] + a[10]*b[2] + a[11];
+      };
+
+      gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
+         AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
+      };
+
+      gMat44.mul_mat44 = [](const float* a, const float* b, float* r) {
+         r[0] = a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12];
+         r[1] = a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13];
+         r[2] = a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14];
+         r[3] = a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15];
+
+         r[4] = a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12];
+         r[5] = a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13];
+         r[6] = a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14];
+         r[7] = a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15];
+
+         r[8] = a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12];
+         r[9] = a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13];
+         r[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14];
+         r[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15];
+
+         r[12] = a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12];
+         r[13] = a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13];
+         r[14] = a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14];
+         r[15] = a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15];
+      };
+
+      gMat44.normalize = [](float* a) {
+         F32 col0[3], col1[3], col2[3];
+         // extract columns 0 and 1
+         col0[0] = a[0];
+         col0[1] = a[4];
+         col0[2] = a[8];
+
+         col1[0] = a[1];
+         col1[1] = a[5];
+         col1[2] = a[9];
+
+         math_backend::float3::dispatch::gFloat3.normalize(col0);
+         math_backend::float3::dispatch::gFloat3.normalize(col1);
+         math_backend::float3::dispatch::gFloat3.normalize(col2);
+
+         // store the normalized columns
+         a[0] = col0[0];
+         a[4] = col0[1];
+         a[8] = col0[2];
+
+         a[1] = col1[0];
+         a[5] = col1[1];
+         a[9] = col1[2];
+
+         a[2] = col2[0];
+         a[6] = col2[1];
+         a[10] = col2[2];
+
+      };
    }
 }
diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
index 0340d84a2..a67f900b4 100644
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <immintrin.h> // AVX/AVX2 intrinsics
+#include <immintrin.h>  // AVX
 
 namespace
 {
@@ -26,11 +26,58 @@ namespace
 
    inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
 
+   inline f32x4 v_swizzle_mask(f32x4 v1, const int x, const int y, const int z, const int w)
+   {
+      __m128i idx = _mm_set_epi32(w, z, y, x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
+   {
+      __m128i idx = _mm_set1_epi32(x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
    {
       return _mm_blend_ps(newv, original, 0b1000);
    }
 
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, const int x, const int y, const int z, const int w)
+   {
+      f32x4 lo = v1;
+      f32x4 hi = v2;
+
+      f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
+      f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
+
+      return _mm_movelh_ps(combined_lo, combined_hi);
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
    //------------------------------------------------------
    // Float3 helpers (safe loading into 4 lanes)
    //------------------------------------------------------
@@ -54,6 +101,16 @@ namespace
       dst[2] = tmp[2];
    }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
    //------------------------------------------------------
    // Simple Arithmatic
    //------------------------------------------------------
@@ -70,7 +127,7 @@ namespace
    // Element-wise subtract
    inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
 
-   //------------------------------------------------------
+   //------------------------------------------------------ 
    // Fast recip
    //------------------------------------------------------
 
@@ -137,4 +194,195 @@ namespace
       __m128 sum = _mm_hadd_ps(a, a);
       return _mm_hadd_ps(sum, sum);
    }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
 }
diff --git a/Engine/source/math/isa/avx/mat44.cpp b/Engine/source/math/isa/avx/mat44.cpp
new file mode 100644
index 000000000..6634b652c
--- /dev/null
+++ b/Engine/source/math/isa/avx/mat44.cpp
@@ -0,0 +1,22 @@
+#include "avx_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_avx()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
index 0340d84a2..67f0df04a 100644
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <immintrin.h> // AVX/AVX2 intrinsics
+#include <immintrin.h>  // AVX2
 
 namespace
 {
@@ -26,11 +26,58 @@ namespace
 
    inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
 
+   inline f32x4 v_swizzle_mask(f32x4 v1,const int x, const int y, const int z, const int w)
+   {
+      __m128i idx = _mm_set_epi32(w, z, y, x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
+   {
+      __m128i idx = _mm_set1_epi32(x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
    {
       return _mm_blend_ps(newv, original, 0b1000);
    }
 
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      f32x4 lo = v1;
+      f32x4 hi = v2;
+
+      f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
+      f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
+
+      return _mm_movelh_ps(combined_lo, combined_hi);
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
    //------------------------------------------------------
    // Float3 helpers (safe loading into 4 lanes)
    //------------------------------------------------------
@@ -54,6 +101,16 @@ namespace
       dst[2] = tmp[2];
    }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
    //------------------------------------------------------
    // Simple Arithmatic
    //------------------------------------------------------
@@ -70,7 +127,7 @@ namespace
    // Element-wise subtract
    inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
 
-   //------------------------------------------------------
+   //------------------------------------------------------ 
    // Fast recip
    //------------------------------------------------------
 
@@ -137,4 +194,195 @@ namespace
       __m128 sum = _mm_hadd_ps(a, a);
       return _mm_hadd_ps(sum, sum);
    }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
 }
diff --git a/Engine/source/math/isa/avx2/mat44.cpp b/Engine/source/math/isa/avx2/mat44.cpp
new file mode 100644
index 000000000..11e842174
--- /dev/null
+++ b/Engine/source/math/isa/avx2/mat44.cpp
@@ -0,0 +1,22 @@
+#include "avx2_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl" 
+
+namespace math_backend::mat44::dispatch
+{
+   void install_avx2()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/isa/sse2/mat44.cpp b/Engine/source/math/isa/sse2/mat44.cpp
new file mode 100644
index 000000000..6738ee391
--- /dev/null
+++ b/Engine/source/math/isa/sse2/mat44.cpp
@@ -0,0 +1,22 @@
+#include "sse2_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_sse2()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
index 85b09ebb0..71a95c1b7 100644
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -7,7 +7,7 @@ namespace
    typedef __m128 f32x4;
 
    //------------------------------------------------------
-   // Load / Store
+   // Vector/Point Load / Store
    //------------------------------------------------------
 
    // Load 4 floats from memory into a SIMD register
@@ -35,6 +35,17 @@ namespace
       return _mm_set_ps(1.0f, p[2], p[1], p[0]);
    }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
+      return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v));
+   }
+
    inline void v_store3(float* dst, f32x4 v)
    {
       alignas(16) float tmp[4];   // temp storage
@@ -50,12 +61,70 @@ namespace
 
    inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
 
+   inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+
+      return _mm_set_ps(
+         tmp[w],
+         tmp[z],
+         tmp[y],
+         tmp[x]
+      );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+      return _mm_set1_ps(tmp[x]);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0));
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1));
+   }
+
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
    {
-      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1
       return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
    }
 
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      alignas(16) float a[4], b[4];
+      _mm_store_ps(a, v1);
+      _mm_store_ps(b, v2);
+
+      return _mm_set_ps(
+         b[w],
+         b[z],
+         a[y],
+         a[x]
+      );
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
 
    //------------------------------------------------------
    // Simple Arithmatic
@@ -73,6 +142,13 @@ namespace
    // Element-wise subtract
    inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
 
+   // Absolute value.
+   inline f32x4 v_abs(f32x4 v)
+   {
+      const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+      return _mm_and_ps(v, mask);
+   }
+
    //------------------------------------------------------
    // Fast recip
    //------------------------------------------------------
@@ -153,4 +229,292 @@ namespace
       shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
       return _mm_add_ps(sums, shuf);
    }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+   
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // Pack lowest lane of each into a vector
+      f32x4 xy = _mm_unpacklo_ps(x, y);
+      f32x4 zw = _mm_unpacklo_ps(z, w);
+      return _mm_movelh_ps(xy, zw);
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
+   {
+      dst[0] = v_extract0(c0);            // c0.x
+      dst[1] = v_extract0(c1);            // c1.x
+      dst[2] = v_extract0(c2);            // c2.x
+
+      dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
+      dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
+      dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
+
+      dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
+      dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
+      dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(b, c);
+      f32x4 c1 = v_cross(c, d);
+      f32x4 c2 = v_cross(d, a);
+      f32x4 c3 = v_cross(a, b);
+
+      f32x4 det = v_dot4(a, c1);
+      f32x4 invDet = v_rcp_nr(det);
+
+      f32x4x4 adj;
+      adj.r0 = _mm_mul_ps(c1, invDet);
+      adj.r1 = _mm_mul_ps(c2, invDet);
+      adj.r2 = _mm_mul_ps(c3, invDet);
+      adj.r3 = _mm_mul_ps(c0, invDet);
+
+      return m_transpose(adj);
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+       
+      return t;
+   }
+
+   inline f32x4x4 m_inverse_affine(const f32x4x4& m)
+   {
+      // Extract upper 3x3 rows
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      // Zero translation
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      // Compute cofactors via cross products
+      f32x4 c0 = v_cross(r1, r2);
+      f32x4 c1 = v_cross(r2, r0);
+      f32x4 c2 = v_cross(r0, r1);
+
+      // Determinant
+      f32x4 det = v_dot3(r0, c0);
+
+      // Reciprocal determinant
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Inverse 3x3 (transpose of cofactor matrix)
+      f32x4 i0 = _mm_mul_ps(c0, invDet);
+      f32x4 i1 = _mm_mul_ps(c1, invDet);
+      f32x4 i2 = _mm_mul_ps(c2, invDet);
+
+      // Transpose 3x3
+      f32x4 t0 = i0;
+      f32x4 t1 = i1;
+      f32x4 t2 = i2;
+      f32x4 t3 = _mm_setzero_ps();
+
+      _MM_TRANSPOSE4_PS(t0, t1, t2, t3);
+
+      // Extract translation
+      alignas(16) float tmp[4];
+      tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[3] = 0.0f;
+
+      f32x4 T = v_load(tmp);
+
+      // New translation = -R' * T
+      f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      // Assemble final matrix
+      f32x4x4 out;
+      out.r0 = v_preserve_w(t0, newT);
+      out.r1 = v_preserve_w(t1, newT);
+      out.r2 = v_preserve_w(t2, newT);
+      out.r3 = _mm_set_ps(1, 0, 0, 0);
+
+      return out;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
 }
diff --git a/Engine/source/math/isa/sse41/mat44.cpp b/Engine/source/math/isa/sse41/mat44.cpp
new file mode 100644
index 000000000..94bc5e4b2
--- /dev/null
+++ b/Engine/source/math/isa/sse41/mat44.cpp
@@ -0,0 +1,22 @@
+#include "sse41_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_sse41()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
index 047cb44ee..2ea63e6b5 100644
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -26,11 +26,70 @@ namespace
 
    inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
 
+   inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+
+      return _mm_set_ps(
+         tmp[w],
+         tmp[z],
+         tmp[y],
+         tmp[x]
+      );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+      return _mm_set1_ps(tmp[x]);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
    {
       return _mm_blend_ps(newv, original, 0b1000);
    }
 
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      alignas(16) float a[4], b[4];
+      _mm_store_ps(a, v1);
+      _mm_store_ps(b, v2);
+
+      return _mm_set_ps(
+         b[w],
+         b[z],
+         a[y],
+         a[x]
+      );
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
+
    //------------------------------------------------------
    // Float3 helpers (safe loading into 4 lanes)
    //------------------------------------------------------
@@ -54,6 +113,16 @@ namespace
       dst[2] = tmp[2];
    }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
    //------------------------------------------------------
    // Simple Arithmatic
    //------------------------------------------------------
@@ -70,7 +139,7 @@ namespace
    // Element-wise subtract
    inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
 
-   //------------------------------------------------------
+   //------------------------------------------------------ 
    // Fast recip
    //------------------------------------------------------
 
@@ -104,13 +173,13 @@ namespace
    // full dot4
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
    }
 
    // dot3 (ignores w)
    inline f32x4 v_dot3(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
    }
 
    // cross product xyz only.
@@ -137,4 +206,293 @@ namespace
       __m128 sum = _mm_hadd_ps(a, a);
       return _mm_hadd_ps(sum, sum);
    }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      _mm_storeu_ps(dst + 0, m.r0);
+      _mm_storeu_ps(dst + 4, m.r1);
+      _mm_storeu_ps(dst + 8, m.r2);
+      _mm_storeu_ps(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+                        _mm_cvtss_f32(z),
+                        _mm_cvtss_f32(y),
+                        _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
+   {
+      dst[0] = v_extract0(c0);            // c0.x
+      dst[1] = v_extract0(c1);            // c1.x
+      dst[2] = v_extract0(c2);            // c2.x
+
+      dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
+      dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
+      dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
+
+      dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
+      dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
+      dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(b, c);
+      f32x4 c1 = v_cross(c, d);
+      f32x4 c2 = v_cross(d, a);
+      f32x4 c3 = v_cross(a, b);
+
+      f32x4 det = v_dot4(a, c1);
+      f32x4 invDet = v_rcp_nr(det);
+
+      f32x4x4 adj;
+      adj.r0 = _mm_mul_ps(c1, invDet);
+      adj.r1 = _mm_mul_ps(c2, invDet);
+      adj.r2 = _mm_mul_ps(c3, invDet);
+      adj.r3 = _mm_mul_ps(c0, invDet);
+
+      return m_transpose(adj);
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4x4 m_inverse_affine(const f32x4x4& m)
+   {
+      // Extract upper 3x3 rows
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      // Zero translation
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      // Compute cofactors via cross products
+      f32x4 c0 = v_cross(r1, r2);
+      f32x4 c1 = v_cross(r2, r0);
+      f32x4 c2 = v_cross(r0, r1);
+
+      // Determinant
+      f32x4 det = v_dot3(r0, c0);
+
+      // Reciprocal determinant
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Inverse 3x3 (transpose of cofactor matrix)
+      f32x4 i0 = _mm_mul_ps(c0, invDet);
+      f32x4 i1 = _mm_mul_ps(c1, invDet);
+      f32x4 i2 = _mm_mul_ps(c2, invDet);
+
+      // Transpose 3x3
+      f32x4 t0 = i0;
+      f32x4 t1 = i1;
+      f32x4 t2 = i2;
+      f32x4 t3 = _mm_setzero_ps();
+
+      _MM_TRANSPOSE4_PS(t0, t1, t2, t3);
+
+      // Extract translation
+      alignas(16) float tmp[4];
+      tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[3] = 0.0f;
+
+      f32x4 T = v_load(tmp);
+
+      // New translation = -R' * T
+      f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      // Assemble final matrix
+      f32x4x4 out;
+      out.r0 = v_preserve_w(t0, newT);
+      out.r1 = v_preserve_w(t1, newT);
+      out.r2 = v_preserve_w(t2, newT);
+      out.r3 = _mm_set_ps(1, 0, 0, 0);
+
+      return out;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
 }
diff --git a/Engine/source/math/mMathFn.h b/Engine/source/math/mMathFn.h
index 9f84b37df..e9e7d3840 100644
--- a/Engine/source/math/mMathFn.h
+++ b/Engine/source/math/mMathFn.h
@@ -76,13 +76,11 @@ extern void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m );
 extern void (*m_matF_set_euler)(const F32 *e, F32 *result);
 extern void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result);
 extern void (*m_matF_identity)(F32 *m);
-extern void (*m_matF_inverse)(F32 *m);
-extern void (*m_matF_invert_to)(const F32 *m, F32 *d);
-extern void (*m_matF_affineInverse)(F32 *m);
+extern void (*m_matF_invert_to)(const F32* m, F32* d);
 extern void (*m_matF_transpose)(F32 *m);
 extern void (*m_matF_scale)(F32 *m,const F32* p);
 extern void (*m_matF_normalize)(F32 *m);
-extern F32  (*m_matF_determinant)(const F32 *m);
+extern F32(*m_matF_determinant)(const F32* m);
 extern void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult);
 extern void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult);
 // extern void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult);
diff --git a/Engine/source/math/mMath_C.cpp b/Engine/source/math/mMath_C.cpp
index 7fac832f9..ad5aed0db 100644
--- a/Engine/source/math/mMath_C.cpp
+++ b/Engine/source/math/mMath_C.cpp
@@ -464,71 +464,26 @@ static void affineInvertTo(const F32 * m, F32 * out)
 }
 #endif
 
-//--------------------------------------
-static void m_matF_inverse_C(F32 *m)
+static void m_matF_invert_to_C(const F32* m, F32* d)
 {
    // using Cramers Rule find the Inverse
    // Minv = (1/det(M)) * adjoint(M)
-   F32 det = m_matF_determinant( m );
-   AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+   F32 det = m_matF_determinant(m);
+   AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
 
-   F32 invDet = 1.0f/det;
-   F32 temp[16];
+   F32 invDet = 1.0f / det;
 
-   temp[0] = (m[5] * m[10]- m[6] * m[9]) * invDet;
-   temp[1] = (m[9] * m[2] - m[10]* m[1]) * invDet;
-   temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
-
-   temp[4] = (m[6] * m[8] - m[4] * m[10])* invDet;
-   temp[5] = (m[10]* m[0] - m[8] * m[2]) * invDet;
-   temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
-
-   temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
-   temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
-   temp[10]= (m[0] * m[5] - m[1] * m[4]) * invDet;
-
-   m[0] = temp[0];
-   m[1] = temp[1];
-   m[2] = temp[2];
-
-   m[4] = temp[4];
-   m[5] = temp[5];
-   m[6] = temp[6];
-
-   m[8] = temp[8];
-   m[9] = temp[9];
-   m[10] = temp[10];
-
-   // invert the translation
-   temp[0] = -m[3];
-   temp[1] = -m[7];
-   temp[2] = -m[11];
-   m_matF_x_vectorF(m, temp, &temp[4]);
-   m[3] = temp[4];
-   m[7] = temp[5];
-   m[11]= temp[6];
-}
-
-static void m_matF_invert_to_C(const F32 *m, F32 *d)
-{
-   // using Cramers Rule find the Inverse
-   // Minv = (1/det(M)) * adjoint(M)
-   F32 det = m_matF_determinant( m );
-   AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
-
-   F32 invDet = 1.0f/det;
-
-   d[0] = (m[5] * m[10]- m[6] * m[9]) * invDet;
-   d[1] = (m[9] * m[2] - m[10]* m[1]) * invDet;
+   d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
+   d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
    d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
 
-   d[4] = (m[6] * m[8] - m[4] * m[10])* invDet;
-   d[5] = (m[10]* m[0] - m[8] * m[2]) * invDet;
+   d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
+   d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
    d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
 
    d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
    d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
-   d[10]= (m[0] * m[5] - m[1] * m[4]) * invDet;
+   d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
 
    // invert the translation
    F32 temp[6];
@@ -538,33 +493,13 @@ static void m_matF_invert_to_C(const F32 *m, F32 *d)
    m_matF_x_vectorF(d, temp, &temp[3]);
    d[3] = temp[3];
    d[7] = temp[4];
-   d[11]= temp[5];
-   d[ 12 ] = m[ 12 ];
-   d[ 13 ] = m[ 13 ];
-   d[ 14 ] = m[ 14 ];
-   d[ 15 ] = m[ 15 ];
+   d[11] = temp[5];
+   d[12] = m[12];
+   d[13] = m[13];
+   d[14] = m[14];
+   d[15] = m[15];
 }
 
-//--------------------------------------
-static void m_matF_affineInverse_C(F32 *m)
-{
-   // Matrix class checks to make sure this is an affine transform before calling
-   //  this function, so we can proceed assuming it is...
-   F32 temp[16];
-   dMemcpy(temp, m, 16 * sizeof(F32));
-
-   // Transpose rotation
-   m[1] = temp[4];
-   m[4] = temp[1];
-   m[2] = temp[8];
-   m[8] = temp[2];
-   m[6] = temp[9];
-   m[9] = temp[6];
-
-   m[3]  = -(temp[0]*temp[3] + temp[4]*temp[7] + temp[8]*temp[11]);
-   m[7]  = -(temp[1]*temp[3] + temp[5]*temp[7] + temp[9]*temp[11]);
-   m[11] = -(temp[2]*temp[3] + temp[6]*temp[7] + temp[10]*temp[11]);
-}
 
 inline void swap(F32 &a, F32 &b)
 {
@@ -633,11 +568,11 @@ static void m_matF_normalize_C(F32 *m)
 
 
 //--------------------------------------
-static F32 m_matF_determinant_C(const F32 *m)
+static F32 m_matF_determinant_C(const F32* m)
 {
-   return m[0] * (m[5] * m[10] - m[6] * m[9])  +
-      m[4] * (m[2] * m[9]  - m[1] * m[10]) +
-      m[8] * (m[1] * m[6]  - m[2] * m[5])  ;
+   return m[0] * (m[5] * m[10] - m[6] * m[9]) +
+      m[4] * (m[2] * m[9] - m[1] * m[10]) +
+      m[8] * (m[1] * m[6] - m[2] * m[5]);
 }
 
 
@@ -890,13 +825,11 @@ void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m ) = m_quatF_set_mat
 void (*m_matF_set_euler)(const F32 *e, F32 *result) = m_matF_set_euler_C;
 void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result) = m_matF_set_euler_point_C;
 void (*m_matF_identity)(F32 *m)  = m_matF_identity_C;
-void (*m_matF_inverse)(F32 *m)   = m_matF_inverse_C;
-void (*m_matF_affineInverse)(F32 *m)   = m_matF_affineInverse_C;
-void (*m_matF_invert_to)(const F32 *m, F32 *d) = m_matF_invert_to_C;
+void (*m_matF_invert_to)(const F32* m, F32* d) = m_matF_invert_to_C;
 void (*m_matF_transpose)(F32 *m) = m_matF_transpose_C;
 void (*m_matF_scale)(F32 *m,const F32* p) = m_matF_scale_C;
 void (*m_matF_normalize)(F32 *m) = m_matF_normalize_C;
-F32  (*m_matF_determinant)(const F32 *m) = m_matF_determinant_C;
+F32(*m_matF_determinant)(const F32* m) = m_matF_determinant_C;
 void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult)    = default_matF_x_matF_C;
 void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult)    = default_matF_x_matF_C;
 // void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult) = m_matF_x_point3F_C;
@@ -935,13 +868,11 @@ void mInstallLibrary_C()
    m_matF_set_euler        = m_matF_set_euler_C;
    m_matF_set_euler_point  = m_matF_set_euler_point_C;
    m_matF_identity         = m_matF_identity_C;
-   m_matF_inverse          = m_matF_inverse_C;
-   m_matF_affineInverse    = m_matF_affineInverse_C;
-   m_matF_invert_to        = m_matF_invert_to_C;
+   m_matF_invert_to = m_matF_invert_to_C;
    m_matF_transpose        = m_matF_transpose_C;
    m_matF_scale            = m_matF_scale_C;
    m_matF_normalize        = m_matF_normalize_C;
-   m_matF_determinant      = m_matF_determinant_C;
+   m_matF_determinant = m_matF_determinant_C;
    m_matF_x_matF           = default_matF_x_matF_C;
    m_matF_x_matF_aligned   = default_matF_x_matF_C;
 //    m_matF_x_point3F        = m_matF_x_point3F_C;
diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h
index 763a62b93..f42aaa5e4 100644
--- a/Engine/source/math/mMatrix.h
+++ b/Engine/source/math/mMatrix.h
@@ -50,13 +50,16 @@
 #ifndef _CONSOLE_H_
 #include "console/console.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
 
 #ifndef USE_TEMPLATE_MATRIX
 
 /// 4x4 Matrix Class
 ///
 /// This runs at F32 precision.
-
+using math_backend::mat44::dispatch::gMat44;
 class MatrixF
 {
    friend class MatrixFEngineExport;
@@ -128,7 +131,7 @@ public:
    EulerF toEuler() const;
 
    F32 determinant() const {
-      return m_matF_determinant(*this);
+      return gMat44.determinant(*this);
    }
 
    /// Compute the inverse of the matrix.
@@ -372,71 +375,68 @@ inline MatrixF& MatrixF::identity()
 
 inline MatrixF& MatrixF::inverse()
 {
-   m_matF_inverse(m);
+   gMat44.inverse(m);
    return (*this);
 }
 
 inline void MatrixF::invertTo( MatrixF *out )
 {
-   m_matF_invert_to(m,*out);
+   out = this;
+   gMat44.inverse(*out);
 }
 
 inline MatrixF& MatrixF::affineInverse()
 {
-//   AssertFatal(isAffine() == true, "Error, this matrix is not an affine transform");
-   m_matF_affineInverse(m);
+   gMat44.affine_inverse(m);
    return (*this);
 }
 
 inline MatrixF& MatrixF::transpose()
 {
-   m_matF_transpose(m);
+   gMat44.transpose(m);
    return (*this);
 }
 
 inline MatrixF& MatrixF::scale(const Point3F& p)
 {
-   m_matF_scale(m,p);
+   gMat44.scale(m, p);
    return *this;
 }
 
 inline Point3F MatrixF::getScale() const
 {
    Point3F scale;
-   scale.x = mSqrt(m[0]*m[0] + m[4] * m[4] + m[8] * m[8]);
-   scale.y = mSqrt(m[1]*m[1] + m[5] * m[5] + m[9] * m[9]);
-   scale.z = mSqrt(m[2]*m[2] + m[6] * m[6] + m[10] * m[10]);
+   gMat44.get_scale(m, scale);
    return scale;
 }
 
 inline void MatrixF::normalize()
 {
-   m_matF_normalize(m);
+   gMat44.normalize(m);
 }
 
 inline MatrixF& MatrixF::mul( const MatrixF &a )
 {  // M * a -> M
    AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!");
-
    MatrixF tempThis(*this);
-   m_matF_x_matF(tempThis, a, *this);
+   gMat44.mul_mat44(tempThis, a, tempThis);
+   *this = tempThis;
+
    return (*this);
 }
 
 inline MatrixF& MatrixF::mulL( const MatrixF &a )
 {  // a * M -> M
    AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!");
-
    MatrixF tempThis(*this);
-   m_matF_x_matF(a, tempThis, *this);
+   gMat44.mul_mat44(a, tempThis, *this);
    return (*this);
 }
 
 inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b )
 {  // a * b -> M
    AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!");
-
-   m_matF_x_matF(a, b, *this);
+   gMat44.mul_mat44(a, b, *this);
    return (*this);
 }
 
@@ -461,7 +461,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b)
 inline void MatrixF::mul( Point4F& p ) const
 {
    Point4F temp;
-   m_matF_x_point4F(*this, &p.x, &temp.x);
+   gMat44.mul_float4(*this, p, temp);
    p = temp;
 }
 
@@ -469,28 +469,28 @@ inline void MatrixF::mulP( Point3F& p) const
 {
    // M * p -> d
    Point3F d;
-   m_matF_x_point3F(*this, &p.x, &d.x);
+   gMat44.mul_pos3(*this, p, d);
    p = d;
 }
 
 inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
 {
    // M * p -> d
-   m_matF_x_point3F(*this, &p.x, &d->x);
+   gMat44.mul_pos3(*this, p, *d);
 }
 
 inline void MatrixF::mulV( VectorF& v) const
 {
    // M * v -> v
    VectorF temp;
-   m_matF_x_vectorF(*this, &v.x, &temp.x);
+   gMat44.mul_vec3(*this, v, temp);
    v = temp;
 }
 
 inline void MatrixF::mulV( const VectorF &v, Point3F *d) const
 {
    // M * v -> d
-   m_matF_x_vectorF(*this, &v.x, &d->x);
+   gMat44.mul_vec3(*this, v, *d);
 }
 
 inline void MatrixF::mul(Box3F& b) const
@@ -634,14 +634,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 )
 {
    // temp = m1 * m2
    MatrixF temp;
-   m_matF_x_matF(m1, m2, temp);
+   gMat44.mul_mat44(m1, m2, temp);
    return temp;
 }
 
 inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 )
 {
    MatrixF tempThis(*this);
-   m_matF_x_matF(tempThis, m1, *this);
+   gMat44.mul_mat44(tempThis, m1, *this);
    return (*this);
 }
 
diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h
index 61b488861..b1b78ef13 100644
--- a/Engine/source/math/public/mat44_dispatch.h
+++ b/Engine/source/math/public/mat44_dispatch.h
@@ -8,7 +8,16 @@ namespace math_backend::mat44::dispatch
    struct Mat44Funcs
    {
       void (*transpose)(float*) = nullptr;
+      void (*inverse)(float*) = nullptr;
+      void (*affine_inverse)(float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*mul_mat44)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_float4)(const float* a, const float* b, float* r) = nullptr;
+      float (*determinant)(const float*) = nullptr;
       void (*scale)(float*, const float*) = nullptr;
+      void (*get_scale)(const float*, float*) = nullptr;
    };
 
    // Global dispatch table
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
index 9b5e5daed..81e00b3c7 100644
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@@ -18,6 +18,26 @@ namespace math_backend::mat44::dispatch
    Mat44Funcs gMat44{};
 }
 
+namespace math_backend
+{
+   // Use an anonymous namespace for the static initializer
+   namespace {
+      struct ScalarInitializer
+      {
+         ScalarInitializer()
+         {
+            // Install scalar defaults immediately for all types
+            float4::dispatch::install_scalar();
+            float3::dispatch::install_scalar();
+            mat44::dispatch::install_scalar();
+         }
+      };
+
+      // Static instance ensures constructor runs before main()
+      ScalarInitializer g_scalarInitializer;
+   }
+}
+
 
 math_backend::backend math_backend::choose_backend(U32 cpu_flags)
 {
@@ -47,21 +67,25 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
       case backend::avx2:
          float4::dispatch::install_avx2();
          float3::dispatch::install_avx2();
+         mat44::dispatch::install_avx2();
          break;
 
       case backend::avx:
          float4::dispatch::install_avx();
          float3::dispatch::install_avx();
+         mat44::dispatch::install_avx();
          break;
 
       case backend::sse41:
          float4::dispatch::install_sse41();
          float3::dispatch::install_sse41();
+         mat44::dispatch::install_sse41();
          break;
 
       case backend::sse2:
          float4::dispatch::install_sse2();
          float3::dispatch::install_sse2();
+         mat44::dispatch::install_sse2();
          break;
 #elif defined(__aarch64__) || defined(__ARM_NEON)
       case backend::neon:
diff --git a/Engine/source/ts/assimp/assimpAppNode.cpp b/Engine/source/ts/assimp/assimpAppNode.cpp
index ff2e1ffd5..af448f940 100644
--- a/Engine/source/ts/assimp/assimpAppNode.cpp
+++ b/Engine/source/ts/assimp/assimpAppNode.cpp
@@ -250,7 +250,7 @@ MatrixF AssimpAppNode::getNodeTransform(F32 time)
       // Check for inverted node coordinate spaces => can happen when modelers
       // use the 'mirror' tool in their 3d app. Shows up as negative <scale>
       // transforms in the collada model.
-      if (m_matF_determinant(nodeTransform) < 0.0f)
+      if (nodeTransform.determinant() < 0.0f)
       {
          // Mark this node as inverted so we can mirror mesh geometry, then
          // de-invert the transform matrix
diff --git a/Engine/source/ts/collada/colladaAppMesh.cpp b/Engine/source/ts/collada/colladaAppMesh.cpp
index a944e2eef..4c4470654 100644
--- a/Engine/source/ts/collada/colladaAppMesh.cpp
+++ b/Engine/source/ts/collada/colladaAppMesh.cpp
@@ -1099,7 +1099,7 @@ void ColladaAppMesh::lookupSkinData()
       // Inverted node coordinate spaces (negative scale factor) are corrected
       // in ColladaAppNode::getNodeTransform, so need to apply the same operation
       // here to match
-      if (m_matF_determinant(invBind) < 0.0f)
+      if (invBind.determinant() < 0.0f)
          initialTransforms[iJoint].scale(Point3F(1, 1, -1));
 
       initialTransforms[iJoint].mul(invBind);
diff --git a/Engine/source/ts/collada/colladaAppNode.cpp b/Engine/source/ts/collada/colladaAppNode.cpp
index 2ffc10fba..0cd8258c8 100644
--- a/Engine/source/ts/collada/colladaAppNode.cpp
+++ b/Engine/source/ts/collada/colladaAppNode.cpp
@@ -192,7 +192,7 @@ MatrixF ColladaAppNode::getNodeTransform(F32 time)
       // Check for inverted node coordinate spaces => can happen when modelers
       // use the 'mirror' tool in their 3d app. Shows up as negative <scale>
       // transforms in the collada model.
-      if (m_matF_determinant(nodeTransform) < 0.0f)
+      if (nodeTransform.determinant() < 0.0f)
       {
          // Mark this node as inverted so we can mirror mesh geometry, then
          // de-invert the transform matrix