diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt
index 746bae1e7..d5f7aeaad 100644
--- a/Engine/source/CMakeLists.txt
+++ b/Engine/source/CMakeLists.txt
@@ -503,12 +503,17 @@ set(IS_ARM FALSE)
 
 if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
     set(IS_X86 TRUE)
-elseif(ARCH MATCHES "arm64|aarch64")
+endif()
+
+if(ARCH MATCHES "arm64|aarch64|arm")
     set(IS_ARM TRUE)
 endif()
 
 # always available
 add_math_backend(scalar  MATH_SIMD_SCALAR)
+message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "IS_X86=${IS_X86}")
+message(STATUS "IS_ARM=${IS_ARM}")
 
 # x86 family
 if(IS_X86)
diff --git a/Engine/source/gfx/gfxDrawUtil.cpp b/Engine/source/gfx/gfxDrawUtil.cpp
index 16b886a55..3dce4b3af 100644
--- a/Engine/source/gfx/gfxDrawUtil.cpp
+++ b/Engine/source/gfx/gfxDrawUtil.cpp
@@ -853,7 +853,11 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2,
 // 3D World Draw Misc
 //-----------------------------------------------------------------------------
 
-static SphereMesh gSphere;
+SphereMesh& getSphere()
+{
+   static SphereMesh instance;
+   return instance;
+}
 
 void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm )
 {
@@ -868,7 +872,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P
    GFX->pushWorldMatrix();
    GFX->multWorld(mat);
 
-   const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2);
+   const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2);
    S32 numPoly = sphereMesh->numPoly;
    S32 totalPoly = 0;
    GFXVertexBufferHandle<GFXVertexPCT> verts(mDevice, numPoly*3, GFXBufferTypeVolatile);
diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl
new file mode 100644
index 000000000..87b325faf
--- /dev/null
+++ b/Engine/source/math/impl/float3_impl.inl
@@ -0,0 +1,123 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+// Safely loads a float3 -> simd 4 lane backend
+namespace math_backend::float3
+{
+   //----------------------------------------------------------
+   // Add two float4 vectors: r = a + b
+   inline void float3_add_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_add(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Subtract: r = a - b
+   inline void float3_sub_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_sub(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply element-wise: r = a * b
+   inline void float3_mul_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_mul(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply by scalar: r = a * s
+   inline void float3_mul_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_mul(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Divide element-wise: r = a / b
+   inline void float3_div_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_div(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Divide by scalar: r = a / s
+   inline void float3_div_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_div(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Dot product: returns scalar
+   inline float float3_dot_impl(const float* a, const float* b)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vdot = v_dot3(va, vb);
+      return v_extract0(vdot); // first lane is the sum of 3 elements
+   }
+
+   // Length squared
+   inline float float3_length_squared_impl(const float* a)
+   {
+      return float3_dot_impl(a, a);
+   }
+
+   // Length
+   inline float float3_length_impl(const float* a)
+   {
+      return std::sqrt(float3_length_squared_impl(a));
+   }
+
+   // Normalize in-place
+   inline void float3_normalize_impl(float* a)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store3(a, vnorm);
+   }
+
+   // Normalize with magnitude: r = normalize(a) * r
+   inline void float3_normalize_mag_impl(float* a, float r)
+   {
+      f32x4 va = v_load3_vec(a);
+
+      // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
+      f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
+
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
+   }
+
+   // Linear interpolation: r = from + (to - from) * f
+   inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
+   {
+      f32x4 vfrom = v_load3_vec(from);
+      f32x4 vto = v_load3_vec(to);
+      f32x4 vf = v_set1(f);
+      f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
+      v_store3(r, vr);
+   }
+
+   inline void float3_cross_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vcross = v_cross(va, vb);
+      v_store3(r, vcross);
+   }
+
+}
diff --git a/Engine/source/math/impl/float4_c.cpp b/Engine/source/math/impl/float4_c.cpp
deleted file mode 100644
index b3d6559f9..000000000
--- a/Engine/source/math/impl/float4_c.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "math/public/float4_dispatch.h"
-#include "math/mConstants.h"
-#include <math.h>
-
-namespace math_backend::float4::dispatch
-{
-   void install_scalar()
-   {
-      gFloat4.add = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
-      };
-
-      gFloat4.sub = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
-      };
-
-      gFloat4.mul = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
-      };
-
-      gFloat4.mul_scalar = [](const float* a, float s, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] * s;
-      };
-
-      gFloat4.div = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
-      };
-
-      gFloat4.div_scalar = [](const float* a, float s, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] / s;
-      };
-
-      gFloat4.dot = [](const float* a, const float* b) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * b[i];
-         return sum;
-      };
-
-      gFloat4.length = [](const float* a) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
-         return sqrtf(sum);
-      };
-
-      gFloat4.lengthSquared = [](const float* a) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
-         return (sum);
-      };
-
-      gFloat4.normalize = [](float* a) {
-         float len = gFloat4.length(a);
-         if (len > POINT_EPSILON) for (int i = 0; i < 4; i++) a[i] /= len;
-      };
-
-      gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
-      };
-   }
-}
diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl
index cb61ed4fc..9371ae7c3 100644
--- a/Engine/source/math/impl/float4_impl.inl
+++ b/Engine/source/math/impl/float4_impl.inl
@@ -65,8 +65,8 @@ namespace math_backend::float4
    {
       f32x4 va = v_load(a);
       f32x4 vb = v_load(b);
-      f32x4 vmul = v_mul(va, vb);
-      return v_hadd4(vmul);
+      f32x4 vdot = v_dot4(va, vb);   // calls ISA-specific implementation
+      return v_extract0(vdot);
    }
 
    // Length squared
@@ -84,21 +84,22 @@ namespace math_backend::float4
    // Normalize in-place
    inline void float4_normalize_impl(float* a)
    {
-      float len = float4_length_impl(a);
-      if (len > POINT_EPSILON) // safe threshold
-      {
-         float4_mul_scalar_impl(a, 1.0f / len, a);
-      }
+      f32x4 va = v_load(a);
+      f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
    }
 
    // Normalize with magnitude: r = normalize(a) * r
    inline void float4_normalize_mag_impl(float* a, float r)
    {
-      float len = float4_length_impl(a);
-      if (len > POINT_EPSILON)
-      {
-         float4_mul_scalar_impl(a, r / len, a);
-      }
+      f32x4 va = v_load(a);
+
+      // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
+      f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va)));
+
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
    }
 
    // Linear interpolation: r = from + (to - from) * f
@@ -111,4 +112,12 @@ namespace math_backend::float4
       v_store(r, vr);
    }
 
+   inline void float4_cross_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vcross = v_cross(va, vb);
+      v_store(r, vcross);
+   }
+
 } // namespace math_backend::float4
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
new file mode 100644
index 000000000..7d8a317ba
--- /dev/null
+++ b/Engine/source/math/impl/math_c.cpp
@@ -0,0 +1,208 @@
+#include "math/public/float4_dispatch.h"
+#include "math/public/float3_dispatch.h"
+#include "math/public/mat44_dispatch.h"
+#include "math/mConstants.h"
+#include <cmath>   // for sqrtf, etc.
+
+namespace math_backend::float4::dispatch
+{
+   void install_scalar()
+   {
+      gFloat4.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat4.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat4.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat4.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * s;
+      };
+
+      gFloat4.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat4.div_scalar = [](const float* a, float s, float* r) {
+         float denom = 1.0f / s;
+         for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
+      };
+
+      gFloat4.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat4.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return std::sqrt(sum);
+      };
+
+      gFloat4.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat4.normalize = [](float* a) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = 1.0f / len;
+            for (int i = 0; i < 4; i++)
+               a[i] *= denom;
+         }
+      };
+
+      gFloat4.normalize_mag = [](float* a, float f) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = f / len;
+            for (int i = 0; i < 4; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+
+      gFloat4.cross = [](const float* a, const float* b, float* r) {
+         const float ax = a[0];
+         const float ay = a[1];
+         const float az = a[2];
+
+         const float bx = b[0];
+         const float by = b[1];
+         const float bz = b[2];
+
+         r[0] = ay * bz - az * by;
+         r[1] = az * bx - ax * bz;
+         r[2] = ax * by - ay * bx;
+      };
+   }
+}
+
+namespace math_backend::float3::dispatch
+{
+   void install_scalar()
+   {
+      gFloat3.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat3.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat3.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat3.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] * s;
+      };
+
+      gFloat3.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat3.div_scalar = [](const float* a, float s, float* r) {
+         float denom = 1.0f / s;
+         for (int i = 0; i < 3; i++) r[i] = a[i] * denom;
+      };
+
+      gFloat3.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat3.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * a[i];
+         return std::sqrt(sum);
+      };
+
+      gFloat3.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat3.normalize = [](float* a) {
+         float len = gFloat3.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = 1.0 / len;
+            for (int i = 0; i < 3; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat3.normalize_mag = [](float* a, float f) {
+         float len = gFloat3.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = f / len;
+            for (int i = 0; i < 3; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat3.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+
+      gFloat3.cross = [](const float* a, const float* b, float* r) {
+         const float ax = a[0];
+         const float ay = a[1];
+         const float az = a[2];
+
+         const float bx = b[0];
+         const float by = b[1];
+         const float bz = b[2];
+
+         r[0] = ay * bz - az * by;
+         r[1] = az * bx - ax * bz;
+         r[2] = ax * by - ay * bx;
+      };
+   }
+}
+
+inline void swap(float& a, float& b)
+{
+   float temp = a;
+   a = b;
+   b = temp;
+}
+
+
+namespace math_backend::mat44::dispatch
+{
+   void install_scalar()
+   {
+      gMat44.transpose = [](float* a) {
+         swap(a[1], a[4]);
+         swap(a[2], a[8]);
+         swap(a[3], a[12]);
+         swap(a[6], a[9]);
+         swap(a[7], a[13]);
+         swap(a[11], a[14]);
+      };
+
+      gMat44.scale = [](float* a, const float* s) {
+         // Note, doesn't allow scaling w...
+
+         a[0] *= s[0];  a[1] *= s[1];  a[2] *= s[2];
+         a[4] *= s[0];  a[5] *= s[1];  a[6] *= s[2];
+         a[8] *= s[0];  a[9] *= s[1];  a[10] *= s[2];
+         a[12] *= s[0];  a[13] *= s[1];  a[14] *= s[2];
+      };
+   }
+}
diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
new file mode 100644
index 000000000..0340d84a2
--- /dev/null
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/isa/avx/float3.cpp b/Engine/source/math/isa/avx/float3.cpp
new file mode 100644
index 000000000..7829f292c
--- /dev/null
+++ b/Engine/source/math/isa/avx/float3.cpp
@@ -0,0 +1,26 @@
+#include "avx_intrinsics.h"
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX backend
+   void install_avx()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_impl;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+      gFloat3.cross           = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/avx/float4.cpp b/Engine/source/math/isa/avx/float4.cpp
index 1e23fb8b1..cfdab0908 100644
--- a/Engine/source/math/isa/avx/float4.cpp
+++ b/Engine/source/math/isa/avx/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "avx_intrinsics.h"
 #include "float4_dispatch.h"
-#include <immintrin.h> // AVX/AVX2 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for AVX
-   float float4_dot_avx(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -52,16 +8,18 @@ namespace math_backend::float4::dispatch
    // Install AVX backend
    void install_avx()
    {
-      gFloat4.add = float4_add_impl;
-      gFloat4.sub = float4_sub_impl;
-      gFloat4.mul = float4_mul_impl;
-      gFloat4.mul_scalar = float4_mul_scalar_impl;
-      gFloat4.div = float4_div_impl;
-      gFloat4.div_scalar = float4_div_scalar_impl;
-      gFloat4.dot = float4_dot_avx;
-      gFloat4.length = float4_length_impl;
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
-      gFloat4.normalize = float4_normalize_impl;
-      gFloat4.lerp = float4_lerp_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
new file mode 100644
index 000000000..0340d84a2
--- /dev/null
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/isa/avx2/float3.cpp b/Engine/source/math/isa/avx2/float3.cpp
new file mode 100644
index 000000000..fd0d485f6
--- /dev/null
+++ b/Engine/source/math/isa/avx2/float3.cpp
@@ -0,0 +1,26 @@
+#include "avx2_intrinsics.h"
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX2 backend
+   void install_avx2()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_impl;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+      gFloat3.cross           = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp
index 439d0e2d0..85228caaf 100644
--- a/Engine/source/math/isa/avx2/float4.cpp
+++ b/Engine/source/math/isa/avx2/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "avx2_intrinsics.h"
 #include "float4_dispatch.h"
-#include <immintrin.h> // AVX/AVX2 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for AVX
-   float float4_dot_avx(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -58,10 +14,12 @@ namespace math_backend::float4::dispatch
       gFloat4.mul_scalar   = float4_mul_scalar_impl;
       gFloat4.div          = float4_div_impl;
       gFloat4.div_scalar   = float4_div_scalar_impl;
-      gFloat4.dot          = float4_dot_avx;
+      gFloat4.dot          = float4_dot_impl;
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/neon/float3.cpp b/Engine/source/math/isa/neon/float3.cpp
new file mode 100644
index 000000000..904787932
--- /dev/null
+++ b/Engine/source/math/isa/neon/float3.cpp
@@ -0,0 +1,25 @@
+#include "neon_intrinsics.h"
+#include "float3_dispatch.h"
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+    // Install NEON backend
+    void install_neon()
+    {
+        gFloat3.add = float3_add_impl;
+        gFloat3.sub = float3_sub_impl;
+        gFloat3.mul = float3_mul_impl;
+        gFloat3.mul_scalar = float3_mul_scalar_impl;
+        gFloat3.div = float3_div_impl;
+        gFloat3.div_scalar = float3_div_scalar_impl;
+        gFloat3.dot = float3_dot_impl;
+        gFloat3.length = float3_length_impl;
+        gFloat3.lengthSquared = float3_length_squared_impl;
+        gFloat3.normalize = float3_normalize_impl;
+        gFloat3.normalize_mag = float3_normalize_mag_impl;
+        gFloat3.lerp = float3_lerp_impl;
+        gFloat3.cross = float3_cross_impl;
+    }
+}
diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp
index 6258db743..6996f2a76 100644
--- a/Engine/source/math/isa/neon/float4.cpp
+++ b/Engine/source/math/isa/neon/float4.cpp
@@ -1,50 +1,25 @@
+#include "neon_intrinsics.h"
 #include "float4_dispatch.h"
-#include <arm_neon.h>
 
-namespace
-{
-    typedef float32x4_t f32x4;
-
-    inline f32x4 v_load(const float* p)        { return vld1q_f32(p); }
-    inline void  v_store(float* dst, f32x4 v)  { vst1q_f32(dst, v); }
-    inline f32x4 v_set1(float s)               { return vdupq_n_f32(s); }
-
-    inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
-    inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
-    inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
-
-    // AArch64 native divide
-    inline f32x4 v_div(f32x4 a, f32x4 b)
-    {
-        return vdivq_f32(a, b);
-    }
-
-    inline float v_hadd4(f32x4 a)
-    {
-        float32x2_t low  = vget_low_f32(a);
-        float32x2_t high = vget_high_f32(a);
-        float32x2_t sum  = vadd_f32(low, high);
-        sum = vpadd_f32(sum, sum);
-        return vget_lane_f32(sum, 0);
-    }
-}
-
-#include "../../impl/float4_impl.inl"
+#include "float4_impl.inl"
 
 namespace math_backend::float4::dispatch
 {
+    // Install NEON64 backend
     void install_neon()
     {
-        gFloat4.add           = float4_add_impl;
-        gFloat4.sub           = float4_sub_impl;
-        gFloat4.mul           = float4_mul_impl;
-        gFloat4.mul_scalar    = float4_mul_scalar_impl;
-        gFloat4.div           = float4_div_impl;
-        gFloat4.div_scalar    = float4_div_scalar_impl;
-        gFloat4.dot           = float4_dot_impl;
-        gFloat4.length        = float4_length_impl;
+        gFloat4.add          = float4_add_impl;
+        gFloat4.sub          = float4_sub_impl;
+        gFloat4.mul          = float4_mul_impl;
+        gFloat4.mul_scalar   = float4_mul_scalar_impl;
+        gFloat4.div          = float4_div_impl;
+        gFloat4.div_scalar   = float4_div_scalar_impl;
+        gFloat4.dot          = float4_dot_impl;
+        gFloat4.length       = float4_length_impl;
         gFloat4.lengthSquared = float4_length_squared_impl;
-        gFloat4.normalize     = float4_normalize_impl;
-        gFloat4.lerp          = float4_lerp_impl;
+        gFloat4.normalize    = float4_normalize_impl;
+        gFloat4.normalize_mag = float4_normalize_mag_impl;
+        gFloat4.lerp         = float4_lerp_impl;
+        gFloat4.cross        = float4_cross_impl;
     }
 }
diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h
new file mode 100644
index 000000000..3476fab1b
--- /dev/null
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@@ -0,0 +1,130 @@
+#pragma once
+#include <arm_neon.h>
+
+namespace
+{
+   typedef float32x4_t f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+   inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
+   inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
+   inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
+   inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
+   inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+   inline f32x4 v_mask_xyz()
+   {
+      // equivalent to [1,1,1,0]
+      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
+      return mask;
+   }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f};
+      return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers
+   //------------------------------------------------------
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      float tmp[4] = { p[0], p[1], p[2], 0.0f };
+      return vld1q_f32(tmp);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      float tmp[4] = { p[0], p[1], p[2], 1.0f };
+      return vld1q_f32(tmp);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      float tmp[4];
+      vst1q_f32(tmp, v);
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmetic
+   //------------------------------------------------------
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = vrecpeq_f32(b);
+      r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson
+      r = vmulq_f32(r, vrecpsq_f32(b, r));
+      return r;
+   }
+
+   inline f32x4 v_div(f32x4 a, f32x4 b)
+   {
+      return vmulq_f32(a, v_rcp_nr(b));
+   }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = vrsqrteq_f32(x);
+      r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine
+      r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x));
+      return r;
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      f32x4 mul = vmulq_f32(a, b);
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
+      float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
+      return vdupq_n_f32(sum);
+   }
+
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
+      f32x4 mul = vmulq_f32(a, b);
+      mul = vmulq_f32(mul, mask);
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
+      float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
+      return vdupq_n_f32(sum);
+   }
+
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 };
+      float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 };
+      float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b));
+      return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 };
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v,v));
+      return vmulq_f32(v, inv);
+   }
+
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a));
+      float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
+      return vdupq_n_f32(sum);
+   }
+}
diff --git a/Engine/source/math/isa/sse2/float3.cpp b/Engine/source/math/isa/sse2/float3.cpp
new file mode 100644
index 000000000..bc822e8fc
--- /dev/null
+++ b/Engine/source/math/isa/sse2/float3.cpp
@@ -0,0 +1,26 @@
+#include "sse2_intrinsics.h"
+#include "float3_dispatch.h"
+#include <emmintrin.h> // SSE2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install SSE2 backend
+   void install_sse2()
+   {
+      gFloat3.add = float3_add_impl;
+      gFloat3.sub = float3_sub_impl;
+      gFloat3.mul = float3_mul_impl;
+      gFloat3.mul_scalar = float3_mul_scalar_impl;
+      gFloat3.div = float3_div_impl;
+      gFloat3.div_scalar = float3_div_scalar_impl;
+      gFloat3.dot = float3_dot_impl;
+      gFloat3.length = float3_length_impl;
+      gFloat3.lengthSquared = float3_length_squared_impl;
+      gFloat3.normalize = float3_normalize_impl;
+      gFloat3.normalize_mag = float3_normalize_mag_impl;
+      gFloat3.lerp = float3_lerp_impl;
+      gFloat3.cross = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp
index 00850560a..aa986474b 100644
--- a/Engine/source/math/isa/sse2/float4.cpp
+++ b/Engine/source/math/isa/sse2/float4.cpp
@@ -1,42 +1,8 @@
+#include "sse2_intrinsics.h"
 #include "float4_dispatch.h"
-#include <emmintrin.h> // SSE2 intrinsics
-namespace
-{
-   typedef __m128 f32x4;
 
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+#include "float4_impl.inl"
 
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));   // swap pairs
-      __m128 sums = _mm_add_ps(a, shuf);                             // sums: [a0+a1 a1+a0 a2+a3 a3+a2]
-      shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));    // move high pair to low
-      sums = _mm_add_ps(sums, shuf);                                 // total sum in lower float
-      return _mm_cvtss_f32(sums);
-   }
-}
-
-#include "../../impl/float4_impl.inl"
 
 namespace math_backend::float4::dispatch
 {
@@ -53,6 +19,8 @@ namespace math_backend::float4::dispatch
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
new file mode 100644
index 000000000..85b09ebb0
--- /dev/null
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -0,0 +1,156 @@
+#pragma once
+#include <emmintrin.h>  // SSE2
+#include <xmmintrin.h>  // SSE
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
+      return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
+   }
+
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      f32x4 prod = _mm_mul_ps(a, b);           // multiply element-wise
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      prod = _mm_add_ps(prod, shuf);
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      prod = _mm_add_ps(prod, shuf);
+      return prod;                             // f32x4, all lanes = dot(a,b)
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      f32x4 prod = _mm_mul_ps(a, b);
+      prod = _mm_and_ps(prod, v_mask_xyz());    // zero w
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      prod = _mm_add_ps(prod, shuf);
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      prod = _mm_add_ps(prod, shuf);
+      return prod;                             // f32x4, all lanes = dot(a,b)
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE2
+      __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs
+      __m128 sums = _mm_add_ps(a, shuf);
+      shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
+      return _mm_add_ps(sums, shuf);
+   }
+}
diff --git a/Engine/source/math/isa/sse41/float3.cpp b/Engine/source/math/isa/sse41/float3.cpp
new file mode 100644
index 000000000..adf1a3a3d
--- /dev/null
+++ b/Engine/source/math/isa/sse41/float3.cpp
@@ -0,0 +1,26 @@
+#include "sse41_intrinsics.h"
+#include "float3_dispatch.h"
+#include <smmintrin.h> // SSE41 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install SSE41 backend
+   void install_sse41()
+   {
+      gFloat3.add = float3_add_impl;
+      gFloat3.sub = float3_sub_impl;
+      gFloat3.mul = float3_mul_impl;
+      gFloat3.mul_scalar = float3_mul_scalar_impl;
+      gFloat3.div = float3_div_impl;
+      gFloat3.div_scalar = float3_div_scalar_impl;
+      gFloat3.dot = float3_dot_impl;
+      gFloat3.length = float3_length_impl;
+      gFloat3.lengthSquared = float3_length_squared_impl;
+      gFloat3.normalize = float3_normalize_impl;
+      gFloat3.normalize_mag = float3_normalize_mag_impl;
+      gFloat3.lerp = float3_lerp_impl;
+      gFloat3.cross = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/sse41/float4.cpp b/Engine/source/math/isa/sse41/float4.cpp
index 80127acb9..e9ca8aae2 100644
--- a/Engine/source/math/isa/sse41/float4.cpp
+++ b/Engine/source/math/isa/sse41/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "sse41_intrinsics.h"
 #include "float4_dispatch.h"
-#include <smmintrin.h> // SSE41 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for SSE4.1
-   float float4_dot_sse41(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -58,10 +14,12 @@ namespace math_backend::float4::dispatch
       gFloat4.mul_scalar   = float4_mul_scalar_impl;
       gFloat4.div          = float4_div_impl;
       gFloat4.div_scalar   = float4_div_scalar_impl;
-      gFloat4.dot          = float4_dot_sse41;
+      gFloat4.dot          = float4_dot_impl;
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
new file mode 100644
index 000000000..047cb44ee
--- /dev/null
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <smmintrin.h>  // SSE4.1
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h
index c1bb72923..8e329f3a1 100644
--- a/Engine/source/math/mPoint3.h
+++ b/Engine/source/math/mPoint3.h
@@ -29,6 +29,11 @@
 #ifndef _MPOINT2_H_
 #include "math/mPoint2.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
+
+#include <array>
 
 //------------------------------------------------------------------------------
 /// 3D integer point
@@ -97,6 +102,7 @@ public:
 class Point3D;
 
 //------------------------------------------------------------------------------
+using math_backend::float3::dispatch::gFloat3;
 class Point3F
 {
    //-------------------------------------- Public data
@@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
 inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
 {
    AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
-   m_point3F_interpolate( _from, _to, _factor, *this);
+
+   gFloat3.lerp(_from, _to, _factor, *this);
 }
 
 inline void Point3F::zero()
@@ -599,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c)
 
 inline F32 Point3F::lenSquared() const
 {
-   return (x * x) + (y * y) + (z * z);
+   return gFloat3.lengthSquared(*this);
 }
 
 inline F32 Point3F::len() const
 {
-   return mSqrt(x*x + y*y + z*z);
+   return gFloat3.length(*this);
 }
 
 inline void Point3F::normalize()
 {
-   m_point3F_normalize(*this);
+   gFloat3.normalize(*this);
 }
 
 inline F32 Point3F::magnitudeSafe() const
@@ -626,18 +633,13 @@ inline F32 Point3F::magnitudeSafe() const
 
 inline void Point3F::normalizeSafe()
 {
-   F32 vmag = magnitudeSafe();
-
-   if( vmag > POINT_EPSILON )
-   {
-      *this *= F32(1.0 / vmag);
-   }
+   gFloat3.normalize(*this);
 }
 
 
 inline void Point3F::normalize(F32 val)
 {
-   m_point3F_normalize_f(*this, val);
+   gFloat3.normalize_mag(*this, val);
 }
 
 inline bool Point3F::operator==(const Point3F& _test) const
@@ -652,52 +654,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const
 
 inline Point3F Point3F::operator+(const Point3F& _add) const
 {
-   return Point3F(x + _add.x, y + _add.y,  z + _add.z);
+   Point3F temp;
+   gFloat3.add(*this, _add, temp);
+   return temp;
 }
 
 inline Point3F Point3F::operator-(const Point3F& _rSub) const
 {
-   return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
+   Point3F temp;
+   gFloat3.sub(*this, _rSub, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator+=(const Point3F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-
+   gFloat3.add(*this, _add, *this);
    return *this;
 }
 
 inline Point3F& Point3F::operator-=(const Point3F& _rSub)
 {
-   x -= _rSub.x;
-   y -= _rSub.y;
-   z -= _rSub.z;
-
+   gFloat3.sub(*this, _rSub, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator*(F32 _mul) const
 {
-   return Point3F(x * _mul, y * _mul, z * _mul);
+   Point3F temp;
+   gFloat3.mul_scalar(*this, _mul, temp);
+   return temp;
 }
 
 inline Point3F Point3F::operator/(F32 _div) const
 {
    AssertFatal(_div != 0.0f, "Error, div by zero attempted");
 
-   F32 inv = 1.0f / _div;
-
-   return Point3F(x * inv, y * inv, z * inv);
+   Point3F temp;
+   gFloat3.div_scalar(*this, _div, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator*=(F32 _mul)
 {
-   x *= _mul;
-   y *= _mul;
-   z *= _mul;
-
+   gFloat3.mul_scalar(*this, _mul, *this);
    return *this;
 }
 
@@ -705,39 +704,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
 {
    AssertFatal(_div != 0.0f, "Error, div by zero attempted");
 
-   F32 inv = 1.0f / _div;
-   x *= inv;
-   y *= inv;
-   z *= inv;
-
+   gFloat3.div_scalar(*this, _div, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator*(const Point3F &_vec) const
 {
-   return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
+   Point3F temp;
+   gFloat3.mul(*this, _vec, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator*=(const Point3F &_vec)
 {
-   x *= _vec.x;
-   y *= _vec.y;
-   z *= _vec.z;
+   gFloat3.mul(*this, _vec, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator/(const Point3F &_vec) const
 {
    AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
+   Point3F temp;
+   gFloat3.div(*this, _vec, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator/=(const Point3F &_vec)
 {
    AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   x /= _vec.x;
-   y /= _vec.y;
-   z /= _vec.z;
+   gFloat3.div(*this, _vec, *this);
    return *this;
 }
 
@@ -989,7 +984,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)
 
 inline Point3F operator*(F32 mul, const Point3F& multiplicand)
 {
-   return multiplicand * mul;
+   Point3F temp;
+   gFloat3.mul_scalar(multiplicand, mul, temp);
+   return temp;
 }
 
 inline Point3D operator*(F64 mul, const Point3D& multiplicand)
@@ -999,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
 
 inline F32 mDot(const Point3F &p1, const Point3F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
+   return gFloat3.dot(p1, p2);
 }
 
 inline F64 mDot(const Point3D &p1, const Point3D &p2)
@@ -1009,9 +1006,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2)
 
 inline void mCross(const Point3F &a, const Point3F &b, Point3F *res)
 {
-   res->x = (a.y * b.z) - (a.z * b.y);
-   res->y = (a.z * b.x) - (a.x * b.z);
-   res->z = (a.x * b.y) - (a.y * b.x);
+   gFloat3.cross(a, b, *res);
 }
 
 inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
@@ -1024,7 +1019,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
 inline Point3F mCross(const Point3F &a, const Point3F &b)
 {
    Point3F r;
-   mCross( a, b, &r );
+   gFloat3.cross(a, b, r);
    return r;
 }
 
diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h
index 8ae173009..0f715f983 100644
--- a/Engine/source/math/mPoint4.h
+++ b/Engine/source/math/mPoint4.h
@@ -26,10 +26,12 @@
 #ifndef _MMATHFN_H_
 #include "math/mMathFn.h"
 #endif
-
 #ifndef _MPOINT3_H_
 #include "math/mPoint3.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
 
 
 //------------------------------------------------------------------------------
@@ -61,6 +63,8 @@ class Point4I
 /// Uses F32 internally.
 ///
 /// Useful for representing quaternions and other 4d beasties.
+using math_backend::float4::dispatch::gFloat4;
+
 class Point4F
 {
    //-------------------------------------- Public data
@@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)
 
 inline F32 Point4F::len() const
 {
-   return mSqrt(x*x + y*y + z*z + w*w);
+   return gFloat4.length(*this);
 }
 
 inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
 {
-   x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
-   y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
-   z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
-   w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
+   gFloat4.lerp(_from, _to, _factor, *this);
 }
 
 inline void Point4F::zero()
@@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
    if (mIsZero(scalar))
       return *this;
 
-   F32 denom = 1 / scalar;
-
-   x *= denom;
-   y *= denom;
-   z *= denom;
-   w *= denom;
+   gFloat4.div_scalar(*this, scalar, *this);
 
    return *this;
 }
 
 inline Point4F Point4F::operator+(const Point4F& _add) const
 {
-   return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
+   Point4F res;
+   gFloat4.add(*this, _add, res);
+   return res;
 }
 
 inline Point4F& Point4F::operator+=(const Point4F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-   w += _add.w;
-
+   gFloat4.add(*this, _add, *this);
    return *this;
 }
 
 inline Point4F Point4F::operator-(const Point4F& _rSub) const
 {
-   return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
+   Point4F res;
+   gFloat4.sub(*this, _rSub, res);
+   return res;
 }
 
 inline Point4F Point4F::operator*(const Point4F &_vec) const
 {
-   return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
+   Point4F res;
+   gFloat4.mul(*this, _vec, res);
+   return res;
 }
 
 inline Point4F Point4F::operator*(F32 _mul) const
 {
-   return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
+   Point4F res;
+   gFloat4.mul_scalar(*this, _mul, res);
+   return res;
 }
 
 inline Point4F Point4F::operator /(F32 t) const
 {
-   F32 f = 1.0f / t;
-   return Point4F( x * f, y * f, z * f, w * f );
+   Point4F res;
+   gFloat4.div_scalar(*this, t, res);
+   return res;
 }
 
 inline F32 mDot(const Point4F &p1, const Point4F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
+   return  gFloat4.dot(p1, p2);
 }
 
 //------------------------------------------------------------------------------
diff --git a/Engine/source/math/public/float3_dispatch.h b/Engine/source/math/public/float3_dispatch.h
new file mode 100644
index 000000000..e4279cb84
--- /dev/null
+++ b/Engine/source/math/public/float3_dispatch.h
@@ -0,0 +1,39 @@
+#pragma once
+#ifndef _FLOAT3_DISPATCH_H_
+#define _FLOAT3_DISPATCH_H_
+
+
+#include <cstdint>
+
+namespace math_backend::float3::dispatch
+{
+   struct Float3Funcs
+   {
+      void (*add)(const float*, const float*, float*) = nullptr;
+      void (*sub)(const float*, const float*, float*) = nullptr;
+      void (*mul)(const float*, const float*, float*) = nullptr;
+      void (*mul_scalar)(const float*, float, float*) = nullptr;
+      void (*div)(const float*, const float*, float*) = nullptr;
+      void (*div_scalar)(const float*, float, float*) = nullptr;
+      float (*dot)(const float*, const float*) = nullptr;
+      float (*length)(const float*) = nullptr;
+      float (*lengthSquared)(const float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
+      void (*lerp)(const float*, const float*, float, float*) = nullptr;
+      void (*cross)(const float*, const float*, float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Float3Funcs gFloat3;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_FLOAT4_DISPATCH_H_
diff --git a/Engine/source/math/public/float4_dispatch.cpp b/Engine/source/math/public/float4_dispatch.cpp
deleted file mode 100644
index 810eb0e46..000000000
--- a/Engine/source/math/public/float4_dispatch.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "math/public/float4_dispatch.h"
-
-namespace math_backend::float4::dispatch
-{
-   // Single definition of the global dispatch table
-   Float4Funcs gFloat4{};
-}
diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h
index 319b1893f..6f26114ce 100644
--- a/Engine/source/math/public/float4_dispatch.h
+++ b/Engine/source/math/public/float4_dispatch.h
@@ -19,7 +19,9 @@ namespace math_backend::float4::dispatch
       float (*length)(const float*) = nullptr;
       float (*lengthSquared)(const float*) = nullptr;
       void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
       void (*lerp)(const float*, const float*, float, float*) = nullptr;
+      void (*cross)(const float*, const float*, float*) = nullptr;
    };
 
    // Global dispatch table
@@ -32,9 +34,6 @@ namespace math_backend::float4::dispatch
    void install_avx();
    void install_avx2();
    void install_neon();
-
-   // Centralized installer (engine calls this once)
-   void install_preferred();
 }
 
 #endif // !_FLOAT4_DISPATCH_H_
diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h
new file mode 100644
index 000000000..61b488861
--- /dev/null
+++ b/Engine/source/math/public/mat44_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+#ifndef _MAT44_DISPATCH_H_
+#define _MAT44_DISPATCH_H_
+
+
+namespace math_backend::mat44::dispatch
+{
+   struct Mat44Funcs
+   {
+      void (*transpose)(float*) = nullptr;
+      void (*scale)(float*, const float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Mat44Funcs gMat44;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_MAT44_DISPATCH_H_
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
index 7998924ee..9b5e5daed 100644
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@@ -1,6 +1,24 @@
 #pragma once
 #include "math/public/math_backend.h"
 
+namespace math_backend::float4::dispatch
+{
+   // Single definition of the global dispatch table
+   Float4Funcs gFloat4{};
+}
+
+namespace math_backend::float3::dispatch
+{
+   // Single definition of the global dispatch table
+   Float3Funcs gFloat3{};
+}
+
+namespace math_backend::mat44::dispatch
+{
+   Mat44Funcs gMat44{};
+}
+
+
 math_backend::backend math_backend::choose_backend(U32 cpu_flags)
 {
 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
@@ -12,7 +30,7 @@ math_backend::backend math_backend::choose_backend(U32 cpu_flags)
 
 #elif defined(__aarch64__) || defined(__ARM_NEON)
 
-   if (cpu_flags & CPU_NEON) return backend::neon;
+   if (cpu_flags & CPU_PROP_NEON) return backend::neon;
 
 #endif
    return backend::scalar;
@@ -25,28 +43,36 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
 
       switch (g_backend)
       {
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
       case backend::avx2:
          float4::dispatch::install_avx2();
+         float3::dispatch::install_avx2();
          break;
 
       case backend::avx:
-         //float4::dispatch::install_avx();
+         float4::dispatch::install_avx();
+         float3::dispatch::install_avx();
          break;
 
       case backend::sse41:
          float4::dispatch::install_sse41();
+         float3::dispatch::install_sse41();
          break;
 
       case backend::sse2:
          float4::dispatch::install_sse2();
+         float3::dispatch::install_sse2();
          break;
-
+#elif defined(__aarch64__) || defined(__ARM_NEON)
       case backend::neon:
          float4::dispatch::install_neon();
+         float3::dispatch::install_neon();
          break;
-
+#endif
       default:
          float4::dispatch::install_scalar();
+         float3::dispatch::install_scalar();
+         mat44::dispatch::install_scalar();
          break;
       }
    }
diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h
index 40476e7f0..0a3127f81 100644
--- a/Engine/source/math/public/math_backend.h
+++ b/Engine/source/math/public/math_backend.h
@@ -1,4 +1,7 @@
 #pragma once
+#ifndef _MATH_BACKEND_H_
+#define _MATH_BACKEND_H_
+
 #ifndef _MCONSTANTS_H_
 #include "math/mConstants.h"
 #endif
@@ -8,6 +11,12 @@
 #ifndef _FLOAT4_DISPATCH_H_
 #include "math/public/float4_dispatch.h"
 #endif
+#ifndef _FLOAT3_DISPATCH_H_
+#include "math/public/float3_dispatch.h"
+#endif
+#ifndef _MAT44_DISPATCH_H_
+#include "math/public/mat44_dispatch.h"
+#endif
 
 namespace math_backend
 {
@@ -25,3 +34,5 @@ namespace math_backend
    backend choose_backend(U32 cpu_flags);
    void install_from_cpu_flags(uint32_t cpu_flags);
 }
+
+#endif // !_MATH_BACKEND_H_
diff --git a/Engine/source/platform/platform.h b/Engine/source/platform/platform.h
index 1326c4692..0d377f23f 100644
--- a/Engine/source/platform/platform.h
+++ b/Engine/source/platform/platform.h
@@ -76,7 +76,7 @@ enum ProcessorProperties
    CPU_PROP_SSE4_2    = (1<<8),  ///< Supports SSE4_2 instruction set extension.
    CPU_PROP_AVX       = (1<<9), ///< Supports AVX256 instruction set extension.
    CPU_PROP_AVX2      = (1<<10), ///< Supports AVX256 instruction set extension.
-   CPU_PROP_AVX512    = (1<<11), ///< Supports AVX256 instruction set extension.
+   CPU_PROP_AVX512    = (1<<11), ///< Supports AVX512 instruction set extension.
    CPU_PROP_MP        = (1<<12), ///< This is a multi-processor system.
    CPU_PROP_LE        = (1<<13), ///< This processor is LITTLE ENDIAN.
    CPU_PROP_64bit     = (1<<14), ///< This processor is 64-bit capable
diff --git a/Engine/source/platformMac/macMath.mm b/Engine/source/platformMac/macMath.mm
index 4feefb277..c275bdd17 100644
--- a/Engine/source/platformMac/macMath.mm
+++ b/Engine/source/platformMac/macMath.mm
@@ -25,6 +25,7 @@
 #import "math/mMath.h"
 #import "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 
@@ -107,7 +108,9 @@ void Math::init(U32 properties)
 
    Con::printf("Math Init:");
    Con::printf("   Installing Standard C extensions");
-   mInstallLibrary_C();   
+   mInstallLibrary_C();
+   
+   math_backend::install_from_cpu_flags(properties);
 
    #ifdef TORQUE_CPU_X86
    if( properties & CPU_PROP_SSE )
diff --git a/Engine/source/platformPOSIX/POSIXMath.cpp b/Engine/source/platformPOSIX/POSIXMath.cpp
index 8f21329a3..751df294e 100644
--- a/Engine/source/platformPOSIX/POSIXMath.cpp
+++ b/Engine/source/platformPOSIX/POSIXMath.cpp
@@ -27,6 +27,7 @@
 #include "math/mMath.h"
 #include "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@@ -90,6 +91,8 @@ void Math::init(U32 properties)
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
 
+   math_backend::install_from_cpu_flags(properties);
+
 #if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64)
    Con::printf("   Installing Assembly extensions");
    mInstallLibrary_ASM();
diff --git a/Engine/source/platformWin32/winMath.cpp b/Engine/source/platformWin32/winMath.cpp
index 44b215301..36a9d8c58 100644
--- a/Engine/source/platformWin32/winMath.cpp
+++ b/Engine/source/platformWin32/winMath.cpp
@@ -25,7 +25,7 @@
 #include "console/engineAPI.h"
 
 #include "math/mMath.h"
-
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@@ -98,6 +98,8 @@ void Math::init(U32 properties)
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
 
+   math_backend::install_from_cpu_flags(properties);
+
    Con::printf("   Installing Assembly extensions");
    mInstallLibrary_ASM();
 
diff --git a/Engine/source/util/fpsTracker.cpp b/Engine/source/util/fpsTracker.cpp
index 021d4a4f5..830000bc8 100644
--- a/Engine/source/util/fpsTracker.cpp
+++ b/Engine/source/util/fpsTracker.cpp
@@ -36,6 +36,8 @@ void FPSTracker::reset()
 {
    fpsNext         = (F32)Platform::getRealMilliseconds()/1000.0f + mUpdateInterval;
 
+   fpsAccumTime      = 0.0f;
+   fpsAccumVirtualTime = 0.0f;
    fpsRealLast       = 0.0f;
    fpsReal           = 0.0f;
    fpsRealMin        = 0.000001f; // Avoid division by zero.
@@ -51,42 +53,60 @@ void FPSTracker::update()
    F32 realSeconds    = (F32)Platform::getRealMilliseconds()/1000.0f;
    F32 virtualSeconds = (F32)Platform::getVirtualMilliseconds()/1000.0f;
 
-   fpsFrames++;
-   if (fpsFrames > 1)
+   if (fpsRealLast == 0.0f)
    {
-      fpsReal    = fpsReal*(1.0-alpha) + (realSeconds-fpsRealLast)*alpha;
-      fpsVirtual = fpsVirtual*(1.0-alpha) + (virtualSeconds-fpsVirtualLast)*alpha;
-
-      if( fpsFrames > 10 ) // Wait a few frames before updating these.
-      {
-         // Update min/max.  This is a bit counter-intuitive, as the comparisons are
-         // inversed because these are all one-over-x values.
-
-         if( fpsReal > fpsRealMin )
-            fpsRealMin = fpsReal;
-         if( fpsReal < fpsRealMax )
-            fpsRealMax = fpsReal;
-      }
+      fpsRealLast = realSeconds;
+      fpsVirtualLast = virtualSeconds;
+      return;
    }
 
-   fpsRealLast    = realSeconds;
+   F32 frameTimeReal = realSeconds - fpsRealLast;
+   F32 frameTimeVirtual = virtualSeconds - fpsVirtualLast;
+
+   fpsRealLast = realSeconds;
    fpsVirtualLast = virtualSeconds;
 
-   // update variables every few frames
-   F32 update = fpsRealLast - fpsNext;
-   if (update > 0.5f)
-   {
-      F32 delta = realSeconds - fpsNext;
-      Con::setVariable( "fps::frameDelta",avar("%g", delta));
-      Con::setVariable( "fps::real",      avar( "%4.1f", 1.0f / fpsReal ) );
-      Con::setVariable( "fps::realMin",   avar( "%4.1f", 1.0f / fpsRealMin ) );
-      Con::setVariable( "fps::realMax",   avar( "%4.1f", 1.0f / fpsRealMax ) );
-      Con::setVariable( "fps::virtual",   avar( "%4.1f", 1.0f / fpsVirtual ) );
+   // Accumulate for windowed FPS calculation
+   fpsFrames++;
+   fpsAccumTime += frameTimeReal;
+   fpsAccumVirtualTime += frameTimeVirtual;
 
-      if (update > mUpdateInterval)
-         fpsNext  = fpsRealLast + mUpdateInterval;
-      else
-         fpsNext += mUpdateInterval;
+   // Only update console values at interval
+   if (realSeconds >= fpsNext)
+   {
+      fpsReal = 0.0f;
+      fpsVirtual = 0.0f;
+
+      if (fpsAccumTime > 0.0f)
+         fpsReal = fpsFrames / fpsAccumTime;
+
+      if (fpsAccumVirtualTime > 0.0f)
+         fpsVirtual = fpsFrames / fpsAccumVirtualTime;
+
+      // Update min/max correctly (these are FPS now)
+      if (fpsReal > 0.0f)
+      {
+         if (fpsReal < fpsRealMin)
+            fpsRealMin = fpsReal;
+
+         if (fpsReal > fpsRealMax)
+            fpsRealMax = fpsReal;
+      }
+
+      // frameDelta = actual accumulated real time over window
+      Con::setVariable("fps::frameTimeMs",   avar("%4.3f", (fpsAccumTime / fpsFrames) * 1000.0f));
+      Con::setVariable("fps::frameDelta",    avar("%g", fpsAccumTime));
+      Con::setVariable("fps::real",          avar("%4.1f", fpsReal));
+      Con::setVariable("fps::realMin",       avar("%4.1f", fpsRealMin));
+      Con::setVariable("fps::realMax",       avar("%4.1f", fpsRealMax));
+      Con::setVariable("fps::virtual",       avar("%4.1f", fpsVirtual));
+
+      // Reset window
+      fpsFrames = 0;
+      fpsAccumTime = 0.0f;
+      fpsAccumVirtualTime = 0.0f;
+
+      fpsNext = realSeconds + mUpdateInterval;
    }
 }
 
diff --git a/Engine/source/util/fpsTracker.h b/Engine/source/util/fpsTracker.h
index e5973ae68..a29fbbecf 100644
--- a/Engine/source/util/fpsTracker.h
+++ b/Engine/source/util/fpsTracker.h
@@ -27,6 +27,8 @@
 
 struct FPSTracker
 {
+   F32 fpsAccumTime;
+   F32 fpsAccumVirtualTime;
    F32 fpsRealLast;
    F32 fpsReal;
    F32 fpsRealMin;
@@ -48,4 +50,4 @@ struct FPSTracker
 
 extern FPSTracker gFPS;
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake
index 5a7928b38..f799e8c39 100644
--- a/Tools/CMake/torque_macros.cmake
+++ b/Tools/CMake/torque_macros.cmake
@@ -136,7 +136,7 @@ macro(addFramework framework)
 endmacro()
 
 function(add_math_backend name compile_defs)
-    file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp")
+    file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp" "math/isa/${name}/*.h")
 
     if(NOT SRC)
         return()
@@ -144,6 +144,7 @@ function(add_math_backend name compile_defs)
 
     add_library(math_${name} OBJECT ${SRC})
 
+    message(STATUS "adding math library for isa ${name}")
     target_include_directories(math_${name} PUBLIC
         "math/public"
         "math/impl"