From 67f12311d4b9218e3bb366a6df2eeb1846f4f402 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 26 Feb 2026 16:45:13 +0000
Subject: [PATCH] ISA backends float3 and float4 - cleanup history squash

working for both neon32 and neon64

Update math_backend.cpp

further sse simd additions

avx2 float3 added
added normalize_magnitude
added divide fast to float3 may copy to float4

move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend

all float3 and float4 functions and isas

completed all options of float3 and float4 functions in isas and math_c
neon still to be done but that will be on mac.

Update math_backend.cpp

mac isa neon update

added float3
restructured the classes to look more like the final version of the x86 classes

linux required changes

Update build-macos-clang.yml

Update build-macos-clang.yml

Revert "Update build-macos-clang.yml"

This reverts commit 29dfc567f40f20d2400a9967a35bbdb823182e2d.

Revert "Update build-macos-clang.yml"

This reverts commit 2abad2b4ca4de717c5f4278708f289dd1bb22561.

Update CMakeLists.txt

fix macs stupid build

remove god awful rolling average from frame time tracker....

use intrinsic headers instead

each isa implementation now uses a header for that isa's intrinsic functions these are then used in the impl files. This will make it easier for matrix functions when those are implemented.

fixed comment saying 256 when it should be 512 for avx512

consolidated initializers for function tables

Update neon_intrinsics.h

fixes for some neon intrinsics no idea if this is the best way to do these but they work at least

v_cross is especially messy at the moment we basically just do it as a c math function need to look into getting this done correctly
---
 Engine/source/CMakeLists.txt                  |   7 +-
 Engine/source/gfx/gfxDrawUtil.cpp             |   8 +-
 Engine/source/math/impl/float3_impl.inl       | 123 +++++++++++
 Engine/source/math/impl/float4_c.cpp          |  60 -----
 Engine/source/math/impl/float4_impl.inl       |  33 ++-
 Engine/source/math/impl/math_c.cpp            | 208 ++++++++++++++++++
 Engine/source/math/isa/avx/avx_intrinsics.h   | 140 ++++++++++++
 Engine/source/math/isa/avx/float3.cpp         |  26 +++
 Engine/source/math/isa/avx/float4.cpp         |  68 ++----
 Engine/source/math/isa/avx2/avx2_intrinsics.h | 140 ++++++++++++
 Engine/source/math/isa/avx2/float3.cpp        |  26 +++
 Engine/source/math/isa/avx2/float4.cpp        |  50 +----
 Engine/source/math/isa/neon/float3.cpp        |  25 +++
 Engine/source/math/isa/neon/float4.cpp        |  55 ++---
 Engine/source/math/isa/neon/neon_intrinsics.h | 130 +++++++++++
 Engine/source/math/isa/sse2/float3.cpp        |  26 +++
 Engine/source/math/isa/sse2/float4.cpp        |  40 +---
 Engine/source/math/isa/sse2/sse2_intrinsics.h | 156 +++++++++++++
 Engine/source/math/isa/sse41/float3.cpp       |  26 +++
 Engine/source/math/isa/sse41/float4.cpp       |  50 +----
 .../source/math/isa/sse41/sse41_intrinsics.h  | 140 ++++++++++++
 Engine/source/math/mPoint3.h                  |  91 ++++----
 Engine/source/math/mPoint4.h                  |  49 +++--
 Engine/source/math/public/float3_dispatch.h   |  39 ++++
 Engine/source/math/public/float4_dispatch.cpp |   7 -
 Engine/source/math/public/float4_dispatch.h   |   5 +-
 Engine/source/math/public/mat44_dispatch.h    |  26 +++
 Engine/source/math/public/math_backend.cpp    |  34 ++-
 Engine/source/math/public/math_backend.h      |  11 +
 Engine/source/platform/platform.h             |   2 +-
 Engine/source/platformMac/macMath.mm          |   5 +-
 Engine/source/platformPOSIX/POSIXMath.cpp     |   3 +
 Engine/source/platformWin32/winMath.cpp       |   4 +-
 Engine/source/util/fpsTracker.cpp             |  80 ++++---
 Engine/source/util/fpsTracker.h               |   4 +-
 Tools/CMake/torque_macros.cmake               |   3 +-
 36 files changed, 1481 insertions(+), 419 deletions(-)
 create mode 100644 Engine/source/math/impl/float3_impl.inl
 delete mode 100644 Engine/source/math/impl/float4_c.cpp
 create mode 100644 Engine/source/math/impl/math_c.cpp
 create mode 100644 Engine/source/math/isa/avx/avx_intrinsics.h
 create mode 100644 Engine/source/math/isa/avx/float3.cpp
 create mode 100644 Engine/source/math/isa/avx2/avx2_intrinsics.h
 create mode 100644 Engine/source/math/isa/avx2/float3.cpp
 create mode 100644 Engine/source/math/isa/neon/float3.cpp
 create mode 100644 Engine/source/math/isa/neon/neon_intrinsics.h
 create mode 100644 Engine/source/math/isa/sse2/float3.cpp
 create mode 100644 Engine/source/math/isa/sse2/sse2_intrinsics.h
 create mode 100644 Engine/source/math/isa/sse41/float3.cpp
 create mode 100644 Engine/source/math/isa/sse41/sse41_intrinsics.h
 create mode 100644 Engine/source/math/public/float3_dispatch.h
 delete mode 100644 Engine/source/math/public/float4_dispatch.cpp
 create mode 100644 Engine/source/math/public/mat44_dispatch.h

diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt
index 746bae1e7..d5f7aeaad 100644
--- a/Engine/source/CMakeLists.txt
+++ b/Engine/source/CMakeLists.txt
@@ -503,12 +503,17 @@ set(IS_ARM FALSE)
 
 if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
     set(IS_X86 TRUE)
-elseif(ARCH MATCHES "arm64|aarch64")
+endif()
+
+if(ARCH MATCHES "arm64|aarch64|arm")
     set(IS_ARM TRUE)
 endif()
 
 # always available
 add_math_backend(scalar  MATH_SIMD_SCALAR)
+message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "IS_X86=${IS_X86}")
+message(STATUS "IS_ARM=${IS_ARM}")
 
 # x86 family
 if(IS_X86)
diff --git a/Engine/source/gfx/gfxDrawUtil.cpp b/Engine/source/gfx/gfxDrawUtil.cpp
index 16b886a55..3dce4b3af 100644
--- a/Engine/source/gfx/gfxDrawUtil.cpp
+++ b/Engine/source/gfx/gfxDrawUtil.cpp
@@ -853,7 +853,11 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2,
 // 3D World Draw Misc
 //-----------------------------------------------------------------------------
 
-static SphereMesh gSphere;
+SphereMesh& getSphere()
+{
+   static SphereMesh instance;
+   return instance;
+}
 
 void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm )
 {
@@ -868,7 +872,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P
    GFX->pushWorldMatrix();
    GFX->multWorld(mat);
 
-   const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2);
+   const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2);
    S32 numPoly = sphereMesh->numPoly;
    S32 totalPoly = 0;
    GFXVertexBufferHandle<GFXVertexPCT> verts(mDevice, numPoly*3, GFXBufferTypeVolatile);
diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl
new file mode 100644
index 000000000..87b325faf
--- /dev/null
+++ b/Engine/source/math/impl/float3_impl.inl
@@ -0,0 +1,123 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+// Safely loads a float3 -> simd 4 lane backend
+namespace math_backend::float3
+{
+   //----------------------------------------------------------
+   // Add two float4 vectors: r = a + b
+   inline void float3_add_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_add(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Subtract: r = a - b
+   inline void float3_sub_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_sub(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply element-wise: r = a * b
+   inline void float3_mul_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_mul(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply by scalar: r = a * s
+   inline void float3_mul_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_mul(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Divide element-wise: r = a / b
+   inline void float3_div_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_div(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Divide by scalar: r = a / s
+   inline void float3_div_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_div(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Dot product: returns scalar
+   inline float float3_dot_impl(const float* a, const float* b)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vdot = v_dot3(va, vb);
+      return v_extract0(vdot); // first lane is the sum of 3 elements
+   }
+
+   // Length squared
+   inline float float3_length_squared_impl(const float* a)
+   {
+      return float3_dot_impl(a, a);
+   }
+
+   // Length
+   inline float float3_length_impl(const float* a)
+   {
+      return std::sqrt(float3_length_squared_impl(a));
+   }
+
+   // Normalize in-place
+   inline void float3_normalize_impl(float* a)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store3(a, vnorm);
+   }
+
+   // Normalize with magnitude: r = normalize(a) * r
+   inline void float3_normalize_mag_impl(float* a, float r)
+   {
+      f32x4 va = v_load3_vec(a);
+
+      // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
+      f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
+
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
+   }
+
+   // Linear interpolation: r = from + (to - from) * f
+   inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
+   {
+      f32x4 vfrom = v_load3_vec(from);
+      f32x4 vto = v_load3_vec(to);
+      f32x4 vf = v_set1(f);
+      f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
+      v_store3(r, vr);
+   }
+
+   inline void float3_cross_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vcross = v_cross(va, vb);
+      v_store3(r, vcross);
+   }
+
+}
diff --git a/Engine/source/math/impl/float4_c.cpp b/Engine/source/math/impl/float4_c.cpp
deleted file mode 100644
index b3d6559f9..000000000
--- a/Engine/source/math/impl/float4_c.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "math/public/float4_dispatch.h"
-#include "math/mConstants.h"
-#include <math.h>
-
-namespace math_backend::float4::dispatch
-{
-   void install_scalar()
-   {
-      gFloat4.add = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
-      };
-
-      gFloat4.sub = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
-      };
-
-      gFloat4.mul = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
-      };
-
-      gFloat4.mul_scalar = [](const float* a, float s, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] * s;
-      };
-
-      gFloat4.div = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
-      };
-
-      gFloat4.div_scalar = [](const float* a, float s, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] / s;
-      };
-
-      gFloat4.dot = [](const float* a, const float* b) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * b[i];
-         return sum;
-      };
-
-      gFloat4.length = [](const float* a) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
-         return sqrtf(sum);
-      };
-
-      gFloat4.lengthSquared = [](const float* a) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
-         return (sum);
-      };
-
-      gFloat4.normalize = [](float* a) {
-         float len = gFloat4.length(a);
-         if (len > POINT_EPSILON) for (int i = 0; i < 4; i++) a[i] /= len;
-      };
-
-      gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
-      };
-   }
-}
diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl
index cb61ed4fc..9371ae7c3 100644
--- a/Engine/source/math/impl/float4_impl.inl
+++ b/Engine/source/math/impl/float4_impl.inl
@@ -65,8 +65,8 @@ namespace math_backend::float4
    {
       f32x4 va = v_load(a);
       f32x4 vb = v_load(b);
-      f32x4 vmul = v_mul(va, vb);
-      return v_hadd4(vmul);
+      f32x4 vdot = v_dot4(va, vb);   // calls ISA-specific implementation
+      return v_extract0(vdot);
    }
 
    // Length squared
@@ -84,21 +84,22 @@ namespace math_backend::float4
    // Normalize in-place
    inline void float4_normalize_impl(float* a)
    {
-      float len = float4_length_impl(a);
-      if (len > POINT_EPSILON) // safe threshold
-      {
-         float4_mul_scalar_impl(a, 1.0f / len, a);
-      }
+      f32x4 va = v_load(a);
+      f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
    }
 
    // Normalize with magnitude: r = normalize(a) * r
    inline void float4_normalize_mag_impl(float* a, float r)
    {
-      float len = float4_length_impl(a);
-      if (len > POINT_EPSILON)
-      {
-         float4_mul_scalar_impl(a, r / len, a);
-      }
+      f32x4 va = v_load(a);
+
+      // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
+      f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va)));
+
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
    }
 
    // Linear interpolation: r = from + (to - from) * f
@@ -111,4 +112,12 @@ namespace math_backend::float4
       v_store(r, vr);
    }
 
+   inline void float4_cross_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vcross = v_cross(va, vb);
+      v_store(r, vcross);
+   }
+
 } // namespace math_backend::float4
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
new file mode 100644
index 000000000..7d8a317ba
--- /dev/null
+++ b/Engine/source/math/impl/math_c.cpp
@@ -0,0 +1,208 @@
+#include "math/public/float4_dispatch.h"
+#include "math/public/float3_dispatch.h"
+#include "math/public/mat44_dispatch.h"
+#include "math/mConstants.h"
+#include <cmath>   // for sqrtf, etc.
+
+namespace math_backend::float4::dispatch
+{
+   void install_scalar()
+   {
+      gFloat4.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat4.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat4.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat4.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * s;
+      };
+
+      gFloat4.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat4.div_scalar = [](const float* a, float s, float* r) {
+         float denom = 1.0f / s;
+         for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
+      };
+
+      gFloat4.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat4.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return std::sqrt(sum);
+      };
+
+      gFloat4.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat4.normalize = [](float* a) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = 1.0f / len;
+            for (int i = 0; i < 4; i++)
+               a[i] *= denom;
+         }
+      };
+
+      gFloat4.normalize_mag = [](float* a, float f) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = f / len;
+            for (int i = 0; i < 4; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+
+      gFloat4.cross = [](const float* a, const float* b, float* r) {
+         const float ax = a[0];
+         const float ay = a[1];
+         const float az = a[2];
+
+         const float bx = b[0];
+         const float by = b[1];
+         const float bz = b[2];
+
+         r[0] = ay * bz - az * by;
+         r[1] = az * bx - ax * bz;
+         r[2] = ax * by - ay * bx;
+      };
+   }
+}
+
+namespace math_backend::float3::dispatch
+{
+   void install_scalar()
+   {
+      gFloat3.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat3.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat3.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat3.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] * s;
+      };
+
+      gFloat3.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat3.div_scalar = [](const float* a, float s, float* r) {
+         float denom = 1.0f / s;
+         for (int i = 0; i < 3; i++) r[i] = a[i] * denom;
+      };
+
+      gFloat3.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat3.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * a[i];
+         return std::sqrt(sum);
+      };
+
+      gFloat3.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat3.normalize = [](float* a) {
+         float len = gFloat3.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = 1.0 / len;
+            for (int i = 0; i < 3; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat3.normalize_mag = [](float* a, float f) {
+         float len = gFloat3.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = f / len;
+            for (int i = 0; i < 3; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat3.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+
+      gFloat3.cross = [](const float* a, const float* b, float* r) {
+         const float ax = a[0];
+         const float ay = a[1];
+         const float az = a[2];
+
+         const float bx = b[0];
+         const float by = b[1];
+         const float bz = b[2];
+
+         r[0] = ay * bz - az * by;
+         r[1] = az * bx - ax * bz;
+         r[2] = ax * by - ay * bx;
+      };
+   }
+}
+
+inline void swap(float& a, float& b)
+{
+   float temp = a;
+   a = b;
+   b = temp;
+}
+
+
+namespace math_backend::mat44::dispatch
+{
+   void install_scalar()
+   {
+      gMat44.transpose = [](float* a) {
+         swap(a[1], a[4]);
+         swap(a[2], a[8]);
+         swap(a[3], a[12]);
+         swap(a[6], a[9]);
+         swap(a[7], a[13]);
+         swap(a[11], a[14]);
+      };
+
+      gMat44.scale = [](float* a, const float* s) {
+         // Note, doesn't allow scaling w...
+
+         a[0] *= s[0];  a[1] *= s[1];  a[2] *= s[2];
+         a[4] *= s[0];  a[5] *= s[1];  a[6] *= s[2];
+         a[8] *= s[0];  a[9] *= s[1];  a[10] *= s[2];
+         a[12] *= s[0];  a[13] *= s[1];  a[14] *= s[2];
+      };
+   }
+}
diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
new file mode 100644
index 000000000..0340d84a2
--- /dev/null
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/isa/avx/float3.cpp b/Engine/source/math/isa/avx/float3.cpp
new file mode 100644
index 000000000..7829f292c
--- /dev/null
+++ b/Engine/source/math/isa/avx/float3.cpp
@@ -0,0 +1,26 @@
+#include "avx_intrinsics.h"
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX backend
+   void install_avx()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_impl;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+      gFloat3.cross           = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/avx/float4.cpp b/Engine/source/math/isa/avx/float4.cpp
index 1e23fb8b1..cfdab0908 100644
--- a/Engine/source/math/isa/avx/float4.cpp
+++ b/Engine/source/math/isa/avx/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "avx_intrinsics.h"
 #include "float4_dispatch.h"
-#include <immintrin.h> // AVX/AVX2 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for AVX
-   float float4_dot_avx(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -52,16 +8,18 @@ namespace math_backend::float4::dispatch
    // Install AVX backend
    void install_avx()
    {
-      gFloat4.add = float4_add_impl;
-      gFloat4.sub = float4_sub_impl;
-      gFloat4.mul = float4_mul_impl;
-      gFloat4.mul_scalar = float4_mul_scalar_impl;
-      gFloat4.div = float4_div_impl;
-      gFloat4.div_scalar = float4_div_scalar_impl;
-      gFloat4.dot = float4_dot_avx;
-      gFloat4.length = float4_length_impl;
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
-      gFloat4.normalize = float4_normalize_impl;
-      gFloat4.lerp = float4_lerp_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
new file mode 100644
index 000000000..0340d84a2
--- /dev/null
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/isa/avx2/float3.cpp b/Engine/source/math/isa/avx2/float3.cpp
new file mode 100644
index 000000000..fd0d485f6
--- /dev/null
+++ b/Engine/source/math/isa/avx2/float3.cpp
@@ -0,0 +1,26 @@
+#include "avx2_intrinsics.h"
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX2 backend
+   void install_avx2()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_impl;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+      gFloat3.cross           = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp
index 439d0e2d0..85228caaf 100644
--- a/Engine/source/math/isa/avx2/float4.cpp
+++ b/Engine/source/math/isa/avx2/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "avx2_intrinsics.h"
 #include "float4_dispatch.h"
-#include <immintrin.h> // AVX/AVX2 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for AVX
-   float float4_dot_avx(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -58,10 +14,12 @@ namespace math_backend::float4::dispatch
       gFloat4.mul_scalar   = float4_mul_scalar_impl;
       gFloat4.div          = float4_div_impl;
       gFloat4.div_scalar   = float4_div_scalar_impl;
-      gFloat4.dot          = float4_dot_avx;
+      gFloat4.dot          = float4_dot_impl;
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/neon/float3.cpp b/Engine/source/math/isa/neon/float3.cpp
new file mode 100644
index 000000000..904787932
--- /dev/null
+++ b/Engine/source/math/isa/neon/float3.cpp
@@ -0,0 +1,25 @@
+#include "neon_intrinsics.h"
+#include "float3_dispatch.h"
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+    // Install NEON backend
+    void install_neon()
+    {
+        gFloat3.add = float3_add_impl;
+        gFloat3.sub = float3_sub_impl;
+        gFloat3.mul = float3_mul_impl;
+        gFloat3.mul_scalar = float3_mul_scalar_impl;
+        gFloat3.div = float3_div_impl;
+        gFloat3.div_scalar = float3_div_scalar_impl;
+        gFloat3.dot = float3_dot_impl;
+        gFloat3.length = float3_length_impl;
+        gFloat3.lengthSquared = float3_length_squared_impl;
+        gFloat3.normalize = float3_normalize_impl;
+        gFloat3.normalize_mag = float3_normalize_mag_impl;
+        gFloat3.lerp = float3_lerp_impl;
+        gFloat3.cross = float3_cross_impl;
+    }
+}
diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp
index 6258db743..6996f2a76 100644
--- a/Engine/source/math/isa/neon/float4.cpp
+++ b/Engine/source/math/isa/neon/float4.cpp
@@ -1,50 +1,25 @@
+#include "neon_intrinsics.h"
 #include "float4_dispatch.h"
-#include <arm_neon.h>
 
-namespace
-{
-    typedef float32x4_t f32x4;
-
-    inline f32x4 v_load(const float* p)        { return vld1q_f32(p); }
-    inline void  v_store(float* dst, f32x4 v)  { vst1q_f32(dst, v); }
-    inline f32x4 v_set1(float s)               { return vdupq_n_f32(s); }
-
-    inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
-    inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
-    inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
-
-    // AArch64 native divide
-    inline f32x4 v_div(f32x4 a, f32x4 b)
-    {
-        return vdivq_f32(a, b);
-    }
-
-    inline float v_hadd4(f32x4 a)
-    {
-        float32x2_t low  = vget_low_f32(a);
-        float32x2_t high = vget_high_f32(a);
-        float32x2_t sum  = vadd_f32(low, high);
-        sum = vpadd_f32(sum, sum);
-        return vget_lane_f32(sum, 0);
-    }
-}
-
-#include "../../impl/float4_impl.inl"
+#include "float4_impl.inl"
 
 namespace math_backend::float4::dispatch
 {
+    // Install NEON64 backend
     void install_neon()
     {
-        gFloat4.add           = float4_add_impl;
-        gFloat4.sub           = float4_sub_impl;
-        gFloat4.mul           = float4_mul_impl;
-        gFloat4.mul_scalar    = float4_mul_scalar_impl;
-        gFloat4.div           = float4_div_impl;
-        gFloat4.div_scalar    = float4_div_scalar_impl;
-        gFloat4.dot           = float4_dot_impl;
-        gFloat4.length        = float4_length_impl;
+        gFloat4.add          = float4_add_impl;
+        gFloat4.sub          = float4_sub_impl;
+        gFloat4.mul          = float4_mul_impl;
+        gFloat4.mul_scalar   = float4_mul_scalar_impl;
+        gFloat4.div          = float4_div_impl;
+        gFloat4.div_scalar   = float4_div_scalar_impl;
+        gFloat4.dot          = float4_dot_impl;
+        gFloat4.length       = float4_length_impl;
         gFloat4.lengthSquared = float4_length_squared_impl;
-        gFloat4.normalize     = float4_normalize_impl;
-        gFloat4.lerp          = float4_lerp_impl;
+        gFloat4.normalize    = float4_normalize_impl;
+        gFloat4.normalize_mag = float4_normalize_mag_impl;
+        gFloat4.lerp         = float4_lerp_impl;
+        gFloat4.cross        = float4_cross_impl;
     }
 }
diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h
new file mode 100644
index 000000000..3476fab1b
--- /dev/null
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@@ -0,0 +1,130 @@
+#pragma once
+#include <arm_neon.h>
+
+namespace
+{
+   typedef float32x4_t f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+   inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
+   inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
+   inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
+   inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
+   inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+   inline f32x4 v_mask_xyz()
+   {
+      // equivalent to [1,1,1,0]
+      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
+      return mask;
+   }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f};
+      return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers
+   //------------------------------------------------------
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      float tmp[4] = { p[0], p[1], p[2], 0.0f };
+      return vld1q_f32(tmp);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      float tmp[4] = { p[0], p[1], p[2], 1.0f };
+      return vld1q_f32(tmp);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      float tmp[4];
+      vst1q_f32(tmp, v);
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmetic
+   //------------------------------------------------------
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = vrecpeq_f32(b);
+      r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson
+      r = vmulq_f32(r, vrecpsq_f32(b, r));
+      return r;
+   }
+
+   inline f32x4 v_div(f32x4 a, f32x4 b)
+   {
+      return vmulq_f32(a, v_rcp_nr(b));
+   }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = vrsqrteq_f32(x);
+      r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine
+      r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x));
+      return r;
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      f32x4 mul = vmulq_f32(a, b);
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
+      float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
+      return vdupq_n_f32(sum);
+   }
+
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
+      f32x4 mul = vmulq_f32(a, b);
+      mul = vmulq_f32(mul, mask);
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
+      float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
+      return vdupq_n_f32(sum);
+   }
+
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 };
+      float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 };
+      float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b));
+      return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 };
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v,v));
+      return vmulq_f32(v, inv);
+   }
+
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a));
+      float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
+      return vdupq_n_f32(sum);
+   }
+}
diff --git a/Engine/source/math/isa/sse2/float3.cpp b/Engine/source/math/isa/sse2/float3.cpp
new file mode 100644
index 000000000..bc822e8fc
--- /dev/null
+++ b/Engine/source/math/isa/sse2/float3.cpp
@@ -0,0 +1,26 @@
+#include "sse2_intrinsics.h"
+#include "float3_dispatch.h"
+#include <emmintrin.h> // SSE2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install SSE2 backend
+   void install_sse2()
+   {
+      gFloat3.add = float3_add_impl;
+      gFloat3.sub = float3_sub_impl;
+      gFloat3.mul = float3_mul_impl;
+      gFloat3.mul_scalar = float3_mul_scalar_impl;
+      gFloat3.div = float3_div_impl;
+      gFloat3.div_scalar = float3_div_scalar_impl;
+      gFloat3.dot = float3_dot_impl;
+      gFloat3.length = float3_length_impl;
+      gFloat3.lengthSquared = float3_length_squared_impl;
+      gFloat3.normalize = float3_normalize_impl;
+      gFloat3.normalize_mag = float3_normalize_mag_impl;
+      gFloat3.lerp = float3_lerp_impl;
+      gFloat3.cross = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp
index 00850560a..aa986474b 100644
--- a/Engine/source/math/isa/sse2/float4.cpp
+++ b/Engine/source/math/isa/sse2/float4.cpp
@@ -1,42 +1,8 @@
+#include "sse2_intrinsics.h"
 #include "float4_dispatch.h"
-#include <emmintrin.h> // SSE2 intrinsics
-namespace
-{
-   typedef __m128 f32x4;
 
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+#include "float4_impl.inl"
 
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));   // swap pairs
-      __m128 sums = _mm_add_ps(a, shuf);                             // sums: [a0+a1 a1+a0 a2+a3 a3+a2]
-      shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));    // move high pair to low
-      sums = _mm_add_ps(sums, shuf);                                 // total sum in lower float
-      return _mm_cvtss_f32(sums);
-   }
-}
-
-#include "../../impl/float4_impl.inl"
 
 namespace math_backend::float4::dispatch
 {
@@ -53,6 +19,8 @@ namespace math_backend::float4::dispatch
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
new file mode 100644
index 000000000..85b09ebb0
--- /dev/null
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -0,0 +1,156 @@
+#pragma once
+#include <emmintrin.h>  // SSE2
+#include <xmmintrin.h>  // SSE
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
+      return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
+   }
+
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      f32x4 prod = _mm_mul_ps(a, b);           // multiply element-wise
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      prod = _mm_add_ps(prod, shuf);
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      prod = _mm_add_ps(prod, shuf);
+      return prod;                             // f32x4, all lanes = dot(a,b)
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      f32x4 prod = _mm_mul_ps(a, b);
+      prod = _mm_and_ps(prod, v_mask_xyz());    // zero w
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      prod = _mm_add_ps(prod, shuf);
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      prod = _mm_add_ps(prod, shuf);
+      return prod;                             // f32x4, all lanes = dot(a,b)
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE2
+      __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs
+      __m128 sums = _mm_add_ps(a, shuf);
+      shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
+      return _mm_add_ps(sums, shuf);
+   }
+}
diff --git a/Engine/source/math/isa/sse41/float3.cpp b/Engine/source/math/isa/sse41/float3.cpp
new file mode 100644
index 000000000..adf1a3a3d
--- /dev/null
+++ b/Engine/source/math/isa/sse41/float3.cpp
@@ -0,0 +1,26 @@
+#include "sse41_intrinsics.h"
+#include "float3_dispatch.h"
+#include <smmintrin.h> // SSE41 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install SSE41 backend
+   void install_sse41()
+   {
+      gFloat3.add = float3_add_impl;
+      gFloat3.sub = float3_sub_impl;
+      gFloat3.mul = float3_mul_impl;
+      gFloat3.mul_scalar = float3_mul_scalar_impl;
+      gFloat3.div = float3_div_impl;
+      gFloat3.div_scalar = float3_div_scalar_impl;
+      gFloat3.dot = float3_dot_impl;
+      gFloat3.length = float3_length_impl;
+      gFloat3.lengthSquared = float3_length_squared_impl;
+      gFloat3.normalize = float3_normalize_impl;
+      gFloat3.normalize_mag = float3_normalize_mag_impl;
+      gFloat3.lerp = float3_lerp_impl;
+      gFloat3.cross = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/sse41/float4.cpp b/Engine/source/math/isa/sse41/float4.cpp
index 80127acb9..e9ca8aae2 100644
--- a/Engine/source/math/isa/sse41/float4.cpp
+++ b/Engine/source/math/isa/sse41/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "sse41_intrinsics.h"
 #include "float4_dispatch.h"
-#include <smmintrin.h> // SSE41 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for SSE4.1
-   float float4_dot_sse41(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -58,10 +14,12 @@ namespace math_backend::float4::dispatch
       gFloat4.mul_scalar   = float4_mul_scalar_impl;
       gFloat4.div          = float4_div_impl;
       gFloat4.div_scalar   = float4_div_scalar_impl;
-      gFloat4.dot          = float4_dot_sse41;
+      gFloat4.dot          = float4_dot_impl;
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
new file mode 100644
index 000000000..047cb44ee
--- /dev/null
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <smmintrin.h>  // SSE4.1
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h
index c1bb72923..8e329f3a1 100644
--- a/Engine/source/math/mPoint3.h
+++ b/Engine/source/math/mPoint3.h
@@ -29,6 +29,11 @@
 #ifndef _MPOINT2_H_
 #include "math/mPoint2.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
+
+#include <array>
 
 //------------------------------------------------------------------------------
 /// 3D integer point
@@ -97,6 +102,7 @@ public:
 class Point3D;
 
 //------------------------------------------------------------------------------
+using math_backend::float3::dispatch::gFloat3;
 class Point3F
 {
    //-------------------------------------- Public data
@@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
 inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
 {
    AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
-   m_point3F_interpolate( _from, _to, _factor, *this);
+
+   gFloat3.lerp(_from, _to, _factor, *this);
 }
 
 inline void Point3F::zero()
@@ -599,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c)
 
 inline F32 Point3F::lenSquared() const
 {
-   return (x * x) + (y * y) + (z * z);
+   return gFloat3.lengthSquared(*this);
 }
 
 inline F32 Point3F::len() const
 {
-   return mSqrt(x*x + y*y + z*z);
+   return gFloat3.length(*this);
 }
 
 inline void Point3F::normalize()
 {
-   m_point3F_normalize(*this);
+   gFloat3.normalize(*this);
 }
 
 inline F32 Point3F::magnitudeSafe() const
@@ -626,18 +633,13 @@ inline F32 Point3F::magnitudeSafe() const
 
 inline void Point3F::normalizeSafe()
 {
-   F32 vmag = magnitudeSafe();
-
-   if( vmag > POINT_EPSILON )
-   {
-      *this *= F32(1.0 / vmag);
-   }
+   gFloat3.normalize(*this);
 }
 
 
 inline void Point3F::normalize(F32 val)
 {
-   m_point3F_normalize_f(*this, val);
+   gFloat3.normalize_mag(*this, val);
 }
 
 inline bool Point3F::operator==(const Point3F& _test) const
@@ -652,52 +654,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const
 
 inline Point3F Point3F::operator+(const Point3F& _add) const
 {
-   return Point3F(x + _add.x, y + _add.y,  z + _add.z);
+   Point3F temp;
+   gFloat3.add(*this, _add, temp);
+   return temp;
 }
 
 inline Point3F Point3F::operator-(const Point3F& _rSub) const
 {
-   return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
+   Point3F temp;
+   gFloat3.sub(*this, _rSub, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator+=(const Point3F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-
+   gFloat3.add(*this, _add, *this);
    return *this;
 }
 
 inline Point3F& Point3F::operator-=(const Point3F& _rSub)
 {
-   x -= _rSub.x;
-   y -= _rSub.y;
-   z -= _rSub.z;
-
+   gFloat3.sub(*this, _rSub, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator*(F32 _mul) const
 {
-   return Point3F(x * _mul, y * _mul, z * _mul);
+   Point3F temp;
+   gFloat3.mul_scalar(*this, _mul, temp);
+   return temp;
 }
 
 inline Point3F Point3F::operator/(F32 _div) const
 {
    AssertFatal(_div != 0.0f, "Error, div by zero attempted");
 
-   F32 inv = 1.0f / _div;
-
-   return Point3F(x * inv, y * inv, z * inv);
+   Point3F temp;
+   gFloat3.div_scalar(*this, _div, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator*=(F32 _mul)
 {
-   x *= _mul;
-   y *= _mul;
-   z *= _mul;
-
+   gFloat3.mul_scalar(*this, _mul, *this);
    return *this;
 }
 
@@ -705,39 +704,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
 {
    AssertFatal(_div != 0.0f, "Error, div by zero attempted");
 
-   F32 inv = 1.0f / _div;
-   x *= inv;
-   y *= inv;
-   z *= inv;
-
+   gFloat3.div_scalar(*this, _div, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator*(const Point3F &_vec) const
 {
-   return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
+   Point3F temp;
+   gFloat3.mul(*this, _vec, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator*=(const Point3F &_vec)
 {
-   x *= _vec.x;
-   y *= _vec.y;
-   z *= _vec.z;
+   gFloat3.mul(*this, _vec, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator/(const Point3F &_vec) const
 {
    AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
+   Point3F temp;
+   gFloat3.div(*this, _vec, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator/=(const Point3F &_vec)
 {
    AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   x /= _vec.x;
-   y /= _vec.y;
-   z /= _vec.z;
+   gFloat3.div(*this, _vec, *this);
    return *this;
 }
 
@@ -989,7 +984,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)
 
 inline Point3F operator*(F32 mul, const Point3F& multiplicand)
 {
-   return multiplicand * mul;
+   Point3F temp;
+   gFloat3.mul_scalar(multiplicand, mul, temp);
+   return temp;
 }
 
 inline Point3D operator*(F64 mul, const Point3D& multiplicand)
@@ -999,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
 
 inline F32 mDot(const Point3F &p1, const Point3F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
+   return gFloat3.dot(p1, p2);
 }
 
 inline F64 mDot(const Point3D &p1, const Point3D &p2)
@@ -1009,9 +1006,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2)
 
 inline void mCross(const Point3F &a, const Point3F &b, Point3F *res)
 {
-   res->x = (a.y * b.z) - (a.z * b.y);
-   res->y = (a.z * b.x) - (a.x * b.z);
-   res->z = (a.x * b.y) - (a.y * b.x);
+   gFloat3.cross(a, b, *res);
 }
 
 inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
@@ -1024,7 +1019,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
 inline Point3F mCross(const Point3F &a, const Point3F &b)
 {
    Point3F r;
-   mCross( a, b, &r );
+   gFloat3.cross(a, b, r);
    return r;
 }
 
diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h
index 8ae173009..0f715f983 100644
--- a/Engine/source/math/mPoint4.h
+++ b/Engine/source/math/mPoint4.h
@@ -26,10 +26,12 @@
 #ifndef _MMATHFN_H_
 #include "math/mMathFn.h"
 #endif
-
 #ifndef _MPOINT3_H_
 #include "math/mPoint3.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
 
 
 //------------------------------------------------------------------------------
@@ -61,6 +63,8 @@ class Point4I
 /// Uses F32 internally.
 ///
 /// Useful for representing quaternions and other 4d beasties.
+using math_backend::float4::dispatch::gFloat4;
+
 class Point4F
 {
    //-------------------------------------- Public data
@@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)
 
 inline F32 Point4F::len() const
 {
-   return mSqrt(x*x + y*y + z*z + w*w);
+   return gFloat4.length(*this);
 }
 
 inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
 {
-   x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
-   y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
-   z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
-   w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
+   gFloat4.lerp(_from, _to, _factor, *this);
 }
 
 inline void Point4F::zero()
@@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
    if (mIsZero(scalar))
       return *this;
 
-   F32 denom = 1 / scalar;
-
-   x *= denom;
-   y *= denom;
-   z *= denom;
-   w *= denom;
+   gFloat4.div_scalar(*this, scalar, *this);
 
    return *this;
 }
 
 inline Point4F Point4F::operator+(const Point4F& _add) const
 {
-   return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
+   Point4F res;
+   gFloat4.add(*this, _add, res);
+   return res;
 }
 
 inline Point4F& Point4F::operator+=(const Point4F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-   w += _add.w;
-
+   gFloat4.add(*this, _add, *this);
    return *this;
 }
 
 inline Point4F Point4F::operator-(const Point4F& _rSub) const
 {
-   return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
+   Point4F res;
+   gFloat4.sub(*this, _rSub, res);
+   return res;
 }
 
 inline Point4F Point4F::operator*(const Point4F &_vec) const
 {
-   return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
+   Point4F res;
+   gFloat4.mul(*this, _vec, res);
+   return res;
 }
 
 inline Point4F Point4F::operator*(F32 _mul) const
 {
-   return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
+   Point4F res;
+   gFloat4.mul_scalar(*this, _mul, res);
+   return res;
 }
 
 inline Point4F Point4F::operator /(F32 t) const
 {
-   F32 f = 1.0f / t;
-   return Point4F( x * f, y * f, z * f, w * f );
+   Point4F res;
+   gFloat4.div_scalar(*this, t, res);
+   return res;
 }
 
 inline F32 mDot(const Point4F &p1, const Point4F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
+   return  gFloat4.dot(p1, p2);
 }
 
 //------------------------------------------------------------------------------
diff --git a/Engine/source/math/public/float3_dispatch.h b/Engine/source/math/public/float3_dispatch.h
new file mode 100644
index 000000000..e4279cb84
--- /dev/null
+++ b/Engine/source/math/public/float3_dispatch.h
@@ -0,0 +1,39 @@
+#pragma once
+#ifndef _FLOAT3_DISPATCH_H_
+#define _FLOAT3_DISPATCH_H_
+
+
+#include <cstdint>
+
+namespace math_backend::float3::dispatch
+{
+   struct Float3Funcs
+   {
+      void (*add)(const float*, const float*, float*) = nullptr;
+      void (*sub)(const float*, const float*, float*) = nullptr;
+      void (*mul)(const float*, const float*, float*) = nullptr;
+      void (*mul_scalar)(const float*, float, float*) = nullptr;
+      void (*div)(const float*, const float*, float*) = nullptr;
+      void (*div_scalar)(const float*, float, float*) = nullptr;
+      float (*dot)(const float*, const float*) = nullptr;
+      float (*length)(const float*) = nullptr;
+      float (*lengthSquared)(const float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
+      void (*lerp)(const float*, const float*, float, float*) = nullptr;
+      void (*cross)(const float*, const float*, float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Float3Funcs gFloat3;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_FLOAT4_DISPATCH_H_
diff --git a/Engine/source/math/public/float4_dispatch.cpp b/Engine/source/math/public/float4_dispatch.cpp
deleted file mode 100644
index 810eb0e46..000000000
--- a/Engine/source/math/public/float4_dispatch.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "math/public/float4_dispatch.h"
-
-namespace math_backend::float4::dispatch
-{
-   // Single definition of the global dispatch table
-   Float4Funcs gFloat4{};
-}
diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h
index 319b1893f..6f26114ce 100644
--- a/Engine/source/math/public/float4_dispatch.h
+++ b/Engine/source/math/public/float4_dispatch.h
@@ -19,7 +19,9 @@ namespace math_backend::float4::dispatch
       float (*length)(const float*) = nullptr;
       float (*lengthSquared)(const float*) = nullptr;
       void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
       void (*lerp)(const float*, const float*, float, float*) = nullptr;
+      void (*cross)(const float*, const float*, float*) = nullptr;
    };
 
    // Global dispatch table
@@ -32,9 +34,6 @@ namespace math_backend::float4::dispatch
    void install_avx();
    void install_avx2();
    void install_neon();
-
-   // Centralized installer (engine calls this once)
-   void install_preferred();
 }
 
 #endif // !_FLOAT4_DISPATCH_H_
diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h
new file mode 100644
index 000000000..61b488861
--- /dev/null
+++ b/Engine/source/math/public/mat44_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+#ifndef _MAT44_DISPATCH_H_
+#define _MAT44_DISPATCH_H_
+
+
+namespace math_backend::mat44::dispatch
+{
+   struct Mat44Funcs
+   {
+      void (*transpose)(float*) = nullptr;
+      void (*scale)(float*, const float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Mat44Funcs gMat44;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_MAT44_DISPATCH_H_
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
index 7998924ee..9b5e5daed 100644
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@@ -1,6 +1,24 @@
 #pragma once
 #include "math/public/math_backend.h"
 
+namespace math_backend::float4::dispatch
+{
+   // Single definition of the global dispatch table
+   Float4Funcs gFloat4{};
+}
+
+namespace math_backend::float3::dispatch
+{
+   // Single definition of the global dispatch table
+   Float3Funcs gFloat3{};
+}
+
+namespace math_backend::mat44::dispatch
+{
+   Mat44Funcs gMat44{};
+}
+
+
 math_backend::backend math_backend::choose_backend(U32 cpu_flags)
 {
 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
@@ -12,7 +30,7 @@ math_backend::backend math_backend::choose_backend(U32 cpu_flags)
 
 #elif defined(__aarch64__) || defined(__ARM_NEON)
 
-   if (cpu_flags & CPU_NEON) return backend::neon;
+   if (cpu_flags & CPU_PROP_NEON) return backend::neon;
 
 #endif
    return backend::scalar;
@@ -25,28 +43,36 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
 
       switch (g_backend)
       {
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
       case backend::avx2:
          float4::dispatch::install_avx2();
+         float3::dispatch::install_avx2();
          break;
 
       case backend::avx:
-         //float4::dispatch::install_avx();
+         float4::dispatch::install_avx();
+         float3::dispatch::install_avx();
          break;
 
       case backend::sse41:
          float4::dispatch::install_sse41();
+         float3::dispatch::install_sse41();
          break;
 
       case backend::sse2:
          float4::dispatch::install_sse2();
+         float3::dispatch::install_sse2();
          break;
-
+#elif defined(__aarch64__) || defined(__ARM_NEON)
       case backend::neon:
          float4::dispatch::install_neon();
+         float3::dispatch::install_neon();
          break;
-
+#endif
       default:
          float4::dispatch::install_scalar();
+         float3::dispatch::install_scalar();
+         mat44::dispatch::install_scalar();
          break;
       }
    }
diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h
index 40476e7f0..0a3127f81 100644
--- a/Engine/source/math/public/math_backend.h
+++ b/Engine/source/math/public/math_backend.h
@@ -1,4 +1,7 @@
 #pragma once
+#ifndef _MATH_BACKEND_H_
+#define _MATH_BACKEND_H_
+
 #ifndef _MCONSTANTS_H_
 #include "math/mConstants.h"
 #endif
@@ -8,6 +11,12 @@
 #ifndef _FLOAT4_DISPATCH_H_
 #include "math/public/float4_dispatch.h"
 #endif
+#ifndef _FLOAT3_DISPATCH_H_
+#include "math/public/float3_dispatch.h"
+#endif
+#ifndef _MAT44_DISPATCH_H_
+#include "math/public/mat44_dispatch.h"
+#endif
 
 namespace math_backend
 {
@@ -25,3 +34,5 @@ namespace math_backend
    backend choose_backend(U32 cpu_flags);
    void install_from_cpu_flags(uint32_t cpu_flags);
 }
+
+#endif // !_MATH_BACKEND_H_
diff --git a/Engine/source/platform/platform.h b/Engine/source/platform/platform.h
index 1326c4692..0d377f23f 100644
--- a/Engine/source/platform/platform.h
+++ b/Engine/source/platform/platform.h
@@ -76,7 +76,7 @@ enum ProcessorProperties
    CPU_PROP_SSE4_2    = (1<<8),  ///< Supports SSE4_2 instruction set extension.
    CPU_PROP_AVX       = (1<<9), ///< Supports AVX256 instruction set extension.
    CPU_PROP_AVX2      = (1<<10), ///< Supports AVX256 instruction set extension.
-   CPU_PROP_AVX512    = (1<<11), ///< Supports AVX256 instruction set extension.
+   CPU_PROP_AVX512    = (1<<11), ///< Supports AVX512 instruction set extension.
    CPU_PROP_MP        = (1<<12), ///< This is a multi-processor system.
    CPU_PROP_LE        = (1<<13), ///< This processor is LITTLE ENDIAN.
    CPU_PROP_64bit     = (1<<14), ///< This processor is 64-bit capable
diff --git a/Engine/source/platformMac/macMath.mm b/Engine/source/platformMac/macMath.mm
index 4feefb277..c275bdd17 100644
--- a/Engine/source/platformMac/macMath.mm
+++ b/Engine/source/platformMac/macMath.mm
@@ -25,6 +25,7 @@
 #import "math/mMath.h"
 #import "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 
@@ -107,7 +108,9 @@ void Math::init(U32 properties)
 
    Con::printf("Math Init:");
    Con::printf("   Installing Standard C extensions");
-   mInstallLibrary_C();   
+   mInstallLibrary_C();
+   
+   math_backend::install_from_cpu_flags(properties);
 
    #ifdef TORQUE_CPU_X86
    if( properties & CPU_PROP_SSE )
diff --git a/Engine/source/platformPOSIX/POSIXMath.cpp b/Engine/source/platformPOSIX/POSIXMath.cpp
index 8f21329a3..751df294e 100644
--- a/Engine/source/platformPOSIX/POSIXMath.cpp
+++ b/Engine/source/platformPOSIX/POSIXMath.cpp
@@ -27,6 +27,7 @@
 #include "math/mMath.h"
 #include "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@@ -90,6 +91,8 @@ void Math::init(U32 properties)
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
 
+   math_backend::install_from_cpu_flags(properties);
+
 #if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64)
    Con::printf("   Installing Assembly extensions");
    mInstallLibrary_ASM();
diff --git a/Engine/source/platformWin32/winMath.cpp b/Engine/source/platformWin32/winMath.cpp
index 44b215301..36a9d8c58 100644
--- a/Engine/source/platformWin32/winMath.cpp
+++ b/Engine/source/platformWin32/winMath.cpp
@@ -25,7 +25,7 @@
 #include "console/engineAPI.h"
 
 #include "math/mMath.h"
-
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@@ -98,6 +98,8 @@ void Math::init(U32 properties)
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
 
+   math_backend::install_from_cpu_flags(properties);
+
    Con::printf("   Installing Assembly extensions");
    mInstallLibrary_ASM();
 
diff --git a/Engine/source/util/fpsTracker.cpp b/Engine/source/util/fpsTracker.cpp
index 021d4a4f5..830000bc8 100644
--- a/Engine/source/util/fpsTracker.cpp
+++ b/Engine/source/util/fpsTracker.cpp
@@ -36,6 +36,8 @@ void FPSTracker::reset()
 {
    fpsNext         = (F32)Platform::getRealMilliseconds()/1000.0f + mUpdateInterval;
 
+   fpsAccumTime      = 0.0f;
+   fpsAccumVirtualTime = 0.0f;
    fpsRealLast       = 0.0f;
    fpsReal           = 0.0f;
    fpsRealMin        = 0.000001f; // Avoid division by zero.
@@ -51,42 +53,60 @@ void FPSTracker::update()
    F32 realSeconds    = (F32)Platform::getRealMilliseconds()/1000.0f;
    F32 virtualSeconds = (F32)Platform::getVirtualMilliseconds()/1000.0f;
 
-   fpsFrames++;
-   if (fpsFrames > 1)
+   if (fpsRealLast == 0.0f)
    {
-      fpsReal    = fpsReal*(1.0-alpha) + (realSeconds-fpsRealLast)*alpha;
-      fpsVirtual = fpsVirtual*(1.0-alpha) + (virtualSeconds-fpsVirtualLast)*alpha;
-
-      if( fpsFrames > 10 ) // Wait a few frames before updating these.
-      {
-         // Update min/max.  This is a bit counter-intuitive, as the comparisons are
-         // inversed because these are all one-over-x values.
-
-         if( fpsReal > fpsRealMin )
-            fpsRealMin = fpsReal;
-         if( fpsReal < fpsRealMax )
-            fpsRealMax = fpsReal;
-      }
+      fpsRealLast = realSeconds;
+      fpsVirtualLast = virtualSeconds;
+      return;
    }
 
-   fpsRealLast    = realSeconds;
+   F32 frameTimeReal = realSeconds - fpsRealLast;
+   F32 frameTimeVirtual = virtualSeconds - fpsVirtualLast;
+
+   fpsRealLast = realSeconds;
    fpsVirtualLast = virtualSeconds;
 
-   // update variables every few frames
-   F32 update = fpsRealLast - fpsNext;
-   if (update > 0.5f)
-   {
-      F32 delta = realSeconds - fpsNext;
-      Con::setVariable( "fps::frameDelta",avar("%g", delta));
-      Con::setVariable( "fps::real",      avar( "%4.1f", 1.0f / fpsReal ) );
-      Con::setVariable( "fps::realMin",   avar( "%4.1f", 1.0f / fpsRealMin ) );
-      Con::setVariable( "fps::realMax",   avar( "%4.1f", 1.0f / fpsRealMax ) );
-      Con::setVariable( "fps::virtual",   avar( "%4.1f", 1.0f / fpsVirtual ) );
+   // Accumulate for windowed FPS calculation
+   fpsFrames++;
+   fpsAccumTime += frameTimeReal;
+   fpsAccumVirtualTime += frameTimeVirtual;
 
-      if (update > mUpdateInterval)
-         fpsNext  = fpsRealLast + mUpdateInterval;
-      else
-         fpsNext += mUpdateInterval;
+   // Only update console values at interval
+   if (realSeconds >= fpsNext)
+   {
+      fpsReal = 0.0f;
+      fpsVirtual = 0.0f;
+
+      if (fpsAccumTime > 0.0f)
+         fpsReal = fpsFrames / fpsAccumTime;
+
+      if (fpsAccumVirtualTime > 0.0f)
+         fpsVirtual = fpsFrames / fpsAccumVirtualTime;
+
+      // Update min/max correctly (these are FPS now)
+      if (fpsReal > 0.0f)
+      {
+         if (fpsReal < fpsRealMin)
+            fpsRealMin = fpsReal;
+
+         if (fpsReal > fpsRealMax)
+            fpsRealMax = fpsReal;
+      }
+
+      // frameDelta = actual accumulated real time over window
+      Con::setVariable("fps::frameTimeMs",   avar("%4.3f", (fpsAccumTime / fpsFrames) * 1000.0f));
+      Con::setVariable("fps::frameDelta",    avar("%g", fpsAccumTime));
+      Con::setVariable("fps::real",          avar("%4.1f", fpsReal));
+      Con::setVariable("fps::realMin",       avar("%4.1f", fpsRealMin));
+      Con::setVariable("fps::realMax",       avar("%4.1f", fpsRealMax));
+      Con::setVariable("fps::virtual",       avar("%4.1f", fpsVirtual));
+
+      // Reset window
+      fpsFrames = 0;
+      fpsAccumTime = 0.0f;
+      fpsAccumVirtualTime = 0.0f;
+
+      fpsNext = realSeconds + mUpdateInterval;
    }
 }
 
diff --git a/Engine/source/util/fpsTracker.h b/Engine/source/util/fpsTracker.h
index e5973ae68..a29fbbecf 100644
--- a/Engine/source/util/fpsTracker.h
+++ b/Engine/source/util/fpsTracker.h
@@ -27,6 +27,8 @@
 
 struct FPSTracker
 {
+   F32 fpsAccumTime;
+   F32 fpsAccumVirtualTime;
    F32 fpsRealLast;
    F32 fpsReal;
    F32 fpsRealMin;
@@ -48,4 +50,4 @@ struct FPSTracker
 
 extern FPSTracker gFPS;
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake
index 5a7928b38..f799e8c39 100644
--- a/Tools/CMake/torque_macros.cmake
+++ b/Tools/CMake/torque_macros.cmake
@@ -136,7 +136,7 @@ macro(addFramework framework)
 endmacro()
 
 function(add_math_backend name compile_defs)
-    file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp")
+    file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp" "math/isa/${name}/*.h")
 
     if(NOT SRC)
         return()
@@ -144,6 +144,7 @@ function(add_math_backend name compile_defs)
 
     add_library(math_${name} OBJECT ${SRC})
 
+    message(STATUS "adding math library for isa ${name}")
     target_include_directories(math_${name} PUBLIC
         "math/public"
         "math/impl"