From b6ea96f367d791af72fc0f68c2f47a59388e7db0 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Tue, 24 Feb 2026 15:03:22 +0000
Subject: [PATCH 01/23] cpu info check

find avx2 and avx512
---
 Engine/source/platform/platform.h          | 10 ++++++----
 Engine/source/platformWin32/winCPUInfo.cpp | 20 +++++++++++++++++++-
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/Engine/source/platform/platform.h b/Engine/source/platform/platform.h
index acb929a9f..1326c4692 100644
--- a/Engine/source/platform/platform.h
+++ b/Engine/source/platform/platform.h
@@ -75,10 +75,12 @@ enum ProcessorProperties
    CPU_PROP_SSE4_1    = (1<<7),  ///< Supports SSE4_1 instruction set extension.  
    CPU_PROP_SSE4_2    = (1<<8),  ///< Supports SSE4_2 instruction set extension.
    CPU_PROP_AVX       = (1<<9), ///< Supports AVX256 instruction set extension.
-   CPU_PROP_MP        = (1<<10), ///< This is a multi-processor system.
-   CPU_PROP_LE        = (1<<11), ///< This processor is LITTLE ENDIAN.
-   CPU_PROP_64bit     = (1<<12), ///< This processor is 64-bit capable
-   CPU_PROP_NEON      = (1<<13), ///< Supports the Arm Neon instruction set extension.
+   CPU_PROP_AVX2      = (1<<10), ///< Supports AVX256 instruction set extension.
+   CPU_PROP_AVX512    = (1<<11), ///< Supports AVX256 instruction set extension.
+   CPU_PROP_MP        = (1<<12), ///< This is a multi-processor system.
+   CPU_PROP_LE        = (1<<13), ///< This processor is LITTLE ENDIAN.
+   CPU_PROP_64bit     = (1<<14), ///< This processor is 64-bit capable
+   CPU_PROP_NEON      = (1<<15), ///< Supports the Arm Neon instruction set extension.
 };
 
 /// Processor info manager. 
diff --git a/Engine/source/platformWin32/winCPUInfo.cpp b/Engine/source/platformWin32/winCPUInfo.cpp
index c8e05f44f..3bd183f44 100644
--- a/Engine/source/platformWin32/winCPUInfo.cpp
+++ b/Engine/source/platformWin32/winCPUInfo.cpp
@@ -72,13 +72,15 @@ enum CpuFlags
 
    BIT_XSAVE_RESTORE = BIT(27),
    BIT_AVX = BIT(28),
+   BIT_AVX2 = BIT(5),
+   BIT_AVX512F = BIT(16)
 };
 
 static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor)
 {
    S32 cpuInfo[4];
    __cpuid(cpuInfo, 1);
-   //U32 eax = cpuInfo[0];   // eax
+   int nIds = cpuInfo[0];
    U32 edx = cpuInfo[3];  // edx
    U32 ecx = cpuInfo[2]; // ecx
 
@@ -90,6 +92,18 @@ static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor)
    processor.properties |= (ecx & BIT_SSE4_1) ? CPU_PROP_SSE4_1 : 0;
    processor.properties |= (ecx & BIT_SSE4_2) ? CPU_PROP_SSE4_2 : 0;
 
+   if (nIds >= 7)
+   {
+      int ext[4];
+      __cpuidex(ext, 7, 0);
+
+      if (ext[1] & (BIT_AVX2)) // EBX bit 5
+         processor.properties |= CPU_PROP_AVX2;
+
+      if (ext[1] & (BIT_AVX512F)) // AVX-512 Foundation
+         processor.properties |= CPU_PROP_AVX512;
+   }
+
    // AVX detection requires that xsaverestore is supported
    if (ecx & BIT_XSAVE_RESTORE && ecx & BIT_AVX)
    {
@@ -176,6 +190,10 @@ void Processor::init()
       Con::printf("      SSE4.2 detected" );
    if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX)
       Con::printf("      AVX detected");
+   if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX2)
+      Con::printf("      AVX2 detected");
+   if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX512)
+      Con::printf("      AVX512 detected");
 
    if (Platform::SystemInfo.processor.properties & CPU_PROP_MP)
       Con::printf("   MultiCore CPU detected [%i cores, %i logical]", Platform::SystemInfo.processor.numPhysicalProcessors, Platform::SystemInfo.processor.numLogicalProcessors);

From e9fdffc2dd53f11c9285bb8300522ebff93a4513 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 26 Feb 2026 14:57:16 +0000
Subject: [PATCH 02/23] math backend setup

setup libraries for different simd isa's
add float4 functions for c sse2 and avx2 (placeholder file for neon to be implemented on mac)
---
 Engine/source/CMakeLists.txt                  |  11 +-
 Engine/source/math/impl/float4_c.cpp          |  60 ++++++++++
 Engine/source/math/impl/float4_impl.inl       | 113 ++++++++++++++++++
 Engine/source/math/isa/avx2/float4.cpp        |  58 +++++++++
 Engine/source/math/isa/neon/float4.cpp        |   0
 Engine/source/math/isa/sse2/float4.cpp        |  58 +++++++++
 Engine/source/math/public/float4_dispatch.cpp |   7 ++
 Engine/source/math/public/float4_dispatch.h   |  34 ++++++
 Tools/CMake/torque_macros.cmake               |  47 +++++++-
 9 files changed, 386 insertions(+), 2 deletions(-)
 create mode 100644 Engine/source/math/impl/float4_c.cpp
 create mode 100644 Engine/source/math/impl/float4_impl.inl
 create mode 100644 Engine/source/math/isa/avx2/float4.cpp
 create mode 100644 Engine/source/math/isa/neon/float4.cpp
 create mode 100644 Engine/source/math/isa/sse2/float4.cpp
 create mode 100644 Engine/source/math/public/float4_dispatch.cpp
 create mode 100644 Engine/source/math/public/float4_dispatch.h

diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt
index 029ef89ec..fd225b579 100644
--- a/Engine/source/CMakeLists.txt
+++ b/Engine/source/CMakeLists.txt
@@ -130,7 +130,7 @@ torqueAddSourceDirectories("windowManager" "windowManager/torque" "windowManager
 torqueAddSourceDirectories("scene" "scene/culling" "scene/zones" "scene/mixin")
 
 # Handle math
-torqueAddSourceDirectories("math" "math/util")
+torqueAddSourceDirectories("math" "math/util" "math/public" "math/impl") # note impl must skip the .inl files, never use them in engine code.
 
 # Handle persistence
 set(TORQUE_INCLUDE_DIRECTORIES ${TORQUE_INCLUDE_DIRECTORIES} "persistence/rapidjson")
@@ -496,6 +496,15 @@ else()
 	set_target_properties(${TORQUE_APP_NAME} PROPERTIES LINK_FLAGS "-Wl,-rpath,./")
 endif()
 
+add_math_backend(scalar  MATH_SIMD_SCALAR)
+add_math_backend(sse2    MATH_SIMD_SSE2)
+add_math_backend(sse41   MATH_SIMD_SSE41)
+add_math_backend(avx     MATH_SIMD_AVX)
+add_math_backend(avx2    MATH_SIMD_AVX2)
+# Only on ARM
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
+    add_math_backend(neon MATH_SIMD_NEON)
+endif()
 
 if(MSVC)
     # Match projectGenerator naming for executables
diff --git a/Engine/source/math/impl/float4_c.cpp b/Engine/source/math/impl/float4_c.cpp
new file mode 100644
index 000000000..b3d6559f9
--- /dev/null
+++ b/Engine/source/math/impl/float4_c.cpp
@@ -0,0 +1,60 @@
+#include "math/public/float4_dispatch.h"
+#include "math/mConstants.h"
+#include <math.h>
+
+namespace math_backend::float4::dispatch
+{
+   void install_scalar()
+   {
+      gFloat4.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat4.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat4.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat4.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * s;
+      };
+
+      gFloat4.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat4.div_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] / s;
+      };
+
+      gFloat4.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat4.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return sqrtf(sum);
+      };
+
+      gFloat4.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat4.normalize = [](float* a) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON) for (int i = 0; i < 4; i++) a[i] /= len;
+      };
+
+      gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+   }
+}
diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl
new file mode 100644
index 000000000..4871d3161
--- /dev/null
+++ b/Engine/source/math/impl/float4_impl.inl
@@ -0,0 +1,113 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+
+namespace math_backend::float4
+{
+
+   //----------------------------------------------------------
+   // Add two float4 vectors: r = a + b
+   inline void float4_add_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vr = v_add(va, vb);
+      v_store(r, vr);
+   }
+
+   // Subtract: r = a - b
+   inline void float4_sub_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vr = v_sub(va, vb);
+      v_store(r, vr);
+   }
+
+   // Multiply element-wise: r = a * b
+   inline void float4_mul_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vr = v_mul(va, vb);
+      v_store(r, vr);
+   }
+
+   // Multiply by scalar: r = a * s
+   inline void float4_mul_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_mul(va, vs);
+      v_store(r, vr);
+   }
+
+   // Divide element-wise: r = a / b
+   inline void float4_div_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vr = _mm_div_ps(va, vb);
+      v_store(r, vr);
+   }
+
+   // Divide by scalar: r = a / s
+   inline void float4_div_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_div(va, vs);
+      v_store(r, vr);
+   }
+
+   // Dot product: returns scalar
+   inline float float4_dot_impl(const float* a, const float* b)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vmul = v_mul(va, vb);
+      return v_hadd4(vmul);
+   }
+
+   // Length squared
+   inline float float4_length_squared_impl(const float* a)
+   {
+      return float4_dot_impl(a, a);
+   }
+
+   // Length
+   inline float float4_length_impl(const float* a)
+   {
+      return std::sqrt(float4_length_squared_impl(a));
+   }
+
+   // Normalize in-place
+   inline void float4_normalize_impl(float* a)
+   {
+      float len = float4_length_impl(a);
+      if (len > 1e-6f) // safe threshold
+      {
+         float4_mul_scalar_impl(a, 1.0f / len, a);
+      }
+   }
+
+   // Normalize with magnitude: r = normalize(a) * r
+   inline void float4_normalize_mag_impl(float* a, float r)
+   {
+      float len = float4_length_impl(a);
+      if (len > 1e-6f)
+      {
+         float4_mul_scalar_impl(a, r / len, a);
+      }
+   }
+
+   // Linear interpolation: r = from + (to - from) * f
+   inline void float4_lerp_impl(const float* from, const float* to, float f, float* r)
+   {
+      f32x4 vfrom = v_load(from);
+      f32x4 vto = v_load(to);
+      f32x4 vf = v_set1(f);
+      f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
+      v_store(r, vr);
+   }
+
+} // namespace math_backend::float4
diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp
new file mode 100644
index 000000000..c6e2ffa99
--- /dev/null
+++ b/Engine/source/math/isa/avx2/float4.cpp
@@ -0,0 +1,58 @@
+
+#include "float4_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   // Store 4 floats from SIMD register back to memory
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   // Broadcast a single float across all 4 lanes
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   // Horizontal sum of all 4 elements (for dot product, length, etc.)
+   inline float v_hadd4(f32x4 a)
+   {
+      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
+      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
+      return _mm_cvtss_f32(t2);       // extract first element
+   }
+}
+
+#include "float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+   // Install AVX2 backend
+   void install_avx2()
+   {
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
+      gFloat4.lengthSquared = float4_length_squared_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+   }
+}
diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp
new file mode 100644
index 000000000..e69de29bb
diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp
new file mode 100644
index 000000000..3b9e80e28
--- /dev/null
+++ b/Engine/source/math/isa/sse2/float4.cpp
@@ -0,0 +1,58 @@
+#include "float4_dispatch.h"
+#include <emmintrin.h> // SSE2 intrinsics
+namespace
+{
+   typedef __m128 f32x4;
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   // Store 4 floats from SIMD register back to memory
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   // Broadcast a single float across all 4 lanes
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   // Horizontal sum of all 4 elements (for dot product, length, etc.)
+   inline float v_hadd4(f32x4 a)
+   {
+      __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));   // swap pairs
+      __m128 sums = _mm_add_ps(a, shuf);                             // sums: [a0+a1 a1+a0 a2+a3 a3+a2]
+      shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));    // move high pair to low
+      sums = _mm_add_ps(sums, shuf);                                 // total sum in lower float
+      return _mm_cvtss_f32(sums);
+   }
+}
+
+#include "../../impl/float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+   // Install AVX2 backend
+   void install_sse2()
+   {
+      gFloat4.add = float4_add_impl;
+      gFloat4.sub = float4_sub_impl;
+      gFloat4.mul = float4_mul_impl;
+      gFloat4.mul_scalar = float4_mul_scalar_impl;
+      gFloat4.div = float4_div_impl;
+      gFloat4.div_scalar = float4_div_scalar_impl;
+      gFloat4.dot = float4_dot_impl;
+      gFloat4.length = float4_length_impl;
+      gFloat4.lengthSquared = float4_length_squared_impl;
+      gFloat4.normalize = float4_normalize_impl;
+      gFloat4.lerp = float4_lerp_impl;
+   }
+}
diff --git a/Engine/source/math/public/float4_dispatch.cpp b/Engine/source/math/public/float4_dispatch.cpp
new file mode 100644
index 000000000..810eb0e46
--- /dev/null
+++ b/Engine/source/math/public/float4_dispatch.cpp
@@ -0,0 +1,7 @@
+#include "math/public/float4_dispatch.h"
+
+namespace math_backend::float4::dispatch
+{
+   // Single definition of the global dispatch table
+   Float4Funcs gFloat4{};
+}
diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h
new file mode 100644
index 000000000..68f9a6520
--- /dev/null
+++ b/Engine/source/math/public/float4_dispatch.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <cstdint>
+
+namespace math_backend::float4::dispatch
+{
+   struct Float4Funcs
+   {
+      void (*add)(const float*, const float*, float*) = nullptr;
+      void (*sub)(const float*, const float*, float*) = nullptr;
+      void (*mul)(const float*, const float*, float*) = nullptr;
+      void (*mul_scalar)(const float*, float, float*) = nullptr;
+      void (*div)(const float*, const float*, float*) = nullptr;
+      void (*div_scalar)(const float*, float, float*) = nullptr;
+      float (*dot)(const float*, const float*) = nullptr;
+      float (*length)(const float*) = nullptr;
+      float (*lengthSquared)(const float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*lerp)(const float*, const float*, float, float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Float4Funcs gFloat4;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+
+   // Centralized installer (engine calls this once)
+   void install_preferred();
+}
diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake
index 48bb6896e..23c334780 100644
--- a/Tools/CMake/torque_macros.cmake
+++ b/Tools/CMake/torque_macros.cmake
@@ -133,4 +133,49 @@ macro(addFramework framework)
       find_library(_${framework}_FRAMEWORK_PATH ${framework} PATHS /System/Library/Frameworks /Library/Frameworks)
       set(TORQUE_LINK_FRAMEWORKS ${TORQUE_LINK_FRAMEWORKS} "${_${framework}_FRAMEWORK_PATH}")
 	endif()
-endmacro()
\ No newline at end of file
+endmacro()
+
+function(add_math_backend name compile_defs)
+    file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp")
+
+    if(NOT SRC)
+        return()
+    endif()
+
+    add_library(math_${name} OBJECT ${SRC})
+
+    target_include_directories(math_${name} PUBLIC
+        "math/public"
+        "math/impl"
+        "math/isa/${name}"
+    )
+
+    target_compile_definitions(math_${name} PRIVATE ${compile_defs})
+
+    # ISA flags
+    if(MSVC)
+        if(name STREQUAL "sse2" OR name STREQUAL "sse41")
+            target_compile_options(math_${name} PRIVATE /arch:SSE2)
+        elseif(name STREQUAL "avx")
+            target_compile_options(math_${name} PRIVATE /arch:AVX)
+        elseif(name STREQUAL "avx2")
+            target_compile_options(math_${name} PRIVATE /arch:AVX2)
+        endif()
+    else()
+        if(name STREQUAL "sse2")
+            target_compile_options(math_${name} PRIVATE -msse2)
+        elseif(name STREQUAL "sse41")
+            target_compile_options(math_${name} PRIVATE -msse4.1)
+        elseif(name STREQUAL "avx")
+            target_compile_options(math_${name} PRIVATE -mavx)
+        elseif(name STREQUAL "avx2")
+            target_compile_options(math_${name} PRIVATE -mavx2 -mfma)
+        elseif(name STREQUAL "neon")
+            target_compile_options(math_${name} PRIVATE -march=armv8-a+simd)
+        endif()
+    endif()
+
+    # Inject objects into engine
+    target_sources(${TORQUE_APP_NAME} PRIVATE $<TARGET_OBJECTS:math_${name}>)
+    set_target_properties(math_${name} PROPERTIES FOLDER "Libraries/Math")
+endfunction()
\ No newline at end of file

From 4e7fdd167bc4be52f961a192241e181d2566b7bb Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 26 Feb 2026 15:35:38 +0000
Subject: [PATCH 03/23] Update CMakeLists.txt

fix mac build
---
 Engine/source/CMakeLists.txt | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt
index fd225b579..746bae1e7 100644
--- a/Engine/source/CMakeLists.txt
+++ b/Engine/source/CMakeLists.txt
@@ -496,14 +496,31 @@ else()
 	set_target_properties(${TORQUE_APP_NAME} PROPERTIES LINK_FLAGS "-Wl,-rpath,./")
 endif()
 
+string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH)
+
+set(IS_X86 FALSE)
+set(IS_ARM FALSE)
+
+if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
+    set(IS_X86 TRUE)
+elseif(ARCH MATCHES "arm64|aarch64")
+    set(IS_ARM TRUE)
+endif()
+
+# always available
 add_math_backend(scalar  MATH_SIMD_SCALAR)
-add_math_backend(sse2    MATH_SIMD_SSE2)
-add_math_backend(sse41   MATH_SIMD_SSE41)
-add_math_backend(avx     MATH_SIMD_AVX)
-add_math_backend(avx2    MATH_SIMD_AVX2)
-# Only on ARM
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
-    add_math_backend(neon MATH_SIMD_NEON)
+
+# x86 family
+if(IS_X86)
+    add_math_backend(sse2  MATH_SIMD_SSE2)
+    add_math_backend(sse41 MATH_SIMD_SSE41)
+    add_math_backend(avx   MATH_SIMD_AVX)
+    add_math_backend(avx2  MATH_SIMD_AVX2)
+endif()
+
+# ARM family
+if(IS_ARM)
+    add_math_backend(neon  MATH_SIMD_NEON)
 endif()
 
 if(MSVC)

From 9ebcee420f277d22f19fc170a6a216d526b5409d Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 26 Feb 2026 16:40:01 +0000
Subject: [PATCH 04/23] moar changes

mac implementation had _mm_div (x86 intrinsic)
---
 Engine/source/math/impl/float4_impl.inl     |  7 ++-
 Engine/source/math/isa/avx/float4.cpp       | 67 +++++++++++++++++++++
 Engine/source/math/isa/avx2/float4.cpp      | 11 +++-
 Engine/source/math/isa/sse2/float4.cpp      | 22 +++----
 Engine/source/math/isa/sse41/float4.cpp     | 67 +++++++++++++++++++++
 Engine/source/math/public/float4_dispatch.h |  6 ++
 Engine/source/math/public/math_backend.cpp  | 53 ++++++++++++++++
 Engine/source/math/public/math_backend.h    | 27 +++++++++
 Tools/CMake/torque_macros.cmake             |  4 +-
 9 files changed, 248 insertions(+), 16 deletions(-)
 create mode 100644 Engine/source/math/isa/avx/float4.cpp
 create mode 100644 Engine/source/math/isa/sse41/float4.cpp
 create mode 100644 Engine/source/math/public/math_backend.cpp
 create mode 100644 Engine/source/math/public/math_backend.h

diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl
index 4871d3161..cb61ed4fc 100644
--- a/Engine/source/math/impl/float4_impl.inl
+++ b/Engine/source/math/impl/float4_impl.inl
@@ -1,5 +1,6 @@
 #pragma once
 #include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
 
 namespace math_backend::float4
 {
@@ -46,7 +47,7 @@ namespace math_backend::float4
    {
       f32x4 va = v_load(a);
       f32x4 vb = v_load(b);
-      f32x4 vr = _mm_div_ps(va, vb);
+      f32x4 vr = v_div(va, vb);
       v_store(r, vr);
    }
 
@@ -84,7 +85,7 @@ namespace math_backend::float4
    inline void float4_normalize_impl(float* a)
    {
       float len = float4_length_impl(a);
-      if (len > 1e-6f) // safe threshold
+      if (len > POINT_EPSILON) // safe threshold
       {
          float4_mul_scalar_impl(a, 1.0f / len, a);
       }
@@ -94,7 +95,7 @@ namespace math_backend::float4
    inline void float4_normalize_mag_impl(float* a, float r)
    {
       float len = float4_length_impl(a);
-      if (len > 1e-6f)
+      if (len > POINT_EPSILON)
       {
          float4_mul_scalar_impl(a, r / len, a);
       }
diff --git a/Engine/source/math/isa/avx/float4.cpp b/Engine/source/math/isa/avx/float4.cpp
new file mode 100644
index 000000000..1e23fb8b1
--- /dev/null
+++ b/Engine/source/math/isa/avx/float4.cpp
@@ -0,0 +1,67 @@
+
+#include "float4_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   // Store 4 floats from SIMD register back to memory
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   // Broadcast a single float across all 4 lanes
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   // Horizontal sum of all 4 elements (for dot product, length, etc.)
+   inline float v_hadd4(f32x4 a)
+   {
+      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
+      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
+      return _mm_cvtss_f32(t2);       // extract first element
+   }
+
+   // specialized dot product for AVX
+   float float4_dot_avx(const float* a, const float* b)
+   {
+      f32x4 va = _mm_loadu_ps(a);
+      f32x4 vb = _mm_loadu_ps(b);
+      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
+      return _mm_cvtss_f32(dp);
+   }
+}
+
+#include "float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+   // Install AVX backend
+   void install_avx()
+   {
+      gFloat4.add = float4_add_impl;
+      gFloat4.sub = float4_sub_impl;
+      gFloat4.mul = float4_mul_impl;
+      gFloat4.mul_scalar = float4_mul_scalar_impl;
+      gFloat4.div = float4_div_impl;
+      gFloat4.div_scalar = float4_div_scalar_impl;
+      gFloat4.dot = float4_dot_avx;
+      gFloat4.length = float4_length_impl;
+      gFloat4.lengthSquared = float4_length_squared_impl;
+      gFloat4.normalize = float4_normalize_impl;
+      gFloat4.lerp = float4_lerp_impl;
+   }
+}
diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp
index c6e2ffa99..439d0e2d0 100644
--- a/Engine/source/math/isa/avx2/float4.cpp
+++ b/Engine/source/math/isa/avx2/float4.cpp
@@ -34,6 +34,15 @@ namespace
       __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
       return _mm_cvtss_f32(t2);       // extract first element
    }
+
+   // specialized dot product for AVX
+   float float4_dot_avx(const float* a, const float* b)
+   {
+      f32x4 va = _mm_loadu_ps(a);
+      f32x4 vb = _mm_loadu_ps(b);
+      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
+      return _mm_cvtss_f32(dp);
+   }
 }
 
 #include "float4_impl.inl"
@@ -49,7 +58,7 @@ namespace math_backend::float4::dispatch
       gFloat4.mul_scalar   = float4_mul_scalar_impl;
       gFloat4.div          = float4_div_impl;
       gFloat4.div_scalar   = float4_div_scalar_impl;
-      gFloat4.dot          = float4_dot_impl;
+      gFloat4.dot          = float4_dot_avx;
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp
index 3b9e80e28..00850560a 100644
--- a/Engine/source/math/isa/sse2/float4.cpp
+++ b/Engine/source/math/isa/sse2/float4.cpp
@@ -40,19 +40,19 @@ namespace
 
 namespace math_backend::float4::dispatch
 {
-   // Install AVX2 backend
+   // Install SSE2 backend
    void install_sse2()
    {
-      gFloat4.add = float4_add_impl;
-      gFloat4.sub = float4_sub_impl;
-      gFloat4.mul = float4_mul_impl;
-      gFloat4.mul_scalar = float4_mul_scalar_impl;
-      gFloat4.div = float4_div_impl;
-      gFloat4.div_scalar = float4_div_scalar_impl;
-      gFloat4.dot = float4_dot_impl;
-      gFloat4.length = float4_length_impl;
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
-      gFloat4.normalize = float4_normalize_impl;
-      gFloat4.lerp = float4_lerp_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.lerp         = float4_lerp_impl;
    }
 }
diff --git a/Engine/source/math/isa/sse41/float4.cpp b/Engine/source/math/isa/sse41/float4.cpp
new file mode 100644
index 000000000..80127acb9
--- /dev/null
+++ b/Engine/source/math/isa/sse41/float4.cpp
@@ -0,0 +1,67 @@
+
+#include "float4_dispatch.h"
+#include <smmintrin.h> // SSE41 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   // Store 4 floats from SIMD register back to memory
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   // Broadcast a single float across all 4 lanes
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   // Horizontal sum of all 4 elements (for dot product, length, etc.)
+   inline float v_hadd4(f32x4 a)
+   {
+      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
+      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
+      return _mm_cvtss_f32(t2);       // extract first element
+   }
+
+   // specialized dot product for SSE4.1
+   float float4_dot_sse41(const float* a, const float* b)
+   {
+      f32x4 va = _mm_loadu_ps(a);
+      f32x4 vb = _mm_loadu_ps(b);
+      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
+      return _mm_cvtss_f32(dp);
+   }
+}
+
+#include "float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+   // Install SSE41 backend
+   void install_sse41()
+   {
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_sse41;
+      gFloat4.length       = float4_length_impl;
+      gFloat4.lengthSquared = float4_length_squared_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+   }
+}
diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h
index 68f9a6520..319b1893f 100644
--- a/Engine/source/math/public/float4_dispatch.h
+++ b/Engine/source/math/public/float4_dispatch.h
@@ -1,4 +1,8 @@
 #pragma once
+#ifndef _FLOAT4_DISPATCH_H_
+#define _FLOAT4_DISPATCH_H_
+
+
 #include <cstdint>
 
 namespace math_backend::float4::dispatch
@@ -32,3 +36,5 @@ namespace math_backend::float4::dispatch
    // Centralized installer (engine calls this once)
    void install_preferred();
 }
+
+#endif // !_FLOAT4_DISPATCH_H_
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
new file mode 100644
index 000000000..7998924ee
--- /dev/null
+++ b/Engine/source/math/public/math_backend.cpp
@@ -0,0 +1,53 @@
+#pragma once
+#include "math/public/math_backend.h"
+
+math_backend::backend math_backend::choose_backend(U32 cpu_flags)
+{
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
+
+   if (cpu_flags & CPU_PROP_AVX2)  return backend::avx2;
+   if (cpu_flags & CPU_PROP_AVX)   return backend::avx;
+   if (cpu_flags & CPU_PROP_SSE4_1) return backend::sse41;
+   if (cpu_flags & CPU_PROP_SSE2)  return backend::sse2;
+
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+
+   if (cpu_flags & CPU_NEON) return backend::neon;
+
+#endif
+   return backend::scalar;
+}
+
+void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
+{
+   {
+      g_backend = choose_backend(cpu_flags);
+
+      switch (g_backend)
+      {
+      case backend::avx2:
+         float4::dispatch::install_avx2();
+         break;
+
+      case backend::avx:
+         //float4::dispatch::install_avx();
+         break;
+
+      case backend::sse41:
+         float4::dispatch::install_sse41();
+         break;
+
+      case backend::sse2:
+         float4::dispatch::install_sse2();
+         break;
+
+      case backend::neon:
+         float4::dispatch::install_neon();
+         break;
+
+      default:
+         float4::dispatch::install_scalar();
+         break;
+      }
+   }
+}
diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h
new file mode 100644
index 000000000..40476e7f0
--- /dev/null
+++ b/Engine/source/math/public/math_backend.h
@@ -0,0 +1,27 @@
+#pragma once
+#ifndef _MCONSTANTS_H_
+#include "math/mConstants.h"
+#endif
+#ifndef _PLATFORMASSERT_H_
+#include "platform/platformAssert.h"
+#endif
+#ifndef _FLOAT4_DISPATCH_H_
+#include "math/public/float4_dispatch.h"
+#endif
+
+namespace math_backend
+{
+   enum class backend
+   {
+      scalar,
+      sse2,
+      sse41,
+      avx,
+      avx2,
+      neon
+   };
+
+   static backend g_backend = backend::scalar;
+   backend choose_backend(U32 cpu_flags);
+   void install_from_cpu_flags(uint32_t cpu_flags);
+}
diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake
index 23c334780..5a7928b38 100644
--- a/Tools/CMake/torque_macros.cmake
+++ b/Tools/CMake/torque_macros.cmake
@@ -154,7 +154,9 @@ function(add_math_backend name compile_defs)
 
     # ISA flags
     if(MSVC)
-        if(name STREQUAL "sse2" OR name STREQUAL "sse41")
+        if(name STREQUAL "sse2")
+            target_compile_options(math_${name} PRIVATE /arch:SSE2)
+        elseif(name STREQUAL "sse41")
             target_compile_options(math_${name} PRIVATE /arch:SSE2)
         elseif(name STREQUAL "avx")
             target_compile_options(math_${name} PRIVATE /arch:AVX)

From 73ed502ac9fdf8e7b7bf7cf9f59f532f3dd351eb Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 26 Feb 2026 16:40:49 +0000
Subject: [PATCH 05/23] neon float4

note: 64bit only
---
 Engine/source/math/isa/neon/float4.cpp | 50 ++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp
index e69de29bb..6258db743 100644
--- a/Engine/source/math/isa/neon/float4.cpp
+++ b/Engine/source/math/isa/neon/float4.cpp
@@ -0,0 +1,50 @@
+#include "float4_dispatch.h"
+#include <arm_neon.h>
+
+namespace
+{
+    typedef float32x4_t f32x4;
+
+    inline f32x4 v_load(const float* p)        { return vld1q_f32(p); }
+    inline void  v_store(float* dst, f32x4 v)  { vst1q_f32(dst, v); }
+    inline f32x4 v_set1(float s)               { return vdupq_n_f32(s); }
+
+    inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
+    inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
+    inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
+
+    // AArch64 native divide
+    inline f32x4 v_div(f32x4 a, f32x4 b)
+    {
+        return vdivq_f32(a, b);
+    }
+
+    inline float v_hadd4(f32x4 a)
+    {
+        float32x2_t low  = vget_low_f32(a);
+        float32x2_t high = vget_high_f32(a);
+        float32x2_t sum  = vadd_f32(low, high);
+        sum = vpadd_f32(sum, sum);
+        return vget_lane_f32(sum, 0);
+    }
+}
+
+#include "../../impl/float4_impl.inl"
+
+namespace math_backend::float4::dispatch
+{
+    void install_neon()
+    {
+        gFloat4.add           = float4_add_impl;
+        gFloat4.sub           = float4_sub_impl;
+        gFloat4.mul           = float4_mul_impl;
+        gFloat4.mul_scalar    = float4_mul_scalar_impl;
+        gFloat4.div           = float4_div_impl;
+        gFloat4.div_scalar    = float4_div_scalar_impl;
+        gFloat4.dot           = float4_dot_impl;
+        gFloat4.length        = float4_length_impl;
+        gFloat4.lengthSquared = float4_length_squared_impl;
+        gFloat4.normalize     = float4_normalize_impl;
+        gFloat4.lerp          = float4_lerp_impl;
+    }
+}

From 67f12311d4b9218e3bb366a6df2eeb1846f4f402 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 26 Feb 2026 16:45:13 +0000
Subject: [PATCH 06/23] ISA backends float3 and float4 - cleanup history squash

working for both neon32 and neon64

Update math_backend.cpp

further sse simd additions

avx2 float3 added
added normalize_magnitude
added divide fast to float3 may copy to float4

move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend

all float3 and float4 functions and isas

completed all options of float3 and float4 functions in isas and math_c
neon still to be done but that will be on mac.

Update math_backend.cpp

mac isa neon update

added float3
restructured the classes to look more like the final version of the x86 classes

linux required changes

Update build-macos-clang.yml

Update build-macos-clang.yml

Revert "Update build-macos-clang.yml"

This reverts commit 29dfc567f40f20d2400a9967a35bbdb823182e2d.

Revert "Update build-macos-clang.yml"

This reverts commit 2abad2b4ca4de717c5f4278708f289dd1bb22561.

Update CMakeLists.txt

fix macs stupid build

remove god awful rolling average from frame time tracker....

use intrinsic headers instead

each isa implementation now uses a header for that isa's intrinsic functions these are then used in the impl files. This will make it easier for matrix functions when those are implemented.

fixed comment saying 256 when it should be 512 for avx512

consolidated initializers for function tables

Update neon_intrinsics.h

fixes for some neon intrinsics no idea if this is the best way to do these but they work at least

v_cross is especially messy at the moment we basically just do it as a c math function need to look into getting this done correctly
---
 Engine/source/CMakeLists.txt                  |   7 +-
 Engine/source/gfx/gfxDrawUtil.cpp             |   8 +-
 Engine/source/math/impl/float3_impl.inl       | 123 +++++++++++
 Engine/source/math/impl/float4_c.cpp          |  60 -----
 Engine/source/math/impl/float4_impl.inl       |  33 ++-
 Engine/source/math/impl/math_c.cpp            | 208 ++++++++++++++++++
 Engine/source/math/isa/avx/avx_intrinsics.h   | 140 ++++++++++++
 Engine/source/math/isa/avx/float3.cpp         |  26 +++
 Engine/source/math/isa/avx/float4.cpp         |  68 ++----
 Engine/source/math/isa/avx2/avx2_intrinsics.h | 140 ++++++++++++
 Engine/source/math/isa/avx2/float3.cpp        |  26 +++
 Engine/source/math/isa/avx2/float4.cpp        |  50 +----
 Engine/source/math/isa/neon/float3.cpp        |  25 +++
 Engine/source/math/isa/neon/float4.cpp        |  55 ++---
 Engine/source/math/isa/neon/neon_intrinsics.h | 130 +++++++++++
 Engine/source/math/isa/sse2/float3.cpp        |  26 +++
 Engine/source/math/isa/sse2/float4.cpp        |  40 +---
 Engine/source/math/isa/sse2/sse2_intrinsics.h | 156 +++++++++++++
 Engine/source/math/isa/sse41/float3.cpp       |  26 +++
 Engine/source/math/isa/sse41/float4.cpp       |  50 +----
 .../source/math/isa/sse41/sse41_intrinsics.h  | 140 ++++++++++++
 Engine/source/math/mPoint3.h                  |  91 ++++----
 Engine/source/math/mPoint4.h                  |  49 +++--
 Engine/source/math/public/float3_dispatch.h   |  39 ++++
 Engine/source/math/public/float4_dispatch.cpp |   7 -
 Engine/source/math/public/float4_dispatch.h   |   5 +-
 Engine/source/math/public/mat44_dispatch.h    |  26 +++
 Engine/source/math/public/math_backend.cpp    |  34 ++-
 Engine/source/math/public/math_backend.h      |  11 +
 Engine/source/platform/platform.h             |   2 +-
 Engine/source/platformMac/macMath.mm          |   5 +-
 Engine/source/platformPOSIX/POSIXMath.cpp     |   3 +
 Engine/source/platformWin32/winMath.cpp       |   4 +-
 Engine/source/util/fpsTracker.cpp             |  80 ++++---
 Engine/source/util/fpsTracker.h               |   4 +-
 Tools/CMake/torque_macros.cmake               |   3 +-
 36 files changed, 1481 insertions(+), 419 deletions(-)
 create mode 100644 Engine/source/math/impl/float3_impl.inl
 delete mode 100644 Engine/source/math/impl/float4_c.cpp
 create mode 100644 Engine/source/math/impl/math_c.cpp
 create mode 100644 Engine/source/math/isa/avx/avx_intrinsics.h
 create mode 100644 Engine/source/math/isa/avx/float3.cpp
 create mode 100644 Engine/source/math/isa/avx2/avx2_intrinsics.h
 create mode 100644 Engine/source/math/isa/avx2/float3.cpp
 create mode 100644 Engine/source/math/isa/neon/float3.cpp
 create mode 100644 Engine/source/math/isa/neon/neon_intrinsics.h
 create mode 100644 Engine/source/math/isa/sse2/float3.cpp
 create mode 100644 Engine/source/math/isa/sse2/sse2_intrinsics.h
 create mode 100644 Engine/source/math/isa/sse41/float3.cpp
 create mode 100644 Engine/source/math/isa/sse41/sse41_intrinsics.h
 create mode 100644 Engine/source/math/public/float3_dispatch.h
 delete mode 100644 Engine/source/math/public/float4_dispatch.cpp
 create mode 100644 Engine/source/math/public/mat44_dispatch.h

diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt
index 746bae1e7..d5f7aeaad 100644
--- a/Engine/source/CMakeLists.txt
+++ b/Engine/source/CMakeLists.txt
@@ -503,12 +503,17 @@ set(IS_ARM FALSE)
 
 if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
     set(IS_X86 TRUE)
-elseif(ARCH MATCHES "arm64|aarch64")
+endif()
+
+if(ARCH MATCHES "arm64|aarch64|arm")
     set(IS_ARM TRUE)
 endif()
 
 # always available
 add_math_backend(scalar  MATH_SIMD_SCALAR)
+message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "IS_X86=${IS_X86}")
+message(STATUS "IS_ARM=${IS_ARM}")
 
 # x86 family
 if(IS_X86)
diff --git a/Engine/source/gfx/gfxDrawUtil.cpp b/Engine/source/gfx/gfxDrawUtil.cpp
index 16b886a55..3dce4b3af 100644
--- a/Engine/source/gfx/gfxDrawUtil.cpp
+++ b/Engine/source/gfx/gfxDrawUtil.cpp
@@ -853,7 +853,11 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2,
 // 3D World Draw Misc
 //-----------------------------------------------------------------------------
 
-static SphereMesh gSphere;
+SphereMesh& getSphere()
+{
+   static SphereMesh instance;
+   return instance;
+}
 
 void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm )
 {
@@ -868,7 +872,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P
    GFX->pushWorldMatrix();
    GFX->multWorld(mat);
 
-   const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2);
+   const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2);
    S32 numPoly = sphereMesh->numPoly;
    S32 totalPoly = 0;
    GFXVertexBufferHandle<GFXVertexPCT> verts(mDevice, numPoly*3, GFXBufferTypeVolatile);
diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl
new file mode 100644
index 000000000..87b325faf
--- /dev/null
+++ b/Engine/source/math/impl/float3_impl.inl
@@ -0,0 +1,123 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+// Safely loads a float3 -> simd 4 lane backend
+namespace math_backend::float3
+{
+   //----------------------------------------------------------
+   // Add two float4 vectors: r = a + b
+   inline void float3_add_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_add(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Subtract: r = a - b
+   inline void float3_sub_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_sub(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply element-wise: r = a * b
+   inline void float3_mul_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_mul(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply by scalar: r = a * s
+   inline void float3_mul_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_mul(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Divide element-wise: r = a / b
+   inline void float3_div_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vr = v_div(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Divide by scalar: r = a / s
+   inline void float3_div_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_div(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Dot product: returns scalar
+   inline float float3_dot_impl(const float* a, const float* b)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vdot = v_dot3(va, vb);
+      return v_extract0(vdot); // first lane is the sum of 3 elements
+   }
+
+   // Length squared
+   inline float float3_length_squared_impl(const float* a)
+   {
+      return float3_dot_impl(a, a);
+   }
+
+   // Length
+   inline float float3_length_impl(const float* a)
+   {
+      return std::sqrt(float3_length_squared_impl(a));
+   }
+
+   // Normalize in-place
+   inline void float3_normalize_impl(float* a)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store3(a, vnorm);
+   }
+
+   // Normalize with magnitude: r = normalize(a) * r
+   inline void float3_normalize_mag_impl(float* a, float r)
+   {
+      f32x4 va = v_load3_vec(a);
+
+      // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
+      f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
+
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
+   }
+
+   // Linear interpolation: r = from + (to - from) * f
+   inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
+   {
+      f32x4 vfrom = v_load3_vec(from);
+      f32x4 vto = v_load3_vec(to);
+      f32x4 vf = v_set1(f);
+      f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
+      v_store3(r, vr);
+   }
+
+   inline void float3_cross_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3_vec(a);
+      f32x4 vb = v_load3_vec(b);
+      f32x4 vcross = v_cross(va, vb);
+      v_store3(r, vcross);
+   }
+
+}
diff --git a/Engine/source/math/impl/float4_c.cpp b/Engine/source/math/impl/float4_c.cpp
deleted file mode 100644
index b3d6559f9..000000000
--- a/Engine/source/math/impl/float4_c.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "math/public/float4_dispatch.h"
-#include "math/mConstants.h"
-#include <math.h>
-
-namespace math_backend::float4::dispatch
-{
-   void install_scalar()
-   {
-      gFloat4.add = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
-      };
-
-      gFloat4.sub = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
-      };
-
-      gFloat4.mul = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
-      };
-
-      gFloat4.mul_scalar = [](const float* a, float s, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] * s;
-      };
-
-      gFloat4.div = [](const float* a, const float* b, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
-      };
-
-      gFloat4.div_scalar = [](const float* a, float s, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] / s;
-      };
-
-      gFloat4.dot = [](const float* a, const float* b) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * b[i];
-         return sum;
-      };
-
-      gFloat4.length = [](const float* a) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
-         return sqrtf(sum);
-      };
-
-      gFloat4.lengthSquared = [](const float* a) {
-         float sum = 0.f;
-         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
-         return (sum);
-      };
-
-      gFloat4.normalize = [](float* a) {
-         float len = gFloat4.length(a);
-         if (len > POINT_EPSILON) for (int i = 0; i < 4; i++) a[i] /= len;
-      };
-
-      gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
-      };
-   }
-}
diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl
index cb61ed4fc..9371ae7c3 100644
--- a/Engine/source/math/impl/float4_impl.inl
+++ b/Engine/source/math/impl/float4_impl.inl
@@ -65,8 +65,8 @@ namespace math_backend::float4
    {
       f32x4 va = v_load(a);
       f32x4 vb = v_load(b);
-      f32x4 vmul = v_mul(va, vb);
-      return v_hadd4(vmul);
+      f32x4 vdot = v_dot4(va, vb);   // calls ISA-specific implementation
+      return v_extract0(vdot);
    }
 
    // Length squared
@@ -84,21 +84,22 @@ namespace math_backend::float4
    // Normalize in-place
    inline void float4_normalize_impl(float* a)
    {
-      float len = float4_length_impl(a);
-      if (len > POINT_EPSILON) // safe threshold
-      {
-         float4_mul_scalar_impl(a, 1.0f / len, a);
-      }
+      f32x4 va = v_load(a);
+      f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
    }
 
    // Normalize with magnitude: r = normalize(a) * r
    inline void float4_normalize_mag_impl(float* a, float r)
    {
-      float len = float4_length_impl(a);
-      if (len > POINT_EPSILON)
-      {
-         float4_mul_scalar_impl(a, r / len, a);
-      }
+      f32x4 va = v_load(a);
+
+      // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
+      f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va)));
+
+      f32x4 vnorm = v_mul(va, invLen);
+      v_store(a, vnorm);
    }
 
    // Linear interpolation: r = from + (to - from) * f
@@ -111,4 +112,12 @@ namespace math_backend::float4
       v_store(r, vr);
    }
 
+   inline void float4_cross_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load(a);
+      f32x4 vb = v_load(b);
+      f32x4 vcross = v_cross(va, vb);
+      v_store(r, vcross);
+   }
+
 } // namespace math_backend::float4
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
new file mode 100644
index 000000000..7d8a317ba
--- /dev/null
+++ b/Engine/source/math/impl/math_c.cpp
@@ -0,0 +1,208 @@
+#include "math/public/float4_dispatch.h"
+#include "math/public/float3_dispatch.h"
+#include "math/public/mat44_dispatch.h"
+#include "math/mConstants.h"
+#include <cmath>   // for sqrtf, etc.
+
+namespace math_backend::float4::dispatch
+{
+   void install_scalar()
+   {
+      gFloat4.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat4.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat4.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat4.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] * s;
+      };
+
+      gFloat4.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat4.div_scalar = [](const float* a, float s, float* r) {
+         float denom = 1.0f / s;
+         for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
+      };
+
+      gFloat4.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat4.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return std::sqrt(sum);
+      };
+
+      gFloat4.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat4.normalize = [](float* a) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = 1.0f / len;
+            for (int i = 0; i < 4; i++)
+               a[i] *= denom;
+         }
+      };
+
+      gFloat4.normalize_mag = [](float* a, float f) {
+         float len = gFloat4.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = f / len;
+            for (int i = 0; i < 4; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+
+      gFloat4.cross = [](const float* a, const float* b, float* r) {
+         const float ax = a[0];
+         const float ay = a[1];
+         const float az = a[2];
+
+         const float bx = b[0];
+         const float by = b[1];
+         const float bz = b[2];
+
+         r[0] = ay * bz - az * by;
+         r[1] = az * bx - ax * bz;
+         r[2] = ax * by - ay * bx;
+      };
+   }
+}
+
+namespace math_backend::float3::dispatch
+{
+   void install_scalar()
+   {
+      gFloat3.add = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] + b[i];
+      };
+
+      gFloat3.sub = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] - b[i];
+      };
+
+      gFloat3.mul = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] * b[i];
+      };
+
+      gFloat3.mul_scalar = [](const float* a, float s, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] * s;
+      };
+
+      gFloat3.div = [](const float* a, const float* b, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = a[i] / b[i];
+      };
+
+      gFloat3.div_scalar = [](const float* a, float s, float* r) {
+         float denom = 1.0f / s;
+         for (int i = 0; i < 3; i++) r[i] = a[i] * denom;
+      };
+
+      gFloat3.dot = [](const float* a, const float* b) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * b[i];
+         return sum;
+      };
+
+      gFloat3.length = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * a[i];
+         return std::sqrt(sum);
+      };
+
+      gFloat3.lengthSquared = [](const float* a) {
+         float sum = 0.f;
+         for (int i = 0; i < 3; i++) sum += a[i] * a[i];
+         return (sum);
+      };
+
+      gFloat3.normalize = [](float* a) {
+         float len = gFloat3.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = 1.0 / len;
+            for (int i = 0; i < 3; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat3.normalize_mag = [](float* a, float f) {
+         float len = gFloat3.length(a);
+         if (len > POINT_EPSILON)
+         {
+            float denom = f / len;
+            for (int i = 0; i < 3; i++) a[i] *= denom;
+         }
+      };
+
+      gFloat3.lerp = [](const float* from, const float* to, float f, float* r) {
+         for (int i = 0; i < 3; i++) r[i] = from[i] + (to[i] - from[i]) * f;
+      };
+
+      gFloat3.cross = [](const float* a, const float* b, float* r) {
+         const float ax = a[0];
+         const float ay = a[1];
+         const float az = a[2];
+
+         const float bx = b[0];
+         const float by = b[1];
+         const float bz = b[2];
+
+         r[0] = ay * bz - az * by;
+         r[1] = az * bx - ax * bz;
+         r[2] = ax * by - ay * bx;
+      };
+   }
+}
+
+inline void swap(float& a, float& b)
+{
+   float temp = a;
+   a = b;
+   b = temp;
+}
+
+
+namespace math_backend::mat44::dispatch
+{
+   void install_scalar()
+   {
+      gMat44.transpose = [](float* a) {
+         swap(a[1], a[4]);
+         swap(a[2], a[8]);
+         swap(a[3], a[12]);
+         swap(a[6], a[9]);
+         swap(a[7], a[13]);
+         swap(a[11], a[14]);
+      };
+
+      gMat44.scale = [](float* a, const float* s) {
+         // Note, doesn't allow scaling w...
+
+         a[0] *= s[0];  a[1] *= s[1];  a[2] *= s[2];
+         a[4] *= s[0];  a[5] *= s[1];  a[6] *= s[2];
+         a[8] *= s[0];  a[9] *= s[1];  a[10] *= s[2];
+         a[12] *= s[0];  a[13] *= s[1];  a[14] *= s[2];
+      };
+   }
+}
diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
new file mode 100644
index 000000000..0340d84a2
--- /dev/null
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/isa/avx/float3.cpp b/Engine/source/math/isa/avx/float3.cpp
new file mode 100644
index 000000000..7829f292c
--- /dev/null
+++ b/Engine/source/math/isa/avx/float3.cpp
@@ -0,0 +1,26 @@
+#include "avx_intrinsics.h"
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX backend
+   void install_avx()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_impl;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+      gFloat3.cross           = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/avx/float4.cpp b/Engine/source/math/isa/avx/float4.cpp
index 1e23fb8b1..cfdab0908 100644
--- a/Engine/source/math/isa/avx/float4.cpp
+++ b/Engine/source/math/isa/avx/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "avx_intrinsics.h"
 #include "float4_dispatch.h"
-#include <immintrin.h> // AVX/AVX2 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for AVX
-   float float4_dot_avx(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -52,16 +8,18 @@ namespace math_backend::float4::dispatch
    // Install AVX backend
    void install_avx()
    {
-      gFloat4.add = float4_add_impl;
-      gFloat4.sub = float4_sub_impl;
-      gFloat4.mul = float4_mul_impl;
-      gFloat4.mul_scalar = float4_mul_scalar_impl;
-      gFloat4.div = float4_div_impl;
-      gFloat4.div_scalar = float4_div_scalar_impl;
-      gFloat4.dot = float4_dot_avx;
-      gFloat4.length = float4_length_impl;
+      gFloat4.add          = float4_add_impl;
+      gFloat4.sub          = float4_sub_impl;
+      gFloat4.mul          = float4_mul_impl;
+      gFloat4.mul_scalar   = float4_mul_scalar_impl;
+      gFloat4.div          = float4_div_impl;
+      gFloat4.div_scalar   = float4_div_scalar_impl;
+      gFloat4.dot          = float4_dot_impl;
+      gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
-      gFloat4.normalize = float4_normalize_impl;
-      gFloat4.lerp = float4_lerp_impl;
+      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
+      gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
new file mode 100644
index 000000000..0340d84a2
--- /dev/null
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/isa/avx2/float3.cpp b/Engine/source/math/isa/avx2/float3.cpp
new file mode 100644
index 000000000..fd0d485f6
--- /dev/null
+++ b/Engine/source/math/isa/avx2/float3.cpp
@@ -0,0 +1,26 @@
+#include "avx2_intrinsics.h"
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX2 backend
+   void install_avx2()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_impl;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+      gFloat3.cross           = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp
index 439d0e2d0..85228caaf 100644
--- a/Engine/source/math/isa/avx2/float4.cpp
+++ b/Engine/source/math/isa/avx2/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "avx2_intrinsics.h"
 #include "float4_dispatch.h"
-#include <immintrin.h> // AVX/AVX2 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for AVX
-   float float4_dot_avx(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -58,10 +14,12 @@ namespace math_backend::float4::dispatch
       gFloat4.mul_scalar   = float4_mul_scalar_impl;
       gFloat4.div          = float4_div_impl;
       gFloat4.div_scalar   = float4_div_scalar_impl;
-      gFloat4.dot          = float4_dot_avx;
+      gFloat4.dot          = float4_dot_impl;
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/neon/float3.cpp b/Engine/source/math/isa/neon/float3.cpp
new file mode 100644
index 000000000..904787932
--- /dev/null
+++ b/Engine/source/math/isa/neon/float3.cpp
@@ -0,0 +1,25 @@
+#include "neon_intrinsics.h"
+#include "float3_dispatch.h"
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+    // Install NEON backend
+    void install_neon()
+    {
+        gFloat3.add = float3_add_impl;
+        gFloat3.sub = float3_sub_impl;
+        gFloat3.mul = float3_mul_impl;
+        gFloat3.mul_scalar = float3_mul_scalar_impl;
+        gFloat3.div = float3_div_impl;
+        gFloat3.div_scalar = float3_div_scalar_impl;
+        gFloat3.dot = float3_dot_impl;
+        gFloat3.length = float3_length_impl;
+        gFloat3.lengthSquared = float3_length_squared_impl;
+        gFloat3.normalize = float3_normalize_impl;
+        gFloat3.normalize_mag = float3_normalize_mag_impl;
+        gFloat3.lerp = float3_lerp_impl;
+        gFloat3.cross = float3_cross_impl;
+    }
+}
diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp
index 6258db743..6996f2a76 100644
--- a/Engine/source/math/isa/neon/float4.cpp
+++ b/Engine/source/math/isa/neon/float4.cpp
@@ -1,50 +1,25 @@
+#include "neon_intrinsics.h"
 #include "float4_dispatch.h"
-#include <arm_neon.h>
 
-namespace
-{
-    typedef float32x4_t f32x4;
-
-    inline f32x4 v_load(const float* p)        { return vld1q_f32(p); }
-    inline void  v_store(float* dst, f32x4 v)  { vst1q_f32(dst, v); }
-    inline f32x4 v_set1(float s)               { return vdupq_n_f32(s); }
-
-    inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
-    inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
-    inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
-
-    // AArch64 native divide
-    inline f32x4 v_div(f32x4 a, f32x4 b)
-    {
-        return vdivq_f32(a, b);
-    }
-
-    inline float v_hadd4(f32x4 a)
-    {
-        float32x2_t low  = vget_low_f32(a);
-        float32x2_t high = vget_high_f32(a);
-        float32x2_t sum  = vadd_f32(low, high);
-        sum = vpadd_f32(sum, sum);
-        return vget_lane_f32(sum, 0);
-    }
-}
-
-#include "../../impl/float4_impl.inl"
+#include "float4_impl.inl"
 
 namespace math_backend::float4::dispatch
 {
+    // Install NEON64 backend
     void install_neon()
     {
-        gFloat4.add           = float4_add_impl;
-        gFloat4.sub           = float4_sub_impl;
-        gFloat4.mul           = float4_mul_impl;
-        gFloat4.mul_scalar    = float4_mul_scalar_impl;
-        gFloat4.div           = float4_div_impl;
-        gFloat4.div_scalar    = float4_div_scalar_impl;
-        gFloat4.dot           = float4_dot_impl;
-        gFloat4.length        = float4_length_impl;
+        gFloat4.add          = float4_add_impl;
+        gFloat4.sub          = float4_sub_impl;
+        gFloat4.mul          = float4_mul_impl;
+        gFloat4.mul_scalar   = float4_mul_scalar_impl;
+        gFloat4.div          = float4_div_impl;
+        gFloat4.div_scalar   = float4_div_scalar_impl;
+        gFloat4.dot          = float4_dot_impl;
+        gFloat4.length       = float4_length_impl;
         gFloat4.lengthSquared = float4_length_squared_impl;
-        gFloat4.normalize     = float4_normalize_impl;
-        gFloat4.lerp          = float4_lerp_impl;
+        gFloat4.normalize    = float4_normalize_impl;
+        gFloat4.normalize_mag = float4_normalize_mag_impl;
+        gFloat4.lerp         = float4_lerp_impl;
+        gFloat4.cross        = float4_cross_impl;
     }
 }
diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h
new file mode 100644
index 000000000..3476fab1b
--- /dev/null
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@@ -0,0 +1,130 @@
+#pragma once
+#include <arm_neon.h>
+
+namespace
+{
+   typedef float32x4_t f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+   inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
+   inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
+   inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
+   inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
+   inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+   inline f32x4 v_mask_xyz()
+   {
+      // equivalent to [1,1,1,0]
+      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
+      return mask;
+   }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f};
+      return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers
+   //------------------------------------------------------
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      float tmp[4] = { p[0], p[1], p[2], 0.0f };
+      return vld1q_f32(tmp);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      float tmp[4] = { p[0], p[1], p[2], 1.0f };
+      return vld1q_f32(tmp);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      float tmp[4];
+      vst1q_f32(tmp, v);
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmetic
+   //------------------------------------------------------
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = vrecpeq_f32(b);
+      r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson
+      r = vmulq_f32(r, vrecpsq_f32(b, r));
+      return r;
+   }
+
+   inline f32x4 v_div(f32x4 a, f32x4 b)
+   {
+      return vmulq_f32(a, v_rcp_nr(b));
+   }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = vrsqrteq_f32(x);
+      r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine
+      r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x));
+      return r;
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      f32x4 mul = vmulq_f32(a, b);
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
+      float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
+      return vdupq_n_f32(sum);
+   }
+
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
+      f32x4 mul = vmulq_f32(a, b);
+      mul = vmulq_f32(mul, mask);
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
+      float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
+      return vdupq_n_f32(sum);
+   }
+
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 };
+      float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 };
+      float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b));
+      return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 };
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v,v));
+      return vmulq_f32(v, inv);
+   }
+
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a));
+      float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
+      return vdupq_n_f32(sum);
+   }
+}
diff --git a/Engine/source/math/isa/sse2/float3.cpp b/Engine/source/math/isa/sse2/float3.cpp
new file mode 100644
index 000000000..bc822e8fc
--- /dev/null
+++ b/Engine/source/math/isa/sse2/float3.cpp
@@ -0,0 +1,26 @@
+#include "sse2_intrinsics.h"
+#include "float3_dispatch.h"
+#include <emmintrin.h> // SSE2 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install SSE2 backend
+   void install_sse2()
+   {
+      gFloat3.add = float3_add_impl;
+      gFloat3.sub = float3_sub_impl;
+      gFloat3.mul = float3_mul_impl;
+      gFloat3.mul_scalar = float3_mul_scalar_impl;
+      gFloat3.div = float3_div_impl;
+      gFloat3.div_scalar = float3_div_scalar_impl;
+      gFloat3.dot = float3_dot_impl;
+      gFloat3.length = float3_length_impl;
+      gFloat3.lengthSquared = float3_length_squared_impl;
+      gFloat3.normalize = float3_normalize_impl;
+      gFloat3.normalize_mag = float3_normalize_mag_impl;
+      gFloat3.lerp = float3_lerp_impl;
+      gFloat3.cross = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp
index 00850560a..aa986474b 100644
--- a/Engine/source/math/isa/sse2/float4.cpp
+++ b/Engine/source/math/isa/sse2/float4.cpp
@@ -1,42 +1,8 @@
+#include "sse2_intrinsics.h"
 #include "float4_dispatch.h"
-#include <emmintrin.h> // SSE2 intrinsics
-namespace
-{
-   typedef __m128 f32x4;
 
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+#include "float4_impl.inl"
 
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));   // swap pairs
-      __m128 sums = _mm_add_ps(a, shuf);                             // sums: [a0+a1 a1+a0 a2+a3 a3+a2]
-      shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));    // move high pair to low
-      sums = _mm_add_ps(sums, shuf);                                 // total sum in lower float
-      return _mm_cvtss_f32(sums);
-   }
-}
-
-#include "../../impl/float4_impl.inl"
 
 namespace math_backend::float4::dispatch
 {
@@ -53,6 +19,8 @@ namespace math_backend::float4::dispatch
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
new file mode 100644
index 000000000..85b09ebb0
--- /dev/null
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -0,0 +1,156 @@
+#pragma once
+#include <emmintrin.h>  // SSE2
+#include <xmmintrin.h>  // SSE
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
+      return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
+   }
+
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      f32x4 prod = _mm_mul_ps(a, b);           // multiply element-wise
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      prod = _mm_add_ps(prod, shuf);
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      prod = _mm_add_ps(prod, shuf);
+      return prod;                             // f32x4, all lanes = dot(a,b)
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      f32x4 prod = _mm_mul_ps(a, b);
+      prod = _mm_and_ps(prod, v_mask_xyz());    // zero w
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      prod = _mm_add_ps(prod, shuf);
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      prod = _mm_add_ps(prod, shuf);
+      return prod;                             // f32x4, all lanes = dot(a,b)
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE2
+      __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs
+      __m128 sums = _mm_add_ps(a, shuf);
+      shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
+      return _mm_add_ps(sums, shuf);
+   }
+}
diff --git a/Engine/source/math/isa/sse41/float3.cpp b/Engine/source/math/isa/sse41/float3.cpp
new file mode 100644
index 000000000..adf1a3a3d
--- /dev/null
+++ b/Engine/source/math/isa/sse41/float3.cpp
@@ -0,0 +1,26 @@
+#include "sse41_intrinsics.h"
+#include "float3_dispatch.h"
+#include <smmintrin.h> // SSE41 intrinsics
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install SSE41 backend
+   void install_sse41()
+   {
+      gFloat3.add = float3_add_impl;
+      gFloat3.sub = float3_sub_impl;
+      gFloat3.mul = float3_mul_impl;
+      gFloat3.mul_scalar = float3_mul_scalar_impl;
+      gFloat3.div = float3_div_impl;
+      gFloat3.div_scalar = float3_div_scalar_impl;
+      gFloat3.dot = float3_dot_impl;
+      gFloat3.length = float3_length_impl;
+      gFloat3.lengthSquared = float3_length_squared_impl;
+      gFloat3.normalize = float3_normalize_impl;
+      gFloat3.normalize_mag = float3_normalize_mag_impl;
+      gFloat3.lerp = float3_lerp_impl;
+      gFloat3.cross = float3_cross_impl;
+   }
+}
diff --git a/Engine/source/math/isa/sse41/float4.cpp b/Engine/source/math/isa/sse41/float4.cpp
index 80127acb9..e9ca8aae2 100644
--- a/Engine/source/math/isa/sse41/float4.cpp
+++ b/Engine/source/math/isa/sse41/float4.cpp
@@ -1,49 +1,5 @@
-
+#include "sse41_intrinsics.h"
 #include "float4_dispatch.h"
-#include <smmintrin.h> // SSE41 intrinsics
-
-namespace
-{
-   typedef __m128 f32x4;
-
-   // Load 4 floats from memory into a SIMD register
-   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
-
-   // Store 4 floats from SIMD register back to memory
-   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
-
-   // Broadcast a single float across all 4 lanes
-   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
-
-   // Element-wise multiply
-   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
-
-   // Element-wise divide
-   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
-
-   // Element-wise add
-   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
-
-   // Element-wise subtract
-   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
-
-   // Horizontal sum of all 4 elements (for dot product, length, etc.)
-   inline float v_hadd4(f32x4 a)
-   {
-      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
-      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
-      return _mm_cvtss_f32(t2);       // extract first element
-   }
-
-   // specialized dot product for SSE4.1
-   float float4_dot_sse41(const float* a, const float* b)
-   {
-      f32x4 va = _mm_loadu_ps(a);
-      f32x4 vb = _mm_loadu_ps(b);
-      __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
-      return _mm_cvtss_f32(dp);
-   }
-}
 
 #include "float4_impl.inl"
 
@@ -58,10 +14,12 @@ namespace math_backend::float4::dispatch
       gFloat4.mul_scalar   = float4_mul_scalar_impl;
       gFloat4.div          = float4_div_impl;
       gFloat4.div_scalar   = float4_div_scalar_impl;
-      gFloat4.dot          = float4_dot_sse41;
+      gFloat4.dot          = float4_dot_impl;
       gFloat4.length       = float4_length_impl;
       gFloat4.lengthSquared = float4_length_squared_impl;
       gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
       gFloat4.lerp         = float4_lerp_impl;
+      gFloat4.cross        = float4_cross_impl;
    }
 }
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
new file mode 100644
index 000000000..047cb44ee
--- /dev/null
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <smmintrin.h>  // SSE4.1
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   //------------------------------------------------------
+   // Load / Store
+   //------------------------------------------------------
+
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
+
+   inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
+
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   inline f32x4 v_zero() { return _mm_setzero_ps(); }
+
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   //------------------------------------------------------
+   // Mask helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
+
+   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
+   {
+      return _mm_blend_ps(newv, original, 0b1000);
+   }
+
+   //------------------------------------------------------
+   // Float3 helpers (safe loading into 4 lanes)
+   //------------------------------------------------------
+
+   inline f32x4 v_load3_vec(const float* p) // w = 0
+   {
+      return _mm_set_ps(0.0f, p[2], p[1], p[0]);
+   }
+
+   inline f32x4 v_load3_pos(const float* p) // w = 1
+   {
+      return _mm_set_ps(1.0f, p[2], p[1], p[0]);
+   }
+
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   //------------------------------------------------------
+   // Simple Arithmatic
+   //------------------------------------------------------
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide
+   inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   //------------------------------------------------------
+   // Fast recip
+   //------------------------------------------------------
+
+   // Fast recip 1/b
+   inline f32x4 v_rcp_nr(f32x4 b)
+   {
+      f32x4 r = _mm_rcp_ps(b);
+      f32x4 two = _mm_set1_ps(2.0f);
+      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+   }
+
+   // Divide fast ( b = recip eg 1/b)
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
+
+   inline f32x4 v_rsqrt_nr(f32x4 x)
+   {
+      f32x4 r = _mm_rsqrt_ps(x);
+
+      f32x4 half = _mm_set1_ps(0.5f);
+      f32x4 three = _mm_set1_ps(3.0f);
+
+      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
+
+      return _mm_mul_ps(r, half);
+   }
+
+   //------------------------------------------------------
+   // Vector intrinsic functions
+   //------------------------------------------------------
+
+   // full dot4
+   inline f32x4 v_dot4(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+   }
+
+   // dot3 (ignores w)
+   inline f32x4 v_dot3(f32x4 a, f32x4 b)
+   {
+      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+   }
+
+   // cross product xyz only.
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+
+      f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
+   }
+
+   inline f32x4 v_normalize3(f32x4 v)
+   {
+      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
+      return _mm_mul_ps(v, inv);
+   }
+
+   // adds all 4 lanes together.
+   inline f32x4 v_hadd4(f32x4 a)
+   {
+      // sum all 4 lanes in SSE41
+      __m128 sum = _mm_hadd_ps(a, a);
+      return _mm_hadd_ps(sum, sum);
+   }
+}
diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h
index c1bb72923..8e329f3a1 100644
--- a/Engine/source/math/mPoint3.h
+++ b/Engine/source/math/mPoint3.h
@@ -29,6 +29,11 @@
 #ifndef _MPOINT2_H_
 #include "math/mPoint2.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
+
+#include <array>
 
 //------------------------------------------------------------------------------
 /// 3D integer point
@@ -97,6 +102,7 @@ public:
 class Point3D;
 
 //------------------------------------------------------------------------------
+using math_backend::float3::dispatch::gFloat3;
 class Point3F
 {
    //-------------------------------------- Public data
@@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
 inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
 {
    AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
-   m_point3F_interpolate( _from, _to, _factor, *this);
+
+   gFloat3.lerp(_from, _to, _factor, *this);
 }
 
 inline void Point3F::zero()
@@ -599,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c)
 
 inline F32 Point3F::lenSquared() const
 {
-   return (x * x) + (y * y) + (z * z);
+   return gFloat3.lengthSquared(*this);
 }
 
 inline F32 Point3F::len() const
 {
-   return mSqrt(x*x + y*y + z*z);
+   return gFloat3.length(*this);
 }
 
 inline void Point3F::normalize()
 {
-   m_point3F_normalize(*this);
+   gFloat3.normalize(*this);
 }
 
 inline F32 Point3F::magnitudeSafe() const
@@ -626,18 +633,13 @@ inline F32 Point3F::magnitudeSafe() const
 
 inline void Point3F::normalizeSafe()
 {
-   F32 vmag = magnitudeSafe();
-
-   if( vmag > POINT_EPSILON )
-   {
-      *this *= F32(1.0 / vmag);
-   }
+   gFloat3.normalize(*this);
 }
 
 
 inline void Point3F::normalize(F32 val)
 {
-   m_point3F_normalize_f(*this, val);
+   gFloat3.normalize_mag(*this, val);
 }
 
 inline bool Point3F::operator==(const Point3F& _test) const
@@ -652,52 +654,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const
 
 inline Point3F Point3F::operator+(const Point3F& _add) const
 {
-   return Point3F(x + _add.x, y + _add.y,  z + _add.z);
+   Point3F temp;
+   gFloat3.add(*this, _add, temp);
+   return temp;
 }
 
 inline Point3F Point3F::operator-(const Point3F& _rSub) const
 {
-   return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
+   Point3F temp;
+   gFloat3.sub(*this, _rSub, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator+=(const Point3F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-
+   gFloat3.add(*this, _add, *this);
    return *this;
 }
 
 inline Point3F& Point3F::operator-=(const Point3F& _rSub)
 {
-   x -= _rSub.x;
-   y -= _rSub.y;
-   z -= _rSub.z;
-
+   gFloat3.sub(*this, _rSub, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator*(F32 _mul) const
 {
-   return Point3F(x * _mul, y * _mul, z * _mul);
+   Point3F temp;
+   gFloat3.mul_scalar(*this, _mul, temp);
+   return temp;
 }
 
 inline Point3F Point3F::operator/(F32 _div) const
 {
    AssertFatal(_div != 0.0f, "Error, div by zero attempted");
 
-   F32 inv = 1.0f / _div;
-
-   return Point3F(x * inv, y * inv, z * inv);
+   Point3F temp;
+   gFloat3.div_scalar(*this, _div, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator*=(F32 _mul)
 {
-   x *= _mul;
-   y *= _mul;
-   z *= _mul;
-
+   gFloat3.mul_scalar(*this, _mul, *this);
    return *this;
 }
 
@@ -705,39 +704,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
 {
    AssertFatal(_div != 0.0f, "Error, div by zero attempted");
 
-   F32 inv = 1.0f / _div;
-   x *= inv;
-   y *= inv;
-   z *= inv;
-
+   gFloat3.div_scalar(*this, _div, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator*(const Point3F &_vec) const
 {
-   return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
+   Point3F temp;
+   gFloat3.mul(*this, _vec, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator*=(const Point3F &_vec)
 {
-   x *= _vec.x;
-   y *= _vec.y;
-   z *= _vec.z;
+   gFloat3.mul(*this, _vec, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator/(const Point3F &_vec) const
 {
    AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
+   Point3F temp;
+   gFloat3.div(*this, _vec, temp);
+   return temp;
 }
 
 inline Point3F& Point3F::operator/=(const Point3F &_vec)
 {
    AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   x /= _vec.x;
-   y /= _vec.y;
-   z /= _vec.z;
+   gFloat3.div(*this, _vec, *this);
    return *this;
 }
 
@@ -989,7 +984,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)
 
 inline Point3F operator*(F32 mul, const Point3F& multiplicand)
 {
-   return multiplicand * mul;
+   Point3F temp;
+   gFloat3.mul_scalar(multiplicand, mul, temp);
+   return temp;
 }
 
 inline Point3D operator*(F64 mul, const Point3D& multiplicand)
@@ -999,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
 
 inline F32 mDot(const Point3F &p1, const Point3F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
+   return gFloat3.dot(p1, p2);
 }
 
 inline F64 mDot(const Point3D &p1, const Point3D &p2)
@@ -1009,9 +1006,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2)
 
 inline void mCross(const Point3F &a, const Point3F &b, Point3F *res)
 {
-   res->x = (a.y * b.z) - (a.z * b.y);
-   res->y = (a.z * b.x) - (a.x * b.z);
-   res->z = (a.x * b.y) - (a.y * b.x);
+   gFloat3.cross(a, b, *res);
 }
 
 inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
@@ -1024,7 +1019,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
 inline Point3F mCross(const Point3F &a, const Point3F &b)
 {
    Point3F r;
-   mCross( a, b, &r );
+   gFloat3.cross(a, b, r);
    return r;
 }
 
diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h
index 8ae173009..0f715f983 100644
--- a/Engine/source/math/mPoint4.h
+++ b/Engine/source/math/mPoint4.h
@@ -26,10 +26,12 @@
 #ifndef _MMATHFN_H_
 #include "math/mMathFn.h"
 #endif
-
 #ifndef _MPOINT3_H_
 #include "math/mPoint3.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
 
 
 //------------------------------------------------------------------------------
@@ -61,6 +63,8 @@ class Point4I
 /// Uses F32 internally.
 ///
 /// Useful for representing quaternions and other 4d beasties.
+using math_backend::float4::dispatch::gFloat4;
+
 class Point4F
 {
    //-------------------------------------- Public data
@@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)
 
 inline F32 Point4F::len() const
 {
-   return mSqrt(x*x + y*y + z*z + w*w);
+   return gFloat4.length(*this);
 }
 
 inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
 {
-   x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
-   y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
-   z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
-   w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
+   gFloat4.lerp(_from, _to, _factor, *this);
 }
 
 inline void Point4F::zero()
@@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
    if (mIsZero(scalar))
       return *this;
 
-   F32 denom = 1 / scalar;
-
-   x *= denom;
-   y *= denom;
-   z *= denom;
-   w *= denom;
+   gFloat4.div_scalar(*this, scalar, *this);
 
    return *this;
 }
 
 inline Point4F Point4F::operator+(const Point4F& _add) const
 {
-   return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
+   Point4F res;
+   gFloat4.add(*this, _add, res);
+   return res;
 }
 
 inline Point4F& Point4F::operator+=(const Point4F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-   w += _add.w;
-
+   gFloat4.add(*this, _add, *this);
    return *this;
 }
 
 inline Point4F Point4F::operator-(const Point4F& _rSub) const
 {
-   return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
+   Point4F res;
+   gFloat4.sub(*this, _rSub, res);
+   return res;
 }
 
 inline Point4F Point4F::operator*(const Point4F &_vec) const
 {
-   return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
+   Point4F res;
+   gFloat4.mul(*this, _vec, res);
+   return res;
 }
 
 inline Point4F Point4F::operator*(F32 _mul) const
 {
-   return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
+   Point4F res;
+   gFloat4.mul_scalar(*this, _mul, res);
+   return res;
 }
 
 inline Point4F Point4F::operator /(F32 t) const
 {
-   F32 f = 1.0f / t;
-   return Point4F( x * f, y * f, z * f, w * f );
+   Point4F res;
+   gFloat4.div_scalar(*this, t, res);
+   return res;
 }
 
 inline F32 mDot(const Point4F &p1, const Point4F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
+   return  gFloat4.dot(p1, p2);
 }
 
 //------------------------------------------------------------------------------
diff --git a/Engine/source/math/public/float3_dispatch.h b/Engine/source/math/public/float3_dispatch.h
new file mode 100644
index 000000000..e4279cb84
--- /dev/null
+++ b/Engine/source/math/public/float3_dispatch.h
@@ -0,0 +1,39 @@
+#pragma once
+#ifndef _FLOAT3_DISPATCH_H_
+#define _FLOAT3_DISPATCH_H_
+
+
+#include <cstdint>
+
+namespace math_backend::float3::dispatch
+{
+   struct Float3Funcs
+   {
+      void (*add)(const float*, const float*, float*) = nullptr;
+      void (*sub)(const float*, const float*, float*) = nullptr;
+      void (*mul)(const float*, const float*, float*) = nullptr;
+      void (*mul_scalar)(const float*, float, float*) = nullptr;
+      void (*div)(const float*, const float*, float*) = nullptr;
+      void (*div_scalar)(const float*, float, float*) = nullptr;
+      float (*dot)(const float*, const float*) = nullptr;
+      float (*length)(const float*) = nullptr;
+      float (*lengthSquared)(const float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
+      void (*lerp)(const float*, const float*, float, float*) = nullptr;
+      void (*cross)(const float*, const float*, float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Float3Funcs gFloat3;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_FLOAT4_DISPATCH_H_
diff --git a/Engine/source/math/public/float4_dispatch.cpp b/Engine/source/math/public/float4_dispatch.cpp
deleted file mode 100644
index 810eb0e46..000000000
--- a/Engine/source/math/public/float4_dispatch.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "math/public/float4_dispatch.h"
-
-namespace math_backend::float4::dispatch
-{
-   // Single definition of the global dispatch table
-   Float4Funcs gFloat4{};
-}
diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h
index 319b1893f..6f26114ce 100644
--- a/Engine/source/math/public/float4_dispatch.h
+++ b/Engine/source/math/public/float4_dispatch.h
@@ -19,7 +19,9 @@ namespace math_backend::float4::dispatch
       float (*length)(const float*) = nullptr;
       float (*lengthSquared)(const float*) = nullptr;
       void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
       void (*lerp)(const float*, const float*, float, float*) = nullptr;
+      void (*cross)(const float*, const float*, float*) = nullptr;
    };
 
    // Global dispatch table
@@ -32,9 +34,6 @@ namespace math_backend::float4::dispatch
    void install_avx();
    void install_avx2();
    void install_neon();
-
-   // Centralized installer (engine calls this once)
-   void install_preferred();
 }
 
 #endif // !_FLOAT4_DISPATCH_H_
diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h
new file mode 100644
index 000000000..61b488861
--- /dev/null
+++ b/Engine/source/math/public/mat44_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+#ifndef _MAT44_DISPATCH_H_
+#define _MAT44_DISPATCH_H_
+
+
+namespace math_backend::mat44::dispatch
+{
+   struct Mat44Funcs
+   {
+      void (*transpose)(float*) = nullptr;
+      void (*scale)(float*, const float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Mat44Funcs gMat44;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_MAT44_DISPATCH_H_
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
index 7998924ee..9b5e5daed 100644
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@@ -1,6 +1,24 @@
 #pragma once
 #include "math/public/math_backend.h"
 
+namespace math_backend::float4::dispatch
+{
+   // Single definition of the global dispatch table
+   Float4Funcs gFloat4{};
+}
+
+namespace math_backend::float3::dispatch
+{
+   // Single definition of the global dispatch table
+   Float3Funcs gFloat3{};
+}
+
+namespace math_backend::mat44::dispatch
+{
+   Mat44Funcs gMat44{};
+}
+
+
 math_backend::backend math_backend::choose_backend(U32 cpu_flags)
 {
 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
@@ -12,7 +30,7 @@ math_backend::backend math_backend::choose_backend(U32 cpu_flags)
 
 #elif defined(__aarch64__) || defined(__ARM_NEON)
 
-   if (cpu_flags & CPU_NEON) return backend::neon;
+   if (cpu_flags & CPU_PROP_NEON) return backend::neon;
 
 #endif
    return backend::scalar;
@@ -25,28 +43,36 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
 
       switch (g_backend)
       {
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
       case backend::avx2:
          float4::dispatch::install_avx2();
+         float3::dispatch::install_avx2();
          break;
 
       case backend::avx:
-         //float4::dispatch::install_avx();
+         float4::dispatch::install_avx();
+         float3::dispatch::install_avx();
          break;
 
       case backend::sse41:
          float4::dispatch::install_sse41();
+         float3::dispatch::install_sse41();
          break;
 
       case backend::sse2:
          float4::dispatch::install_sse2();
+         float3::dispatch::install_sse2();
          break;
-
+#elif defined(__aarch64__) || defined(__ARM_NEON)
       case backend::neon:
          float4::dispatch::install_neon();
+         float3::dispatch::install_neon();
          break;
-
+#endif
       default:
          float4::dispatch::install_scalar();
+         float3::dispatch::install_scalar();
+         mat44::dispatch::install_scalar();
          break;
       }
    }
diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h
index 40476e7f0..0a3127f81 100644
--- a/Engine/source/math/public/math_backend.h
+++ b/Engine/source/math/public/math_backend.h
@@ -1,4 +1,7 @@
 #pragma once
+#ifndef _MATH_BACKEND_H_
+#define _MATH_BACKEND_H_
+
 #ifndef _MCONSTANTS_H_
 #include "math/mConstants.h"
 #endif
@@ -8,6 +11,12 @@
 #ifndef _FLOAT4_DISPATCH_H_
 #include "math/public/float4_dispatch.h"
 #endif
+#ifndef _FLOAT3_DISPATCH_H_
+#include "math/public/float3_dispatch.h"
+#endif
+#ifndef _MAT44_DISPATCH_H_
+#include "math/public/mat44_dispatch.h"
+#endif
 
 namespace math_backend
 {
@@ -25,3 +34,5 @@ namespace math_backend
    backend choose_backend(U32 cpu_flags);
    void install_from_cpu_flags(uint32_t cpu_flags);
 }
+
+#endif // !_MATH_BACKEND_H_
diff --git a/Engine/source/platform/platform.h b/Engine/source/platform/platform.h
index 1326c4692..0d377f23f 100644
--- a/Engine/source/platform/platform.h
+++ b/Engine/source/platform/platform.h
@@ -76,7 +76,7 @@ enum ProcessorProperties
    CPU_PROP_SSE4_2    = (1<<8),  ///< Supports SSE4_2 instruction set extension.
    CPU_PROP_AVX       = (1<<9), ///< Supports AVX256 instruction set extension.
    CPU_PROP_AVX2      = (1<<10), ///< Supports AVX256 instruction set extension.
-   CPU_PROP_AVX512    = (1<<11), ///< Supports AVX256 instruction set extension.
+   CPU_PROP_AVX512    = (1<<11), ///< Supports AVX512 instruction set extension.
    CPU_PROP_MP        = (1<<12), ///< This is a multi-processor system.
    CPU_PROP_LE        = (1<<13), ///< This processor is LITTLE ENDIAN.
    CPU_PROP_64bit     = (1<<14), ///< This processor is 64-bit capable
diff --git a/Engine/source/platformMac/macMath.mm b/Engine/source/platformMac/macMath.mm
index 4feefb277..c275bdd17 100644
--- a/Engine/source/platformMac/macMath.mm
+++ b/Engine/source/platformMac/macMath.mm
@@ -25,6 +25,7 @@
 #import "math/mMath.h"
 #import "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 
@@ -107,7 +108,9 @@ void Math::init(U32 properties)
 
    Con::printf("Math Init:");
    Con::printf("   Installing Standard C extensions");
-   mInstallLibrary_C();   
+   mInstallLibrary_C();
+   
+   math_backend::install_from_cpu_flags(properties);
 
    #ifdef TORQUE_CPU_X86
    if( properties & CPU_PROP_SSE )
diff --git a/Engine/source/platformPOSIX/POSIXMath.cpp b/Engine/source/platformPOSIX/POSIXMath.cpp
index 8f21329a3..751df294e 100644
--- a/Engine/source/platformPOSIX/POSIXMath.cpp
+++ b/Engine/source/platformPOSIX/POSIXMath.cpp
@@ -27,6 +27,7 @@
 #include "math/mMath.h"
 #include "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@@ -90,6 +91,8 @@ void Math::init(U32 properties)
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
 
+   math_backend::install_from_cpu_flags(properties);
+
 #if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64)
    Con::printf("   Installing Assembly extensions");
    mInstallLibrary_ASM();
diff --git a/Engine/source/platformWin32/winMath.cpp b/Engine/source/platformWin32/winMath.cpp
index 44b215301..36a9d8c58 100644
--- a/Engine/source/platformWin32/winMath.cpp
+++ b/Engine/source/platformWin32/winMath.cpp
@@ -25,7 +25,7 @@
 #include "console/engineAPI.h"
 
 #include "math/mMath.h"
-
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@@ -98,6 +98,8 @@ void Math::init(U32 properties)
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
 
+   math_backend::install_from_cpu_flags(properties);
+
    Con::printf("   Installing Assembly extensions");
    mInstallLibrary_ASM();
 
diff --git a/Engine/source/util/fpsTracker.cpp b/Engine/source/util/fpsTracker.cpp
index 021d4a4f5..830000bc8 100644
--- a/Engine/source/util/fpsTracker.cpp
+++ b/Engine/source/util/fpsTracker.cpp
@@ -36,6 +36,8 @@ void FPSTracker::reset()
 {
    fpsNext         = (F32)Platform::getRealMilliseconds()/1000.0f + mUpdateInterval;
 
+   fpsAccumTime      = 0.0f;
+   fpsAccumVirtualTime = 0.0f;
    fpsRealLast       = 0.0f;
    fpsReal           = 0.0f;
    fpsRealMin        = 0.000001f; // Avoid division by zero.
@@ -51,42 +53,60 @@ void FPSTracker::update()
    F32 realSeconds    = (F32)Platform::getRealMilliseconds()/1000.0f;
    F32 virtualSeconds = (F32)Platform::getVirtualMilliseconds()/1000.0f;
 
-   fpsFrames++;
-   if (fpsFrames > 1)
+   if (fpsRealLast == 0.0f)
    {
-      fpsReal    = fpsReal*(1.0-alpha) + (realSeconds-fpsRealLast)*alpha;
-      fpsVirtual = fpsVirtual*(1.0-alpha) + (virtualSeconds-fpsVirtualLast)*alpha;
-
-      if( fpsFrames > 10 ) // Wait a few frames before updating these.
-      {
-         // Update min/max.  This is a bit counter-intuitive, as the comparisons are
-         // inversed because these are all one-over-x values.
-
-         if( fpsReal > fpsRealMin )
-            fpsRealMin = fpsReal;
-         if( fpsReal < fpsRealMax )
-            fpsRealMax = fpsReal;
-      }
+      fpsRealLast = realSeconds;
+      fpsVirtualLast = virtualSeconds;
+      return;
    }
 
-   fpsRealLast    = realSeconds;
+   F32 frameTimeReal = realSeconds - fpsRealLast;
+   F32 frameTimeVirtual = virtualSeconds - fpsVirtualLast;
+
+   fpsRealLast = realSeconds;
    fpsVirtualLast = virtualSeconds;
 
-   // update variables every few frames
-   F32 update = fpsRealLast - fpsNext;
-   if (update > 0.5f)
-   {
-      F32 delta = realSeconds - fpsNext;
-      Con::setVariable( "fps::frameDelta",avar("%g", delta));
-      Con::setVariable( "fps::real",      avar( "%4.1f", 1.0f / fpsReal ) );
-      Con::setVariable( "fps::realMin",   avar( "%4.1f", 1.0f / fpsRealMin ) );
-      Con::setVariable( "fps::realMax",   avar( "%4.1f", 1.0f / fpsRealMax ) );
-      Con::setVariable( "fps::virtual",   avar( "%4.1f", 1.0f / fpsVirtual ) );
+   // Accumulate for windowed FPS calculation
+   fpsFrames++;
+   fpsAccumTime += frameTimeReal;
+   fpsAccumVirtualTime += frameTimeVirtual;
 
-      if (update > mUpdateInterval)
-         fpsNext  = fpsRealLast + mUpdateInterval;
-      else
-         fpsNext += mUpdateInterval;
+   // Only update console values at interval
+   if (realSeconds >= fpsNext)
+   {
+      fpsReal = 0.0f;
+      fpsVirtual = 0.0f;
+
+      if (fpsAccumTime > 0.0f)
+         fpsReal = fpsFrames / fpsAccumTime;
+
+      if (fpsAccumVirtualTime > 0.0f)
+         fpsVirtual = fpsFrames / fpsAccumVirtualTime;
+
+      // Update min/max correctly (these are FPS now)
+      if (fpsReal > 0.0f)
+      {
+         if (fpsReal < fpsRealMin)
+            fpsRealMin = fpsReal;
+
+         if (fpsReal > fpsRealMax)
+            fpsRealMax = fpsReal;
+      }
+
+      // frameDelta = actual accumulated real time over window
+      Con::setVariable("fps::frameTimeMs",   avar("%4.3f", (fpsAccumTime / fpsFrames) * 1000.0f));
+      Con::setVariable("fps::frameDelta",    avar("%g", fpsAccumTime));
+      Con::setVariable("fps::real",          avar("%4.1f", fpsReal));
+      Con::setVariable("fps::realMin",       avar("%4.1f", fpsRealMin));
+      Con::setVariable("fps::realMax",       avar("%4.1f", fpsRealMax));
+      Con::setVariable("fps::virtual",       avar("%4.1f", fpsVirtual));
+
+      // Reset window
+      fpsFrames = 0;
+      fpsAccumTime = 0.0f;
+      fpsAccumVirtualTime = 0.0f;
+
+      fpsNext = realSeconds + mUpdateInterval;
    }
 }
 
diff --git a/Engine/source/util/fpsTracker.h b/Engine/source/util/fpsTracker.h
index e5973ae68..a29fbbecf 100644
--- a/Engine/source/util/fpsTracker.h
+++ b/Engine/source/util/fpsTracker.h
@@ -27,6 +27,8 @@
 
 struct FPSTracker
 {
+   F32 fpsAccumTime;
+   F32 fpsAccumVirtualTime;
    F32 fpsRealLast;
    F32 fpsReal;
    F32 fpsRealMin;
@@ -48,4 +50,4 @@ struct FPSTracker
 
 extern FPSTracker gFPS;
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake
index 5a7928b38..f799e8c39 100644
--- a/Tools/CMake/torque_macros.cmake
+++ b/Tools/CMake/torque_macros.cmake
@@ -136,7 +136,7 @@ macro(addFramework framework)
 endmacro()
 
 function(add_math_backend name compile_defs)
-    file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp")
+    file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp" "math/isa/${name}/*.h")
 
     if(NOT SRC)
         return()
@@ -144,6 +144,7 @@ function(add_math_backend name compile_defs)
 
     add_library(math_${name} OBJECT ${SRC})
 
+    message(STATUS "adding math library for isa ${name}")
     target_include_directories(math_${name} PUBLIC
         "math/public"
         "math/impl"

From bc3145bc55719432db9439d42d1042916a418e74 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Tue, 3 Mar 2026 19:09:00 +0000
Subject: [PATCH 07/23] matrix functions

most matrix functions are converted over, no benefit to converting over the project/ortho because they would be scalar anyway but may need to move them regardless.
---
 Engine/source/environment/meshRoad.cpp        |   4 +-
 Engine/source/environment/river.cpp           |   2 +-
 Engine/source/math/impl/float3_impl.inl       |   2 +-
 Engine/source/math/impl/mat44_impl.inl        | 312 +++++++++++++++
 Engine/source/math/impl/math_c.cpp            | 165 ++++++++
 Engine/source/math/isa/avx/avx_intrinsics.h   | 252 +++++++++++-
 Engine/source/math/isa/avx/mat44.cpp          |  22 ++
 Engine/source/math/isa/avx2/avx2_intrinsics.h | 252 +++++++++++-
 Engine/source/math/isa/avx2/mat44.cpp         |  22 ++
 Engine/source/math/isa/sse2/mat44.cpp         |  22 ++
 Engine/source/math/isa/sse2/sse2_intrinsics.h | 368 +++++++++++++++++-
 Engine/source/math/isa/sse41/mat44.cpp        |  22 ++
 .../source/math/isa/sse41/sse41_intrinsics.h  | 364 ++++++++++++++++-
 Engine/source/math/mMathFn.h                  |   6 +-
 Engine/source/math/mMath_C.cpp                | 113 ++----
 Engine/source/math/mMatrix.h                  |  50 +--
 Engine/source/math/public/mat44_dispatch.h    |   9 +
 Engine/source/math/public/math_backend.cpp    |  24 ++
 Engine/source/ts/assimp/assimpAppNode.cpp     |   2 +-
 Engine/source/ts/collada/colladaAppMesh.cpp   |   2 +-
 Engine/source/ts/collada/colladaAppNode.cpp   |   2 +-
 21 files changed, 1881 insertions(+), 136 deletions(-)
 create mode 100644 Engine/source/math/impl/mat44_impl.inl
 create mode 100644 Engine/source/math/isa/avx/mat44.cpp
 create mode 100644 Engine/source/math/isa/avx2/mat44.cpp
 create mode 100644 Engine/source/math/isa/sse2/mat44.cpp
 create mode 100644 Engine/source/math/isa/sse41/mat44.cpp

diff --git a/Engine/source/environment/meshRoad.cpp b/Engine/source/environment/meshRoad.cpp
index 0b10f4a29..ea09c4352 100644
--- a/Engine/source/environment/meshRoad.cpp
+++ b/Engine/source/environment/meshRoad.cpp
@@ -3406,7 +3406,7 @@ MatrixF MeshRoad::getNodeTransform( U32 idx )
    mat.setColumn( 2, node.normal );
    mat.setColumn( 3, node.point );
 
-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal(mat.determinant() != 0.0f, "no inverse!");
 
    return mat; 
 }
@@ -3456,7 +3456,7 @@ void MeshRoad::calcSliceTransform( U32 idx, MatrixF &mat )
    mat.setColumn( 2, slice.normal );
    mat.setColumn( 3, slice.p1 );
 
-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal(mat.determinant() != 0.0f, "no inverse!");
 }
 
 F32 MeshRoad::getRoadLength() const
diff --git a/Engine/source/environment/river.cpp b/Engine/source/environment/river.cpp
index acefb66b3..a3456cf17 100644
--- a/Engine/source/environment/river.cpp
+++ b/Engine/source/environment/river.cpp
@@ -2139,7 +2139,7 @@ MatrixF River::getNodeTransform( U32 idx ) const
    mat.setColumn( 2, node.normal );
    mat.setColumn( 3, node.point );
 
-   AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
+   AssertFatal( mat.determinant() != 0.0f, "no inverse!");
 
    return mat; 
 }
diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl
index 87b325faf..216fc99ba 100644
--- a/Engine/source/math/impl/float3_impl.inl
+++ b/Engine/source/math/impl/float3_impl.inl
@@ -99,7 +99,7 @@ namespace math_backend::float3
       f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
 
       f32x4 vnorm = v_mul(va, invLen);
-      v_store(a, vnorm);
+      v_store3(a, vnorm);
    }
 
    // Linear interpolation: r = from + (to - from) * f
diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
new file mode 100644
index 000000000..a9635cff0
--- /dev/null
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -0,0 +1,312 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+namespace math_backend::mat44
+{
+   //------------------------------------------------------------------
+   // Matrix Transpose
+   inline void mat44_transpose_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4x4 mr = m_transpose(ma);
+      m_store(m, mr);
+   }
+
+   inline float mat44_get_determinant(const float* m)
+   {
+      f32x4x4 ma = m_load(m);
+      return v_extract0(m_determinant_affine(ma));
+   }
+
+   // Matrix Scale: Float3 (assume w = 1.0f)
+   inline void mat44_scale_impl(float* m, const float* s)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_pos(s);
+
+      ma.r0 = v_mul(ma.r0, va);
+      ma.r1 = v_mul(ma.r1, va);
+      ma.r2 = v_mul(ma.r2, va);
+      ma.r3 = v_mul(ma.r3, va);
+      m_store(m, ma);
+   }
+
+   inline void mat44_get_scale_impl(const float* m, float* s)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // squared lengths
+      f32x4 len2_x = v_dot3(ma.r0, ma.r0);
+      f32x4 len2_y = v_dot3(ma.r1, ma.r1);
+      f32x4 len2_z = v_dot3(ma.r2, ma.r2);
+
+      // extract and sqrt
+      s[0] = 1.0f / v_extract0(v_rsqrt_nr(len2_x));
+      s[1] = 1.0f / v_extract0(v_rsqrt_nr(len2_y));
+      s[2] = 1.0f / v_extract0(v_rsqrt_nr(len2_z));
+   }
+
+   // Matrix Scale Uniform: Float value (assume w = 1.0f)
+   inline void mat44_scale_uniform(float* m, float s)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // (s, s, s, 1)
+      f32x4 scale = v_set(s, s, s, 1.0f);
+
+      // Scale only rotation rows (xyz part)
+      ma.r0 = v_mul(ma.r0, scale);
+      ma.r1 = v_mul(ma.r1, scale);
+      ma.r2 = v_mul(ma.r2, scale);
+      m_store(m, ma);
+   }
+
+   // Matrix Inverse
+   inline void mat44_inverse_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // Compute cofactors using cross products
+      f32x4x4 mTemp;
+      mTemp.r0 = v_cross(ma.r1, ma.r2);
+      mTemp.r1 = v_cross(ma.r2, ma.r0);
+      mTemp.r2 = v_cross(ma.r0, ma.r1);
+
+      // Determinant = dot(ma.r0, c0)
+      f32x4 det = v_dot3(ma.r0, mTemp.r0);
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Scale cofactors
+      mTemp.r0 = v_mul(mTemp.r0, invDet);
+      mTemp.r1 = v_mul(mTemp.r1, invDet);
+      mTemp.r2 = v_mul(mTemp.r2, invDet);
+
+      // Store inverse 3x3 (transpose of cofactor matrix)
+
+      mTemp = m_transpose(mTemp);
+      mTemp.r3 = v_set(0, 0, 0, 1.0f);
+
+      // ---- Translation ----
+
+      // Load original translation
+      f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
+      
+      // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
+      f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
+      result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
+      result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
+      result = v_mul(result, v_set1(-1.0f));
+
+      m_store(m, mTemp);
+
+      // Store translation
+      m[3] = v_extract0(result);
+      m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
+      m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
+   }
+
+   // Matrix Affine Inverse
+   inline void mat44_affine_inverse_impl(float* m)
+   {
+      f32x4x4 ma = m_load(m);
+
+      f32x4x4 mTemp = m_transpose(ma);
+      mTemp.r3 = v_set(0, 0, 0, 1);
+
+      // ---- Translation ----
+      // Load original translation
+      f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
+
+      // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
+      f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
+      result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
+      result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
+      result = v_mul(result, v_set1(-1.0f));
+
+      m_store(m, mTemp);
+
+      // Store translation
+      m[3] = v_extract0(result);
+      m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
+      m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
+   }
+
+   inline void mat44_normalize_impl(float* a)
+   {
+      // Load the matrix
+      f32x4x4 m = m_load(a);
+
+      // Extract axes (rows 0-2), zero out w using v_mask_xyz
+      f32x4 xaxis = v_mul(m.r0, v_mask_xyz());
+      f32x4 yaxis = v_mul(m.r1, v_mask_xyz());
+      f32x4 zaxis = v_mul(m.r2, v_mask_xyz());
+
+      xaxis = v_normalize3(xaxis);
+
+      float dotXY = v_extract0(v_hadd4(v_mul(xaxis, yaxis)));
+      f32x4 projYonX = v_mul(v_set1(dotXY), xaxis);
+      yaxis = v_normalize3(v_sub(yaxis, projYonX));
+
+      zaxis = v_cross(xaxis, yaxis);
+
+      // Store normalized axes back (preserve translation w)
+      m.r0 = v_preserve_w(xaxis, m.r0);
+      m.r1 = v_preserve_w(yaxis, m.r1);
+      m.r2 = v_preserve_w(zaxis, m.r2);
+
+      // Store back to memory
+      m_store(a, m);
+   }
+
+   // Matrix Multiply: a * b
+   inline void mat44_mul_mat44_impl(const float* a, const float* b, float* r)
+   {
+      f32x4x4 ma = m_load(a);
+      f32x4x4 mb = m_load(b);
+
+      f32x4x4 mr = m_mul(ma, mb);
+      m_store(r, mr);
+   }
+
+   // Vector Multiply: m * p (assume w = 1.0f)
+   inline void mat44_mul_pos3_impl(const float *m, const float *p, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_pos(p);
+      f32x4 vr = m_mul_vec4(ma, va);
+      v_store3(r, vr);
+   }
+
+   // Vector Multiply: m * v (assume w = 0.0f)
+   inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load3_vec(v);
+      f32x4 vr = m_mul_vec4(ma, va);
+      v_store3(r, vr);
+   }
+
+   // Vector Multiply: m * p (full [4x4] * [1x4])
+   inline void mat44_mul_float4_impl(const float* m, const float* p, float* r)
+   {
+      f32x4x4 ma = m_load(m);
+      f32x4 va = v_load(p);
+      f32x4 vr = m_mul_vec4(ma, va);
+      v_store(r, vr);
+   }
+
+   //--------------------------------------------------
+   // MATRIX ROTATION FUNCTIONS
+   //--------------------------------------------------
+
+   inline void mat44_rotation_x_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r1 = v_set(0, c, s, 0);
+      mr.r2 = v_set(0, -s, c, 0);
+      m_store(m, mr);
+   }
+
+   inline void mat44_rotation_y_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r0 = v_set(c, 0, -s, 0);
+      mr.r2 = v_set(s, 0, c, 0);
+      m_store(m, mr);
+   }
+
+   inline void mat44_rotation_z_impl(float* m, float angle)
+   {
+      float c = cosf(angle), s = sinf(angle);
+      f32x4x4 mr = m_identity();
+      mr.r0 = v_set(c, s, 0, 0);
+      mr.r1 = v_set(-s, c, 0, 0);
+      m_store(m, mr);
+   }
+
+   // Compose rotation from Euler angles (pitch=X, yaw=Y, roll=Z)
+   inline void mat44_rotation_euler_impl(float* m, float pitch, float yaw, float roll)
+   {
+      f32x4x4 rx, ry, rz;
+      mat44_rotation_x_impl((float*)&rx, pitch);
+      mat44_rotation_y_impl((float*)&ry, yaw);
+      mat44_rotation_z_impl((float*)&rz, roll);
+
+      f32x4x4 r = m_mul(rz, m_mul(ry, rx));
+      m_store(m, r);
+   }
+
+   inline void mat44_trs_impl(float* m, const float* t, const float* r_euler, const float* s)
+   {
+      f32x4x4 mr;
+      mat44_rotation_euler_impl((float*)&mr, r_euler[0], r_euler[1], r_euler[2]);
+
+      f32x4 vs = v_load3_vec(s); // scale xyz
+      mr.r0 = v_mul(mr.r0, vs);
+      mr.r1 = v_mul(mr.r1, vs);
+      mr.r2 = v_mul(mr.r2, vs);
+
+      mr.r0 = v_insert_w(mr.r0, _mm_set_ss(t[0]));
+      mr.r1 = v_insert_w(mr.r1, _mm_set_ss(t[1]));
+      mr.r2 = v_insert_w(mr.r2, _mm_set_ss(t[2]));
+      mr.r3 = v_set(0, 0, 0, 1.0f);
+
+      m_store(m, mr);
+   }
+
+   inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up)
+   {
+      f32x4 vEye = v_load3_pos(eye);
+      f32x4 vTarget = v_load3_pos(target);
+      f32x4 vUp = v_load3_vec(up);
+
+      // Forward (z+)
+      f32x4 zaxis = v_normalize3(v_sub(vTarget, vEye));
+
+      // Right (x+)
+      f32x4 xaxis = v_normalize3(v_cross(vUp, zaxis));
+
+      // Up (y+)
+      f32x4 yaxis = v_cross(zaxis, xaxis);
+
+      // Compute translation components: -dot(axis, eye)
+      f32x4 t_x = v_mul(v_dot3(xaxis, vEye), v_set1(-1.0f));
+      f32x4 t_y = v_mul(v_dot3(yaxis, vEye), v_set1(-1.0f));
+      f32x4 t_z = v_mul(v_dot3(zaxis, vEye), v_set1(-1.0f));
+
+      f32x4x4 view;
+      view.r0 = v_insert_w(xaxis, t_x);
+      view.r1 = v_insert_w(yaxis, t_y);
+      view.r2 = v_insert_w(zaxis, t_z);
+      view.r3 = v_set(0, 0, 0, 1.0f);
+
+      m_store(m, view);
+   }
+
+   inline void mat44_perspective_impl(float* m, float fovY, float aspect, float znear, float zfar)
+   {
+      float f = 1.0f / tanf(fovY * 0.5f);
+      float nf = 1.0f / (znear - zfar);
+
+      f32x4x4 mp = m_zero();
+      mp.r0 = v_set(f / aspect, 0, 0, 0);
+      mp.r1 = v_set(0, f, 0, 0);
+      mp.r2 = v_set(0, 0, (zfar + znear) * nf, 2 * zfar * znear * nf);
+      mp.r3 = v_set(0, 0, -1, 0); // row-major projection
+      m_store(m, mp);
+   }
+
+   inline void mat44_orthographic_impl(float* m, float left, float right, float bottom, float top, float znear, float zfar)
+   {
+      f32x4x4 mo = m_zero();
+      mo.r0 = v_set(2.0f / (right - left), 0, 0, -(right + left) / (right - left));
+      mo.r1 = v_set(0, 2.0f / (top - bottom), 0, -(top + bottom) / (top - bottom));
+      mo.r2 = v_set(0, 0, -2.0f / (zfar - znear), -(zfar + znear) / (zfar - znear));
+      mo.r3 = v_set(0, 0, 0, 1.0f);
+      m_store(m, mo);
+   }
+
+} // namespace math_backend::mat44
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
index 7d8a317ba..3fe449a1b 100644
--- a/Engine/source/math/impl/math_c.cpp
+++ b/Engine/source/math/impl/math_c.cpp
@@ -1,3 +1,4 @@
+#include "platform/platform.h"
 #include "math/public/float4_dispatch.h"
 #include "math/public/float3_dispatch.h"
 #include "math/public/mat44_dispatch.h"
@@ -196,6 +197,89 @@ namespace math_backend::mat44::dispatch
          swap(a[11], a[14]);
       };
 
+      gMat44.determinant = [](const float* m) {
+         return m[0] * (m[5] * m[10] - m[6] * m[9]) +
+            m[4] * (m[2] * m[9] - m[1] * m[10]) +
+            m[8] * (m[1] * m[6] - m[2] * m[5]);
+      };
+
+      gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
+#ifdef TORQUE_COMPILER_GCC
+         const F32   v0 = b[0], v1 = b[1], v2 = b[2];
+         const F32   m0 = a[0], m1 = a[1], m2 = a[2];
+         const F32   m4 = a[4], m5 = a[5], m6 = a[6];
+         const F32   m8 = a[8], m9 = a[9], m10 = a[10];
+
+         r[0] = m0 * v0 + m1 * v1 + m2 * v2;
+         r[1] = m4 * v0 + m5 * v1 + m6 * v2;
+         r[2] = m8 * v0 + m9 * v1 + m10 * v2;
+#else
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
+#endif
+
+      };
+
+      gMat44.inverse = [](float* m) {
+         // using Cramers Rule find the Inverse
+         // Minv = (1/det(M)) * adjoint(M)
+         float det = gMat44.determinant(m);
+         AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+         float invDet = 1.0f / det;
+         float temp[16];
+         temp[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
+         temp[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
+         temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
+
+         temp[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
+         temp[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
+         temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
+
+         temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
+         temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
+         temp[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
+
+         m[0] = temp[0];
+         m[1] = temp[1];
+         m[2] = temp[2];
+
+         m[4] = temp[4];
+         m[5] = temp[5];
+         m[6] = temp[6];
+
+         m[8] = temp[8];
+         m[9] = temp[9];
+         m[10] = temp[10];
+         
+         // invert the translation
+         temp[0] = -m[3];
+         temp[1] = -m[7];
+         temp[2] = -m[11];
+         gMat44.mul_vec3(m, temp, &temp[4]);
+         m[3] = temp[4];
+         m[7] = temp[5];
+         m[11] = temp[6];
+
+      };
+
+      gMat44.affine_inverse = [](float* a) {
+         F32 temp[16];
+         dMemcpy(temp, a, 16 * sizeof(F32));
+
+         // Transpose rotation
+         a[1] = temp[4];
+         a[4] = temp[1];
+         a[2] = temp[8];
+         a[8] = temp[2];
+         a[6] = temp[9];
+         a[9] = temp[6];
+
+         a[3] = -(temp[0] * temp[3] + temp[4] * temp[7] + temp[8] * temp[11]);
+         a[7] = -(temp[1] * temp[3] + temp[5] * temp[7] + temp[9] * temp[11]);
+         a[11] = -(temp[2] * temp[3] + temp[6] * temp[7] + temp[10] * temp[11]);
+      };
+
       gMat44.scale = [](float* a, const float* s) {
          // Note, doesn't allow scaling w...
 
@@ -204,5 +288,86 @@ namespace math_backend::mat44::dispatch
          a[8] *= s[0];  a[9] *= s[1];  a[10] *= s[2];
          a[12] *= s[0];  a[13] *= s[1];  a[14] *= s[2];
       };
+
+      gMat44.get_scale = [](const float* a, float* s) {
+         // Note, doesn't allow scaling w...
+         s[0] = sqrt(a[0] * a[0] + a[4] * a[4] + a[8] * a[8]);
+         s[1] = sqrt(a[1] * a[1] + a[5] * a[5] + a[9] * a[9]);
+         s[2] = sqrt(a[2] * a[2] + a[6] * a[6] + a[10] * a[10]);
+      };
+
+      gMat44.mul_float4 = [](const float* a, const float* b, float* r) {
+         AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2] + a[7] * b[3];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2] + a[11] * b[3];
+         r[2] = a[12] * b[0] + a[13] * b[1] + a[14] * b[2] + a[15] * b[3];
+      };
+
+      gMat44.mul_pos3 = [](const float* a, const float* b, float* r) {
+             AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+             r[0] = a[0]*b[0] + a[1]*b[1] + a[2]*b[2]  + a[3];
+             r[1] = a[4]*b[0] + a[5]*b[1] + a[6]*b[2]  + a[7];
+             r[2] = a[8]*b[0] + a[9]*b[1] + a[10]*b[2] + a[11];
+      };
+
+      gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
+         AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
+         r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+         r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
+         r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
+      };
+
+      gMat44.mul_mat44 = [](const float* a, const float* b, float* r) {
+         r[0] = a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12];
+         r[1] = a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13];
+         r[2] = a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14];
+         r[3] = a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15];
+
+         r[4] = a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12];
+         r[5] = a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13];
+         r[6] = a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14];
+         r[7] = a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15];
+
+         r[8] = a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12];
+         r[9] = a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13];
+         r[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14];
+         r[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15];
+
+         r[12] = a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12];
+         r[13] = a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13];
+         r[14] = a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14];
+         r[15] = a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15];
+      };
+
+      gMat44.normalize = [](float* a) {
+         F32 col0[3], col1[3], col2[3];
+         // extract columns 0 and 1
+         col0[0] = a[0];
+         col0[1] = a[4];
+         col0[2] = a[8];
+
+         col1[0] = a[1];
+         col1[1] = a[5];
+         col1[2] = a[9];
+
+         math_backend::float3::dispatch::gFloat3.normalize(col0);
+         math_backend::float3::dispatch::gFloat3.normalize(col1);
+         math_backend::float3::dispatch::gFloat3.normalize(col2);
+
+         // store the normalized columns
+         a[0] = col0[0];
+         a[4] = col0[1];
+         a[8] = col0[2];
+
+         a[1] = col1[0];
+         a[5] = col1[1];
+         a[9] = col1[2];
+
+         a[2] = col2[0];
+         a[6] = col2[1];
+         a[10] = col2[2];
+
+      };
    }
 }
diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
index 0340d84a2..a67f900b4 100644
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <immintrin.h> // AVX/AVX2 intrinsics
+#include <immintrin.h>  // AVX
 
 namespace
 {
@@ -26,11 +26,58 @@ namespace
 
    inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
 
+   inline f32x4 v_swizzle_mask(f32x4 v1, const int x, const int y, const int z, const int w)
+   {
+      __m128i idx = _mm_set_epi32(w, z, y, x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
+   {
+      __m128i idx = _mm_set1_epi32(x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
    {
       return _mm_blend_ps(newv, original, 0b1000);
    }
 
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, const int x, const int y, const int z, const int w)
+   {
+      f32x4 lo = v1;
+      f32x4 hi = v2;
+
+      f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
+      f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
+
+      return _mm_movelh_ps(combined_lo, combined_hi);
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
    //------------------------------------------------------
    // Float3 helpers (safe loading into 4 lanes)
    //------------------------------------------------------
@@ -54,6 +101,16 @@ namespace
       dst[2] = tmp[2];
    }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
    //------------------------------------------------------
    // Simple Arithmatic
    //------------------------------------------------------
@@ -70,7 +127,7 @@ namespace
    // Element-wise subtract
    inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
 
-   //------------------------------------------------------
+   //------------------------------------------------------ 
    // Fast recip
    //------------------------------------------------------
 
@@ -137,4 +194,195 @@ namespace
       __m128 sum = _mm_hadd_ps(a, a);
       return _mm_hadd_ps(sum, sum);
    }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
 }
diff --git a/Engine/source/math/isa/avx/mat44.cpp b/Engine/source/math/isa/avx/mat44.cpp
new file mode 100644
index 000000000..6634b652c
--- /dev/null
+++ b/Engine/source/math/isa/avx/mat44.cpp
@@ -0,0 +1,22 @@
+#include "avx_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_avx()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
index 0340d84a2..67f0df04a 100644
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <immintrin.h> // AVX/AVX2 intrinsics
+#include <immintrin.h>  // AVX2
 
 namespace
 {
@@ -26,11 +26,58 @@ namespace
 
    inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
 
+   inline f32x4 v_swizzle_mask(f32x4 v1,const int x, const int y, const int z, const int w)
+   {
+      __m128i idx = _mm_set_epi32(w, z, y, x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
+   {
+      __m128i idx = _mm_set1_epi32(x);
+      return _mm_permutevar_ps(v1, idx);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
    {
       return _mm_blend_ps(newv, original, 0b1000);
    }
 
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      f32x4 lo = v1;
+      f32x4 hi = v2;
+
+      f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
+      f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
+
+      return _mm_movelh_ps(combined_lo, combined_hi);
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
    //------------------------------------------------------
    // Float3 helpers (safe loading into 4 lanes)
    //------------------------------------------------------
@@ -54,6 +101,16 @@ namespace
       dst[2] = tmp[2];
    }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
    //------------------------------------------------------
    // Simple Arithmatic
    //------------------------------------------------------
@@ -70,7 +127,7 @@ namespace
    // Element-wise subtract
    inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
 
-   //------------------------------------------------------
+   //------------------------------------------------------ 
    // Fast recip
    //------------------------------------------------------
 
@@ -137,4 +194,195 @@ namespace
       __m128 sum = _mm_hadd_ps(a, a);
       return _mm_hadd_ps(sum, sum);
    }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
 }
diff --git a/Engine/source/math/isa/avx2/mat44.cpp b/Engine/source/math/isa/avx2/mat44.cpp
new file mode 100644
index 000000000..11e842174
--- /dev/null
+++ b/Engine/source/math/isa/avx2/mat44.cpp
@@ -0,0 +1,22 @@
+#include "avx2_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl" 
+
+namespace math_backend::mat44::dispatch
+{
+   void install_avx2()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/isa/sse2/mat44.cpp b/Engine/source/math/isa/sse2/mat44.cpp
new file mode 100644
index 000000000..6738ee391
--- /dev/null
+++ b/Engine/source/math/isa/sse2/mat44.cpp
@@ -0,0 +1,22 @@
+#include "sse2_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_sse2()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
index 85b09ebb0..71a95c1b7 100644
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -7,7 +7,7 @@ namespace
    typedef __m128 f32x4;
 
    //------------------------------------------------------
-   // Load / Store
+   // Vector/Point Load / Store
    //------------------------------------------------------
 
    // Load 4 floats from memory into a SIMD register
@@ -35,6 +35,17 @@ namespace
       return _mm_set_ps(1.0f, p[2], p[1], p[0]);
    }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
+      return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v));
+   }
+
    inline void v_store3(float* dst, f32x4 v)
    {
       alignas(16) float tmp[4];   // temp storage
@@ -50,12 +61,70 @@ namespace
 
    inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
 
+   inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+
+      return _mm_set_ps(
+         tmp[w],
+         tmp[z],
+         tmp[y],
+         tmp[x]
+      );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+      return _mm_set1_ps(tmp[x]);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0));
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1));
+   }
+
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
    {
-      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
+      f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1
       return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
    }
 
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      alignas(16) float a[4], b[4];
+      _mm_store_ps(a, v1);
+      _mm_store_ps(b, v2);
+
+      return _mm_set_ps(
+         b[w],
+         b[z],
+         a[y],
+         a[x]
+      );
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
 
    //------------------------------------------------------
    // Simple Arithmatic
@@ -73,6 +142,13 @@ namespace
    // Element-wise subtract
    inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
 
+   // Absolute value.
+   inline f32x4 v_abs(f32x4 v)
+   {
+      const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+      return _mm_and_ps(v, mask);
+   }
+
    //------------------------------------------------------
    // Fast recip
    //------------------------------------------------------
@@ -153,4 +229,292 @@ namespace
       shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
       return _mm_add_ps(sums, shuf);
    }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+   
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      v_store(dst + 0, m.r0);
+      v_store(dst + 4, m.r1);
+      v_store(dst + 8, m.r2);
+      v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // Pack lowest lane of each into a vector
+      f32x4 xy = _mm_unpacklo_ps(x, y);
+      f32x4 zw = _mm_unpacklo_ps(z, w);
+      return _mm_movelh_ps(xy, zw);
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
+   {
+      dst[0] = v_extract0(c0);            // c0.x
+      dst[1] = v_extract0(c1);            // c1.x
+      dst[2] = v_extract0(c2);            // c2.x
+
+      dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
+      dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
+      dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
+
+      dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
+      dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
+      dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(b, c);
+      f32x4 c1 = v_cross(c, d);
+      f32x4 c2 = v_cross(d, a);
+      f32x4 c3 = v_cross(a, b);
+
+      f32x4 det = v_dot4(a, c1);
+      f32x4 invDet = v_rcp_nr(det);
+
+      f32x4x4 adj;
+      adj.r0 = _mm_mul_ps(c1, invDet);
+      adj.r1 = _mm_mul_ps(c2, invDet);
+      adj.r2 = _mm_mul_ps(c3, invDet);
+      adj.r3 = _mm_mul_ps(c0, invDet);
+
+      return m_transpose(adj);
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+       
+      return t;
+   }
+
+   inline f32x4x4 m_inverse_affine(const f32x4x4& m)
+   {
+      // Extract upper 3x3 rows
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      // Zero translation
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      // Compute cofactors via cross products
+      f32x4 c0 = v_cross(r1, r2);
+      f32x4 c1 = v_cross(r2, r0);
+      f32x4 c2 = v_cross(r0, r1);
+
+      // Determinant
+      f32x4 det = v_dot3(r0, c0);
+
+      // Reciprocal determinant
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Inverse 3x3 (transpose of cofactor matrix)
+      f32x4 i0 = _mm_mul_ps(c0, invDet);
+      f32x4 i1 = _mm_mul_ps(c1, invDet);
+      f32x4 i2 = _mm_mul_ps(c2, invDet);
+
+      // Transpose 3x3
+      f32x4 t0 = i0;
+      f32x4 t1 = i1;
+      f32x4 t2 = i2;
+      f32x4 t3 = _mm_setzero_ps();
+
+      _MM_TRANSPOSE4_PS(t0, t1, t2, t3);
+
+      // Extract translation
+      alignas(16) float tmp[4];
+      tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[3] = 0.0f;
+
+      f32x4 T = v_load(tmp);
+
+      // New translation = -R' * T
+      f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      // Assemble final matrix
+      f32x4x4 out;
+      out.r0 = v_preserve_w(t0, newT);
+      out.r1 = v_preserve_w(t1, newT);
+      out.r2 = v_preserve_w(t2, newT);
+      out.r3 = _mm_set_ps(1, 0, 0, 0);
+
+      return out;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
 }
diff --git a/Engine/source/math/isa/sse41/mat44.cpp b/Engine/source/math/isa/sse41/mat44.cpp
new file mode 100644
index 000000000..94bc5e4b2
--- /dev/null
+++ b/Engine/source/math/isa/sse41/mat44.cpp
@@ -0,0 +1,22 @@
+#include "sse41_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_sse41()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
index 047cb44ee..2ea63e6b5 100644
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -26,11 +26,70 @@ namespace
 
    inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
 
+   inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+
+      return _mm_set_ps(
+         tmp[w],
+         tmp[z],
+         tmp[y],
+         tmp[x]
+      );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+      alignas(16) float tmp[4];
+      _mm_store_ps(tmp, v);
+      return _mm_set1_ps(tmp[x]);
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v1)
+   {
+      return _mm_moveldup_ps(v1);
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v1)
+   {
+      return _mm_movehdup_ps(v1);
+   }
+
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
    {
       return _mm_blend_ps(newv, original, 0b1000);
    }
 
+   //------------------------------------------------------
+   // Shuffle helpers
+   //------------------------------------------------------
+
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
+   {
+      alignas(16) float a[4], b[4];
+      _mm_store_ps(a, v1);
+      _mm_store_ps(b, v2);
+
+      return _mm_set_ps(
+         b[w],
+         b[z],
+         a[y],
+         a[x]
+      );
+   }
+
+   inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movelh_ps(v1, v2);
+   }
+
+   inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
+   {
+      return _mm_movehl_ps(v2, v1);
+   }
+
+
    //------------------------------------------------------
    // Float3 helpers (safe loading into 4 lanes)
    //------------------------------------------------------
@@ -54,6 +113,16 @@ namespace
       dst[2] = tmp[2];
    }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+      return _mm_set_ps(w, z, y, x);
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+      return _mm_insert_ps(v, w, 0x30);
+   }
+
    //------------------------------------------------------
    // Simple Arithmatic
    //------------------------------------------------------
@@ -70,7 +139,7 @@ namespace
    // Element-wise subtract
    inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
 
-   //------------------------------------------------------
+   //------------------------------------------------------ 
    // Fast recip
    //------------------------------------------------------
 
@@ -104,13 +173,13 @@ namespace
    // full dot4
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
    }
 
    // dot3 (ignores w)
    inline f32x4 v_dot3(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
    }
 
    // cross product xyz only.
@@ -137,4 +206,293 @@ namespace
       __m128 sum = _mm_hadd_ps(a, a);
       return _mm_hadd_ps(sum, sum);
    }
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+      f32x4 r0;
+      f32x4 r1;
+      f32x4 r2;
+      f32x4 r3;
+   };
+
+   //------------------------------------------------------
+   // Matrix Load / Store
+   //------------------------------------------------------
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+      f32x4x4 out;
+      out.r0 = v_load(m + 0);
+      out.r1 = v_load(m + 4);
+      out.r2 = v_load(m + 8);
+      out.r3 = v_load(m + 12);
+      return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+      _mm_storeu_ps(dst + 0, m.r0);
+      _mm_storeu_ps(dst + 4, m.r1);
+      _mm_storeu_ps(dst + 8, m.r2);
+      _mm_storeu_ps(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+      f32x4x4 m;
+      m.r0 = _mm_set_ps(0, 0, 0, 1);
+      m.r1 = _mm_set_ps(0, 0, 1, 0);
+      m.r2 = _mm_set_ps(0, 1, 0, 0);
+      m.r3 = _mm_set_ps(1, 0, 0, 0);
+      return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+      f32x4 z = v_zero();
+      return { z, z, z, z };
+   }
+
+   //------------------------------------------------------
+   // Matrix × Vector
+   //------------------------------------------------------
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot4(m.r0, v);
+      f32x4 y = v_dot4(m.r1, v);
+      f32x4 z = v_dot4(m.r2, v);
+      f32x4 w = v_dot4(m.r3, v);
+
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+                        _mm_cvtss_f32(z),
+                        _mm_cvtss_f32(y),
+                        _mm_cvtss_f32(x));
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+      f32x4 x = v_dot3(m.r0, v);
+      f32x4 y = v_dot3(m.r1, v);
+      f32x4 z = v_dot3(m.r2, v);
+
+      // combine to a vector
+      return _mm_set_ps(0.0f,
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
+   }
+
+   //------------------------------------------------------
+   // Matrix × Matrix
+   //------------------------------------------------------
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+      // Transpose B once for dot products
+      f32x4 b0 = b.r0;
+      f32x4 b1 = b.r1;
+      f32x4 b2 = b.r2;
+      f32x4 b3 = b.r3;
+
+      // Transpose (SSE2)
+      _MM_TRANSPOSE4_PS(b0, b1, b2, b3);
+
+      f32x4x4 C;
+
+      auto mul_row = [&](f32x4 arow)
+      {
+         f32x4 x = v_dot4(arow, b0);
+         f32x4 y = v_dot4(arow, b1);
+         f32x4 z = v_dot4(arow, b2);
+         f32x4 w = v_dot4(arow, b3);
+
+         f32x4 xy = _mm_unpacklo_ps(x, y);
+         f32x4 zw = _mm_unpacklo_ps(z, w);
+         return _mm_movelh_ps(xy, zw);
+      };
+
+      C.r0 = mul_row(a.r0);
+      C.r1 = mul_row(a.r1);
+      C.r2 = mul_row(a.r2);
+      C.r3 = mul_row(a.r3);
+
+      return C;
+   }
+
+   //------------------------------------------------------
+   // Transpose
+   //------------------------------------------------------
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+      f32x4 r3 = m.r3;
+
+      _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+      return { r0, r1, r2, r3 };
+   }
+
+   inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
+   {
+      dst[0] = v_extract0(c0);            // c0.x
+      dst[1] = v_extract0(c1);            // c1.x
+      dst[2] = v_extract0(c2);            // c2.x
+
+      dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
+      dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
+      dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
+
+      dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
+      dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
+      dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(b, c);
+      f32x4 c1 = v_cross(c, d);
+      f32x4 c2 = v_cross(d, a);
+      f32x4 c3 = v_cross(a, b);
+
+      f32x4 det = v_dot4(a, c1);
+      f32x4 invDet = v_rcp_nr(det);
+
+      f32x4x4 adj;
+      adj.r0 = _mm_mul_ps(c1, invDet);
+      adj.r1 = _mm_mul_ps(c2, invDet);
+      adj.r2 = _mm_mul_ps(c3, invDet);
+      adj.r3 = _mm_mul_ps(c0, invDet);
+
+      return m_transpose(adj);
+   }
+
+   inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
+   {
+      f32x4x4 t = m_transpose(m);
+
+      // Extract translation
+      f32x4 T = v_set(
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
+         _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
+         0.0f
+      );
+
+      f32x4 newT = m_mul_vec4(t, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      t.r0 = v_preserve_w(t.r0, newT);
+      t.r1 = v_preserve_w(t.r1, newT);
+      t.r2 = v_preserve_w(t.r2, newT);
+      t.r3 = v_set(0, 0, 0, 1);
+
+      return t;
+   }
+
+   inline f32x4x4 m_inverse_affine(const f32x4x4& m)
+   {
+      // Extract upper 3x3 rows
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      // Zero translation
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      // Compute cofactors via cross products
+      f32x4 c0 = v_cross(r1, r2);
+      f32x4 c1 = v_cross(r2, r0);
+      f32x4 c2 = v_cross(r0, r1);
+
+      // Determinant
+      f32x4 det = v_dot3(r0, c0);
+
+      // Reciprocal determinant
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Inverse 3x3 (transpose of cofactor matrix)
+      f32x4 i0 = _mm_mul_ps(c0, invDet);
+      f32x4 i1 = _mm_mul_ps(c1, invDet);
+      f32x4 i2 = _mm_mul_ps(c2, invDet);
+
+      // Transpose 3x3
+      f32x4 t0 = i0;
+      f32x4 t1 = i1;
+      f32x4 t2 = i2;
+      f32x4 t3 = _mm_setzero_ps();
+
+      _MM_TRANSPOSE4_PS(t0, t1, t2, t3);
+
+      // Extract translation
+      alignas(16) float tmp[4];
+      tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
+      tmp[3] = 0.0f;
+
+      f32x4 T = v_load(tmp);
+
+      // New translation = -R' * T
+      f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
+      newT = _mm_sub_ps(v_zero(), newT);
+
+      // Assemble final matrix
+      f32x4x4 out;
+      out.r0 = v_preserve_w(t0, newT);
+      out.r1 = v_preserve_w(t1, newT);
+      out.r2 = v_preserve_w(t2, newT);
+      out.r3 = _mm_set_ps(1, 0, 0, 0);
+
+      return out;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+      f32x4 a = m.r0;
+      f32x4 b = m.r1;
+      f32x4 c = m.r2;
+      f32x4 d = m.r3;
+
+      f32x4 c0 = v_cross(c, d);
+      f32x4 c1 = v_cross(d, b);
+      f32x4 c2 = v_cross(b, c);
+
+      f32x4 term0 = _mm_mul_ps(a, c0);
+      f32x4 term1 = _mm_mul_ps(a, c1);
+      f32x4 term2 = _mm_mul_ps(a, c2);
+
+      f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
+
+      return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+      f32x4 r0 = m.r0;
+      f32x4 r1 = m.r1;
+      f32x4 r2 = m.r2;
+
+      r0 = _mm_and_ps(r0, v_mask_xyz());
+      r1 = _mm_and_ps(r1, v_mask_xyz());
+      r2 = _mm_and_ps(r2, v_mask_xyz());
+
+      f32x4 c0 = v_cross(r1, r2);
+      return v_dot3(r0, c0); // splatted determinant
+   }
 }
diff --git a/Engine/source/math/mMathFn.h b/Engine/source/math/mMathFn.h
index 9f84b37df..e9e7d3840 100644
--- a/Engine/source/math/mMathFn.h
+++ b/Engine/source/math/mMathFn.h
@@ -76,13 +76,11 @@ extern void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m );
 extern void (*m_matF_set_euler)(const F32 *e, F32 *result);
 extern void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result);
 extern void (*m_matF_identity)(F32 *m);
-extern void (*m_matF_inverse)(F32 *m);
-extern void (*m_matF_invert_to)(const F32 *m, F32 *d);
-extern void (*m_matF_affineInverse)(F32 *m);
+extern void (*m_matF_invert_to)(const F32* m, F32* d);
 extern void (*m_matF_transpose)(F32 *m);
 extern void (*m_matF_scale)(F32 *m,const F32* p);
 extern void (*m_matF_normalize)(F32 *m);
-extern F32  (*m_matF_determinant)(const F32 *m);
+extern F32(*m_matF_determinant)(const F32* m);
 extern void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult);
 extern void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult);
 // extern void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult);
diff --git a/Engine/source/math/mMath_C.cpp b/Engine/source/math/mMath_C.cpp
index 7fac832f9..ad5aed0db 100644
--- a/Engine/source/math/mMath_C.cpp
+++ b/Engine/source/math/mMath_C.cpp
@@ -464,71 +464,26 @@ static void affineInvertTo(const F32 * m, F32 * out)
 }
 #endif
 
-//--------------------------------------
-static void m_matF_inverse_C(F32 *m)
+static void m_matF_invert_to_C(const F32* m, F32* d)
 {
    // using Cramers Rule find the Inverse
    // Minv = (1/det(M)) * adjoint(M)
-   F32 det = m_matF_determinant( m );
-   AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+   F32 det = m_matF_determinant(m);
+   AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
 
-   F32 invDet = 1.0f/det;
-   F32 temp[16];
+   F32 invDet = 1.0f / det;
 
-   temp[0] = (m[5] * m[10]- m[6] * m[9]) * invDet;
-   temp[1] = (m[9] * m[2] - m[10]* m[1]) * invDet;
-   temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
-
-   temp[4] = (m[6] * m[8] - m[4] * m[10])* invDet;
-   temp[5] = (m[10]* m[0] - m[8] * m[2]) * invDet;
-   temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
-
-   temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
-   temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
-   temp[10]= (m[0] * m[5] - m[1] * m[4]) * invDet;
-
-   m[0] = temp[0];
-   m[1] = temp[1];
-   m[2] = temp[2];
-
-   m[4] = temp[4];
-   m[5] = temp[5];
-   m[6] = temp[6];
-
-   m[8] = temp[8];
-   m[9] = temp[9];
-   m[10] = temp[10];
-
-   // invert the translation
-   temp[0] = -m[3];
-   temp[1] = -m[7];
-   temp[2] = -m[11];
-   m_matF_x_vectorF(m, temp, &temp[4]);
-   m[3] = temp[4];
-   m[7] = temp[5];
-   m[11]= temp[6];
-}
-
-static void m_matF_invert_to_C(const F32 *m, F32 *d)
-{
-   // using Cramers Rule find the Inverse
-   // Minv = (1/det(M)) * adjoint(M)
-   F32 det = m_matF_determinant( m );
-   AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
-
-   F32 invDet = 1.0f/det;
-
-   d[0] = (m[5] * m[10]- m[6] * m[9]) * invDet;
-   d[1] = (m[9] * m[2] - m[10]* m[1]) * invDet;
+   d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
+   d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
    d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
 
-   d[4] = (m[6] * m[8] - m[4] * m[10])* invDet;
-   d[5] = (m[10]* m[0] - m[8] * m[2]) * invDet;
+   d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
+   d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
    d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
 
    d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
    d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
-   d[10]= (m[0] * m[5] - m[1] * m[4]) * invDet;
+   d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
 
    // invert the translation
    F32 temp[6];
@@ -538,33 +493,13 @@ static void m_matF_invert_to_C(const F32 *m, F32 *d)
    m_matF_x_vectorF(d, temp, &temp[3]);
    d[3] = temp[3];
    d[7] = temp[4];
-   d[11]= temp[5];
-   d[ 12 ] = m[ 12 ];
-   d[ 13 ] = m[ 13 ];
-   d[ 14 ] = m[ 14 ];
-   d[ 15 ] = m[ 15 ];
+   d[11] = temp[5];
+   d[12] = m[12];
+   d[13] = m[13];
+   d[14] = m[14];
+   d[15] = m[15];
 }
 
-//--------------------------------------
-static void m_matF_affineInverse_C(F32 *m)
-{
-   // Matrix class checks to make sure this is an affine transform before calling
-   //  this function, so we can proceed assuming it is...
-   F32 temp[16];
-   dMemcpy(temp, m, 16 * sizeof(F32));
-
-   // Transpose rotation
-   m[1] = temp[4];
-   m[4] = temp[1];
-   m[2] = temp[8];
-   m[8] = temp[2];
-   m[6] = temp[9];
-   m[9] = temp[6];
-
-   m[3]  = -(temp[0]*temp[3] + temp[4]*temp[7] + temp[8]*temp[11]);
-   m[7]  = -(temp[1]*temp[3] + temp[5]*temp[7] + temp[9]*temp[11]);
-   m[11] = -(temp[2]*temp[3] + temp[6]*temp[7] + temp[10]*temp[11]);
-}
 
 inline void swap(F32 &a, F32 &b)
 {
@@ -633,11 +568,11 @@ static void m_matF_normalize_C(F32 *m)
 
 
 //--------------------------------------
-static F32 m_matF_determinant_C(const F32 *m)
+static F32 m_matF_determinant_C(const F32* m)
 {
-   return m[0] * (m[5] * m[10] - m[6] * m[9])  +
-      m[4] * (m[2] * m[9]  - m[1] * m[10]) +
-      m[8] * (m[1] * m[6]  - m[2] * m[5])  ;
+   return m[0] * (m[5] * m[10] - m[6] * m[9]) +
+      m[4] * (m[2] * m[9] - m[1] * m[10]) +
+      m[8] * (m[1] * m[6] - m[2] * m[5]);
 }
 
 
@@ -890,13 +825,11 @@ void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m ) = m_quatF_set_mat
 void (*m_matF_set_euler)(const F32 *e, F32 *result) = m_matF_set_euler_C;
 void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result) = m_matF_set_euler_point_C;
 void (*m_matF_identity)(F32 *m)  = m_matF_identity_C;
-void (*m_matF_inverse)(F32 *m)   = m_matF_inverse_C;
-void (*m_matF_affineInverse)(F32 *m)   = m_matF_affineInverse_C;
-void (*m_matF_invert_to)(const F32 *m, F32 *d) = m_matF_invert_to_C;
+void (*m_matF_invert_to)(const F32* m, F32* d) = m_matF_invert_to_C;
 void (*m_matF_transpose)(F32 *m) = m_matF_transpose_C;
 void (*m_matF_scale)(F32 *m,const F32* p) = m_matF_scale_C;
 void (*m_matF_normalize)(F32 *m) = m_matF_normalize_C;
-F32  (*m_matF_determinant)(const F32 *m) = m_matF_determinant_C;
+F32(*m_matF_determinant)(const F32* m) = m_matF_determinant_C;
 void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult)    = default_matF_x_matF_C;
 void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult)    = default_matF_x_matF_C;
 // void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult) = m_matF_x_point3F_C;
@@ -935,13 +868,11 @@ void mInstallLibrary_C()
    m_matF_set_euler        = m_matF_set_euler_C;
    m_matF_set_euler_point  = m_matF_set_euler_point_C;
    m_matF_identity         = m_matF_identity_C;
-   m_matF_inverse          = m_matF_inverse_C;
-   m_matF_affineInverse    = m_matF_affineInverse_C;
-   m_matF_invert_to        = m_matF_invert_to_C;
+   m_matF_invert_to = m_matF_invert_to_C;
    m_matF_transpose        = m_matF_transpose_C;
    m_matF_scale            = m_matF_scale_C;
    m_matF_normalize        = m_matF_normalize_C;
-   m_matF_determinant      = m_matF_determinant_C;
+   m_matF_determinant = m_matF_determinant_C;
    m_matF_x_matF           = default_matF_x_matF_C;
    m_matF_x_matF_aligned   = default_matF_x_matF_C;
 //    m_matF_x_point3F        = m_matF_x_point3F_C;
diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h
index 763a62b93..f42aaa5e4 100644
--- a/Engine/source/math/mMatrix.h
+++ b/Engine/source/math/mMatrix.h
@@ -50,13 +50,16 @@
 #ifndef _CONSOLE_H_
 #include "console/console.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
 
 #ifndef USE_TEMPLATE_MATRIX
 
 /// 4x4 Matrix Class
 ///
 /// This runs at F32 precision.
-
+using math_backend::mat44::dispatch::gMat44;
 class MatrixF
 {
    friend class MatrixFEngineExport;
@@ -128,7 +131,7 @@ public:
    EulerF toEuler() const;
 
    F32 determinant() const {
-      return m_matF_determinant(*this);
+      return gMat44.determinant(*this);
    }
 
    /// Compute the inverse of the matrix.
@@ -372,71 +375,68 @@ inline MatrixF& MatrixF::identity()
 
 inline MatrixF& MatrixF::inverse()
 {
-   m_matF_inverse(m);
+   gMat44.inverse(m);
    return (*this);
 }
 
 inline void MatrixF::invertTo( MatrixF *out )
 {
-   m_matF_invert_to(m,*out);
+   out = this;
+   gMat44.inverse(*out);
 }
 
 inline MatrixF& MatrixF::affineInverse()
 {
-//   AssertFatal(isAffine() == true, "Error, this matrix is not an affine transform");
-   m_matF_affineInverse(m);
+   gMat44.affine_inverse(m);
    return (*this);
 }
 
 inline MatrixF& MatrixF::transpose()
 {
-   m_matF_transpose(m);
+   gMat44.transpose(m);
    return (*this);
 }
 
 inline MatrixF& MatrixF::scale(const Point3F& p)
 {
-   m_matF_scale(m,p);
+   gMat44.scale(m, p);
    return *this;
 }
 
 inline Point3F MatrixF::getScale() const
 {
    Point3F scale;
-   scale.x = mSqrt(m[0]*m[0] + m[4] * m[4] + m[8] * m[8]);
-   scale.y = mSqrt(m[1]*m[1] + m[5] * m[5] + m[9] * m[9]);
-   scale.z = mSqrt(m[2]*m[2] + m[6] * m[6] + m[10] * m[10]);
+   gMat44.get_scale(m, scale);
    return scale;
 }
 
 inline void MatrixF::normalize()
 {
-   m_matF_normalize(m);
+   gMat44.normalize(m);
 }
 
 inline MatrixF& MatrixF::mul( const MatrixF &a )
 {  // M * a -> M
    AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!");
-
    MatrixF tempThis(*this);
-   m_matF_x_matF(tempThis, a, *this);
+   gMat44.mul_mat44(tempThis, a, tempThis);
+   *this = tempThis;
+
    return (*this);
 }
 
 inline MatrixF& MatrixF::mulL( const MatrixF &a )
 {  // a * M -> M
    AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!");
-
    MatrixF tempThis(*this);
-   m_matF_x_matF(a, tempThis, *this);
+   gMat44.mul_mat44(a, tempThis, *this);
    return (*this);
 }
 
 inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b )
 {  // a * b -> M
    AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!");
-
-   m_matF_x_matF(a, b, *this);
+   gMat44.mul_mat44(a, b, *this);
    return (*this);
 }
 
@@ -461,7 +461,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b)
 inline void MatrixF::mul( Point4F& p ) const
 {
    Point4F temp;
-   m_matF_x_point4F(*this, &p.x, &temp.x);
+   gMat44.mul_float4(*this, p, temp);
    p = temp;
 }
 
@@ -469,28 +469,28 @@ inline void MatrixF::mulP( Point3F& p) const
 {
    // M * p -> d
    Point3F d;
-   m_matF_x_point3F(*this, &p.x, &d.x);
+   gMat44.mul_pos3(*this, p, d);
    p = d;
 }
 
 inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
 {
    // M * p -> d
-   m_matF_x_point3F(*this, &p.x, &d->x);
+   gMat44.mul_pos3(*this, p, *d);
 }
 
 inline void MatrixF::mulV( VectorF& v) const
 {
    // M * v -> v
    VectorF temp;
-   m_matF_x_vectorF(*this, &v.x, &temp.x);
+   gMat44.mul_vec3(*this, v, temp);
    v = temp;
 }
 
 inline void MatrixF::mulV( const VectorF &v, Point3F *d) const
 {
    // M * v -> d
-   m_matF_x_vectorF(*this, &v.x, &d->x);
+   gMat44.mul_vec3(*this, v, *d);
 }
 
 inline void MatrixF::mul(Box3F& b) const
@@ -634,14 +634,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 )
 {
    // temp = m1 * m2
    MatrixF temp;
-   m_matF_x_matF(m1, m2, temp);
+   gMat44.mul_mat44(m1, m2, temp);
    return temp;
 }
 
 inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 )
 {
    MatrixF tempThis(*this);
-   m_matF_x_matF(tempThis, m1, *this);
+   gMat44.mul_mat44(tempThis, m1, *this);
    return (*this);
 }
 
diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h
index 61b488861..b1b78ef13 100644
--- a/Engine/source/math/public/mat44_dispatch.h
+++ b/Engine/source/math/public/mat44_dispatch.h
@@ -8,7 +8,16 @@ namespace math_backend::mat44::dispatch
    struct Mat44Funcs
    {
       void (*transpose)(float*) = nullptr;
+      void (*inverse)(float*) = nullptr;
+      void (*affine_inverse)(float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*mul_mat44)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr;
+      void (*mul_float4)(const float* a, const float* b, float* r) = nullptr;
+      float (*determinant)(const float*) = nullptr;
       void (*scale)(float*, const float*) = nullptr;
+      void (*get_scale)(const float*, float*) = nullptr;
    };
 
    // Global dispatch table
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
index 9b5e5daed..81e00b3c7 100644
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@@ -18,6 +18,26 @@ namespace math_backend::mat44::dispatch
    Mat44Funcs gMat44{};
 }
 
+namespace math_backend
+{
+   // Use an anonymous namespace for the static initializer
+   namespace {
+      struct ScalarInitializer
+      {
+         ScalarInitializer()
+         {
+            // Install scalar defaults immediately for all types
+            float4::dispatch::install_scalar();
+            float3::dispatch::install_scalar();
+            mat44::dispatch::install_scalar();
+         }
+      };
+
+      // Static instance ensures constructor runs before main()
+      ScalarInitializer g_scalarInitializer;
+   }
+}
+
 
 math_backend::backend math_backend::choose_backend(U32 cpu_flags)
 {
@@ -47,21 +67,25 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
       case backend::avx2:
          float4::dispatch::install_avx2();
          float3::dispatch::install_avx2();
+         mat44::dispatch::install_avx2();
          break;
 
       case backend::avx:
          float4::dispatch::install_avx();
          float3::dispatch::install_avx();
+         mat44::dispatch::install_avx();
          break;
 
       case backend::sse41:
          float4::dispatch::install_sse41();
          float3::dispatch::install_sse41();
+         mat44::dispatch::install_sse41();
          break;
 
       case backend::sse2:
          float4::dispatch::install_sse2();
          float3::dispatch::install_sse2();
+         mat44::dispatch::install_sse2();
          break;
 #elif defined(__aarch64__) || defined(__ARM_NEON)
       case backend::neon:
diff --git a/Engine/source/ts/assimp/assimpAppNode.cpp b/Engine/source/ts/assimp/assimpAppNode.cpp
index ff2e1ffd5..af448f940 100644
--- a/Engine/source/ts/assimp/assimpAppNode.cpp
+++ b/Engine/source/ts/assimp/assimpAppNode.cpp
@@ -250,7 +250,7 @@ MatrixF AssimpAppNode::getNodeTransform(F32 time)
       // Check for inverted node coordinate spaces => can happen when modelers
       // use the 'mirror' tool in their 3d app. Shows up as negative <scale>
       // transforms in the collada model.
-      if (m_matF_determinant(nodeTransform) < 0.0f)
+      if (nodeTransform.determinant() < 0.0f)
       {
          // Mark this node as inverted so we can mirror mesh geometry, then
          // de-invert the transform matrix
diff --git a/Engine/source/ts/collada/colladaAppMesh.cpp b/Engine/source/ts/collada/colladaAppMesh.cpp
index a944e2eef..4c4470654 100644
--- a/Engine/source/ts/collada/colladaAppMesh.cpp
+++ b/Engine/source/ts/collada/colladaAppMesh.cpp
@@ -1099,7 +1099,7 @@ void ColladaAppMesh::lookupSkinData()
       // Inverted node coordinate spaces (negative scale factor) are corrected
       // in ColladaAppNode::getNodeTransform, so need to apply the same operation
       // here to match
-      if (m_matF_determinant(invBind) < 0.0f)
+      if (invBind.determinant() < 0.0f)
          initialTransforms[iJoint].scale(Point3F(1, 1, -1));
 
       initialTransforms[iJoint].mul(invBind);
diff --git a/Engine/source/ts/collada/colladaAppNode.cpp b/Engine/source/ts/collada/colladaAppNode.cpp
index 2ffc10fba..0cd8258c8 100644
--- a/Engine/source/ts/collada/colladaAppNode.cpp
+++ b/Engine/source/ts/collada/colladaAppNode.cpp
@@ -192,7 +192,7 @@ MatrixF ColladaAppNode::getNodeTransform(F32 time)
       // Check for inverted node coordinate spaces => can happen when modelers
       // use the 'mirror' tool in their 3d app. Shows up as negative <scale>
       // transforms in the collada model.
-      if (m_matF_determinant(nodeTransform) < 0.0f)
+      if (nodeTransform.determinant() < 0.0f)
       {
          // Mark this node as inverted so we can mirror mesh geometry, then
          // de-invert the transform matrix

From bb1478a8c3435297fe653deb7fb02c3d07c9bc25 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Tue, 3 Mar 2026 19:59:32 +0000
Subject: [PATCH 08/23] add neon mat for mac

---
 Engine/source/math/isa/neon/mat44.cpp      | 22 ++++++++++++++++++++++
 Engine/source/math/public/math_backend.cpp |  1 +
 2 files changed, 23 insertions(+)
 create mode 100644 Engine/source/math/isa/neon/mat44.cpp

diff --git a/Engine/source/math/isa/neon/mat44.cpp b/Engine/source/math/isa/neon/mat44.cpp
new file mode 100644
index 000000000..579cfe097
--- /dev/null
+++ b/Engine/source/math/isa/neon/mat44.cpp
@@ -0,0 +1,22 @@
+#include "neon_intrinsics.h"
+#include "mat44_dispatch.h"
+
+#include "mat44_impl.inl"
+
+namespace math_backend::mat44::dispatch
+{
+   void install_neon()
+   {
+      gMat44.transpose        = mat44_transpose_impl;
+      gMat44.inverse          = mat44_inverse_impl;
+      gMat44.affine_inverse   = mat44_affine_inverse_impl;
+      gMat44.mul_mat44        = mat44_mul_mat44_impl;
+      gMat44.mul_pos3         = mat44_mul_pos3_impl;
+      gMat44.mul_vec3         = mat44_mul_vec3_impl;
+      gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.scale            = mat44_scale_impl;
+      gMat44.get_scale        = mat44_get_scale_impl;
+      gMat44.normalize        = mat44_normalize_impl;
+      gMat44.determinant      = mat44_get_determinant;
+   }
+}
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
index 81e00b3c7..f8a16c3ed 100644
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@@ -91,6 +91,7 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
       case backend::neon:
          float4::dispatch::install_neon();
          float3::dispatch::install_neon();
+         mat44::dispatch::install_neon();
          break;
 #endif
       default:

From 0ba8d948fb67a23ee0ad716f24c0a91124a50fdf Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 08:41:57 +0000
Subject: [PATCH 09/23] neon implementation

removed some x86 intrinsic functions that were in the mat44_impl file
reinstated some mMath_C functions and mMathFn ptrs trying to diagnose an issue.
Had to come up with a different way to initialize the scalar table if the isa tables are not initialized yet. Mac did not like the static initialization.
Had to change neon over to using explicit masks for shifting, cross product was failing during bakes and matrix calculations
---
 Engine/source/math/impl/mat44_impl.inl        |  18 --
 Engine/source/math/impl/math_c.cpp            |  34 +--
 Engine/source/math/isa/neon/neon_intrinsics.h | 286 +++++++++++++++++-
 Engine/source/math/mMathFn.h                  |  34 ++-
 Engine/source/math/mMath_C.cpp                | 126 ++++++--
 Engine/source/math/mMatrix.h                  |  39 ++-
 Engine/source/math/mPoint3.h                  |  46 +--
 Engine/source/math/mPoint4.h                  |  22 +-
 Engine/source/math/public/math_backend.cpp    |   1 -
 Engine/source/math/public/math_backend.h      |  57 ++++
 10 files changed, 521 insertions(+), 142 deletions(-)

diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
index a9635cff0..c27daeb45 100644
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -239,24 +239,6 @@ namespace math_backend::mat44
       m_store(m, r);
    }
 
-   inline void mat44_trs_impl(float* m, const float* t, const float* r_euler, const float* s)
-   {
-      f32x4x4 mr;
-      mat44_rotation_euler_impl((float*)&mr, r_euler[0], r_euler[1], r_euler[2]);
-
-      f32x4 vs = v_load3_vec(s); // scale xyz
-      mr.r0 = v_mul(mr.r0, vs);
-      mr.r1 = v_mul(mr.r1, vs);
-      mr.r2 = v_mul(mr.r2, vs);
-
-      mr.r0 = v_insert_w(mr.r0, _mm_set_ss(t[0]));
-      mr.r1 = v_insert_w(mr.r1, _mm_set_ss(t[1]));
-      mr.r2 = v_insert_w(mr.r2, _mm_set_ss(t[2]));
-      mr.r3 = v_set(0, 0, 0, 1.0f);
-
-      m_store(m, mr);
-   }
-
    inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up)
    {
       f32x4 vEye = v_load3_pos(eye);
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
index 3fe449a1b..01640ca1f 100644
--- a/Engine/source/math/impl/math_c.cpp
+++ b/Engine/source/math/impl/math_c.cpp
@@ -318,26 +318,26 @@ namespace math_backend::mat44::dispatch
          r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
       };
 
-      gMat44.mul_mat44 = [](const float* a, const float* b, float* r) {
-         r[0] = a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12];
-         r[1] = a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13];
-         r[2] = a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14];
-         r[3] = a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15];
+      gMat44.mul_mat44 = [](const float* a, const float* b, float* mresult) {
+         mresult[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8]  + a[3]*b[12];
+         mresult[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9]  + a[3]*b[13];
+         mresult[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14];
+         mresult[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15];
 
-         r[4] = a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12];
-         r[5] = a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13];
-         r[6] = a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14];
-         r[7] = a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15];
+         mresult[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8]  + a[7]*b[12];
+         mresult[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9]  + a[7]*b[13];
+         mresult[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14];
+         mresult[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15];
 
-         r[8] = a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12];
-         r[9] = a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13];
-         r[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14];
-         r[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15];
+         mresult[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12];
+         mresult[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13];
+         mresult[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14];
+         mresult[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15];
 
-         r[12] = a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12];
-         r[13] = a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13];
-         r[14] = a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14];
-         r[15] = a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15];
+         mresult[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12];
+         mresult[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13];
+         mresult[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14];
+         mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
       };
 
       gMat44.normalize = [](float* a) {
diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h
index 3476fab1b..e6e8ef123 100644
--- a/Engine/source/math/isa/neon/neon_intrinsics.h
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@@ -14,14 +14,82 @@ namespace
    inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
    inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
 
+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+       return { x, y, z, w }; // lane0=x, lane1=y, lane2=z, lane3=w
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+       // extract scalar value from w lane0
+       float w_val = vgetq_lane_f32(w, 0);
+
+       // broadcast w_val to all lanes (we only need lane3 later)
+       f32x4 w_broadcast = vdupq_n_f32(w_val);
+
+       // mask to select only lane3
+       uint32x4_t mask = {0, 0, 0, 0xFFFFFFFF};
+
+       // vbslq: if mask bit=1, take from first argument, else take from second
+       return vbslq_f32(mask, w_broadcast, v);
+   }
+
    //------------------------------------------------------
    // Mask helpers
    //------------------------------------------------------
    inline f32x4 v_mask_xyz()
    {
-      // equivalent to [1,1,1,0]
-      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
-      return mask;
+       uint32x4_t mask = {
+           0xFFFFFFFF,
+           0xFFFFFFFF,
+           0xFFFFFFFF,
+           0x00000000
+       };
+
+       return vreinterpretq_f32_u32(mask);
+   }
+
+   inline f32x4 v_and(f32x4 a, f32x4 b)
+   {
+       return vreinterpretq_f32_u32(
+           vandq_u32(
+               vreinterpretq_u32_f32(a),
+               vreinterpretq_u32_f32(b)
+           )
+       );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+       // base byte index of the float lane
+       uint8x16_t base = vdupq_n_u8((uint8_t)(x * 4));
+
+       // byte offsets inside a float (0,1,2,3 repeated 4 times)
+       const uint8x16_t offsets = {
+           0,1,2,3,  0,1,2,3,
+           0,1,2,3,  0,1,2,3
+       };
+
+       uint8x16_t idx = vaddq_u8(base, offsets);
+
+       return vreinterpretq_f32_u8(
+           vqtbl1q_u8(
+               vreinterpretq_u8_f32(v),
+               idx
+           )
+       );
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v)
+   {
+       float32x4x2_t t = vzipq_f32(v, v);
+       return t.val[0];   // [x, x, z, z]
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v)
+   {
+       float32x4x2_t t = vzipq_f32(v, v);
+       return t.val[1];   // [y, y, w, w]
    }
 
    inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
@@ -109,10 +177,30 @@ namespace
 
    inline f32x4 v_cross(f32x4 a, f32x4 b)
    {
-      float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 };
-      float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 };
-      float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b));
-      return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 };
+      f32x4 a_yzx = { vgetq_lane_f32(a,1),
+                    vgetq_lane_f32(a,2),
+                    vgetq_lane_f32(a,0),
+                    0.0f };
+
+      f32x4 b_zxy = { vgetq_lane_f32(b,2),
+                    vgetq_lane_f32(b,0),
+                    vgetq_lane_f32(b,1),
+                    0.0f };
+
+      f32x4 a_zxy = { vgetq_lane_f32(a,2),
+                    vgetq_lane_f32(a,0),
+                    vgetq_lane_f32(a,1),
+                    0.0f };
+
+      f32x4 b_yzx = { vgetq_lane_f32(b,1),
+                    vgetq_lane_f32(b,2),
+                    vgetq_lane_f32(b,0),
+                    0.0f };
+
+      return vsubq_f32(
+        vmulq_f32(a_yzx, b_zxy),
+        vmulq_f32(a_zxy, b_yzx)
+      );
    }
 
    inline f32x4 v_normalize3(f32x4 v)
@@ -127,4 +215,188 @@ namespace
       float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
       return vdupq_n_f32(sum);
    }
+
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+       f32x4 r0;
+       f32x4 r1;
+       f32x4 r2;
+       f32x4 r3;
+   };
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+       f32x4x4 out;
+       out.r0 = v_load(m + 0);
+       out.r1 = v_load(m + 4);
+       out.r2 = v_load(m + 8);
+       out.r3 = v_load(m + 12);
+       return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+       v_store(dst + 0,  m.r0);
+       v_store(dst + 4,  m.r1);
+       v_store(dst + 8,  m.r2);
+       v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+       f32x4x4 m;
+       m.r0 = {1,0,0,0};
+       m.r1 = {0,1,0,0};
+       m.r2 = {0,0,1,0};
+       m.r3 = {0,0,0,1};
+       return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+       f32x4 z = v_zero();
+       return { z, z, z, z };
+   }
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+       f32x4 x = v_dot4(m.r0, v);
+       f32x4 y = v_dot4(m.r1, v);
+       f32x4 z = v_dot4(m.r2, v);
+       f32x4 w = v_dot4(m.r3, v);
+
+       return {
+           v_extract0(x),
+           v_extract0(y),
+           v_extract0(z),
+           v_extract0(w)
+       };
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+       f32x4 x = v_dot3(m.r0, v);
+       f32x4 y = v_dot3(m.r1, v);
+       f32x4 z = v_dot3(m.r2, v);
+
+       return {
+           v_extract0(x),
+           v_extract0(y),
+           v_extract0(z),
+           0.0f
+       };
+   }
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+       float32x4x2_t t0 = vtrnq_f32(m.r0, m.r1);
+       float32x4x2_t t1 = vtrnq_f32(m.r2, m.r3);
+
+       float32x2_t a0 = vget_low_f32(t0.val[0]);
+       float32x2_t a1 = vget_high_f32(t0.val[0]);
+       float32x2_t a2 = vget_low_f32(t1.val[0]);
+       float32x2_t a3 = vget_high_f32(t1.val[0]);
+
+       float32x2_t b0 = vget_low_f32(t0.val[1]);
+       float32x2_t b1 = vget_high_f32(t0.val[1]);
+       float32x2_t b2 = vget_low_f32(t1.val[1]);
+       float32x2_t b3 = vget_high_f32(t1.val[1]);
+
+       f32x4x4 out;
+
+       out.r0 = vcombine_f32(a0, a2);
+       out.r1 = vcombine_f32(b0, b2);
+       out.r2 = vcombine_f32(a1, a3);
+       out.r3 = vcombine_f32(b1, b3);
+
+       return out;
+   }
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+       f32x4x4 bt = m_transpose(b);
+
+       auto mul_row = [&](f32x4 row)
+       {
+           f32x4 x = v_dot4(row, bt.r0);
+           f32x4 y = v_dot4(row, bt.r1);
+           f32x4 z = v_dot4(row, bt.r2);
+           f32x4 w = v_dot4(row, bt.r3);
+
+           return f32x4{
+               v_extract0(x),
+               v_extract0(y),
+               v_extract0(z),
+               v_extract0(w)
+           };
+       };
+
+       f32x4x4 C;
+       C.r0 = mul_row(a.r0);
+       C.r1 = mul_row(a.r1);
+       C.r2 = mul_row(a.r2);
+       C.r3 = mul_row(a.r3);
+
+       return C;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+       f32x4 a = m.r0;
+       f32x4 b = m.r1;
+       f32x4 c = m.r2;
+       f32x4 d = m.r3;
+
+       f32x4 c0 = v_cross(c, d);
+       f32x4 c1 = v_cross(d, b);
+       f32x4 c2 = v_cross(b, c);
+
+       f32x4 term0 = vmulq_f32(a, c0);
+       f32x4 term1 = vmulq_f32(a, c1);
+       f32x4 term2 = vmulq_f32(a, c2);
+
+       f32x4 det = vaddq_f32(term0, vaddq_f32(term1, term2));
+
+       return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+       f32x4 r0 = v_and(m.r0, v_mask_xyz());
+       f32x4 r1 = v_and(m.r1, v_mask_xyz());
+       f32x4 r2 = v_and(m.r2, v_mask_xyz());
+
+       f32x4 c0 = v_cross(r1, r2);
+       return v_dot3(r0, c0);
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+       f32x4 a = m.r0;
+       f32x4 b = m.r1;
+       f32x4 c = m.r2;
+       f32x4 d = m.r3;
+
+       f32x4 c0 = v_cross(b, c);
+       f32x4 c1 = v_cross(c, d);
+       f32x4 c2 = v_cross(d, a);
+       f32x4 c3 = v_cross(a, b);
+
+       f32x4 det = v_dot4(a, c1);
+       f32x4 invDet = v_rcp_nr(det);
+
+       f32x4x4 adj;
+       adj.r0 = vmulq_f32(c1, invDet);
+       adj.r1 = vmulq_f32(c2, invDet);
+       adj.r2 = vmulq_f32(c3, invDet);
+       adj.r3 = vmulq_f32(c0, invDet);
+
+       return m_transpose(adj);
+   }
+
 }
diff --git a/Engine/source/math/mMathFn.h b/Engine/source/math/mMathFn.h
index e9e7d3840..241a9a393 100644
--- a/Engine/source/math/mMathFn.h
+++ b/Engine/source/math/mMathFn.h
@@ -76,11 +76,13 @@ extern void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m );
 extern void (*m_matF_set_euler)(const F32 *e, F32 *result);
 extern void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result);
 extern void (*m_matF_identity)(F32 *m);
-extern void (*m_matF_invert_to)(const F32* m, F32* d);
+extern void (*m_matF_inverse)(F32 *m);
+extern void (*m_matF_invert_to)(const F32 *m, F32 *d);
+extern void (*m_matF_affineInverse)(F32 *m);
 extern void (*m_matF_transpose)(F32 *m);
 extern void (*m_matF_scale)(F32 *m,const F32* p);
 extern void (*m_matF_normalize)(F32 *m);
-extern F32(*m_matF_determinant)(const F32* m);
+extern F32  (*m_matF_determinant)(const F32 *m);
 extern void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult);
 extern void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult);
 // extern void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult);
@@ -149,7 +151,7 @@ inline void m_matF_x_vectorF(const F32 *m, const F32 *v, F32 *vresult)
 inline bool mIsEqual( F32 a, F32 b, const F32 epsilon = __EQUAL_CONST_F )
 {
    F32 diff = a - b;
-   return diff > -epsilon && diff < epsilon; 
+   return diff > -epsilon && diff < epsilon;
 }
 
 inline bool mIsZero(const F32 val, const F32 epsilon = __EQUAL_CONST_F )
@@ -205,16 +207,16 @@ inline F32 mFmod(const F32 val, const F32 mod)
    return fmod(val, mod);
 }
 
-inline S32 mRound(const F32 val)  
-{  
-   return (S32)floor(val + 0.5f);  
-}  
+inline S32 mRound(const F32 val)
+{
+   return (S32)floor(val + 0.5f);
+}
       
-inline F32 mRound(const F32 val, const S32 n)  
-{  
-   S32 place = (S32) pow(10.0f, n);  
+inline F32 mRound(const F32 val, const S32 n)
+{
+   S32 place = (S32) pow(10.0f, n);
       
-   return mFloor((val*place)+0.5)/place;  
+   return mFloor((val*place)+0.5)/place;
 }
 
 inline F32 mRoundF(const F32 val, const F32 step)
@@ -259,15 +261,15 @@ inline F32 mClampF(F32 val, F32 low, F32 high)
 
 inline S32 mWrap(S32 val, S32 low, S32 high)
 {
-	int len = high - low;
-	return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0);
+   int len = high - low;
+   return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0);
 
 }
 
 inline F32 mWrapF(F32 val, F32 low, F32 high)
 {
-	F32 t = fmod(val - low, high - low);
-	return t < 0 ? t + high : t + low;
+   F32 t = fmod(val - low, high - low);
+   return t < 0 ? t + high : t + low;
 }
 
 /// Template function for doing a linear interpolation between any two
@@ -495,7 +497,7 @@ inline F64 mRadToDeg(F64 r)
 
 inline bool mIsNaN_F( const F32 x )
 {
-   // If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN. 
+   // If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN.
    // This is only going to work if the compiler is IEEE 748 compliant.
    //
    // Tested and working on VC2k5
diff --git a/Engine/source/math/mMath_C.cpp b/Engine/source/math/mMath_C.cpp
index ad5aed0db..07f63dc38 100644
--- a/Engine/source/math/mMath_C.cpp
+++ b/Engine/source/math/mMath_C.cpp
@@ -152,8 +152,8 @@ static void m_point3F_interpolate_C(const F32 *from, const F32 *to, F32 factor,
 #ifdef TORQUE_COMPILER_GCC
 // remove possibility of aliases
    const F32 inverse = 1.0f - factor;
-   const F32	from0 = from[0], from1 = from[1], from2 = from[2];
-   const F32	to0 = to[0], to1 = to[1], to2 = to[2];
+   const F32   from0 = from[0], from1 = from[1], from2 = from[2];
+   const F32   to0 = to[0], to1 = to[1], to2 = to[2];
    
    result[0] = from0 * inverse + to0 * factor;
    result[1] = from1 * inverse + to1 * factor;
@@ -182,8 +182,8 @@ static void m_point3D_interpolate_C(const F64 *from, const F64 *to, F64 factor,
 #ifdef TORQUE_COMPILER_GCC
 // remove possibility of aliases
    const F64 inverse = 1.0f - factor;
-   const F64	from0 = from[0], from1 = from[1], from2 = from[2];
-   const F64	to0 = to[0], to1 = to[1], to2 = to[2];
+   const F64   from0 = from[0], from1 = from[1], from2 = from[2];
+   const F64   to0 = to[0], to1 = to[1], to2 = to[2];
    
    result[0] = from0 * inverse + to0 * factor;
    result[1] = from1 * inverse + to1 * factor;
@@ -436,8 +436,8 @@ static void affineInvertTo(const F32 * m, F32 * out)
 
    F32 invDet = 1.0f / (m[idx(0,0)] * d1 + m[idx(0,1)] * d2 + m[idx(0,2)] * d3);
 
-   F32 m00 = m[idx(0,0)] * invDet; 
-   F32 m01 = m[idx(0,1)] * invDet; 
+   F32 m00 = m[idx(0,0)] * invDet;
+   F32 m01 = m[idx(0,1)] * invDet;
    F32 m02 = m[idx(0,2)] * invDet;
 
    F32 * result = out;
@@ -464,26 +464,71 @@ static void affineInvertTo(const F32 * m, F32 * out)
 }
 #endif
 
-static void m_matF_invert_to_C(const F32* m, F32* d)
+//--------------------------------------
+static void m_matF_inverse_C(F32 *m)
 {
    // using Cramers Rule find the Inverse
    // Minv = (1/det(M)) * adjoint(M)
-   F32 det = m_matF_determinant(m);
-   AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+   F32 det = m_matF_determinant( m );
+   AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
 
-   F32 invDet = 1.0f / det;
+   F32 invDet = 1.0f/det;
+   F32 temp[16];
 
-   d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
-   d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
+   temp[0] = (m[5] * m[10]- m[6] * m[9]) * invDet;
+   temp[1] = (m[9] * m[2] - m[10]* m[1]) * invDet;
+   temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
+
+   temp[4] = (m[6] * m[8] - m[4] * m[10])* invDet;
+   temp[5] = (m[10]* m[0] - m[8] * m[2]) * invDet;
+   temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
+
+   temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
+   temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
+   temp[10]= (m[0] * m[5] - m[1] * m[4]) * invDet;
+
+   m[0] = temp[0];
+   m[1] = temp[1];
+   m[2] = temp[2];
+
+   m[4] = temp[4];
+   m[5] = temp[5];
+   m[6] = temp[6];
+
+   m[8] = temp[8];
+   m[9] = temp[9];
+   m[10] = temp[10];
+
+   // invert the translation
+   temp[0] = -m[3];
+   temp[1] = -m[7];
+   temp[2] = -m[11];
+   m_matF_x_vectorF(m, temp, &temp[4]);
+   m[3] = temp[4];
+   m[7] = temp[5];
+   m[11]= temp[6];
+}
+
+static void m_matF_invert_to_C(const F32 *m, F32 *d)
+{
+   // using Cramers Rule find the Inverse
+   // Minv = (1/det(M)) * adjoint(M)
+   F32 det = m_matF_determinant( m );
+   AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+
+   F32 invDet = 1.0f/det;
+
+   d[0] = (m[5] * m[10]- m[6] * m[9]) * invDet;
+   d[1] = (m[9] * m[2] - m[10]* m[1]) * invDet;
    d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
 
-   d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
-   d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
+   d[4] = (m[6] * m[8] - m[4] * m[10])* invDet;
+   d[5] = (m[10]* m[0] - m[8] * m[2]) * invDet;
    d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
 
    d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
    d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
-   d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
+   d[10]= (m[0] * m[5] - m[1] * m[4]) * invDet;
 
    // invert the translation
    F32 temp[6];
@@ -493,13 +538,33 @@ static void m_matF_invert_to_C(const F32* m, F32* d)
    m_matF_x_vectorF(d, temp, &temp[3]);
    d[3] = temp[3];
    d[7] = temp[4];
-   d[11] = temp[5];
-   d[12] = m[12];
-   d[13] = m[13];
-   d[14] = m[14];
-   d[15] = m[15];
+   d[11]= temp[5];
+   d[ 12 ] = m[ 12 ];
+   d[ 13 ] = m[ 13 ];
+   d[ 14 ] = m[ 14 ];
+   d[ 15 ] = m[ 15 ];
 }
 
+//--------------------------------------
+static void m_matF_affineInverse_C(F32 *m)
+{
+   // Matrix class checks to make sure this is an affine transform before calling
+   //  this function, so we can proceed assuming it is...
+   F32 temp[16];
+   dMemcpy(temp, m, 16 * sizeof(F32));
+
+   // Transpose rotation
+   m[1] = temp[4];
+   m[4] = temp[1];
+   m[2] = temp[8];
+   m[8] = temp[2];
+   m[6] = temp[9];
+   m[9] = temp[6];
+
+   m[3]  = -(temp[0]*temp[3] + temp[4]*temp[7] + temp[8]*temp[11]);
+   m[7]  = -(temp[1]*temp[3] + temp[5]*temp[7] + temp[9]*temp[11]);
+   m[11] = -(temp[2]*temp[3] + temp[6]*temp[7] + temp[10]*temp[11]);
+}
 
 inline void swap(F32 &a, F32 &b)
 {
@@ -568,11 +633,11 @@ static void m_matF_normalize_C(F32 *m)
 
 
 //--------------------------------------
-static F32 m_matF_determinant_C(const F32* m)
+static F32 m_matF_determinant_C(const F32 *m)
 {
-   return m[0] * (m[5] * m[10] - m[6] * m[9]) +
-      m[4] * (m[2] * m[9] - m[1] * m[10]) +
-      m[8] * (m[1] * m[6] - m[2] * m[5]);
+   return m[0] * (m[5] * m[10] - m[6] * m[9])  +
+      m[4] * (m[2] * m[9]  - m[1] * m[10]) +
+      m[8] * (m[1] * m[6]  - m[2] * m[5])  ;
 }
 
 
@@ -825,11 +890,13 @@ void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m ) = m_quatF_set_mat
 void (*m_matF_set_euler)(const F32 *e, F32 *result) = m_matF_set_euler_C;
 void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result) = m_matF_set_euler_point_C;
 void (*m_matF_identity)(F32 *m)  = m_matF_identity_C;
-void (*m_matF_invert_to)(const F32* m, F32* d) = m_matF_invert_to_C;
+void (*m_matF_inverse)(F32 *m)   = m_matF_inverse_C;
+void (*m_matF_affineInverse)(F32 *m)   = m_matF_affineInverse_C;
+void (*m_matF_invert_to)(const F32 *m, F32 *d) = m_matF_invert_to_C;
 void (*m_matF_transpose)(F32 *m) = m_matF_transpose_C;
 void (*m_matF_scale)(F32 *m,const F32* p) = m_matF_scale_C;
 void (*m_matF_normalize)(F32 *m) = m_matF_normalize_C;
-F32(*m_matF_determinant)(const F32* m) = m_matF_determinant_C;
+F32  (*m_matF_determinant)(const F32 *m) = m_matF_determinant_C;
 void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult)    = default_matF_x_matF_C;
 void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult)    = default_matF_x_matF_C;
 // void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult) = m_matF_x_point3F_C;
@@ -868,11 +935,13 @@ void mInstallLibrary_C()
    m_matF_set_euler        = m_matF_set_euler_C;
    m_matF_set_euler_point  = m_matF_set_euler_point_C;
    m_matF_identity         = m_matF_identity_C;
-   m_matF_invert_to = m_matF_invert_to_C;
+   m_matF_inverse          = m_matF_inverse_C;
+   m_matF_affineInverse    = m_matF_affineInverse_C;
+   m_matF_invert_to        = m_matF_invert_to_C;
    m_matF_transpose        = m_matF_transpose_C;
    m_matF_scale            = m_matF_scale_C;
    m_matF_normalize        = m_matF_normalize_C;
-   m_matF_determinant = m_matF_determinant_C;
+   m_matF_determinant      = m_matF_determinant_C;
    m_matF_x_matF           = default_matF_x_matF_C;
    m_matF_x_matF_aligned   = default_matF_x_matF_C;
 //    m_matF_x_point3F        = m_matF_x_point3F_C;
@@ -881,4 +950,3 @@ void mInstallLibrary_C()
    m_matF_x_scale_x_planeF = m_matF_x_scale_x_planeF_C;
    m_matF_x_box3F          = m_matF_x_box3F_C;
 }
-
diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h
index f42aaa5e4..a670ab3ba 100644
--- a/Engine/source/math/mMatrix.h
+++ b/Engine/source/math/mMatrix.h
@@ -59,7 +59,7 @@
 /// 4x4 Matrix Class
 ///
 /// This runs at F32 precision.
-using math_backend::mat44::dispatch::gMat44;
+using namespace math_backend::mat44::dispatch;
 class MatrixF
 {
    friend class MatrixFEngineExport;
@@ -131,7 +131,7 @@ public:
    EulerF toEuler() const;
 
    F32 determinant() const {
-      return gMat44.determinant(*this);
+      return GetMat44().determinant(*this);
    }
 
    /// Compute the inverse of the matrix.
@@ -375,52 +375,51 @@ inline MatrixF& MatrixF::identity()
 
 inline MatrixF& MatrixF::inverse()
 {
-   gMat44.inverse(m);
+   GetMat44().inverse(m);
    return (*this);
 }
 
 inline void MatrixF::invertTo( MatrixF *out )
 {
    out = this;
-   gMat44.inverse(*out);
+   GetMat44().inverse(*out);
 }
 
 inline MatrixF& MatrixF::affineInverse()
 {
-   gMat44.affine_inverse(m);
+   GetMat44().affine_inverse(m);
    return (*this);
 }
 
 inline MatrixF& MatrixF::transpose()
 {
-   gMat44.transpose(m);
+   GetMat44().transpose(m);
    return (*this);
 }
 
 inline MatrixF& MatrixF::scale(const Point3F& p)
 {
-   gMat44.scale(m, p);
+   GetMat44().scale(m, p);
    return *this;
 }
 
 inline Point3F MatrixF::getScale() const
 {
    Point3F scale;
-   gMat44.get_scale(m, scale);
+   GetMat44().get_scale(m, scale);
    return scale;
 }
 
 inline void MatrixF::normalize()
 {
-   gMat44.normalize(m);
+   GetMat44().normalize(m);
 }
 
 inline MatrixF& MatrixF::mul( const MatrixF &a )
 {  // M * a -> M
    AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!");
    MatrixF tempThis(*this);
-   gMat44.mul_mat44(tempThis, a, tempThis);
-   *this = tempThis;
+   GetMat44().mul_mat44(tempThis, a, *this);
 
    return (*this);
 }
@@ -429,14 +428,14 @@ inline MatrixF& MatrixF::mulL( const MatrixF &a )
 {  // a * M -> M
    AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!");
    MatrixF tempThis(*this);
-   gMat44.mul_mat44(a, tempThis, *this);
+   GetMat44().mul_mat44(a, tempThis, *this);
    return (*this);
 }
 
 inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b )
 {  // a * b -> M
    AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!");
-   gMat44.mul_mat44(a, b, *this);
+   GetMat44().mul_mat44(a, b, *this);
    return (*this);
 }
 
@@ -461,7 +460,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b)
 inline void MatrixF::mul( Point4F& p ) const
 {
    Point4F temp;
-   gMat44.mul_float4(*this, p, temp);
+   GetMat44().mul_float4(*this, p, temp);
    p = temp;
 }
 
@@ -469,28 +468,28 @@ inline void MatrixF::mulP( Point3F& p) const
 {
    // M * p -> d
    Point3F d;
-   gMat44.mul_pos3(*this, p, d);
+   GetMat44().mul_pos3(*this, p, d);
    p = d;
 }
 
 inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
 {
    // M * p -> d
-   gMat44.mul_pos3(*this, p, *d);
+   GetMat44().mul_pos3(*this, p, *d);
 }
 
 inline void MatrixF::mulV( VectorF& v) const
 {
    // M * v -> v
    VectorF temp;
-   gMat44.mul_vec3(*this, v, temp);
+   GetMat44().mul_vec3(*this, v, temp);
    v = temp;
 }
 
 inline void MatrixF::mulV( const VectorF &v, Point3F *d) const
 {
    // M * v -> d
-   gMat44.mul_vec3(*this, v, *d);
+   GetMat44().mul_vec3(*this, v, *d);
 }
 
 inline void MatrixF::mul(Box3F& b) const
@@ -634,14 +633,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 )
 {
    // temp = m1 * m2
    MatrixF temp;
-   gMat44.mul_mat44(m1, m2, temp);
+   GetMat44().mul_mat44(m1, m2, temp);
    return temp;
 }
 
 inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 )
 {
    MatrixF tempThis(*this);
-   gMat44.mul_mat44(tempThis, m1, *this);
+   GetMat44().mul_mat44(tempThis, m1, *this);
    return (*this);
 }
 
diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h
index 8e329f3a1..7e7567f92 100644
--- a/Engine/source/math/mPoint3.h
+++ b/Engine/source/math/mPoint3.h
@@ -102,7 +102,7 @@ public:
 class Point3D;
 
 //------------------------------------------------------------------------------
-using math_backend::float3::dispatch::gFloat3;
+using namespace math_backend::float3::dispatch;
 class Point3F
 {
    //-------------------------------------- Public data
@@ -504,7 +504,7 @@ inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _
 {
    AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
 
-   gFloat3.lerp(_from, _to, _factor, *this);
+   GetFloat3().lerp(_from, _to, _factor, *this);
 }
 
 inline void Point3F::zero()
@@ -606,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c)
 
 inline F32 Point3F::lenSquared() const
 {
-   return gFloat3.lengthSquared(*this);
+   return GetFloat3().lengthSquared(*this);
 }
 
 inline F32 Point3F::len() const
 {
-   return gFloat3.length(*this);
+   return GetFloat3().length(*this);
 }
 
 inline void Point3F::normalize()
 {
-   gFloat3.normalize(*this);
+   GetFloat3().normalize(*this);
 }
 
 inline F32 Point3F::magnitudeSafe() const
@@ -633,13 +633,13 @@ inline F32 Point3F::magnitudeSafe() const
 
 inline void Point3F::normalizeSafe()
 {
-   gFloat3.normalize(*this);
+   GetFloat3().normalize(*this);
 }
 
 
 inline void Point3F::normalize(F32 val)
 {
-   gFloat3.normalize_mag(*this, val);
+   GetFloat3().normalize_mag(*this, val);
 }
 
 inline bool Point3F::operator==(const Point3F& _test) const
@@ -655,33 +655,33 @@ inline bool Point3F::operator!=(const Point3F& _test) const
 inline Point3F Point3F::operator+(const Point3F& _add) const
 {
    Point3F temp;
-   gFloat3.add(*this, _add, temp);
+   GetFloat3().add(*this, _add, temp);
    return temp;
 }
 
 inline Point3F Point3F::operator-(const Point3F& _rSub) const
 {
    Point3F temp;
-   gFloat3.sub(*this, _rSub, temp);
+   GetFloat3().sub(*this, _rSub, temp);
    return temp;
 }
 
 inline Point3F& Point3F::operator+=(const Point3F& _add)
 {
-   gFloat3.add(*this, _add, *this);
+   GetFloat3().add(*this, _add, *this);
    return *this;
 }
 
 inline Point3F& Point3F::operator-=(const Point3F& _rSub)
 {
-   gFloat3.sub(*this, _rSub, *this);
+   GetFloat3().sub(*this, _rSub, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator*(F32 _mul) const
 {
    Point3F temp;
-   gFloat3.mul_scalar(*this, _mul, temp);
+   GetFloat3().mul_scalar(*this, _mul, temp);
    return temp;
 }
 
@@ -690,13 +690,13 @@ inline Point3F Point3F::operator/(F32 _div) const
    AssertFatal(_div != 0.0f, "Error, div by zero attempted");
 
    Point3F temp;
-   gFloat3.div_scalar(*this, _div, temp);
+   GetFloat3().div_scalar(*this, _div, temp);
    return temp;
 }
 
 inline Point3F& Point3F::operator*=(F32 _mul)
 {
-   gFloat3.mul_scalar(*this, _mul, *this);
+   GetFloat3().mul_scalar(*this, _mul, *this);
    return *this;
 }
 
@@ -704,20 +704,20 @@ inline Point3F& Point3F::operator/=(F32 _div)
 {
    AssertFatal(_div != 0.0f, "Error, div by zero attempted");
 
-   gFloat3.div_scalar(*this, _div, *this);
+   GetFloat3().div_scalar(*this, _div, *this);
    return *this;
 }
 
 inline Point3F Point3F::operator*(const Point3F &_vec) const
 {
    Point3F temp;
-   gFloat3.mul(*this, _vec, temp);
+   GetFloat3().mul(*this, _vec, temp);
    return temp;
 }
 
 inline Point3F& Point3F::operator*=(const Point3F &_vec)
 {
-   gFloat3.mul(*this, _vec, *this);
+   GetFloat3().mul(*this, _vec, *this);
    return *this;
 }
 
@@ -725,14 +725,14 @@ inline Point3F Point3F::operator/(const Point3F &_vec) const
 {
    AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
    Point3F temp;
-   gFloat3.div(*this, _vec, temp);
+   GetFloat3().div(*this, _vec, temp);
    return temp;
 }
 
 inline Point3F& Point3F::operator/=(const Point3F &_vec)
 {
    AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   gFloat3.div(*this, _vec, *this);
+   GetFloat3().div(*this, _vec, *this);
    return *this;
 }
 
@@ -985,7 +985,7 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)
 inline Point3F operator*(F32 mul, const Point3F& multiplicand)
 {
    Point3F temp;
-   gFloat3.mul_scalar(multiplicand, mul, temp);
+   GetFloat3().mul_scalar(multiplicand, mul, temp);
    return temp;
 }
 
@@ -996,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
 
 inline F32 mDot(const Point3F &p1, const Point3F &p2)
 {
-   return gFloat3.dot(p1, p2);
+   return GetFloat3().dot(p1, p2);
 }
 
 inline F64 mDot(const Point3D &p1, const Point3D &p2)
@@ -1006,7 +1006,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2)
 
 inline void mCross(const Point3F &a, const Point3F &b, Point3F *res)
 {
-   gFloat3.cross(a, b, *res);
+   GetFloat3().cross(a, b, *res);
 }
 
 inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
@@ -1019,7 +1019,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
 inline Point3F mCross(const Point3F &a, const Point3F &b)
 {
    Point3F r;
-   gFloat3.cross(a, b, r);
+   GetFloat3().cross(a, b, r);
    return r;
 }
 
diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h
index 0f715f983..4befd326b 100644
--- a/Engine/source/math/mPoint4.h
+++ b/Engine/source/math/mPoint4.h
@@ -63,7 +63,7 @@ class Point4I
 /// Uses F32 internally.
 ///
 /// Useful for representing quaternions and other 4d beasties.
-using math_backend::float4::dispatch::gFloat4;
+using namespace math_backend::float4::dispatch;
 
 class Point4F
 {
@@ -156,12 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)
 
 inline F32 Point4F::len() const
 {
-   return gFloat4.length(*this);
+   return GetFloat4().length(*this);
 }
 
 inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
 {
-   gFloat4.lerp(_from, _to, _factor, *this);
+   GetFloat4().lerp(_from, _to, _factor, *this);
 }
 
 inline void Point4F::zero()
@@ -194,7 +194,7 @@ inline Point4F& Point4F::operator/=(F32 scalar)
    if (mIsZero(scalar))
       return *this;
 
-   gFloat4.div_scalar(*this, scalar, *this);
+   GetFloat4().div_scalar(*this, scalar, *this);
 
    return *this;
 }
@@ -202,47 +202,47 @@ inline Point4F& Point4F::operator/=(F32 scalar)
 inline Point4F Point4F::operator+(const Point4F& _add) const
 {
    Point4F res;
-   gFloat4.add(*this, _add, res);
+   GetFloat4().add(*this, _add, res);
    return res;
 }
 
 inline Point4F& Point4F::operator+=(const Point4F& _add)
 {
-   gFloat4.add(*this, _add, *this);
+   GetFloat4().add(*this, _add, *this);
    return *this;
 }
 
 inline Point4F Point4F::operator-(const Point4F& _rSub) const
 {
    Point4F res;
-   gFloat4.sub(*this, _rSub, res);
+   GetFloat4().sub(*this, _rSub, res);
    return res;
 }
 
 inline Point4F Point4F::operator*(const Point4F &_vec) const
 {
    Point4F res;
-   gFloat4.mul(*this, _vec, res);
+   GetFloat4().mul(*this, _vec, res);
    return res;
 }
 
 inline Point4F Point4F::operator*(F32 _mul) const
 {
    Point4F res;
-   gFloat4.mul_scalar(*this, _mul, res);
+   GetFloat4().mul_scalar(*this, _mul, res);
    return res;
 }
 
 inline Point4F Point4F::operator /(F32 t) const
 {
    Point4F res;
-   gFloat4.div_scalar(*this, t, res);
+   GetFloat4().div_scalar(*this, t, res);
    return res;
 }
 
 inline F32 mDot(const Point4F &p1, const Point4F &p2)
 {
-   return  gFloat4.dot(p1, p2);
+   return  GetFloat4().dot(p1, p2);
 }
 
 //------------------------------------------------------------------------------
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
index f8a16c3ed..d110650ac 100644
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@@ -1,4 +1,3 @@
-#pragma once
 #include "math/public/math_backend.h"
 
 namespace math_backend::float4::dispatch
diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h
index 0a3127f81..f704f0db5 100644
--- a/Engine/source/math/public/math_backend.h
+++ b/Engine/source/math/public/math_backend.h
@@ -35,4 +35,61 @@ namespace math_backend
    void install_from_cpu_flags(uint32_t cpu_flags);
 }
 
+#include <mutex>
+
+namespace math_backend::float4::dispatch
+{
+   //--------------------------------------------------
+   // Thread-safe getter, ensures scalar installed if empty
+   //--------------------------------------------------
+   inline Float4Funcs& GetFloat4()
+   {
+      if (__builtin_expect(gFloat4.mul == nullptr, 0))
+      {
+         static std::once_flag once;
+         std::call_once(once, []{
+            install_scalar();
+         });
+      }
+      return gFloat4;
+   }
+}
+
+namespace math_backend::float3::dispatch
+{
+   //--------------------------------------------------
+   // Thread-safe getter, ensures scalar installed if empty
+   //--------------------------------------------------
+   inline Float3Funcs& GetFloat3()
+   {
+      if (__builtin_expect(gFloat3.mul == nullptr, 0))
+      {
+         static std::once_flag once;
+         std::call_once(once, []{
+            install_scalar();
+         });
+      }
+      return gFloat3;
+   }
+}
+
+
+namespace math_backend::mat44::dispatch
+{
+   //--------------------------------------------------
+   // Thread-safe getter, ensures scalar installed if empty
+   //--------------------------------------------------
+   inline Mat44Funcs& GetMat44()
+   {
+      if (__builtin_expect(gMat44.mul_mat44 == nullptr, 0))
+      {
+         static std::once_flag once;
+         std::call_once(once, []{
+            install_scalar();
+         });
+      }
+      return gMat44;
+   }
+}
+
 #endif // !_MATH_BACKEND_H_

From 228b474f2e3e8be809d8e5cf795972c238711613 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 08:54:23 +0000
Subject: [PATCH 10/23] Update math_backend.h

MSVC doesnt use __builtin_expect it is a GCC only flag
---
 Engine/source/math/public/math_backend.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h
index f704f0db5..9c5575823 100644
--- a/Engine/source/math/public/math_backend.h
+++ b/Engine/source/math/public/math_backend.h
@@ -44,7 +44,7 @@ namespace math_backend::float4::dispatch
    //--------------------------------------------------
    inline Float4Funcs& GetFloat4()
    {
-      if (__builtin_expect(gFloat4.mul == nullptr, 0))
+      if (gFloat4.mul == nullptr)
       {
          static std::once_flag once;
          std::call_once(once, []{
@@ -62,7 +62,7 @@ namespace math_backend::float3::dispatch
    //--------------------------------------------------
    inline Float3Funcs& GetFloat3()
    {
-      if (__builtin_expect(gFloat3.mul == nullptr, 0))
+      if (gFloat3.mul == nullptr)
       {
          static std::once_flag once;
          std::call_once(once, []{
@@ -81,7 +81,7 @@ namespace math_backend::mat44::dispatch
    //--------------------------------------------------
    inline Mat44Funcs& GetMat44()
    {
-      if (__builtin_expect(gMat44.mul_mat44 == nullptr, 0))
+      if (gMat44.mul_mat44 == nullptr)
       {
          static std::once_flag once;
          std::call_once(once, []{

From caeaedde5204b07be1b598beafb14ea0c2afd385 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 09:56:29 +0000
Subject: [PATCH 11/23] add inverse to

fix tests by adding an explicit inverse to function
copy 3rd row into dst it seems original was not touching this at all.
---
 Engine/source/math/impl/mat44_impl.inl     | 46 +++++++++++++++++++++-
 Engine/source/math/impl/math_c.cpp         | 35 ++++++++++++++++
 Engine/source/math/isa/avx/mat44.cpp       |  1 +
 Engine/source/math/isa/avx2/mat44.cpp      |  1 +
 Engine/source/math/isa/neon/mat44.cpp      |  1 +
 Engine/source/math/isa/sse2/mat44.cpp      |  1 +
 Engine/source/math/isa/sse41/mat44.cpp     |  1 +
 Engine/source/math/mMatrix.h               |  3 +-
 Engine/source/math/public/mat44_dispatch.h |  1 +
 9 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
index c27daeb45..41bb4bd0d 100644
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -85,7 +85,7 @@ namespace math_backend::mat44
       // Store inverse 3x3 (transpose of cofactor matrix)
 
       mTemp = m_transpose(mTemp);
-      mTemp.r3 = v_set(0, 0, 0, 1.0f);
+      mTemp.r3 = ma.r3;
 
       // ---- Translation ----
 
@@ -106,6 +106,50 @@ namespace math_backend::mat44
       m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
    }
 
+   // Matrix Inverse
+   inline void mat44_inverse_to_impl(const float* m, float* dst)
+   {
+      f32x4x4 ma = m_load(m);
+
+      // Compute cofactors using cross products
+      f32x4x4 mTemp;
+      mTemp.r0 = v_cross(ma.r1, ma.r2);
+      mTemp.r1 = v_cross(ma.r2, ma.r0);
+      mTemp.r2 = v_cross(ma.r0, ma.r1);
+
+      // Determinant = dot(ma.r0, c0)
+      f32x4 det = v_dot3(ma.r0, mTemp.r0);
+      f32x4 invDet = v_rcp_nr(det);
+
+      // Scale cofactors
+      mTemp.r0 = v_mul(mTemp.r0, invDet);
+      mTemp.r1 = v_mul(mTemp.r1, invDet);
+      mTemp.r2 = v_mul(mTemp.r2, invDet);
+
+      // Store inverse 3x3 (transpose of cofactor matrix)
+
+      mTemp = m_transpose(mTemp);
+      mTemp.r3 = ma.r3;
+
+      // ---- Translation ----
+
+      // Load original translation
+      f32x4 T = v_set(dst[3], dst[7], dst[11], 0.0f);
+
+      // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
+      f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
+      result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
+      result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
+      result = v_mul(result, v_set1(-1.0f));
+
+      m_store(dst, mTemp);
+
+      // Store translation
+      dst[3] = v_extract0(result);
+      dst[7] = v_extract0(v_swizzle_singular_mask(result, 1));
+      dst[11] = v_extract0(v_swizzle_singular_mask(result, 2));
+   }
+
    // Matrix Affine Inverse
    inline void mat44_affine_inverse_impl(float* m)
    {
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
index 01640ca1f..ac6358b60 100644
--- a/Engine/source/math/impl/math_c.cpp
+++ b/Engine/source/math/impl/math_c.cpp
@@ -263,6 +263,41 @@ namespace math_backend::mat44::dispatch
 
       };
 
+      gMat44.inverse_to = [](const float* m, float* d) {
+         // using Cramers Rule find the Inverse
+         // Minv = (1/det(M)) * adjoint(M)
+         F32 det = gMat44.determinant(m);
+         AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
+
+         F32 invDet = 1.0f / det;
+
+         d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
+         d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
+         d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
+
+         d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
+         d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
+         d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
+
+         d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
+         d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
+         d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
+
+         // invert the translation
+         F32 temp[6];
+         temp[0] = -m[3];
+         temp[1] = -m[7];
+         temp[2] = -m[11];
+         gMat44.mul_vec3(d, temp, &temp[3]);
+         d[3] = temp[3];
+         d[7] = temp[4];
+         d[11] = temp[5];
+         d[12] = m[12];
+         d[13] = m[13];
+         d[14] = m[14];
+         d[15] = m[15];
+      };
+
       gMat44.affine_inverse = [](float* a) {
          F32 temp[16];
          dMemcpy(temp, a, 16 * sizeof(F32));
diff --git a/Engine/source/math/isa/avx/mat44.cpp b/Engine/source/math/isa/avx/mat44.cpp
index 6634b652c..50d64b6a2 100644
--- a/Engine/source/math/isa/avx/mat44.cpp
+++ b/Engine/source/math/isa/avx/mat44.cpp
@@ -9,6 +9,7 @@ namespace math_backend::mat44::dispatch
    {
       gMat44.transpose        = mat44_transpose_impl;
       gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to       = mat44_inverse_to_impl;
       gMat44.affine_inverse   = mat44_affine_inverse_impl;
       gMat44.mul_mat44        = mat44_mul_mat44_impl;
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
diff --git a/Engine/source/math/isa/avx2/mat44.cpp b/Engine/source/math/isa/avx2/mat44.cpp
index 11e842174..e6b891e9e 100644
--- a/Engine/source/math/isa/avx2/mat44.cpp
+++ b/Engine/source/math/isa/avx2/mat44.cpp
@@ -9,6 +9,7 @@ namespace math_backend::mat44::dispatch
    {
       gMat44.transpose        = mat44_transpose_impl;
       gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to       = mat44_inverse_to_impl;
       gMat44.affine_inverse   = mat44_affine_inverse_impl;
       gMat44.mul_mat44        = mat44_mul_mat44_impl;
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
diff --git a/Engine/source/math/isa/neon/mat44.cpp b/Engine/source/math/isa/neon/mat44.cpp
index 579cfe097..cfeebb3bf 100644
--- a/Engine/source/math/isa/neon/mat44.cpp
+++ b/Engine/source/math/isa/neon/mat44.cpp
@@ -9,6 +9,7 @@ namespace math_backend::mat44::dispatch
    {
       gMat44.transpose        = mat44_transpose_impl;
       gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to       = mat44_inverse_to_impl;
       gMat44.affine_inverse   = mat44_affine_inverse_impl;
       gMat44.mul_mat44        = mat44_mul_mat44_impl;
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
diff --git a/Engine/source/math/isa/sse2/mat44.cpp b/Engine/source/math/isa/sse2/mat44.cpp
index 6738ee391..2a84c029f 100644
--- a/Engine/source/math/isa/sse2/mat44.cpp
+++ b/Engine/source/math/isa/sse2/mat44.cpp
@@ -9,6 +9,7 @@ namespace math_backend::mat44::dispatch
    {
       gMat44.transpose        = mat44_transpose_impl;
       gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to = mat44_inverse_to_impl;
       gMat44.affine_inverse   = mat44_affine_inverse_impl;
       gMat44.mul_mat44        = mat44_mul_mat44_impl;
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
diff --git a/Engine/source/math/isa/sse41/mat44.cpp b/Engine/source/math/isa/sse41/mat44.cpp
index 94bc5e4b2..489d495b8 100644
--- a/Engine/source/math/isa/sse41/mat44.cpp
+++ b/Engine/source/math/isa/sse41/mat44.cpp
@@ -9,6 +9,7 @@ namespace math_backend::mat44::dispatch
    {
       gMat44.transpose        = mat44_transpose_impl;
       gMat44.inverse          = mat44_inverse_impl;
+      gMat44.inverse_to       = mat44_inverse_to_impl;
       gMat44.affine_inverse   = mat44_affine_inverse_impl;
       gMat44.mul_mat44        = mat44_mul_mat44_impl;
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h
index a670ab3ba..92bea5fad 100644
--- a/Engine/source/math/mMatrix.h
+++ b/Engine/source/math/mMatrix.h
@@ -381,8 +381,7 @@ inline MatrixF& MatrixF::inverse()
 
 inline void MatrixF::invertTo( MatrixF *out )
 {
-   out = this;
-   GetMat44().inverse(*out);
+   GetMat44().inverse_to(*this, *out);
 }
 
 inline MatrixF& MatrixF::affineInverse()
diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h
index b1b78ef13..2c4342aee 100644
--- a/Engine/source/math/public/mat44_dispatch.h
+++ b/Engine/source/math/public/mat44_dispatch.h
@@ -9,6 +9,7 @@ namespace math_backend::mat44::dispatch
    {
       void (*transpose)(float*) = nullptr;
       void (*inverse)(float*) = nullptr;
+      void (*inverse_to)(const float*, float*) = nullptr;
       void (*affine_inverse)(float*) = nullptr;
       void (*normalize)(float*) = nullptr;
       void (*mul_mat44)(const float* a, const float* b, float* r) = nullptr;

From 585279ae82334d481d08e7705ee55782c27191a2 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 10:25:47 +0000
Subject: [PATCH 12/23] Update mat44_impl.inl

should use original matrix translation not the dst matrix
---
 Engine/source/math/impl/mat44_impl.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
index 41bb4bd0d..cc31d7b71 100644
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -134,7 +134,7 @@ namespace math_backend::mat44
       // ---- Translation ----
 
       // Load original translation
-      f32x4 T = v_set(dst[3], dst[7], dst[11], 0.0f);
+      f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
 
       // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
       f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));

From 65850df9bdf19662afdad1eef837e98cf15100e4 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 12:01:17 +0000
Subject: [PATCH 13/23] Update gfxDrawUtil.cpp

safety no longer needed due to safe initializers
---
 Engine/source/gfx/gfxDrawUtil.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/Engine/source/gfx/gfxDrawUtil.cpp b/Engine/source/gfx/gfxDrawUtil.cpp
index 3dce4b3af..16b886a55 100644
--- a/Engine/source/gfx/gfxDrawUtil.cpp
+++ b/Engine/source/gfx/gfxDrawUtil.cpp
@@ -853,11 +853,7 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2,
 // 3D World Draw Misc
 //-----------------------------------------------------------------------------
 
-SphereMesh& getSphere()
-{
-   static SphereMesh instance;
-   return instance;
-}
+static SphereMesh gSphere;
 
 void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm )
 {
@@ -872,7 +868,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P
    GFX->pushWorldMatrix();
    GFX->multWorld(mat);
 
-   const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2);
+   const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2);
    S32 numPoly = sphereMesh->numPoly;
    S32 totalPoly = 0;
    GFXVertexBufferHandle<GFXVertexPCT> verts(mDevice, numPoly*3, GFXBufferTypeVolatile);

From 014e319aa8b9643055854076bf1a98496ece92a5 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 12:08:43 +0000
Subject: [PATCH 14/23] cleanup console spam

---
 Engine/source/platformMac/macMath.mm      | 10 ++--------
 Engine/source/platformPOSIX/POSIXMath.cpp | 19 +------------------
 Engine/source/platformWin32/winMath.cpp   | 16 +---------------
 3 files changed, 4 insertions(+), 41 deletions(-)

diff --git a/Engine/source/platformMac/macMath.mm b/Engine/source/platformMac/macMath.mm
index c275bdd17..992e6ed53 100644
--- a/Engine/source/platformMac/macMath.mm
+++ b/Engine/source/platformMac/macMath.mm
@@ -109,15 +109,9 @@ void Math::init(U32 properties)
    Con::printf("Math Init:");
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
-   
-   math_backend::install_from_cpu_flags(properties);
 
-   #ifdef TORQUE_CPU_X86
-   if( properties & CPU_PROP_SSE )
-   {
-      Con::printf( "   Installing SSE extensions" );
-   }
-   #endif
+   Con::printf( "   Installing ISA extensions" );
+   math_backend::install_from_cpu_flags(properties);
    
    Con::printf(" ");
 }   
diff --git a/Engine/source/platformPOSIX/POSIXMath.cpp b/Engine/source/platformPOSIX/POSIXMath.cpp
index 751df294e..95e05a1b0 100644
--- a/Engine/source/platformPOSIX/POSIXMath.cpp
+++ b/Engine/source/platformPOSIX/POSIXMath.cpp
@@ -91,6 +91,7 @@ void Math::init(U32 properties)
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
 
+   Con::printf("   Installing ISA extensions");
    math_backend::install_from_cpu_flags(properties);
 
 #if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64)
@@ -98,24 +99,6 @@ void Math::init(U32 properties)
    mInstallLibrary_ASM();
 #endif
 
-   if (properties & CPU_PROP_FPU)
-   {
-      Con::printf("   Installing FPU extensions");
-   }
-
-   if (properties & CPU_PROP_MMX)
-   {
-      Con::printf("   Installing MMX extensions");
-   }
-
-#if !defined(__MWERKS__) || (__MWERKS__ >= 0x2400)
-   if (properties & CPU_PROP_SSE)
-   {
-      Con::printf("   Installing SSE extensions");
-   }
-#endif //mwerks>2.4
-
-   Con::printf(" ");
 }
 
 
diff --git a/Engine/source/platformWin32/winMath.cpp b/Engine/source/platformWin32/winMath.cpp
index 36a9d8c58..bccbd0de6 100644
--- a/Engine/source/platformWin32/winMath.cpp
+++ b/Engine/source/platformWin32/winMath.cpp
@@ -98,26 +98,12 @@ void Math::init(U32 properties)
    Con::printf("   Installing Standard C extensions");
    mInstallLibrary_C();
 
+   Con::printf("   Installing ISA extensions");
    math_backend::install_from_cpu_flags(properties);
 
    Con::printf("   Installing Assembly extensions");
    mInstallLibrary_ASM();
 
-   if (properties & CPU_PROP_FPU)
-   {
-      Con::printf("   Installing FPU extensions");
-   }
-
-   if (properties & CPU_PROP_MMX)
-   {
-      Con::printf("   Installing MMX extensions");
-   }
-
-   if (properties & CPU_PROP_SSE)
-   {
-      Con::printf("   Installing SSE extensions");
-   }
-
    Con::printf(" ");
 }
 

From 5c3c0546f66cfd9b8efb48d28c88cf0c5633225c Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 17:48:58 +0000
Subject: [PATCH 15/23] Update mPoint3.h

missed convolve and convolveInverse
---
 Engine/source/math/mPoint3.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h
index 7e7567f92..895454c05 100644
--- a/Engine/source/math/mPoint3.h
+++ b/Engine/source/math/mPoint3.h
@@ -592,16 +592,12 @@ inline void Point3F::neg()
 
 inline void Point3F::convolve(const Point3F& c)
 {
-   x *= c.x;
-   y *= c.y;
-   z *= c.z;
+   GetFloat3().mul(*this, c, *this);
 }
 
 inline void Point3F::convolveInverse(const Point3F& c)
 {
-   x /= c.x;
-   y /= c.y;
-   z /= c.z;
+   GetFloat3().div(*this, c, *this);
 }
 
 inline F32 Point3F::lenSquared() const

From 5a6467d54a6de766c1aebbac9ac03bce4bebe430 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 21:22:22 +0000
Subject: [PATCH 16/23] Update mat44_impl.inl

fix matrix normalize
---
 Engine/source/math/impl/mat44_impl.inl | 44 +++++++++++++++-----------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
index cc31d7b71..ab0799b08 100644
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -176,31 +176,37 @@ namespace math_backend::mat44
       m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
    }
 
-   inline void mat44_normalize_impl(float* a)
+   inline void mat44_normalize_impl(float* m)
    {
-      // Load the matrix
-      f32x4x4 m = m_load(a);
+      // Load the matrix into SIMD registers
+      f32x4x4 mat = m_load(m);
 
-      // Extract axes (rows 0-2), zero out w using v_mask_xyz
-      f32x4 xaxis = v_mul(m.r0, v_mask_xyz());
-      f32x4 yaxis = v_mul(m.r1, v_mask_xyz());
-      f32x4 zaxis = v_mul(m.r2, v_mask_xyz());
+      // Transpose: now rows are columns
+      mat = m_transpose(mat);
 
-      xaxis = v_normalize3(xaxis);
+      // Extract columns (which are now rows)
+      f32x4 col0 = mat.r0;
+      f32x4 col1 = mat.r1;
 
-      float dotXY = v_extract0(v_hadd4(v_mul(xaxis, yaxis)));
-      f32x4 projYonX = v_mul(v_set1(dotXY), xaxis);
-      yaxis = v_normalize3(v_sub(yaxis, projYonX));
+      // Rebuild orthonormal basis
+      f32x4 col2 = v_cross(col0, col1);
+      col1 = v_cross(col2, col0);
 
-      zaxis = v_cross(xaxis, yaxis);
+      // Normalize columns
+      col0 = v_normalize3(col0);
+      col1 = v_normalize3(col1);
+      col2 = v_normalize3(col2);
 
-      // Store normalized axes back (preserve translation w)
-      m.r0 = v_preserve_w(xaxis, m.r0);
-      m.r1 = v_preserve_w(yaxis, m.r1);
-      m.r2 = v_preserve_w(zaxis, m.r2);
+      // Write back directly into transposed matrix
+      mat.r0 = col0;
+      mat.r1 = col1;
+      mat.r2 = col2;
 
-      // Store back to memory
-      m_store(a, m);
+      // Transpose back to row-major
+      mat = m_transpose(mat);
+
+      // Store back
+      m_store(m, mat);
    }
 
    // Matrix Multiply: a * b
@@ -227,7 +233,7 @@ namespace math_backend::mat44
    {
       f32x4x4 ma = m_load(m);
       f32x4 va = v_load3_vec(v);
-      f32x4 vr = m_mul_vec4(ma, va);
+      f32x4 vr = m_mul_vec3(ma, va);
       v_store3(r, vr);
    }
 

From 8c1acbd1da4b86ebca534b28bac4252bb13f7969 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Wed, 4 Mar 2026 23:49:08 +0000
Subject: [PATCH 17/23] add normal safety

wrap safety around normal checks, this was done on the scalar math may as well do it here just in case.

Ad the impl.inl files to libraries so they can actually be found.
---
 Engine/source/math/impl/float3_impl.inl        |  5 ++---
 Engine/source/math/isa/avx/avx_intrinsics.h    | 11 +++++++++--
 Engine/source/math/isa/avx2/avx2_intrinsics.h  | 11 +++++++++--
 Engine/source/math/isa/neon/neon_intrinsics.h  | 15 +++++++++++++--
 Engine/source/math/isa/sse2/sse2_intrinsics.h  | 18 ++++++++++++++++--
 .../source/math/isa/sse41/sse41_intrinsics.h   | 11 +++++++++--
 Tools/CMake/torque_macros.cmake                |  4 +++-
 7 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl
index 216fc99ba..ec67fff8d 100644
--- a/Engine/source/math/impl/float3_impl.inl
+++ b/Engine/source/math/impl/float3_impl.inl
@@ -85,9 +85,8 @@ namespace math_backend::float3
    inline void float3_normalize_impl(float* a)
    {
       f32x4 va = v_load3_vec(a);
-      f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted
-      f32x4 vnorm = v_mul(va, invLen);
-      v_store3(a, vnorm);
+      f32x4 vr = v_normalize3(va);
+      v_store3(a, vr);
    }
 
    // Normalize with magnitude: r = normalize(a) * r
diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
index a67f900b4..2dcdf513b 100644
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -183,8 +183,15 @@ namespace
 
    inline f32x4 v_normalize3(f32x4 v)
    {
-      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
-      return _mm_mul_ps(v, inv);
+      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
+      f32x4 dot = v_dot3(v, v);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 isZero = _mm_cmpeq_ps(dot, zero);
+      f32x4 norm = _mm_mul_ps(v, inv);
+
+      return _mm_blendv_ps(norm, fallback, isZero);
    }
 
    // adds all 4 lanes together.
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
index 67f0df04a..e276e4c6e 100644
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -183,8 +183,15 @@ namespace
 
    inline f32x4 v_normalize3(f32x4 v)
    {
-      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
-      return _mm_mul_ps(v, inv);
+      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
+      f32x4 dot = v_dot3(v, v);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 isZero = _mm_cmpeq_ps(dot, zero);
+      f32x4 norm = _mm_mul_ps(v, inv);
+
+      return _mm_blendv_ps(norm, fallback, isZero);
    }
 
    // adds all 4 lanes together.
diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h
index e6e8ef123..888adab45 100644
--- a/Engine/source/math/isa/neon/neon_intrinsics.h
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@@ -205,8 +205,19 @@ namespace
 
    inline f32x4 v_normalize3(f32x4 v)
    {
-      f32x4 inv = v_rsqrt_nr(v_dot3(v,v));
-      return vmulq_f32(v, inv);
+      const float32x4_t zero = vdupq_n_f32(0.0f);
+      const float32x4_t fallback = {0.0f, 0.0f, 1.0f, 0.0f};
+
+      f32x4 dot = v_dot3(v, v);
+
+      // dot == 0?
+      uint32x4_t isZero = vceqq_f32(dot, zero);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 norm = vmulq_f32(v, inv);
+
+      // Select fallback when zero
+      return vbslq_f32(isZero, fallback, norm);
    }
 
    inline f32x4 v_hadd4(f32x4 a)
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
index 71a95c1b7..63243d2d3 100644
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -216,8 +216,22 @@ namespace
 
    inline f32x4 v_normalize3(f32x4 v)
    {
-      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
-      return _mm_mul_ps(v, inv);
+      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
+      f32x4 dot = v_dot3(v, v);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 isZero = _mm_cmpeq_ps(dot, zero);
+
+      f32x4 norm = _mm_mul_ps(v, inv);
+
+      // vbsl equivalent
+      f32x4 result = _mm_or_ps(
+         _mm_and_ps(isZero, fallback),
+         _mm_andnot_ps(isZero, norm)
+      );
+      
+      return result;
    }
 
    // adds all 4 lanes together.
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
index 2ea63e6b5..58d66ca6c 100644
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -195,8 +195,15 @@ namespace
 
    inline f32x4 v_normalize3(f32x4 v)
    {
-      f32x4 inv = v_rsqrt_nr(v_dot3(v, v));
-      return _mm_mul_ps(v, inv);
+      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
+      f32x4 dot = v_dot3(v, v);
+
+      f32x4 inv = v_rsqrt_nr(dot);
+      f32x4 isZero = _mm_cmpeq_ps(dot, zero);
+      f32x4 norm = _mm_mul_ps(v, inv);
+
+      return _mm_blendv_ps(norm, fallback, isZero);
    }
 
    // adds all 4 lanes together.
diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake
index f799e8c39..9b98a5b99 100644
--- a/Tools/CMake/torque_macros.cmake
+++ b/Tools/CMake/torque_macros.cmake
@@ -142,7 +142,9 @@ function(add_math_backend name compile_defs)
         return()
     endif()
 
-    add_library(math_${name} OBJECT ${SRC})
+    file(GLOB_RECURSE INL CONFIGURE_DEPENDS "math/impl/*.inl")
+
+    add_library(math_${name} OBJECT ${SRC} ${INL})
 
     message(STATUS "adding math library for isa ${name}")
     target_include_directories(math_${name} PUBLIC

From ac6ec05690e52c85c9858a134d02050b8054559b Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 5 Mar 2026 14:16:11 +0000
Subject: [PATCH 18/23] fix drift turns out we require the dot product to be in
 all rows

---
 Engine/source/math/isa/avx/avx_intrinsics.h   | 157 +++++++++++++++++-
 Engine/source/math/isa/avx2/avx2_intrinsics.h | 157 +++++++++++++++++-
 Engine/source/math/isa/sse2/sse2_intrinsics.h |  96 +++++++++--
 .../source/math/isa/sse41/sse41_intrinsics.h  |  81 ++++++++-
 4 files changed, 456 insertions(+), 35 deletions(-)

diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
index 2dcdf513b..28a5fa4fa 100644
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -136,7 +136,8 @@ namespace
    {
       f32x4 r = _mm_rcp_ps(b);
       f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
    }
 
    // Divide fast ( b = recip eg 1/b)
@@ -144,14 +145,12 @@ namespace
 
    inline f32x4 v_rsqrt_nr(f32x4 x)
    {
-      f32x4 r = _mm_rsqrt_ps(x);
-
       f32x4 half = _mm_set1_ps(0.5f);
       f32x4 three = _mm_set1_ps(3.0f);
 
-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
    }
 
    //------------------------------------------------------
@@ -161,13 +160,13 @@ namespace
    // full dot4
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
    }
 
    // dot3 (ignores w)
    inline f32x4 v_dot3(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
    }
 
    // cross product xyz only.
@@ -392,4 +391,146 @@ namespace
       f32x4 c0 = v_cross(r1, r2);
       return v_dot3(r0, c0); // splatted determinant
    }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+   typedef __m256 f32x8;
+
+   struct vec4_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+      f32x8 w;
+   };
+
+   struct vec3_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+   };
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+   inline vec3_batch8 load_vec3_batch8(const float* ptr)
+   {
+      // ptr points to x0
+      // layout: x y z x y z ...
+
+      vec3_batch8 r;
+
+      r.x = _mm256_set_ps(
+         ptr[21], ptr[18], ptr[15], ptr[12],
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm256_set_ps(
+         ptr[22], ptr[19], ptr[16], ptr[13],
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm256_set_ps(
+         ptr[23], ptr[20], ptr[17], ptr[14],
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 8 at a time
+   inline void store_f32x8(float* out, f32x8 v)
+   {
+      _mm256_storeu_ps(out, v);
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 8
+   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   {
+      alignas(32) float xs[8];
+      alignas(32) float ys[8];
+      alignas(32) float zs[8];
+
+      _mm256_store_ps(xs, v.x);
+      _mm256_store_ps(ys, v.y);
+      _mm256_store_ps(zs, v.z);
+
+      for (int i = 0; i < 8; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   {
+      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
+      f32x8 muly = _mm256_mul_ps(a.y, b.y);
+      f32x8 mulz = _mm256_mul_ps(a.z, b.z);
+
+      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
+   }
+
 }
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
index e276e4c6e..bb367f198 100644
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -136,7 +136,8 @@ namespace
    {
       f32x4 r = _mm_rcp_ps(b);
       f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
    }
 
    // Divide fast ( b = recip eg 1/b)
@@ -144,14 +145,12 @@ namespace
 
    inline f32x4 v_rsqrt_nr(f32x4 x)
    {
-      f32x4 r = _mm_rsqrt_ps(x);
-
       f32x4 half = _mm_set1_ps(0.5f);
       f32x4 three = _mm_set1_ps(3.0f);
 
-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
    }
 
    //------------------------------------------------------
@@ -161,13 +160,13 @@ namespace
    // full dot4
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
    }
 
    // dot3 (ignores w)
    inline f32x4 v_dot3(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
    }
 
    // cross product xyz only.
@@ -392,4 +391,146 @@ namespace
       f32x4 c0 = v_cross(r1, r2);
       return v_dot3(r0, c0); // splatted determinant
    }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+   typedef __m256 f32x8;
+
+   struct vec4_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+      f32x8 w;
+   };
+
+   struct vec3_batch8
+   {
+      f32x8 x;
+      f32x8 y;
+      f32x8 z;
+   };
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+   inline vec3_batch8 load_vec3_batch8(const float* ptr)
+   {
+      // ptr points to x0
+      // layout: x y z x y z ...
+
+      vec3_batch8 r;
+
+      r.x = _mm256_set_ps(
+         ptr[21], ptr[18], ptr[15], ptr[12],
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm256_set_ps(
+         ptr[22], ptr[19], ptr[16], ptr[13],
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm256_set_ps(
+         ptr[23], ptr[20], ptr[17], ptr[14],
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 8 at a time
+   inline void store_f32x8(float* out, f32x8 v)
+   {
+      _mm256_storeu_ps(out, v);
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 8
+   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   {
+      alignas(32) float xs[8];
+      alignas(32) float ys[8];
+      alignas(32) float zs[8];
+
+      _mm256_store_ps(xs, v.x);
+      _mm256_store_ps(ys, v.y);
+      _mm256_store_ps(zs, v.z);
+
+      for (int i = 0; i < 8; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
+
+   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   {
+      f32x8 mulx = _mm256_mul_ps(a.x, b.x);
+      f32x8 muly = _mm256_mul_ps(a.y, b.y);
+      f32x8 mulz = _mm256_mul_ps(a.z, b.z);
+
+      return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
+   }
+
 }
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
index 63243d2d3..0150a78a5 100644
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -149,7 +149,7 @@ namespace
       return _mm_and_ps(v, mask);
    }
 
-   //------------------------------------------------------
+   //------------------------------------------------------ 
    // Fast recip
    //------------------------------------------------------
 
@@ -158,7 +158,8 @@ namespace
    {
       f32x4 r = _mm_rcp_ps(b);
       f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
    }
 
    // Divide fast ( b = recip eg 1/b)
@@ -166,14 +167,12 @@ namespace
 
    inline f32x4 v_rsqrt_nr(f32x4 x)
    {
-      f32x4 r = _mm_rsqrt_ps(x);
-
       f32x4 half = _mm_set1_ps(0.5f);
       f32x4 three = _mm_set1_ps(3.0f);
 
-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
    }
 
    //------------------------------------------------------
@@ -184,9 +183,9 @@ namespace
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
       f32x4 prod = _mm_mul_ps(a, b);           // multiply element-wise
-      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
+      f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2));
       prod = _mm_add_ps(prod, shuf);
-      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
+      shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1));
       prod = _mm_add_ps(prod, shuf);
       return prod;                             // f32x4, all lanes = dot(a,b)
    }
@@ -305,10 +304,11 @@ namespace
       f32x4 z = v_dot4(m.r2, v);
       f32x4 w = v_dot4(m.r3, v);
 
-      // Pack lowest lane of each into a vector
-      f32x4 xy = _mm_unpacklo_ps(x, y);
-      f32x4 zw = _mm_unpacklo_ps(z, w);
-      return _mm_movelh_ps(xy, zw);
+      // combine to a vector
+      return _mm_set_ps(_mm_cvtss_f32(w),
+         _mm_cvtss_f32(z),
+         _mm_cvtss_f32(y),
+         _mm_cvtss_f32(x));
    }
 
    inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
@@ -531,4 +531,74 @@ namespace
       f32x4 c0 = v_cross(r1, r2);
       return v_dot3(r0, c0); // splatted determinant
    }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
 }
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
index 58d66ca6c..4134480aa 100644
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -148,7 +148,8 @@ namespace
    {
       f32x4 r = _mm_rcp_ps(b);
       f32x4 two = _mm_set1_ps(2.0f);
-      return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r)));
+      r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
+      return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
    }
 
    // Divide fast ( b = recip eg 1/b)
@@ -156,14 +157,12 @@ namespace
 
    inline f32x4 v_rsqrt_nr(f32x4 x)
    {
-      f32x4 r = _mm_rsqrt_ps(x);
-
       f32x4 half = _mm_set1_ps(0.5f);
       f32x4 three = _mm_set1_ps(3.0f);
 
-      r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r)));
-
-      return _mm_mul_ps(r, half);
+      f32x4 r = _mm_rsqrt_ps(x);
+      f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
+      return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
    }
 
    //------------------------------------------------------
@@ -502,4 +501,74 @@ namespace
       f32x4 c0 = v_cross(r1, r2);
       return v_dot3(r0, c0); // splatted determinant
    }
+
+   //---------------------------------------------------------
+   // BATCH INTRINSICS
+   //---------------------------------------------------------
+
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   struct vec3_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+   };
+
+
+   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   {
+      vec3_batch4 r;
+
+      r.x = _mm_set_ps(
+         ptr[9], ptr[6], ptr[3], ptr[0]);
+
+      r.y = _mm_set_ps(
+         ptr[10], ptr[7], ptr[4], ptr[1]);
+
+      r.z = _mm_set_ps(
+         ptr[11], ptr[8], ptr[5], ptr[2]);
+
+      return r;
+   }
+
+   // Store the result back out to a float array 4 at a time
+   inline void store_f32x4(float* out, f32x4 v)
+   {
+      _mm_storeu_ps(out, v);
+   }
+
+   // Store the result back to a float3 array with size of 4
+   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   {
+      alignas(16) float xs[8];
+      alignas(16) float ys[8];
+      alignas(16) float zs[8];
+
+      _mm_store_ps(xs, v.x);
+      _mm_store_ps(ys, v.y);
+      _mm_store_ps(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   {
+      f32x4 mulx = _mm_mul_ps(a.x, b.x);
+      f32x4 muly = _mm_mul_ps(a.y, b.y);
+      f32x4 mulz = _mm_mul_ps(a.z, b.z);
+
+      return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
+   }
 }

From add7f2a5d7710cd10273a62b636f53bce848042c Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 5 Mar 2026 18:54:28 +0000
Subject: [PATCH 19/23] more intrinsics

add transform plane
added first batch function for mulp to intrinsics
---
 Engine/source/math/impl/mat44_impl.inl        |  82 ++++++++
 Engine/source/math/impl/math_c.cpp            |  92 +++++++++
 Engine/source/math/isa/avx/avx_intrinsics.h   | 181 +++++++++++++-----
 Engine/source/math/isa/avx/mat44.cpp          |   3 +
 Engine/source/math/isa/avx2/avx2_intrinsics.h | 173 +++++++++++++----
 Engine/source/math/isa/avx2/mat44.cpp         |   2 +
 Engine/source/math/isa/neon/neon_intrinsics.h |  91 +++++++++
 Engine/source/math/isa/sse2/mat44.cpp         |   2 +
 Engine/source/math/isa/sse2/sse2_intrinsics.h |  73 +++++--
 Engine/source/math/isa/sse41/mat44.cpp        |   2 +
 .../source/math/isa/sse41/sse41_intrinsics.h  |  73 +++++--
 Engine/source/math/mMatrix.h                  |  40 +++-
 Engine/source/math/public/mat44_dispatch.h    |   3 +
 Engine/source/math/util/frustum.cpp           |   6 +-
 14 files changed, 710 insertions(+), 113 deletions(-)

diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
index ab0799b08..c203ecf5e 100644
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -32,6 +32,53 @@ namespace math_backend::mat44
       m_store(m, ma);
    }
 
+   inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result)
+   {
+      f32x4x4 M = m_load(m);
+
+      f32x4 plane_v = v_load(plane);
+      f32x4 scale_v = v_load3_vec(scale);
+      f32x4 invScale = v_rcp_nr(scale_v);
+
+      // normal = plane.xyz
+      f32x4 normal = plane_v;
+
+      // apply Inv(s)
+      normal = v_mul(normal, invScale);
+
+      // multiply by Inv(Tr(m)) (only the rotation part matters)
+      f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0);
+      f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1);
+      f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2);
+
+      normal = v_add(v_add(nx, ny), nz);
+
+      normal = v_normalize3(normal);
+
+      // compute point on plane
+      float d = v_extract0(v_swizzle_singular_mask(plane_v, 3));
+
+      f32x4 point = v_mul(plane_v, v_set1(-d));
+      point = v_preserve_w(point, v_set1(1.0f));
+
+      // apply scale
+      point = v_mul(point, scale_v);
+
+      // transform point by matrix
+      point = m_mul_vec4(M, point);
+
+      // compute new plane distance
+      float newD = -v_extract0(v_dot3(point, normal));
+
+      alignas(16) float n[4];
+      v_store(n, normal);
+
+      plane_result[0] = n[0];
+      plane_result[1] = n[1];
+      plane_result[2] = n[2];
+      plane_result[3] = newD;
+   }
+
    inline void mat44_get_scale_impl(const float* m, float* s)
    {
       f32x4x4 ma = m_load(m);
@@ -341,4 +388,39 @@ namespace math_backend::mat44
       m_store(m, mo);
    }
 
+   //--------------------------------------------------
+   // MATRIX BATCH FUNCTIONS
+   //--------------------------------------------------
+
+   inline void mat44_batch_mul_pos3(const float* m, const float* points, size_t count, float* result)
+   {
+      size_t i = 0;
+      f32x4x4 ma = m_load(m);
+
+      // AVX has 8 lanes to play with
+#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX)
+      // 8-wide AVX only
+      for (; i + 8 <= count; i += 8)
+      {
+         vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false);
+         vec4_batch8 vr = m_mul_pos3_batch8(ma, va);
+         store_vec3_batch8(&result[i*3], vr);
+      }
+#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX
+
+      // 4-wide
+      for (; i + 4 <= count; i += 4)
+      {
+         vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false);
+         vec4_batch4 vr = m_mul_pos3_batch4(ma, va);
+         store_vec3_batch4(&result[i * 3], vr);
+      }
+
+      for (; i < count; ++i)
+      {
+         size_t idx = i * 3;
+         mat44_mul_pos3_impl(m, &points[idx], &result[idx]);
+      }
+   }
+
 } // namespace math_backend::mat44
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
index ac6358b60..6d45350fa 100644
--- a/Engine/source/math/impl/math_c.cpp
+++ b/Engine/source/math/impl/math_c.cpp
@@ -3,6 +3,7 @@
 #include "math/public/float3_dispatch.h"
 #include "math/public/mat44_dispatch.h"
 #include "math/mConstants.h"
+#include "math/mMatrix.h"
 #include <cmath>   // for sqrtf, etc.
 
 namespace math_backend::float4::dispatch
@@ -375,6 +376,88 @@ namespace math_backend::mat44::dispatch
          mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
       };
 
+      gMat44.transform_plane = [](const F32* m, const F32* s, const F32* p, F32* presult) {
+         // We take in a matrix, a scale factor, and a plane equation.  We want to output
+         //  the resultant normal
+         // We have T = m*s
+         // To multiply the normal, we want Inv(Tr(m*s))
+         //  Inv(Tr(ms)) = Inv(Tr(s) * Tr(m))
+         //              = Inv(Tr(m)) * Inv(Tr(s))
+         //
+         //  Inv(Tr(s)) = Inv(s) = [ 1/x   0   0  0]
+         //                        [   0 1/y   0  0]
+         //                        [   0   0 1/z  0]
+         //                        [   0   0   0  1]
+         //
+         // Since m is an affine matrix,
+         //  Tr(m) = [ [       ] 0 ]
+         //          [ [   R   ] 0 ]
+         //          [ [       ] 0 ]
+         //          [ [ x y z ] 1 ]
+         //
+         // Inv(Tr(m)) = [ [    -1 ] 0 ]
+         //              [ [   R   ] 0 ]
+         //              [ [       ] 0 ]
+         //              [ [ A B C ] 1 ]
+         // Where:
+         //
+         //  P = (x, y, z)
+         //  A = -(Row(0, r) * P);
+         //  B = -(Row(1, r) * P);
+         //  C = -(Row(2, r) * P);
+
+         MatrixF invScale(true);
+         F32* pScaleElems = invScale;
+         pScaleElems[MatrixF::idx(0, 0)] = 1.0f / s[0];
+         pScaleElems[MatrixF::idx(1, 1)] = 1.0f / s[1];
+         pScaleElems[MatrixF::idx(2, 2)] = 1.0f / s[2];
+
+         const Point3F shear(m[MatrixF::idx(3, 0)], m[MatrixF::idx(3, 1)], m[MatrixF::idx(3, 2)]);
+
+         const Point3F row0(m[MatrixF::idx(0, 0)], m[MatrixF::idx(0, 1)], m[MatrixF::idx(0, 2)]);
+         const Point3F row1(m[MatrixF::idx(1, 0)], m[MatrixF::idx(1, 1)], m[MatrixF::idx(1, 2)]);
+         const Point3F row2(m[MatrixF::idx(2, 0)], m[MatrixF::idx(2, 1)], m[MatrixF::idx(2, 2)]);
+
+         const F32 A = -mDot(row0, shear);
+         const F32 B = -mDot(row1, shear);
+         const F32 C = -mDot(row2, shear);
+
+         MatrixF invTrMatrix(true);
+         F32* destMat = invTrMatrix;
+         destMat[MatrixF::idx(0, 0)] = m[MatrixF::idx(0, 0)];
+         destMat[MatrixF::idx(1, 0)] = m[MatrixF::idx(1, 0)];
+         destMat[MatrixF::idx(2, 0)] = m[MatrixF::idx(2, 0)];
+         destMat[MatrixF::idx(0, 1)] = m[MatrixF::idx(0, 1)];
+         destMat[MatrixF::idx(1, 1)] = m[MatrixF::idx(1, 1)];
+         destMat[MatrixF::idx(2, 1)] = m[MatrixF::idx(2, 1)];
+         destMat[MatrixF::idx(0, 2)] = m[MatrixF::idx(0, 2)];
+         destMat[MatrixF::idx(1, 2)] = m[MatrixF::idx(1, 2)];
+         destMat[MatrixF::idx(2, 2)] = m[MatrixF::idx(2, 2)];
+         destMat[MatrixF::idx(0, 3)] = A;
+         destMat[MatrixF::idx(1, 3)] = B;
+         destMat[MatrixF::idx(2, 3)] = C;
+         invTrMatrix.mul(invScale);
+
+         Point3F norm(p[0], p[1], p[2]);
+         Point3F point = norm * -p[3];
+         invTrMatrix.mulP(norm);
+         norm.normalize();
+
+         MatrixF temp;
+         dMemcpy(temp, m, sizeof(F32) * 16);
+         point.x *= s[0];
+         point.y *= s[1];
+         point.z *= s[2];
+         temp.mulP(point);
+
+         PlaneF resultPlane(point, norm);
+         presult[0] = resultPlane.x;
+         presult[1] = resultPlane.y;
+         presult[2] = resultPlane.z;
+         presult[3] = resultPlane.d;
+
+      };
+
       gMat44.normalize = [](float* a) {
          F32 col0[3], col1[3], col2[3];
          // extract columns 0 and 1
@@ -404,5 +487,14 @@ namespace math_backend::mat44::dispatch
          a[10] = col2[2];
 
       };
+
+      gMat44.batch_mul_pos3 = [](const float* m, const float* pts, size_t count, float* result_ptrs) {
+         size_t i = 0;
+         for (; i < count; i++)
+         {
+            size_t idx = i * 3;
+            gMat44.mul_pos3(m, &pts[idx], &result_ptrs[idx]);
+         }
+      };
    }
 }
diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
index 28a5fa4fa..069b15850 100644
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -57,7 +57,7 @@ namespace
    // Shuffle helpers
    //------------------------------------------------------
 
-   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, const int x, const int y, const int z, const int w)
+   inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
    {
       f32x4 lo = v1;
       f32x4 hi = v2;
@@ -160,13 +160,13 @@ namespace
    // full dot4
    inline f32x4 v_dot4(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
+      return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
    }
 
    // dot3 (ignores w)
    inline f32x4 v_dot3(f32x4 a, f32x4 b)
    {
-      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
+      return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
    }
 
    // cross product xyz only.
@@ -182,7 +182,7 @@ namespace
 
    inline f32x4 v_normalize3(f32x4 v)
    {
-      const f32x4 zero     = _mm_setzero_ps();
+      const f32x4 zero = _mm_setzero_ps();
       const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
       f32x4 dot = v_dot3(v, v);
 
@@ -405,13 +405,6 @@ namespace
       f32x8 w;
    };
 
-   struct vec3_batch8
-   {
-      f32x8 x;
-      f32x8 y;
-      f32x8 z;
-   };
-
    struct vec4_batch4
    {
       f32x4 x;
@@ -420,47 +413,37 @@ namespace
       f32x4 w;
    };
 
-   struct vec3_batch4
+   inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
    {
-      f32x4 x;
-      f32x4 y;
-      f32x4 z;
-   };
+      vec4_batch8 r;
 
-   inline vec3_batch8 load_vec3_batch8(const float* ptr)
-   {
-      // ptr points to x0
-      // layout: x y z x y z ...
+      r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
+         ptr[12], ptr[15], ptr[18], ptr[21]);
+      r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
+         ptr[13], ptr[16], ptr[19], ptr[22]);
+      r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
+         ptr[14], ptr[17], ptr[20], ptr[23]);
 
-      vec3_batch8 r;
-
-      r.x = _mm256_set_ps(
-         ptr[21], ptr[18], ptr[15], ptr[12],
-         ptr[9], ptr[6], ptr[3], ptr[0]);
-
-      r.y = _mm256_set_ps(
-         ptr[22], ptr[19], ptr[16], ptr[13],
-         ptr[10], ptr[7], ptr[4], ptr[1]);
-
-      r.z = _mm256_set_ps(
-         ptr[23], ptr[20], ptr[17], ptr[14],
-         ptr[11], ptr[8], ptr[5], ptr[2]);
+      if (fillW)
+      {
+         r.w = _mm256_set1_ps(w);
+      }
 
       return r;
    }
 
-   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
    {
-      vec3_batch4 r;
+      vec4_batch4 r;
 
-      r.x = _mm_set_ps(
-         ptr[9], ptr[6], ptr[3], ptr[0]);
+      r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
+      r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
+      r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);
 
-      r.y = _mm_set_ps(
-         ptr[10], ptr[7], ptr[4], ptr[1]);
-
-      r.z = _mm_set_ps(
-         ptr[11], ptr[8], ptr[5], ptr[2]);
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
 
       return r;
    }
@@ -478,7 +461,7 @@ namespace
    }
 
    // Store the result back to a float3 array with size of 8
-   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   inline void store_vec3_batch8(float* out, const vec4_batch8& v)
    {
       alignas(32) float xs[8];
       alignas(32) float ys[8];
@@ -497,7 +480,7 @@ namespace
    }
 
    // Store the result back to a float3 array with size of 4
-   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
    {
       alignas(16) float xs[8];
       alignas(16) float ys[8];
@@ -515,7 +498,7 @@ namespace
       }
    }
 
-   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
    {
       f32x4 mulx = _mm_mul_ps(a.x, b.x);
       f32x4 muly = _mm_mul_ps(a.y, b.y);
@@ -524,7 +507,7 @@ namespace
       return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
    }
 
-   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
    {
       f32x8 mulx = _mm256_mul_ps(a.x, b.x);
       f32x8 muly = _mm256_mul_ps(a.y, b.y);
@@ -533,4 +516,110 @@ namespace
       return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
    }
 
+   // Batch 8 mul_Vec4.
+   inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v)
+   {
+      vec4_batch8 r;
+
+      __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]);
+      __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]);
+      __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]);
+      __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]);
+
+      __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]);
+      __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]);
+      __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]);
+      __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]);
+
+      __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]);
+      __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]);
+      __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]);
+      __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m00),
+            _mm256_mul_ps(v.y, m01)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m10),
+            _mm256_mul_ps(v.y, m11)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m20),
+            _mm256_mul_ps(v.y, m21)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
+      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
+      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
+      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+
+      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
+      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
+      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
+      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+
+      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
+      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
+      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
+      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
 }
diff --git a/Engine/source/math/isa/avx/mat44.cpp b/Engine/source/math/isa/avx/mat44.cpp
index 50d64b6a2..b0fddcd1c 100644
--- a/Engine/source/math/isa/avx/mat44.cpp
+++ b/Engine/source/math/isa/avx/mat44.cpp
@@ -15,9 +15,12 @@ namespace math_backend::mat44::dispatch
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
       gMat44.mul_vec3         = mat44_mul_vec3_impl;
       gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
       gMat44.scale            = mat44_scale_impl;
       gMat44.get_scale        = mat44_get_scale_impl;
       gMat44.normalize        = mat44_normalize_impl;
       gMat44.determinant      = mat44_get_determinant;
+
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
    }
 }
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
index bb367f198..a10ba77d8 100644
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -405,13 +405,6 @@ namespace
       f32x8 w;
    };
 
-   struct vec3_batch8
-   {
-      f32x8 x;
-      f32x8 y;
-      f32x8 z;
-   };
-
    struct vec4_batch4
    {
       f32x4 x;
@@ -420,47 +413,37 @@ namespace
       f32x4 w;
    };
 
-   struct vec3_batch4
+   inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
    {
-      f32x4 x;
-      f32x4 y;
-      f32x4 z;
-   };
+      vec4_batch8 r;
 
-   inline vec3_batch8 load_vec3_batch8(const float* ptr)
-   {
-      // ptr points to x0
-      // layout: x y z x y z ...
+      r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
+         ptr[12], ptr[15], ptr[18], ptr[21]);
+      r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
+         ptr[13], ptr[16], ptr[19], ptr[22]);
+      r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
+         ptr[14], ptr[17], ptr[20], ptr[23]);
 
-      vec3_batch8 r;
-
-      r.x = _mm256_set_ps(
-         ptr[21], ptr[18], ptr[15], ptr[12],
-         ptr[9], ptr[6], ptr[3], ptr[0]);
-
-      r.y = _mm256_set_ps(
-         ptr[22], ptr[19], ptr[16], ptr[13],
-         ptr[10], ptr[7], ptr[4], ptr[1]);
-
-      r.z = _mm256_set_ps(
-         ptr[23], ptr[20], ptr[17], ptr[14],
-         ptr[11], ptr[8], ptr[5], ptr[2]);
+      if (fillW)
+      {
+         r.w = _mm256_set1_ps(w);
+      }
 
       return r;
    }
 
-   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
    {
-      vec3_batch4 r;
+      vec4_batch4 r;
 
-      r.x = _mm_set_ps(
-         ptr[9], ptr[6], ptr[3], ptr[0]);
+      r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
+      r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
+      r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);
 
-      r.y = _mm_set_ps(
-         ptr[10], ptr[7], ptr[4], ptr[1]);
-
-      r.z = _mm_set_ps(
-         ptr[11], ptr[8], ptr[5], ptr[2]);
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
 
       return r;
    }
@@ -478,7 +461,7 @@ namespace
    }
 
    // Store the result back to a float3 array with size of 8
-   inline void store_vec3_batch8(float* out, const vec3_batch8& v)
+   inline void store_vec3_batch8(float* out, const vec4_batch8& v)
    {
       alignas(32) float xs[8];
       alignas(32) float ys[8];
@@ -497,7 +480,7 @@ namespace
    }
 
    // Store the result back to a float3 array with size of 4
-   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
    {
       alignas(16) float xs[8];
       alignas(16) float ys[8];
@@ -515,7 +498,7 @@ namespace
       }
    }
 
-   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
    {
       f32x4 mulx = _mm_mul_ps(a.x, b.x);
       f32x4 muly = _mm_mul_ps(a.y, b.y);
@@ -524,7 +507,7 @@ namespace
       return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
    }
 
-   inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b)
+   inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
    {
       f32x8 mulx = _mm256_mul_ps(a.x, b.x);
       f32x8 muly = _mm256_mul_ps(a.y, b.y);
@@ -533,4 +516,110 @@ namespace
       return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
    }
 
+   // Batch 8 mul_Vec4.
+   inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v)
+   {
+      vec4_batch8 r;
+
+      __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]);
+      __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]);
+      __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]);
+      __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]);
+
+      __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]);
+      __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]);
+      __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]);
+      __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]);
+
+      __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]);
+      __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]);
+      __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]);
+      __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m00),
+            _mm256_mul_ps(v.y, m01)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m10),
+            _mm256_mul_ps(v.y, m11)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm256_add_ps(
+         _mm256_add_ps(
+            _mm256_mul_ps(v.x, m20),
+            _mm256_mul_ps(v.y, m21)),
+         _mm256_add_ps(
+            _mm256_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
+      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
+      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
+      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+
+      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
+      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
+      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
+      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+
+      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
+      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
+      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
+      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
+
 }
diff --git a/Engine/source/math/isa/avx2/mat44.cpp b/Engine/source/math/isa/avx2/mat44.cpp
index e6b891e9e..83c90a647 100644
--- a/Engine/source/math/isa/avx2/mat44.cpp
+++ b/Engine/source/math/isa/avx2/mat44.cpp
@@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
       gMat44.mul_vec3         = mat44_mul_vec3_impl;
       gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
       gMat44.scale            = mat44_scale_impl;
       gMat44.get_scale        = mat44_get_scale_impl;
       gMat44.normalize        = mat44_normalize_impl;
       gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
    }
 }
diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h
index 888adab45..a02da6391 100644
--- a/Engine/source/math/isa/neon/neon_intrinsics.h
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@@ -410,4 +410,95 @@ namespace
        return m_transpose(adj);
    }
 
+   struct vec4_batch4
+   {
+      f32x4 x;
+      f32x4 y;
+      f32x4 z;
+      f32x4 w;
+   };
+
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
+   {
+      vec4_batch4 r;
+
+      r.x = (f32x4){ ptr[9], ptr[6], ptr[3], ptr[0] };
+      r.y = (f32x4){ ptr[10], ptr[7], ptr[4], ptr[1] };
+      r.z = (f32x4){ ptr[11], ptr[8], ptr[5], ptr[2] };
+
+      if (fillW)
+      {
+         r.w = vdupq_n_f32(w);
+      }
+
+      return r;
+   }
+
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
+   {
+      alignas(16) float xs[4];
+      alignas(16) float ys[4];
+      alignas(16) float zs[4];
+
+      vst1q_f32(xs, v.x);
+      vst1q_f32(ys, v.y);
+      vst1q_f32(zs, v.z);
+
+      for (int i = 0; i < 4; ++i)
+      {
+         out[i * 3 + 0] = xs[i];
+         out[i * 3 + 1] = ys[i];
+         out[i * 3 + 2] = zs[i];
+      }
+   }
+
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      float32x4_t m00 = vdupq_n_f32(m.r0.m128_f32[0]);
+      float32x4_t m01 = vdupq_n_f32(m.r0.m128_f32[1]);
+      float32x4_t m02 = vdupq_n_f32(m.r0.m128_f32[2]);
+      float32x4_t m03 = vdupq_n_f32(m.r0.m128_f32[3]);
+
+      float32x4_t m10 = vdupq_n_f32(m.r1.m128_f32[0]);
+      float32x4_t m11 = vdupq_n_f32(m.r1.m128_f32[1]);
+      float32x4_t m12 = vdupq_n_f32(m.r1.m128_f32[2]);
+      float32x4_t m13 = vdupq_n_f32(m.r1.m128_f32[3]);
+
+      float32x4_t m20 = vdupq_n_f32(m.r2.m128_f32[0]);
+      float32x4_t m21 = vdupq_n_f32(m.r2.m128_f32[1]);
+      float32x4_t m22 = vdupq_n_f32(m.r2.m128_f32[2]);
+      float32x4_t m23 = vdupq_n_f32(m.r2.m128_f32[3]);
+
+      // row0 dot
+      r.x = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m00),
+            vmulq_f32(v.y, m01)),
+         vaddq_f32(
+            vmulq_f32(v.z, m02),
+            m03));
+
+      // row1 dot
+      r.y = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m10),
+            vmulq_f32(v.y, m11)),
+         vaddq_f32(
+            vmulq_f32(v.z, m12),
+            m13));
+
+      // row2 dot
+      r.z = vaddq_f32(
+         vaddq_f32(
+            vmulq_f32(v.x, m20),
+            vmulq_f32(v.y, m21)),
+         vaddq_f32(
+            vmulq_f32(v.z, m22),
+            m23));
+
+      return r;
+   }
+
 }
diff --git a/Engine/source/math/isa/sse2/mat44.cpp b/Engine/source/math/isa/sse2/mat44.cpp
index 2a84c029f..d8d06ea58 100644
--- a/Engine/source/math/isa/sse2/mat44.cpp
+++ b/Engine/source/math/isa/sse2/mat44.cpp
@@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
       gMat44.mul_vec3         = mat44_mul_vec3_impl;
       gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane  = mat44_transform_plane_impl;
       gMat44.scale            = mat44_scale_impl;
       gMat44.get_scale        = mat44_get_scale_impl;
       gMat44.normalize        = mat44_normalize_impl;
       gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
    }
 }
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
index 0150a78a5..df3852340 100644
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -544,17 +544,10 @@ namespace
       f32x4 w;
    };
 
-   struct vec3_batch4
-   {
-      f32x4 x;
-      f32x4 y;
-      f32x4 z;
-   };
 
-
-   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
    {
-      vec3_batch4 r;
+      vec4_batch4 r;
 
       r.x = _mm_set_ps(
          ptr[9], ptr[6], ptr[3], ptr[0]);
@@ -565,6 +558,11 @@ namespace
       r.z = _mm_set_ps(
          ptr[11], ptr[8], ptr[5], ptr[2]);
 
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
+
       return r;
    }
 
@@ -575,7 +573,7 @@ namespace
    }
 
    // Store the result back to a float3 array with size of 4
-   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
    {
       alignas(16) float xs[8];
       alignas(16) float ys[8];
@@ -593,7 +591,7 @@ namespace
       }
    }
 
-   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
    {
       f32x4 mulx = _mm_mul_ps(a.x, b.x);
       f32x4 muly = _mm_mul_ps(a.y, b.y);
@@ -601,4 +599,57 @@ namespace
 
       return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
    }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
+      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
+      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
+      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+
+      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
+      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
+      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
+      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+
+      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
+      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
+      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
+      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
 }
diff --git a/Engine/source/math/isa/sse41/mat44.cpp b/Engine/source/math/isa/sse41/mat44.cpp
index 489d495b8..f9e33138b 100644
--- a/Engine/source/math/isa/sse41/mat44.cpp
+++ b/Engine/source/math/isa/sse41/mat44.cpp
@@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch
       gMat44.mul_pos3         = mat44_mul_pos3_impl;
       gMat44.mul_vec3         = mat44_mul_vec3_impl;
       gMat44.mul_float4       = mat44_mul_float4_impl;
+      gMat44.transform_plane = mat44_transform_plane_impl;
       gMat44.scale            = mat44_scale_impl;
       gMat44.get_scale        = mat44_get_scale_impl;
       gMat44.normalize        = mat44_normalize_impl;
       gMat44.determinant      = mat44_get_determinant;
+      gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
    }
 }
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
index 4134480aa..45c056dcf 100644
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -514,17 +514,10 @@ namespace
       f32x4 w;
    };
 
-   struct vec3_batch4
-   {
-      f32x4 x;
-      f32x4 y;
-      f32x4 z;
-   };
 
-
-   inline vec3_batch4 load_vec3_batch4(const float* ptr)
+   inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
    {
-      vec3_batch4 r;
+      vec4_batch4 r;
 
       r.x = _mm_set_ps(
          ptr[9], ptr[6], ptr[3], ptr[0]);
@@ -535,6 +528,11 @@ namespace
       r.z = _mm_set_ps(
          ptr[11], ptr[8], ptr[5], ptr[2]);
 
+      if (fillW)
+      {
+         r.w = _mm_set1_ps(w);
+      }
+
       return r;
    }
 
@@ -545,7 +543,7 @@ namespace
    }
 
    // Store the result back to a float3 array with size of 4
-   inline void store_vec3_batch4(float* out, const vec3_batch4& v)
+   inline void store_vec3_batch4(float* out, const vec4_batch4& v)
    {
       alignas(16) float xs[8];
       alignas(16) float ys[8];
@@ -563,7 +561,7 @@ namespace
       }
    }
 
-   inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b)
+   inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
    {
       f32x4 mulx = _mm_mul_ps(a.x, b.x);
       f32x4 muly = _mm_mul_ps(a.y, b.y);
@@ -571,4 +569,57 @@ namespace
 
       return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
    }
+
+   // Batch 4 mul_Vec4.
+   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   {
+      vec4_batch4 r;
+
+      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
+      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
+      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
+      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+
+      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
+      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
+      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
+      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+
+      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
+      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
+      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
+      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+
+      //
+      // row0 dot
+      //
+      r.x = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m00),
+            _mm_mul_ps(v.y, m01)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m02), m03));
+
+      //
+      // row1 dot
+      //
+      r.y = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m10),
+            _mm_mul_ps(v.y, m11)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m12), m13));
+
+      //
+      // row2 dot
+      //
+      r.z = _mm_add_ps(
+         _mm_add_ps(
+            _mm_mul_ps(v.x, m20),
+            _mm_mul_ps(v.y, m21)),
+         _mm_add_ps(
+            _mm_mul_ps(v.z, m22), m23));
+
+      return r;
+   }
 }
diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h
index 92bea5fad..873ff967a 100644
--- a/Engine/source/math/mMatrix.h
+++ b/Engine/source/math/mMatrix.h
@@ -231,6 +231,8 @@ public:
    void mul( Point4F& p ) const;                       ///< M * p -> p (full [4x4] * [1x4])
    void mulP( Point3F& p ) const;                      ///< M * p -> p (assume w = 1.0f)
    void mulP( const Point3F &p, Point3F *d) const;     ///< M * p -> d (assume w = 1.0f)
+   void batch_mulP(Point3F* points, size_t count) const;
+   void batch_mulP(const Point3F* points, size_t count, Point3F* out) const;
    void mulV( VectorF& p ) const;                      ///< M * v -> v (assume w = 0.0f)
    void mulV( const VectorF &p, Point3F *d) const;     ///< M * v -> d (assume w = 0.0f)
 
@@ -477,6 +479,40 @@ inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
    GetMat44().mul_pos3(*this, p, *d);
 }
 
+inline void MatrixF::batch_mulP(Point3F* points, size_t count) const
+{
+   // Allocate temporary buffer
+   Point3F* temp = new Point3F[count];
+
+   GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
+
+   // Copy the results back to out
+   for (size_t i = 0; i < count; ++i)
+   {
+      points[i] = temp[i];
+   }
+
+   // Free temporary buffer
+   delete[] temp;
+}
+
+inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* out) const
+{
+   // Allocate temporary buffer
+   Point3F* temp = new Point3F[count];
+
+   GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
+
+   // Copy the results back to out
+   for (size_t i = 0; i < count; ++i)
+   {
+      out[i] = temp[i];
+   }
+
+   // Free temporary buffer
+   delete[] temp;
+}
+
 inline void MatrixF::mulV( VectorF& v) const
 {
    // M * v -> v
@@ -663,7 +699,9 @@ inline bool MatrixF::isNaN()
 
 inline void mTransformPlane(const MatrixF& mat, const Point3F& scale, const PlaneF&  plane, PlaneF * result)
 {
-   m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
+   //m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
+
+   GetMat44().transform_plane(mat, scale, plane, *result);
 }
 
 #else // !USE_TEMPLATE_MATRIX
diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h
index 2c4342aee..6f6ade3ab 100644
--- a/Engine/source/math/public/mat44_dispatch.h
+++ b/Engine/source/math/public/mat44_dispatch.h
@@ -16,9 +16,12 @@ namespace math_backend::mat44::dispatch
       void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr;
       void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr;
       void (*mul_float4)(const float* a, const float* b, float* r) = nullptr;
+      void (*transform_plane)(const float* m, const float* s, const float* p, float* r) = nullptr;
       float (*determinant)(const float*) = nullptr;
       void (*scale)(float*, const float*) = nullptr;
       void (*get_scale)(const float*, float*) = nullptr;
+
+      void (*batch_mul_pos3)(const float* m, const float* pts, size_t count, float* result_ptrs) = nullptr;
    };
 
    // Global dispatch table
diff --git a/Engine/source/math/util/frustum.cpp b/Engine/source/math/util/frustum.cpp
index 632d7cded..eff17a0c1 100644
--- a/Engine/source/math/util/frustum.cpp
+++ b/Engine/source/math/util/frustum.cpp
@@ -381,8 +381,10 @@ void FrustumData::_update() const
 
    // Transform the points into the desired culling space.
 
-   for( U32 i = 0; i < mPoints.size(); ++ i )
-      mTransform.mulP( mPoints[ i ] );
+   /*for( U32 i = 0; i < mPoints.size(); ++ i )
+      mTransform.mulP( mPoints[ i ] );*/
+
+   mTransform.batch_mulP(mPoints.address(), mPoints.size());
 
    // Update the axis aligned bounding box from 
    // the newly transformed points.

From c09d5a457902fe5fb6f678ee247b60ad67f6c96f Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Thu, 5 Mar 2026 20:04:33 +0000
Subject: [PATCH 20/23] fix batch on neon

---
 Engine/source/math/impl/mat44_impl.inl        |  4 +--
 Engine/source/math/impl/math_c.cpp            |  6 ++--
 Engine/source/math/isa/neon/neon_intrinsics.h | 30 +++++++++----------
 Engine/source/math/public/mat44_dispatch.h    |  2 +-
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
index c203ecf5e..1bf009a50 100644
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -392,9 +392,9 @@ namespace math_backend::mat44
    // MATRIX BATCH FUNCTIONS
    //--------------------------------------------------
 
-   inline void mat44_batch_mul_pos3(const float* m, const float* points, size_t count, float* result)
+   inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result)
    {
-      size_t i = 0;
+      int i = 0;
       f32x4x4 ma = m_load(m);
 
       // AVX has 8 lanes to play with
diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp
index 6d45350fa..285f43abc 100644
--- a/Engine/source/math/impl/math_c.cpp
+++ b/Engine/source/math/impl/math_c.cpp
@@ -488,11 +488,11 @@ namespace math_backend::mat44::dispatch
 
       };
 
-      gMat44.batch_mul_pos3 = [](const float* m, const float* pts, size_t count, float* result_ptrs) {
-         size_t i = 0;
+      gMat44.batch_mul_pos3 = [](const float* m, const float* pts, int count, float* result_ptrs) {
+         int i = 0;
          for (; i < count; i++)
          {
-            size_t idx = i * 3;
+            int idx = i * 3;
             gMat44.mul_pos3(m, &pts[idx], &result_ptrs[idx]);
          }
       };
diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h
index a02da6391..fc93c9c97 100644
--- a/Engine/source/math/isa/neon/neon_intrinsics.h
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@@ -422,9 +422,9 @@ namespace
    {
       vec4_batch4 r;
 
-      r.x = (f32x4){ ptr[9], ptr[6], ptr[3], ptr[0] };
-      r.y = (f32x4){ ptr[10], ptr[7], ptr[4], ptr[1] };
-      r.z = (f32x4){ ptr[11], ptr[8], ptr[5], ptr[2] };
+      r.x = (f32x4){ ptr[0], ptr[3], ptr[6], ptr[9] };
+      r.y = (f32x4){ ptr[1], ptr[4], ptr[7], ptr[10] };
+      r.z = (f32x4){ ptr[2], ptr[5], ptr[8], ptr[11] };
 
       if (fillW)
       {
@@ -456,20 +456,20 @@ namespace
    {
       vec4_batch4 r;
 
-      float32x4_t m00 = vdupq_n_f32(m.r0.m128_f32[0]);
-      float32x4_t m01 = vdupq_n_f32(m.r0.m128_f32[1]);
-      float32x4_t m02 = vdupq_n_f32(m.r0.m128_f32[2]);
-      float32x4_t m03 = vdupq_n_f32(m.r0.m128_f32[3]);
+      float32x4_t m00 = vdupq_n_f32(m.r0[0]);
+      float32x4_t m01 = vdupq_n_f32(m.r0[1]);
+      float32x4_t m02 = vdupq_n_f32(m.r0[2]);
+      float32x4_t m03 = vdupq_n_f32(m.r0[3]);
 
-      float32x4_t m10 = vdupq_n_f32(m.r1.m128_f32[0]);
-      float32x4_t m11 = vdupq_n_f32(m.r1.m128_f32[1]);
-      float32x4_t m12 = vdupq_n_f32(m.r1.m128_f32[2]);
-      float32x4_t m13 = vdupq_n_f32(m.r1.m128_f32[3]);
+      float32x4_t m10 = vdupq_n_f32(m.r1[0]);
+      float32x4_t m11 = vdupq_n_f32(m.r1[1]);
+      float32x4_t m12 = vdupq_n_f32(m.r1[2]);
+      float32x4_t m13 = vdupq_n_f32(m.r1[3]);
 
-      float32x4_t m20 = vdupq_n_f32(m.r2.m128_f32[0]);
-      float32x4_t m21 = vdupq_n_f32(m.r2.m128_f32[1]);
-      float32x4_t m22 = vdupq_n_f32(m.r2.m128_f32[2]);
-      float32x4_t m23 = vdupq_n_f32(m.r2.m128_f32[3]);
+      float32x4_t m20 = vdupq_n_f32(m.r2[0]);
+      float32x4_t m21 = vdupq_n_f32(m.r2[1]);
+      float32x4_t m22 = vdupq_n_f32(m.r2[2]);
+      float32x4_t m23 = vdupq_n_f32(m.r2[3]);
 
       // row0 dot
       r.x = vaddq_f32(
diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h
index 6f6ade3ab..b49ef9b84 100644
--- a/Engine/source/math/public/mat44_dispatch.h
+++ b/Engine/source/math/public/mat44_dispatch.h
@@ -21,7 +21,7 @@ namespace math_backend::mat44::dispatch
       void (*scale)(float*, const float*) = nullptr;
       void (*get_scale)(const float*, float*) = nullptr;
 
-      void (*batch_mul_pos3)(const float* m, const float* pts, size_t count, float* result_ptrs) = nullptr;
+      void (*batch_mul_pos3)(const float* m, const float* pts, int count, float* result_ptrs) = nullptr;
    };
 
    // Global dispatch table

From 88ffdd01cd9aac5cc684d3c89cf4ee54f6c368af Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Fri, 6 Mar 2026 09:41:35 +0000
Subject: [PATCH 21/23] fix linux build

---
 Engine/source/math/impl/mat44_impl.inl        |  6 +--
 Engine/source/math/isa/avx/avx_intrinsics.h   | 52 +++++++++----------
 Engine/source/math/isa/avx2/avx2_intrinsics.h | 52 +++++++++----------
 Engine/source/math/isa/neon/neon_intrinsics.h | 26 +++++-----
 Engine/source/math/isa/sse2/sse2_intrinsics.h | 26 +++++-----
 .../source/math/isa/sse41/sse41_intrinsics.h  | 26 +++++-----
 Engine/source/math/mMatrix.h                  | 12 ++---
 7 files changed, 99 insertions(+), 101 deletions(-)

diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl
index 1bf009a50..da6971d0d 100644
--- a/Engine/source/math/impl/mat44_impl.inl
+++ b/Engine/source/math/impl/mat44_impl.inl
@@ -395,15 +395,13 @@ namespace math_backend::mat44
    inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result)
    {
       int i = 0;
-      f32x4x4 ma = m_load(m);
-
       // AVX has 8 lanes to play with
 #if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX)
       // 8-wide AVX only
       for (; i + 8 <= count; i += 8)
       {
          vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false);
-         vec4_batch8 vr = m_mul_pos3_batch8(ma, va);
+         vec4_batch8 vr = m_mul_pos3_batch8(m, va);
          store_vec3_batch8(&result[i*3], vr);
       }
 #endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX
@@ -412,7 +410,7 @@ namespace math_backend::mat44
       for (; i + 4 <= count; i += 4)
       {
          vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false);
-         vec4_batch4 vr = m_mul_pos3_batch4(ma, va);
+         vec4_batch4 vr = m_mul_pos3_batch4(m, va);
          store_vec3_batch4(&result[i * 3], vr);
       }
 
diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h
index 069b15850..40337e8c5 100644
--- a/Engine/source/math/isa/avx/avx_intrinsics.h
+++ b/Engine/source/math/isa/avx/avx_intrinsics.h
@@ -517,24 +517,24 @@ namespace
    }
 
    // Batch 8 mul_Vec4.
-   inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v)
+   inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
    {
       vec4_batch8 r;
 
-      __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]);
-      __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]);
-      __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]);
-      __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]);
+      __m256 m00 = _mm256_set1_ps(m[0]);
+      __m256 m01 = _mm256_set1_ps(m[1]);
+      __m256 m02 = _mm256_set1_ps(m[2]);
+      __m256 m03 = _mm256_set1_ps(m[3]);
 
-      __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]);
-      __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]);
-      __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]);
-      __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]);
+      __m256 m10 = _mm256_set1_ps(m[4]);
+      __m256 m11 = _mm256_set1_ps(m[5]);
+      __m256 m12 = _mm256_set1_ps(m[6]);
+      __m256 m13 = _mm256_set1_ps(m[7]);
 
-      __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]);
-      __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]);
-      __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]);
-      __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]);
+      __m256 m20 = _mm256_set1_ps(m[8]);
+      __m256 m21 = _mm256_set1_ps(m[9]);
+      __m256 m22 = _mm256_set1_ps(m[10]);
+      __m256 m23 = _mm256_set1_ps(m[11]);
 
       //
       // row0 dot
@@ -570,24 +570,24 @@ namespace
    }
 
    // Batch 4 mul_Vec4.
-   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
    {
       vec4_batch4 r;
 
-      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
-      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
-      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
-      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+      f32x4 m00 = _mm_set1_ps(m[0]);
+      f32x4 m01 = _mm_set1_ps(m[1]);
+      f32x4 m02 = _mm_set1_ps(m[2]);
+      f32x4 m03 = _mm_set1_ps(m[3]);
 
-      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
-      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
-      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
-      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+      f32x4 m10 = _mm_set1_ps(m[4]);
+      f32x4 m11 = _mm_set1_ps(m[5]);
+      f32x4 m12 = _mm_set1_ps(m[6]);
+      f32x4 m13 = _mm_set1_ps(m[7]);
 
-      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
-      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
-      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
-      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+      f32x4 m20 = _mm_set1_ps(m[8]);
+      f32x4 m21 = _mm_set1_ps(m[9]);
+      f32x4 m22 = _mm_set1_ps(m[10]);
+      f32x4 m23 = _mm_set1_ps(m[11]);
 
       //
       // row0 dot
diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h
index a10ba77d8..3291c3521 100644
--- a/Engine/source/math/isa/avx2/avx2_intrinsics.h
+++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h
@@ -517,24 +517,24 @@ namespace
    }
 
    // Batch 8 mul_Vec4.
-   inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v)
+   inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
    {
       vec4_batch8 r;
 
-      __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]);
-      __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]);
-      __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]);
-      __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]);
+      __m256 m00 = _mm256_set1_ps(m[0]);
+      __m256 m01 = _mm256_set1_ps(m[1]);
+      __m256 m02 = _mm256_set1_ps(m[2]);
+      __m256 m03 = _mm256_set1_ps(m[3]);
 
-      __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]);
-      __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]);
-      __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]);
-      __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]);
+      __m256 m10 = _mm256_set1_ps(m[4]);
+      __m256 m11 = _mm256_set1_ps(m[5]);
+      __m256 m12 = _mm256_set1_ps(m[6]);
+      __m256 m13 = _mm256_set1_ps(m[7]);
 
-      __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]);
-      __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]);
-      __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]);
-      __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]);
+      __m256 m20 = _mm256_set1_ps(m[8]);
+      __m256 m21 = _mm256_set1_ps(m[9]);
+      __m256 m22 = _mm256_set1_ps(m[10]);
+      __m256 m23 = _mm256_set1_ps(m[11]);
 
       //
       // row0 dot
@@ -570,24 +570,24 @@ namespace
    }
 
    // Batch 4 mul_Vec4.
-   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
    {
       vec4_batch4 r;
 
-      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
-      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
-      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
-      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+      f32x4 m00 = _mm_set1_ps(m[0]);
+      f32x4 m01 = _mm_set1_ps(m[1]);
+      f32x4 m02 = _mm_set1_ps(m[2]);
+      f32x4 m03 = _mm_set1_ps(m[3]);
 
-      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
-      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
-      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
-      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+      f32x4 m10 = _mm_set1_ps(m[4]);
+      f32x4 m11 = _mm_set1_ps(m[5]);
+      f32x4 m12 = _mm_set1_ps(m[6]);
+      f32x4 m13 = _mm_set1_ps(m[7]);
 
-      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
-      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
-      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
-      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+      f32x4 m20 = _mm_set1_ps(m[8]);
+      f32x4 m21 = _mm_set1_ps(m[9]);
+      f32x4 m22 = _mm_set1_ps(m[10]);
+      f32x4 m23 = _mm_set1_ps(m[11]);
 
       //
       // row0 dot
diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h
index fc93c9c97..e118b1600 100644
--- a/Engine/source/math/isa/neon/neon_intrinsics.h
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@@ -452,24 +452,24 @@ namespace
       }
    }
 
-   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
    {
       vec4_batch4 r;
 
-      float32x4_t m00 = vdupq_n_f32(m.r0[0]);
-      float32x4_t m01 = vdupq_n_f32(m.r0[1]);
-      float32x4_t m02 = vdupq_n_f32(m.r0[2]);
-      float32x4_t m03 = vdupq_n_f32(m.r0[3]);
+      float32x4_t m00 = vdupq_n_f32(m[0]);
+      float32x4_t m01 = vdupq_n_f32(m[1]);
+      float32x4_t m02 = vdupq_n_f32(m[2]);
+      float32x4_t m03 = vdupq_n_f32(m[3]);
 
-      float32x4_t m10 = vdupq_n_f32(m.r1[0]);
-      float32x4_t m11 = vdupq_n_f32(m.r1[1]);
-      float32x4_t m12 = vdupq_n_f32(m.r1[2]);
-      float32x4_t m13 = vdupq_n_f32(m.r1[3]);
+      float32x4_t m10 = vdupq_n_f32(m[4]);
+      float32x4_t m11 = vdupq_n_f32(m[5]);
+      float32x4_t m12 = vdupq_n_f32(m[6]);
+      float32x4_t m13 = vdupq_n_f32(m[7]);
 
-      float32x4_t m20 = vdupq_n_f32(m.r2[0]);
-      float32x4_t m21 = vdupq_n_f32(m.r2[1]);
-      float32x4_t m22 = vdupq_n_f32(m.r2[2]);
-      float32x4_t m23 = vdupq_n_f32(m.r2[3]);
+      float32x4_t m20 = vdupq_n_f32(m[8]);
+      float32x4_t m21 = vdupq_n_f32(m[9]);
+      float32x4_t m22 = vdupq_n_f32(m[10]);
+      float32x4_t m23 = vdupq_n_f32(m[11]);
 
       // row0 dot
       r.x = vaddq_f32(
diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h
index df3852340..a582d3793 100644
--- a/Engine/source/math/isa/sse2/sse2_intrinsics.h
+++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h
@@ -601,24 +601,24 @@ namespace
    }
 
    // Batch 4 mul_Vec4.
-   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
    {
       vec4_batch4 r;
 
-      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
-      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
-      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
-      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+      f32x4 m00 = _mm_set1_ps(m[0]);
+      f32x4 m01 = _mm_set1_ps(m[1]);
+      f32x4 m02 = _mm_set1_ps(m[2]);
+      f32x4 m03 = _mm_set1_ps(m[3]);
 
-      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
-      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
-      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
-      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+      f32x4 m10 = _mm_set1_ps(m[4]);
+      f32x4 m11 = _mm_set1_ps(m[5]);
+      f32x4 m12 = _mm_set1_ps(m[6]);
+      f32x4 m13 = _mm_set1_ps(m[7]);
 
-      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
-      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
-      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
-      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+      f32x4 m20 = _mm_set1_ps(m[8]);
+      f32x4 m21 = _mm_set1_ps(m[9]);
+      f32x4 m22 = _mm_set1_ps(m[10]);
+      f32x4 m23 = _mm_set1_ps(m[11]);
 
       //
       // row0 dot
diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h
index 45c056dcf..80dfdbc71 100644
--- a/Engine/source/math/isa/sse41/sse41_intrinsics.h
+++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h
@@ -571,24 +571,24 @@ namespace
    }
 
    // Batch 4 mul_Vec4.
-   inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
+   inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
    {
       vec4_batch4 r;
 
-      f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
-      f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
-      f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
-      f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
+      f32x4 m00 = _mm_set1_ps(m[0]);
+      f32x4 m01 = _mm_set1_ps(m[1]);
+      f32x4 m02 = _mm_set1_ps(m[2]);
+      f32x4 m03 = _mm_set1_ps(m[3]);
 
-      f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
-      f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
-      f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
-      f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
+      f32x4 m10 = _mm_set1_ps(m[4]);
+      f32x4 m11 = _mm_set1_ps(m[5]);
+      f32x4 m12 = _mm_set1_ps(m[6]);
+      f32x4 m13 = _mm_set1_ps(m[7]);
 
-      f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
-      f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
-      f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
-      f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
+      f32x4 m20 = _mm_set1_ps(m[8]);
+      f32x4 m21 = _mm_set1_ps(m[9]);
+      f32x4 m22 = _mm_set1_ps(m[10]);
+      f32x4 m23 = _mm_set1_ps(m[11]);
 
       //
       // row0 dot
diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h
index 873ff967a..3b160c94c 100644
--- a/Engine/source/math/mMatrix.h
+++ b/Engine/source/math/mMatrix.h
@@ -231,8 +231,8 @@ public:
    void mul( Point4F& p ) const;                       ///< M * p -> p (full [4x4] * [1x4])
    void mulP( Point3F& p ) const;                      ///< M * p -> p (assume w = 1.0f)
    void mulP( const Point3F &p, Point3F *d) const;     ///< M * p -> d (assume w = 1.0f)
-   void batch_mulP(Point3F* points, size_t count) const;
-   void batch_mulP(const Point3F* points, size_t count, Point3F* out) const;
+   void batch_mulP(Point3F* points, U32 count) const;
+   void batch_mulP(const Point3F* points, U32 count, Point3F* out) const;
    void mulV( VectorF& p ) const;                      ///< M * v -> v (assume w = 0.0f)
    void mulV( const VectorF &p, Point3F *d) const;     ///< M * v -> d (assume w = 0.0f)
 
@@ -479,7 +479,7 @@ inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
    GetMat44().mul_pos3(*this, p, *d);
 }
 
-inline void MatrixF::batch_mulP(Point3F* points, size_t count) const
+inline void MatrixF::batch_mulP(Point3F* points, U32 count) const
 {
    // Allocate temporary buffer
    Point3F* temp = new Point3F[count];
@@ -487,7 +487,7 @@ inline void MatrixF::batch_mulP(Point3F* points, size_t count) const
    GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
 
    // Copy the results back to out
-   for (size_t i = 0; i < count; ++i)
+   for (U32 i = 0; i < count; ++i)
    {
       points[i] = temp[i];
    }
@@ -496,7 +496,7 @@ inline void MatrixF::batch_mulP(Point3F* points, size_t count) const
    delete[] temp;
 }
 
-inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* out) const
+inline void MatrixF::batch_mulP(const Point3F* points, U32 count, Point3F* out) const
 {
    // Allocate temporary buffer
    Point3F* temp = new Point3F[count];
@@ -504,7 +504,7 @@ inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* ou
    GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
 
    // Copy the results back to out
-   for (size_t i = 0; i < count; ++i)
+   for (U32 i = 0; i < count; ++i)
    {
       out[i] = temp[i];
    }

From 86b0baf432ee079af7c0f390be631d76c2789dec Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Sat, 7 Mar 2026 16:14:31 +0000
Subject: [PATCH 22/23] Update torque_macros.cmake

---
 Tools/CMake/torque_macros.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake
index 9b98a5b99..fe0cec4c6 100644
--- a/Tools/CMake/torque_macros.cmake
+++ b/Tools/CMake/torque_macros.cmake
@@ -176,7 +176,7 @@ function(add_math_backend name compile_defs)
         elseif(name STREQUAL "avx2")
             target_compile_options(math_${name} PRIVATE -mavx2 -mfma)
         elseif(name STREQUAL "neon")
-            target_compile_options(math_${name} PRIVATE -march=armv8-a+simd)
+            target_compile_options(math_${name} PRIVATE -march=armv8-a)
         endif()
     endif()
 

From 090dd4993a86a39074f18d304554834459701b84 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Sat, 7 Mar 2026 16:41:00 +0000
Subject: [PATCH 23/23] Update CMakeLists.txt

---
 Engine/source/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt
index d5f7aeaad..d439a06e2 100644
--- a/Engine/source/CMakeLists.txt
+++ b/Engine/source/CMakeLists.txt
@@ -505,7 +505,7 @@ if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
     set(IS_X86 TRUE)
 endif()
 
-if(ARCH MATCHES "arm64|aarch64|arm")
+if(ARCH MATCHES "arm64|aarch64")
     set(IS_ARM TRUE)
 endif()