further sse simd additions

avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend
2026-03-20 04:40:54 +00:00 · 2026-02-26 21:11:31 +00:00 · 2026-02-26 21:11:31 +00:00 · 18d0aa0418
commit 18d0aa0418
parent 2559145c06
16 changed files with 337 additions and 77 deletions
--- a/Engine/source/gfx/gfxDrawUtil.cpp
+++ b/Engine/source/gfx/gfxDrawUtil.cpp
@ -853,7 +853,11 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2,
 // 3D World Draw Misc
 //-----------------------------------------------------------------------------

-static SphereMesh gSphere;
+SphereMesh& getSphere()
+{
+   static SphereMesh instance;
+   return instance;
+}

 void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm )
 {
@ -868,7 +872,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P
   GFX->pushWorldMatrix();
   GFX->multWorld(mat);

-   const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2);
+   const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2);
   S32 numPoly = sphereMesh->numPoly;
   S32 totalPoly = 0;
   GFXVertexBufferHandle<GFXVertexPCT> verts(mDevice, numPoly*3, GFXBufferTypeVolatile);
--- a/Engine/source/math/impl/float3_impl.inl
+++ b/Engine/source/math/impl/float3_impl.inl
@ -0,0 +1,115 @@
+#pragma once
+#include <cmath>   // for sqrtf, etc.
+#include "../mConstants.h"
+
+// Safely loads a float3 -> simd 4 lane backend
+namespace math_backend::float3
+{
+   //----------------------------------------------------------
+   // Add two float4 vectors: r = a + b
+   inline void float3_add_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3(a);
+      f32x4 vb = v_load3(b);
+      f32x4 vr = v_add(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Subtract: r = a - b
+   inline void float3_sub_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3(a);
+      f32x4 vb = v_load3(b);
+      f32x4 vr = v_sub(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply element-wise: r = a * b
+   inline void float3_mul_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3(a);
+      f32x4 vb = v_load3(b);
+      f32x4 vr = v_mul(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Multiply by scalar: r = a * s
+   inline void float3_mul_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_mul(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Divide element-wise: r = a / b
+   inline void float3_div_impl(const float* a, const float* b, float* r)
+   {
+      f32x4 va = v_load3(a);
+      f32x4 vb = v_load3(b);
+      f32x4 vr = v_div_fast(va, vb);
+      v_store3(r, vr);
+   }
+
+   // Divide by scalar: r = a / s
+   inline void float3_div_scalar_impl(const float* a, float s, float* r)
+   {
+      f32x4 va = v_load3(a);
+      f32x4 vs = v_set1(s);
+      f32x4 vr = v_div_fast(va, vs);
+      v_store3(r, vr);
+   }
+
+   // Dot product: returns scalar
+   inline float float3_dot_impl(const float* a, const float* b)
+   {
+      f32x4 va = v_load3(a);
+      f32x4 vb = v_load3(b);
+      f32x4 vmul = v_mul(va, vb);
+      f32x4 vsum = v_hadd3(vmul);
+      return v_extract0(vsum); // first lane is the sum of 3 elements
+   }
+
+   // Length squared
+   inline float float3_length_squared_impl(const float* a)
+   {
+      return float3_dot_impl(a, a);
+   }
+
+   // Length
+   inline float float3_length_impl(const float* a)
+   {
+      return std::sqrt(float3_length_squared_impl(a));
+   }
+
+   // Normalize in-place
+   inline void float3_normalize_impl(float* a)
+   {
+      float len = float3_length_impl(a);
+      if (len > POINT_EPSILON) // safe threshold
+      {
+         float3_mul_scalar_impl(a, 1.0f / len, a);
+      }
+   }
+
+   // Normalize with magnitude: r = normalize(a) * r
+   inline void float3_normalize_mag_impl(float* a, float r)
+   {
+      float len = float3_length_impl(a);
+      if (len > POINT_EPSILON)
+      {
+         float3_mul_scalar_impl(a, r / len, a);
+      }
+   }
+
+   // Linear interpolation: r = from + (to - from) * f
+   inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
+   {
+      f32x4 vfrom = v_load3(from);
+      f32x4 vto = v_load3(to);
+      f32x4 vf = v_set1(f);
+      f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
+      v_store3(r, vr);
+   }
+
+}
--- a/Engine/source/math/impl/float4_c.cpp
+++ b/Engine/source/math/impl/float4_c.cpp
@ -1,6 +1,6 @@
 #include "math/public/float4_dispatch.h"
 #include "math/mConstants.h"
-#include <math.h>
+#include <cmath>   // for sqrtf, etc.

 namespace math_backend::float4::dispatch
 {
@ -27,7 +27,8 @@ namespace math_backend::float4::dispatch
      };

      gFloat4.div_scalar = [](const float* a, float s, float* r) {
-         for (int i = 0; i < 4; i++) r[i] = a[i] / s;
+         float denom = 1.0f / s;
+         for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
      };

      gFloat4.dot = [](const float* a, const float* b) {
@ -39,7 +40,7 @@ namespace math_backend::float4::dispatch
      gFloat4.length = [](const float* a) {
         float sum = 0.f;
         for (int i = 0; i < 4; i++) sum += a[i] * a[i];
-         return sqrtf(sum);
+         return std::sqrt(sum);
      };

      gFloat4.lengthSquared = [](const float* a) {
--- a/Engine/source/math/isa/avx/float4.cpp
+++ b/Engine/source/math/isa/avx/float4.cpp
@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
      gFloat4.length = float4_length_impl;
      gFloat4.lengthSquared = float4_length_squared_impl;
      gFloat4.normalize = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
      gFloat4.lerp = float4_lerp_impl;
   }
 }
--- a/Engine/source/math/isa/avx2/float3.cpp
+++ b/Engine/source/math/isa/avx2/float3.cpp
@ -0,0 +1,84 @@
+#include "float3_dispatch.h"
+#include <immintrin.h> // AVX/AVX2 intrinsics
+
+namespace
+{
+   typedef __m128 f32x4;
+
+   // Load 3 floats into 4-wide SIMD, zero the 4th lane
+   inline f32x4 v_load3(const float* p) { return _mm_set_ps(0.0f, p[2], p[1], p[0]); }
+
+   // Store 3 floats from SIMD register back to memory
+   inline void v_store3(float* dst, f32x4 v)
+   {
+      alignas(16) float tmp[4];   // temp storage
+      _mm_store_ps(tmp, v);        // store all 4 lanes
+      dst[0] = tmp[0];
+      dst[1] = tmp[1];
+      dst[2] = tmp[2];
+   }
+
+   // extract just the first lane.
+   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
+
+   // Broadcast a single float across all 4 lanes
+   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
+
+   // Element-wise divide precise
+   inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
+
+   // Element-wise divide fast (1/b)
+   inline f32x4 v_div_fast(f32x4 a, f32x4 b)
+   {
+      f32x4 rcp = _mm_rcp_ps(b);
+      // Optional refinement here
+      return _mm_mul_ps(a, rcp);
+   }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
+
+   // Horizontal sum of all elements (for dot product, length, etc.)
+   inline f32x4 v_hadd3(f32x4 a)
+   {
+      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
+      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
+      return t2;
+   }
+
+   float float3_dot_avx(const float* a, const float* b)
+   {
+      f32x4 va = _mm_loadu_ps(a);
+      f32x4 vb = _mm_loadu_ps(b);
+      __m128 dp = _mm_dp_ps(va, vb, 0x71); // multiply 3 (0x71), sum all 4, lowest lane
+      return _mm_cvtss_f32(dp);
+   }
+}
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+   // Install AVX2 backend
+   void install_avx2()
+   {
+      gFloat3.add             = float3_add_impl;
+      gFloat3.sub             = float3_sub_impl;
+      gFloat3.mul             = float3_mul_impl;
+      gFloat3.mul_scalar      = float3_mul_scalar_impl;
+      gFloat3.div             = float3_div_impl;
+      gFloat3.div_scalar      = float3_div_scalar_impl;
+      gFloat3.dot             = float3_dot_avx;
+      gFloat3.length          = float3_length_impl;
+      gFloat3.lengthSquared   = float3_length_squared_impl;
+      gFloat3.normalize       = float3_normalize_impl;
+      gFloat3.normalize_mag   = float3_normalize_mag_impl;
+      gFloat3.lerp            = float3_lerp_impl;
+   }
+}
--- a/Engine/source/math/isa/avx2/float4.cpp
+++ b/Engine/source/math/isa/avx2/float4.cpp
@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
      gFloat4.length       = float4_length_impl;
      gFloat4.lengthSquared = float4_length_squared_impl;
      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
      gFloat4.lerp         = float4_lerp_impl;
   }
 }
--- a/Engine/source/math/isa/sse2/float4.cpp
+++ b/Engine/source/math/isa/sse2/float4.cpp
@ -53,6 +53,7 @@ namespace math_backend::float4::dispatch
      gFloat4.length       = float4_length_impl;
      gFloat4.lengthSquared = float4_length_squared_impl;
      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
      gFloat4.lerp         = float4_lerp_impl;
   }
 }
--- a/Engine/source/math/isa/sse41/float4.cpp
+++ b/Engine/source/math/isa/sse41/float4.cpp
@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
      gFloat4.length       = float4_length_impl;
      gFloat4.lengthSquared = float4_length_squared_impl;
      gFloat4.normalize    = float4_normalize_impl;
+      gFloat4.normalize_mag = float4_normalize_mag_impl;
      gFloat4.lerp         = float4_lerp_impl;
   }
 }
--- a/Engine/source/math/mPoint3.h
+++ b/Engine/source/math/mPoint3.h
@ -29,6 +29,11 @@
 #ifndef _MPOINT2_H_
 #include "math/mPoint2.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif
+
+#include <array>

 //------------------------------------------------------------------------------
 /// 3D integer point
@ -97,6 +102,7 @@ public:
 class Point3D;

 //------------------------------------------------------------------------------
+using math_backend::float3::dispatch::gFloat3;
 class Point3F
 {
   //-------------------------------------- Public data
@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
 inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
 {
   AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
-   m_point3F_interpolate( _from, _to, _factor, *this);
+
+   gFloat3.lerp(_from, _to, _factor, *this);
 }

 inline void Point3F::zero()
@ -599,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c)

 inline F32 Point3F::lenSquared() const
 {
-   return (x * x) + (y * y) + (z * z);
+   return gFloat3.lengthSquared(*this);
 }

 inline F32 Point3F::len() const
 {
-   return mSqrt(x*x + y*y + z*z);
+   return gFloat3.length(*this);
 }

 inline void Point3F::normalize()
 {
-   m_point3F_normalize(*this);
+   gFloat3.normalize(*this);
 }

 inline F32 Point3F::magnitudeSafe() const
@ -626,18 +633,13 @@ inline F32 Point3F::magnitudeSafe() const

 inline void Point3F::normalizeSafe()
 {
-   F32 vmag = magnitudeSafe();
-
-   if( vmag > POINT_EPSILON )
-   {
-      *this *= F32(1.0 / vmag);
-   }
+   gFloat3.normalize(*this);
 }


 inline void Point3F::normalize(F32 val)
 {
-   m_point3F_normalize_f(*this, val);
+   gFloat3.normalize_mag(*this, val);
 }

 inline bool Point3F::operator==(const Point3F& _test) const
@ -652,52 +654,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const

 inline Point3F Point3F::operator+(const Point3F& _add) const
 {
-   return Point3F(x + _add.x, y + _add.y,  z + _add.z);
+   Point3F temp;
+   gFloat3.add(*this, _add, temp);
+   return temp;
 }

 inline Point3F Point3F::operator-(const Point3F& _rSub) const
 {
-   return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
+   Point3F temp;
+   gFloat3.sub(*this, _rSub, temp);
+   return temp;
 }

 inline Point3F& Point3F::operator+=(const Point3F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-
+   gFloat3.add(*this, _add, *this);
   return *this;
 }

 inline Point3F& Point3F::operator-=(const Point3F& _rSub)
 {
-   x -= _rSub.x;
-   y -= _rSub.y;
-   z -= _rSub.z;
-
+   gFloat3.sub(*this, _rSub, *this);
   return *this;
 }

 inline Point3F Point3F::operator*(F32 _mul) const
 {
-   return Point3F(x * _mul, y * _mul, z * _mul);
+   Point3F temp;
+   gFloat3.mul_scalar(*this, _mul, temp);
+   return temp;
 }

 inline Point3F Point3F::operator/(F32 _div) const
 {
   AssertFatal(_div != 0.0f, "Error, div by zero attempted");

-   F32 inv = 1.0f / _div;
-
-   return Point3F(x * inv, y * inv, z * inv);
+   Point3F temp;
+   gFloat3.div_scalar(*this, _div, temp);
+   return temp;
 }

 inline Point3F& Point3F::operator*=(F32 _mul)
 {
-   x *= _mul;
-   y *= _mul;
-   z *= _mul;
-
+   gFloat3.mul_scalar(*this, _mul, *this);
   return *this;
 }

@ -705,39 +704,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
 {
   AssertFatal(_div != 0.0f, "Error, div by zero attempted");

-   F32 inv = 1.0f / _div;
-   x *= inv;
-   y *= inv;
-   z *= inv;
-
+   gFloat3.div_scalar(*this, _div, *this);
   return *this;
 }

 inline Point3F Point3F::operator*(const Point3F &_vec) const
 {
-   return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
+   Point3F temp;
+   gFloat3.mul(*this, _vec, temp);
+   return temp;
 }

 inline Point3F& Point3F::operator*=(const Point3F &_vec)
 {
-   x *= _vec.x;
-   y *= _vec.y;
-   z *= _vec.z;
+   gFloat3.mul(*this, _vec, *this);
   return *this;
 }

 inline Point3F Point3F::operator/(const Point3F &_vec) const
 {
   AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
+   Point3F temp;
+   gFloat3.div(*this, _vec, temp);
+   return temp;
 }

 inline Point3F& Point3F::operator/=(const Point3F &_vec)
 {
   AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
-   x /= _vec.x;
-   y /= _vec.y;
-   z /= _vec.z;
+   gFloat3.div(*this, _vec, *this);
   return *this;
 }

@ -989,7 +984,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)

 inline Point3F operator*(F32 mul, const Point3F& multiplicand)
 {
-   return multiplicand * mul;
+   Point3F temp;
+   gFloat3.mul_scalar(multiplicand, mul, temp);
+   return temp;
 }

 inline Point3D operator*(F64 mul, const Point3D& multiplicand)
@ -999,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)

 inline F32 mDot(const Point3F &p1, const Point3F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
+   return gFloat3.dot(p1, p2);
 }

 inline F64 mDot(const Point3D &p1, const Point3D &p2)
--- a/Engine/source/math/mPoint4.h
+++ b/Engine/source/math/mPoint4.h
@ -26,10 +26,12 @@
 #ifndef _MMATHFN_H_
 #include "math/mMathFn.h"
 #endif
-
 #ifndef _MPOINT3_H_
 #include "math/mPoint3.h"
 #endif
+#ifndef _MATH_BACKEND_H_
+#include "math/public/math_backend.h"
+#endif


 //------------------------------------------------------------------------------
@ -61,6 +63,8 @@ class Point4I
 /// Uses F32 internally.
 ///
 /// Useful for representing quaternions and other 4d beasties.
+using math_backend::float4::dispatch::gFloat4;
+
 class Point4F
 {
   //-------------------------------------- Public data
@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)

 inline F32 Point4F::len() const
 {
-   return mSqrt(x*x + y*y + z*z + w*w);
+   return gFloat4.length(*this);
 }

 inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
 {
-   x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
-   y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
-   z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
-   w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
+   gFloat4.lerp(_from, _to, _factor, *this);
 }

 inline void Point4F::zero()
@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
   if (mIsZero(scalar))
      return *this;

-   F32 denom = 1 / scalar;
-
-   x *= denom;
-   y *= denom;
-   z *= denom;
-   w *= denom;
+   gFloat4.div_scalar(*this, scalar, *this);

   return *this;
 }

 inline Point4F Point4F::operator+(const Point4F& _add) const
 {
-   return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
+   Point4F res;
+   gFloat4.add(*this, _add, res);
+   return res;
 }

 inline Point4F& Point4F::operator+=(const Point4F& _add)
 {
-   x += _add.x;
-   y += _add.y;
-   z += _add.z;
-   w += _add.w;
-
+   gFloat4.add(*this, _add, *this);
   return *this;
 }

 inline Point4F Point4F::operator-(const Point4F& _rSub) const
 {
-   return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
+   Point4F res;
+   gFloat4.sub(*this, _rSub, res);
+   return res;
 }

 inline Point4F Point4F::operator*(const Point4F &_vec) const
 {
-   return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
+   Point4F res;
+   gFloat4.mul(*this, _vec, res);
+   return res;
 }

 inline Point4F Point4F::operator*(F32 _mul) const
 {
-   return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
+   Point4F res;
+   gFloat4.mul_scalar(*this, _mul, res);
+   return res;
 }

 inline Point4F Point4F::operator /(F32 t) const
 {
-   F32 f = 1.0f / t;
-   return Point4F( x * f, y * f, z * f, w * f );
+   Point4F res;
+   gFloat4.div_scalar(*this, t, res);
+   return res;
 }

 inline F32 mDot(const Point4F &p1, const Point4F &p2)
 {
-   return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
+   return  gFloat4.dot(p1, p2);
 }

 //------------------------------------------------------------------------------
--- a/Engine/source/math/public/float3_dispatch.h
+++ b/Engine/source/math/public/float3_dispatch.h
@ -0,0 +1,38 @@
+#pragma once
+#ifndef _FLOAT3_DISPATCH_H_
+#define _FLOAT3_DISPATCH_H_
+
+
+#include <cstdint>
+
+namespace math_backend::float3::dispatch
+{
+   struct Float3Funcs
+   {
+      void (*add)(const float*, const float*, float*) = nullptr;
+      void (*sub)(const float*, const float*, float*) = nullptr;
+      void (*mul)(const float*, const float*, float*) = nullptr;
+      void (*mul_scalar)(const float*, float, float*) = nullptr;
+      void (*div)(const float*, const float*, float*) = nullptr;
+      void (*div_scalar)(const float*, float, float*) = nullptr;
+      float (*dot)(const float*, const float*) = nullptr;
+      float (*length)(const float*) = nullptr;
+      float (*lengthSquared)(const float*) = nullptr;
+      void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
+      void (*lerp)(const float*, const float*, float, float*) = nullptr;
+   };
+
+   // Global dispatch table
+   extern Float3Funcs gFloat3;
+
+   // Backend installers (defined in ISA libraries)
+   void install_scalar();
+   void install_sse2();
+   void install_sse41();
+   void install_avx();
+   void install_avx2();
+   void install_neon();
+}
+
+#endif // !_FLOAT4_DISPATCH_H_
--- a/Engine/source/math/public/float4_dispatch.cpp
+++ b/Engine/source/math/public/float4_dispatch.cpp
@ -1,7 +1,14 @@
 #include "math/public/float4_dispatch.h"
+#include "math/public/float3_dispatch.h"

 namespace math_backend::float4::dispatch
 {
   // Single definition of the global dispatch table
   Float4Funcs gFloat4{};
 }
+
+namespace math_backend::float3::dispatch
+{
+   // Single definition of the global dispatch table
+   Float3Funcs gFloat3{};
+}
--- a/Engine/source/math/public/float4_dispatch.h
+++ b/Engine/source/math/public/float4_dispatch.h
@ -19,6 +19,7 @@ namespace math_backend::float4::dispatch
      float (*length)(const float*) = nullptr;
      float (*lengthSquared)(const float*) = nullptr;
      void (*normalize)(float*) = nullptr;
+      void (*normalize_mag)(float*, float) = nullptr;
      void (*lerp)(const float*, const float*, float, float*) = nullptr;
   };

@ -32,9 +33,6 @@ namespace math_backend::float4::dispatch
   void install_avx();
   void install_avx2();
   void install_neon();
-
-   // Centralized installer (engine calls this once)
-   void install_preferred();
 }

 #endif // !_FLOAT4_DISPATCH_H_
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@ -28,6 +28,7 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
      case backend::avx2:
         float4::dispatch::install_avx2();
+         float3::dispatch::install_avx2();
         break;

      case backend::avx:
--- a/Engine/source/math/public/math_backend.h
+++ b/Engine/source/math/public/math_backend.h
@ -1,4 +1,7 @@
 #pragma once
+#ifndef _MATH_BACKEND_H_
+#define _MATH_BACKEND_H_
+
 #ifndef _MCONSTANTS_H_
 #include "math/mConstants.h"
 #endif
@ -8,6 +11,9 @@
 #ifndef _FLOAT4_DISPATCH_H_
 #include "math/public/float4_dispatch.h"
 #endif
+#ifndef _FLOAT3_DISPATCH_H_
+#include "math/public/float3_dispatch.h"
+#endif

 namespace math_backend
 {
@ -25,3 +31,5 @@ namespace math_backend
   backend choose_backend(U32 cpu_flags);
   void install_from_cpu_flags(uint32_t cpu_flags);
 }
+
+#endif // !_MATH_BACKEND_H_
--- a/Engine/source/platformWin32/winMath.cpp
+++ b/Engine/source/platformWin32/winMath.cpp
@ -25,7 +25,7 @@
 #include "console/engineAPI.h"

 #include "math/mMath.h"
-
+#include "math/public/math_backend.h"

 extern void mInstallLibrary_C();
 extern void mInstallLibrary_ASM();
@ -98,6 +98,8 @@ void Math::init(U32 properties)
   Con::printf("   Installing Standard C extensions");
   mInstallLibrary_C();

+   math_backend::install_from_cpu_flags(properties);
+
   Con::printf("   Installing Assembly extensions");
   mInstallLibrary_ASM();