From d8b511bbf91eaca71e5a92a3e17903b4f38bcaf2 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Fri, 27 Feb 2026 14:07:26 +0000
Subject: [PATCH] mac isa neon update

added float3
restructured the classes to look more like the final version of the x86 classes
---
 Engine/source/math/isa/neon/float3.cpp     | 98 ++++++++++++++++++++++
 Engine/source/math/isa/neon/float4.cpp     | 98 +++++++++++++++-------
 Engine/source/math/public/math_backend.cpp |  1 +
 Engine/source/platformMac/macMath.mm       |  5 +-
 4 files changed, 169 insertions(+), 33 deletions(-)
 create mode 100644 Engine/source/math/isa/neon/float3.cpp

diff --git a/Engine/source/math/isa/neon/float3.cpp b/Engine/source/math/isa/neon/float3.cpp
new file mode 100644
index 000000000..f0cbdb113
--- /dev/null
+++ b/Engine/source/math/isa/neon/float3.cpp
@@ -0,0 +1,98 @@
+#include "float3_dispatch.h"
+#include <arm_neon.h> // NEON intrinsics
+
+namespace
+{
+    typedef float32x4_t f32x4;
+
+    // Load 3 floats into 4-wide SIMD, zero the 4th lane
+    inline f32x4 v_load3(const float* p)
+    {
+        // Load first 3 floats
+        float32x2_t low = vld1_f32(p);          // load p[0], p[1]
+        float32x2_t high = vld1_dup_f32(p + 2); // load p[2], duplicate to second lane
+        return vcombine_f32(low, high);         // combine into 128-bit vector
+    }
+
+    // Store 3 floats from SIMD register back to memory
+    inline void v_store3(float* dst, f32x4 v)
+    {
+        vst1_f32(dst, vget_low_f32(v));   // store first 2 floats
+        dst[2] = vgetq_lane_f32(v, 2);   // store 3rd element
+    }
+
+    // extract just the first lane.
+    inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
+
+    // Broadcast a single float across all 4 lanes
+    inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
+
+    // Element-wise multiply
+    inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
+
+    // Element-wise divide (fast approximate)
+    inline f32x4 v_div_fast(f32x4 a, f32x4 b)
+    {
+        float32x4_t rcp = vrecpeq_f32(b);
+        // Optional refinement for better precision
+        rcp = vmulq_f32(vrecpsq_f32(b, rcp), rcp);
+        return vmulq_f32(a, rcp);
+    }
+
+    inline f32x4 v_div(f32x4 a, f32x4 b) { return v_div_fast(a, b); }
+
+    // Element-wise add
+    inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
+
+    // Element-wise subtract
+    inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
+
+    // Horizontal sum of all elements (for dot product, length, etc.)
+    inline f32x4 v_hadd3(f32x4 a)
+    {
+        float32x2_t sum_pair = vadd_f32(vget_low_f32(a), vget_high_f32(a)); // sum pairs
+        float32x2_t sum = vpadd_f32(sum_pair, sum_pair);                    // horizontal add
+        return vsetq_lane_f32(vget_lane_f32(sum, 0), a, 0);                // total sum in lane 0
+    }
+
+    // Cross product
+    inline f32x4 v_cross(f32x4 a, f32x4 b)
+    {
+        // Extract xyz as separate registers
+        float32x4_t a_yzx = vextq_f32(a, a, 1); // rotate left: y,z,x,w
+        float32x4_t b_yzx = vextq_f32(b, b, 1);
+
+        float32x4_t mul1 = vmulq_f32(a, b_yzx);
+        float32x4_t mul2 = vmulq_f32(a_yzx, b);
+
+        float32x4_t c = vsubq_f32(mul1, mul2);
+
+        // Rotate back to x,y,z and keep w from original 'a'
+        float32x4_t xyz = vextq_f32(c, c, 3);   // x,y,z in lanes 0..2
+        float32x4_t result = vsetq_lane_f32(vgetq_lane_f32(a, 3), xyz, 3); // preserve w
+        return result;
+    }
+}
+
+#include "float3_impl.inl"
+
+namespace math_backend::float3::dispatch
+{
+    // Install NEON backend
+    void install_neon()
+    {
+        gFloat3.add = float3_add_impl;
+        gFloat3.sub = float3_sub_impl;
+        gFloat3.mul = float3_mul_impl;
+        gFloat3.mul_scalar = float3_mul_scalar_impl;
+        gFloat3.div = float3_div_impl;
+        gFloat3.div_scalar = float3_div_scalar_impl;
+        gFloat3.dot = float3_dot_impl;
+        gFloat3.length = float3_length_impl;
+        gFloat3.lengthSquared = float3_length_squared_impl;
+        gFloat3.normalize = float3_normalize_impl;
+        gFloat3.normalize_mag = float3_normalize_mag_impl;
+        gFloat3.lerp = float3_lerp_impl;
+        gFloat3.cross = float3_cross_impl;
+    }
+}
diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp
index 6258db743..59092ae28 100644
--- a/Engine/source/math/isa/neon/float4.cpp
+++ b/Engine/source/math/isa/neon/float4.cpp
@@ -1,50 +1,84 @@
 #include "float4_dispatch.h"
-#include <arm_neon.h>
+#include <arm_neon.h> // NEON intrinsics
 
 namespace
 {
-    typedef float32x4_t f32x4;
+   typedef float32x4_t f32x4;
 
-    inline f32x4 v_load(const float* p)        { return vld1q_f32(p); }
-    inline void  v_store(float* dst, f32x4 v)  { vst1q_f32(dst, v); }
-    inline f32x4 v_set1(float s)               { return vdupq_n_f32(s); }
+   // Load 4 floats from memory into a SIMD register
+   inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
 
-    inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
-    inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
-    inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
+   // Store 4 floats from SIMD register back to memory
+   inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
 
-    // AArch64 native divide
-    inline f32x4 v_div(f32x4 a, f32x4 b)
-    {
-        return vdivq_f32(a, b);
-    }
+   // Broadcast a single float across all 4 lanes
+   inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
+
+   // Element-wise multiply
+   inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
+
+   // Element-wise divide (approximate fast reciprocal)
+   inline f32x4 v_div(f32x4 a, f32x4 b)
+   {
+      float32x4_t rcp = vrecpeq_f32(b);
+      // Refine reciprocal for better precision
+      rcp = vmulq_f32(vrecpsq_f32(b, rcp), rcp);
+      return vmulq_f32(a, rcp);
+   }
+
+   // Element-wise add
+   inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
+
+   // Element-wise subtract
+   inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
+
+   // Horizontal sum of all 4 elements (for dot product, length, etc.)
+   inline float v_hadd4(f32x4 a)
+   {
+      float32x2_t sum_pair = vadd_f32(vget_low_f32(a), vget_high_f32(a)); // add pairs [a0+a2, a1+a3]
+      float32x2_t sum = vpadd_f32(sum_pair, sum_pair);                    // horizontal add: total sum
+      return vget_lane_f32(sum, 0);
+   }
+
+   // Optimized cross product for float4 (w component preserved)
+   inline f32x4 v_cross(f32x4 a, f32x4 b)
+   {
+      // Extract xyz as separate registers
+      float32x4_t a_yzx = vextq_f32(a, a, 1); // rotate left: y,z,x,w
+      float32x4_t b_yzx = vextq_f32(b, b, 1);
+      
+      float32x4_t mul1 = vmulq_f32(a, b_yzx);
+      float32x4_t mul2 = vmulq_f32(a_yzx, b);
+      
+      float32x4_t c = vsubq_f32(mul1, mul2);
+      
+      // Rotate back to x,y,z and keep w from original 'a'
+      float32x4_t xyz = vextq_f32(c, c, 3);   // x,y,z in lanes 0..2
+      float32x4_t result = vsetq_lane_f32(vgetq_lane_f32(a, 3), xyz, 3); // preserve w
+      return result;
+   }
 
-    inline float v_hadd4(f32x4 a)
-    {
-        float32x2_t low  = vget_low_f32(a);
-        float32x2_t high = vget_high_f32(a);
-        float32x2_t sum  = vadd_f32(low, high);
-        sum = vpadd_f32(sum, sum);
-        return vget_lane_f32(sum, 0);
-    }
 }
 
-#include "../../impl/float4_impl.inl"
+#include "float4_impl.inl"
 
 namespace math_backend::float4::dispatch
 {
+    // Install NEON64 backend
     void install_neon()
     {
-        gFloat4.add           = float4_add_impl;
-        gFloat4.sub           = float4_sub_impl;
-        gFloat4.mul           = float4_mul_impl;
-        gFloat4.mul_scalar    = float4_mul_scalar_impl;
-        gFloat4.div           = float4_div_impl;
-        gFloat4.div_scalar    = float4_div_scalar_impl;
-        gFloat4.dot           = float4_dot_impl;
-        gFloat4.length        = float4_length_impl;
+        gFloat4.add          = float4_add_impl;
+        gFloat4.sub          = float4_sub_impl;
+        gFloat4.mul          = float4_mul_impl;
+        gFloat4.mul_scalar   = float4_mul_scalar_impl;
+        gFloat4.div          = float4_div_impl;
+        gFloat4.div_scalar   = float4_div_scalar_impl;
+        gFloat4.dot          = float4_dot_impl;
+        gFloat4.length       = float4_length_impl;
         gFloat4.lengthSquared = float4_length_squared_impl;
-        gFloat4.normalize     = float4_normalize_impl;
-        gFloat4.lerp          = float4_lerp_impl;
+        gFloat4.normalize    = float4_normalize_impl;
+        gFloat4.normalize_mag = float4_normalize_mag_impl;
+        gFloat4.lerp         = float4_lerp_impl;
+        gFloat4.cross        = float4_cross_impl;
     }
 }
diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp
index 7683758e2..0e3d6f592 100644
--- a/Engine/source/math/public/math_backend.cpp
+++ b/Engine/source/math/public/math_backend.cpp
@@ -48,6 +48,7 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
 #elif defined(__aarch64__) || defined(__ARM_NEON)
       case backend::neon:
          float4::dispatch::install_neon();
+         float3::dispatch::install_neon();
          break;
 #endif
       default:
diff --git a/Engine/source/platformMac/macMath.mm b/Engine/source/platformMac/macMath.mm
index 4feefb277..c275bdd17 100644
--- a/Engine/source/platformMac/macMath.mm
+++ b/Engine/source/platformMac/macMath.mm
@@ -25,6 +25,7 @@
 #import "math/mMath.h"
 #import "core/strings/stringFunctions.h"
 #include "console/engineAPI.h"
+#include "math/public/math_backend.h"
 
 extern void mInstallLibrary_C();
 
@@ -107,7 +108,9 @@ void Math::init(U32 properties)
 
    Con::printf("Math Init:");
    Con::printf("   Installing Standard C extensions");
-   mInstallLibrary_C();   
+   mInstallLibrary_C();
+   
+   math_backend::install_from_cpu_flags(properties);
 
    #ifdef TORQUE_CPU_X86
    if( properties & CPU_PROP_SSE )