From d8b511bbf91eaca71e5a92a3e17903b4f38bcaf2 Mon Sep 17 00:00:00 2001 From: marauder2k7 Date: Fri, 27 Feb 2026 14:07:26 +0000 Subject: [PATCH] mac isa neon update added float3 restructured the classes to look more like the final version of the x86 classes --- Engine/source/math/isa/neon/float3.cpp | 98 ++++++++++++++++++++++ Engine/source/math/isa/neon/float4.cpp | 98 +++++++++++++++------- Engine/source/math/public/math_backend.cpp | 1 + Engine/source/platformMac/macMath.mm | 5 +- 4 files changed, 169 insertions(+), 33 deletions(-) create mode 100644 Engine/source/math/isa/neon/float3.cpp diff --git a/Engine/source/math/isa/neon/float3.cpp b/Engine/source/math/isa/neon/float3.cpp new file mode 100644 index 000000000..f0cbdb113 --- /dev/null +++ b/Engine/source/math/isa/neon/float3.cpp @@ -0,0 +1,98 @@ +#include "float3_dispatch.h" +#include // NEON intrinsics + +namespace +{ + typedef float32x4_t f32x4; + + // Load 3 floats into 4-wide SIMD, zero the 4th lane + inline f32x4 v_load3(const float* p) + { + // Load first 3 floats + float32x2_t low = vld1_f32(p); // load p[0], p[1] + float32x2_t high = vld1_dup_f32(p + 2); // load p[2], duplicate to second lane + return vcombine_f32(low, high); // combine into 128-bit vector + } + + // Store 3 floats from SIMD register back to memory + inline void v_store3(float* dst, f32x4 v) + { + vst1_f32(dst, vget_low_f32(v)); // store first 2 floats + dst[2] = vgetq_lane_f32(v, 2); // store 3rd element + } + + // extract just the first lane. + inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); } + + // Broadcast a single float across all 4 lanes + inline f32x4 v_set1(float s) { return vdupq_n_f32(s); } + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); } + + // Element-wise divide (fast approximate) + inline f32x4 v_div_fast(f32x4 a, f32x4 b) + { + float32x4_t rcp = vrecpeq_f32(b); + // Optional refinement for better precision + rcp = vmulq_f32(vrecpsq_f32(b, rcp), rcp); + return vmulq_f32(a, rcp); + } + + inline f32x4 v_div(f32x4 a, f32x4 b) { return v_div_fast(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); } + + // Horizontal sum of all elements (for dot product, length, etc.) + inline f32x4 v_hadd3(f32x4 a) + { + float32x2_t sum_pair = vadd_f32(vget_low_f32(a), vget_high_f32(a)); // sum pairs + float32x2_t sum = vpadd_f32(sum_pair, sum_pair); // horizontal add + return vsetq_lane_f32(vget_lane_f32(sum, 0), a, 0); // total sum in lane 0 + } + + // Cross product + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + // Extract xyz as separate registers + float32x4_t a_yzx = vextq_f32(a, a, 1); // rotate left: y,z,x,w + float32x4_t b_yzx = vextq_f32(b, b, 1); + + float32x4_t mul1 = vmulq_f32(a, b_yzx); + float32x4_t mul2 = vmulq_f32(a_yzx, b); + + float32x4_t c = vsubq_f32(mul1, mul2); + + // Rotate back to x,y,z and keep w from original 'a' + float32x4_t xyz = vextq_f32(c, c, 3); // x,y,z in lanes 0..2 + float32x4_t result = vsetq_lane_f32(vgetq_lane_f32(a, 3), xyz, 3); // preserve w + return result; + } +} + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install NEON backend + void install_neon() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp index 6258db743..59092ae28 100644 --- a/Engine/source/math/isa/neon/float4.cpp +++ b/Engine/source/math/isa/neon/float4.cpp @@ -1,50 +1,84 @@ #include "float4_dispatch.h" -#include +#include // NEON intrinsics namespace { - typedef float32x4_t f32x4; + typedef float32x4_t f32x4; - inline f32x4 v_load(const float* p) { return vld1q_f32(p); } - inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); } - inline f32x4 v_set1(float s) { return vdupq_n_f32(s); } + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return vld1q_f32(p); } - inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); } - inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); } - inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); } + // Store 4 floats from SIMD register back to memory + inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); } - // AArch64 native divide - inline f32x4 v_div(f32x4 a, f32x4 b) - { - return vdivq_f32(a, b); - } + // Broadcast a single float across all 4 lanes + inline f32x4 v_set1(float s) { return vdupq_n_f32(s); } + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); } + + // Element-wise divide (approximate fast reciprocal) + inline f32x4 v_div(f32x4 a, f32x4 b) + { + float32x4_t rcp = vrecpeq_f32(b); + // Refine reciprocal for better precision + rcp = vmulq_f32(vrecpsq_f32(b, rcp), rcp); + return vmulq_f32(a, rcp); + } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); } + + // Horizontal sum of all 4 elements (for dot product, length, etc.) + inline float v_hadd4(f32x4 a) + { + float32x2_t sum_pair = vadd_f32(vget_low_f32(a), vget_high_f32(a)); // add pairs [a0+a2, a1+a3] + float32x2_t sum = vpadd_f32(sum_pair, sum_pair); // horizontal add: total sum + return vget_lane_f32(sum, 0); + } + + // Optimized cross product for float4 (w component preserved) + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + // Extract xyz as separate registers + float32x4_t a_yzx = vextq_f32(a, a, 1); // rotate left: y,z,x,w + float32x4_t b_yzx = vextq_f32(b, b, 1); + + float32x4_t mul1 = vmulq_f32(a, b_yzx); + float32x4_t mul2 = vmulq_f32(a_yzx, b); + + float32x4_t c = vsubq_f32(mul1, mul2); + + // Rotate back to x,y,z and keep w from original 'a' + float32x4_t xyz = vextq_f32(c, c, 3); // x,y,z in lanes 0..2 + float32x4_t result = vsetq_lane_f32(vgetq_lane_f32(a, 3), xyz, 3); // preserve w + return result; + } - inline float v_hadd4(f32x4 a) - { - float32x2_t low = vget_low_f32(a); - float32x2_t high = vget_high_f32(a); - float32x2_t sum = vadd_f32(low, high); - sum = vpadd_f32(sum, sum); - return vget_lane_f32(sum, 0); - } } -#include "../../impl/float4_impl.inl" +#include "float4_impl.inl" namespace math_backend::float4::dispatch { + // Install NEON64 backend void install_neon() { - gFloat4.add = float4_add_impl; - gFloat4.sub = float4_sub_impl; - gFloat4.mul = float4_mul_impl; - gFloat4.mul_scalar = float4_mul_scalar_impl; - gFloat4.div = float4_div_impl; - gFloat4.div_scalar = float4_div_scalar_impl; - gFloat4.dot = float4_dot_impl; - gFloat4.length = float4_length_impl; + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; - gFloat4.normalize = float4_normalize_impl; - gFloat4.lerp = float4_lerp_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; + gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; } } diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp index 7683758e2..0e3d6f592 100644 --- a/Engine/source/math/public/math_backend.cpp +++ b/Engine/source/math/public/math_backend.cpp @@ -48,6 +48,7 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags) #elif defined(__aarch64__) || defined(__ARM_NEON) case backend::neon: float4::dispatch::install_neon(); + float3::dispatch::install_neon(); break; #endif default: diff --git a/Engine/source/platformMac/macMath.mm b/Engine/source/platformMac/macMath.mm index 4feefb277..c275bdd17 100644 --- a/Engine/source/platformMac/macMath.mm +++ b/Engine/source/platformMac/macMath.mm @@ -25,6 +25,7 @@ #import "math/mMath.h" #import "core/strings/stringFunctions.h" #include "console/engineAPI.h" +#include "math/public/math_backend.h" extern void mInstallLibrary_C(); @@ -107,7 +108,9 @@ void Math::init(U32 properties) Con::printf("Math Init:"); Con::printf(" Installing Standard C extensions"); - mInstallLibrary_C(); + mInstallLibrary_C(); + + math_backend::install_from_cpu_flags(properties); #ifdef TORQUE_CPU_X86 if( properties & CPU_PROP_SSE )