neon implementation

removed some x86 intrinsic functions that were in the mat44_impl file reinstated some mMath_C functions and mMathFn ptrs trying to diagnose an issue. Had to come up with a different way to initialize the scalar table if the isa tables are not initialized yet. Mac did not like the static initialization. Had to change neon over to using explicit masks for shifting, cross product was failing during bakes and matrix calculations
2026-04-29 08:15:44 +00:00 · 2026-03-04 08:41:57 +00:00 · 2026-03-04 08:41:57 +00:00 · 0ba8d948fb
commit 0ba8d948fb
parent bb1478a8c3
10 changed files with 521 additions and 142 deletions
--- a/Engine/source/math/isa/neon/neon_intrinsics.h
+++ b/Engine/source/math/isa/neon/neon_intrinsics.h
@ -14,14 +14,82 @@ namespace
   inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
   inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }

+   inline f32x4 v_set(float x, float y, float z, float w)
+   {
+       return { x, y, z, w }; // lane0=x, lane1=y, lane2=z, lane3=w
+   }
+
+   inline f32x4 v_insert_w(f32x4 v, f32x4 w)
+   {
+       // extract scalar value from w lane0
+       float w_val = vgetq_lane_f32(w, 0);
+
+       // broadcast w_val to all lanes (we only need lane3 later)
+       f32x4 w_broadcast = vdupq_n_f32(w_val);
+
+       // mask to select only lane3
+       uint32x4_t mask = {0, 0, 0, 0xFFFFFFFF};
+
+       // vbslq: if mask bit=1, take from first argument, else take from second
+       return vbslq_f32(mask, w_broadcast, v);
+   }
+
   //------------------------------------------------------
   // Mask helpers
   //------------------------------------------------------
   inline f32x4 v_mask_xyz()
   {
-      // equivalent to [1,1,1,0]
-      float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
-      return mask;
+       uint32x4_t mask = {
+           0xFFFFFFFF,
+           0xFFFFFFFF,
+           0xFFFFFFFF,
+           0x00000000
+       };
+
+       return vreinterpretq_f32_u32(mask);
+   }
+
+   inline f32x4 v_and(f32x4 a, f32x4 b)
+   {
+       return vreinterpretq_f32_u32(
+           vandq_u32(
+               vreinterpretq_u32_f32(a),
+               vreinterpretq_u32_f32(b)
+           )
+       );
+   }
+
+   inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
+   {
+       // base byte index of the float lane
+       uint8x16_t base = vdupq_n_u8((uint8_t)(x * 4));
+
+       // byte offsets inside a float (0,1,2,3 repeated 4 times)
+       const uint8x16_t offsets = {
+           0,1,2,3,  0,1,2,3,
+           0,1,2,3,  0,1,2,3
+       };
+
+       uint8x16_t idx = vaddq_u8(base, offsets);
+
+       return vreinterpretq_f32_u8(
+           vqtbl1q_u8(
+               vreinterpretq_u8_f32(v),
+               idx
+           )
+       );
+   }
+
+   inline f32x4 v_swizzle_lo(f32x4 v)
+   {
+       float32x4x2_t t = vzipq_f32(v, v);
+       return t.val[0];   // [x, x, z, z]
+   }
+
+   inline f32x4 v_swizzle_hi(f32x4 v)
+   {
+       float32x4x2_t t = vzipq_f32(v, v);
+       return t.val[1];   // [y, y, w, w]
   }

   inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
@ -109,10 +177,30 @@ namespace

   inline f32x4 v_cross(f32x4 a, f32x4 b)
   {
-      float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 };
-      float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 };
-      float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b));
-      return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 };
+      f32x4 a_yzx = { vgetq_lane_f32(a,1),
+                    vgetq_lane_f32(a,2),
+                    vgetq_lane_f32(a,0),
+                    0.0f };
+
+      f32x4 b_zxy = { vgetq_lane_f32(b,2),
+                    vgetq_lane_f32(b,0),
+                    vgetq_lane_f32(b,1),
+                    0.0f };
+
+      f32x4 a_zxy = { vgetq_lane_f32(a,2),
+                    vgetq_lane_f32(a,0),
+                    vgetq_lane_f32(a,1),
+                    0.0f };
+
+      f32x4 b_yzx = { vgetq_lane_f32(b,1),
+                    vgetq_lane_f32(b,2),
+                    vgetq_lane_f32(b,0),
+                    0.0f };
+
+      return vsubq_f32(
+        vmulq_f32(a_yzx, b_zxy),
+        vmulq_f32(a_zxy, b_yzx)
+      );
   }

   inline f32x4 v_normalize3(f32x4 v)
@ -127,4 +215,188 @@ namespace
      float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
      return vdupq_n_f32(sum);
   }
+
+
+   //------------------------------------------------------
+   // Matrix type (row-major 4x4)
+   //------------------------------------------------------
+
+   struct f32x4x4
+   {
+       f32x4 r0;
+       f32x4 r1;
+       f32x4 r2;
+       f32x4 r3;
+   };
+
+   inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
+   {
+       f32x4x4 out;
+       out.r0 = v_load(m + 0);
+       out.r1 = v_load(m + 4);
+       out.r2 = v_load(m + 8);
+       out.r3 = v_load(m + 12);
+       return out;
+   }
+
+   inline void m_store(float* dst, const f32x4x4& m)
+   {
+       v_store(dst + 0,  m.r0);
+       v_store(dst + 4,  m.r1);
+       v_store(dst + 8,  m.r2);
+       v_store(dst + 12, m.r3);
+   }
+
+   inline f32x4x4 m_identity()
+   {
+       f32x4x4 m;
+       m.r0 = {1,0,0,0};
+       m.r1 = {0,1,0,0};
+       m.r2 = {0,0,1,0};
+       m.r3 = {0,0,0,1};
+       return m;
+   }
+
+   inline f32x4x4 m_zero()
+   {
+       f32x4 z = v_zero();
+       return { z, z, z, z };
+   }
+
+   inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
+   {
+       f32x4 x = v_dot4(m.r0, v);
+       f32x4 y = v_dot4(m.r1, v);
+       f32x4 z = v_dot4(m.r2, v);
+       f32x4 w = v_dot4(m.r3, v);
+
+       return {
+           v_extract0(x),
+           v_extract0(y),
+           v_extract0(z),
+           v_extract0(w)
+       };
+   }
+
+   inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
+   {
+       f32x4 x = v_dot3(m.r0, v);
+       f32x4 y = v_dot3(m.r1, v);
+       f32x4 z = v_dot3(m.r2, v);
+
+       return {
+           v_extract0(x),
+           v_extract0(y),
+           v_extract0(z),
+           0.0f
+       };
+   }
+
+   inline f32x4x4 m_transpose(const f32x4x4& m)
+   {
+       float32x4x2_t t0 = vtrnq_f32(m.r0, m.r1);
+       float32x4x2_t t1 = vtrnq_f32(m.r2, m.r3);
+
+       float32x2_t a0 = vget_low_f32(t0.val[0]);
+       float32x2_t a1 = vget_high_f32(t0.val[0]);
+       float32x2_t a2 = vget_low_f32(t1.val[0]);
+       float32x2_t a3 = vget_high_f32(t1.val[0]);
+
+       float32x2_t b0 = vget_low_f32(t0.val[1]);
+       float32x2_t b1 = vget_high_f32(t0.val[1]);
+       float32x2_t b2 = vget_low_f32(t1.val[1]);
+       float32x2_t b3 = vget_high_f32(t1.val[1]);
+
+       f32x4x4 out;
+
+       out.r0 = vcombine_f32(a0, a2);
+       out.r1 = vcombine_f32(b0, b2);
+       out.r2 = vcombine_f32(a1, a3);
+       out.r3 = vcombine_f32(b1, b3);
+
+       return out;
+   }
+
+   inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
+   {
+       f32x4x4 bt = m_transpose(b);
+
+       auto mul_row = [&](f32x4 row)
+       {
+           f32x4 x = v_dot4(row, bt.r0);
+           f32x4 y = v_dot4(row, bt.r1);
+           f32x4 z = v_dot4(row, bt.r2);
+           f32x4 w = v_dot4(row, bt.r3);
+
+           return f32x4{
+               v_extract0(x),
+               v_extract0(y),
+               v_extract0(z),
+               v_extract0(w)
+           };
+       };
+
+       f32x4x4 C;
+       C.r0 = mul_row(a.r0);
+       C.r1 = mul_row(a.r1);
+       C.r2 = mul_row(a.r2);
+       C.r3 = mul_row(a.r3);
+
+       return C;
+   }
+
+   inline f32x4 m_determinant(const f32x4x4& m)
+   {
+       f32x4 a = m.r0;
+       f32x4 b = m.r1;
+       f32x4 c = m.r2;
+       f32x4 d = m.r3;
+
+       f32x4 c0 = v_cross(c, d);
+       f32x4 c1 = v_cross(d, b);
+       f32x4 c2 = v_cross(b, c);
+
+       f32x4 term0 = vmulq_f32(a, c0);
+       f32x4 term1 = vmulq_f32(a, c1);
+       f32x4 term2 = vmulq_f32(a, c2);
+
+       f32x4 det = vaddq_f32(term0, vaddq_f32(term1, term2));
+
+       return v_hadd4(det);
+   }
+
+   inline f32x4 m_determinant_affine(const f32x4x4& m)
+   {
+       f32x4 r0 = v_and(m.r0, v_mask_xyz());
+       f32x4 r1 = v_and(m.r1, v_mask_xyz());
+       f32x4 r2 = v_and(m.r2, v_mask_xyz());
+
+       f32x4 c0 = v_cross(r1, r2);
+       return v_dot3(r0, c0);
+   }
+
+   inline f32x4x4 m_inverse(const f32x4x4& m)
+   {
+       f32x4 a = m.r0;
+       f32x4 b = m.r1;
+       f32x4 c = m.r2;
+       f32x4 d = m.r3;
+
+       f32x4 c0 = v_cross(b, c);
+       f32x4 c1 = v_cross(c, d);
+       f32x4 c2 = v_cross(d, a);
+       f32x4 c3 = v_cross(a, b);
+
+       f32x4 det = v_dot4(a, c1);
+       f32x4 invDet = v_rcp_nr(det);
+
+       f32x4x4 adj;
+       adj.r0 = vmulq_f32(c1, invDet);
+       adj.r1 = vmulq_f32(c2, invDet);
+       adj.r2 = vmulq_f32(c3, invDet);
+       adj.r3 = vmulq_f32(c0, invDet);
+
+       return m_transpose(adj);
+   }
+
 }