neon implementation

removed some x86 intrinsic functions that were in the mat44_impl file
reinstated some mMath_C functions and mMathFn ptrs trying to diagnose an issue.
Had to come up with a different way to initialize the scalar table if the isa tables are not initialized yet. Mac did not like the static initialization.
Had to change neon over to using explicit masks for shifting, cross product was failing during bakes and matrix calculations
This commit is contained in:
marauder2k7 2026-03-04 08:41:57 +00:00
parent bb1478a8c3
commit 0ba8d948fb
10 changed files with 521 additions and 142 deletions

View file

@ -239,24 +239,6 @@ namespace math_backend::mat44
m_store(m, r);
}
inline void mat44_trs_impl(float* m, const float* t, const float* r_euler, const float* s)
{
f32x4x4 mr;
mat44_rotation_euler_impl((float*)&mr, r_euler[0], r_euler[1], r_euler[2]);
f32x4 vs = v_load3_vec(s); // scale xyz
mr.r0 = v_mul(mr.r0, vs);
mr.r1 = v_mul(mr.r1, vs);
mr.r2 = v_mul(mr.r2, vs);
mr.r0 = v_insert_w(mr.r0, _mm_set_ss(t[0]));
mr.r1 = v_insert_w(mr.r1, _mm_set_ss(t[1]));
mr.r2 = v_insert_w(mr.r2, _mm_set_ss(t[2]));
mr.r3 = v_set(0, 0, 0, 1.0f);
m_store(m, mr);
}
inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up)
{
f32x4 vEye = v_load3_pos(eye);

View file

@ -318,26 +318,26 @@ namespace math_backend::mat44::dispatch
r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
};
gMat44.mul_mat44 = [](const float* a, const float* b, float* r) {
r[0] = a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12];
r[1] = a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13];
r[2] = a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14];
r[3] = a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15];
gMat44.mul_mat44 = [](const float* a, const float* b, float* mresult) {
mresult[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12];
mresult[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13];
mresult[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14];
mresult[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15];
r[4] = a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12];
r[5] = a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13];
r[6] = a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14];
r[7] = a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15];
mresult[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12];
mresult[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13];
mresult[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14];
mresult[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15];
r[8] = a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12];
r[9] = a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13];
r[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14];
r[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15];
mresult[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12];
mresult[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13];
mresult[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14];
mresult[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15];
r[12] = a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12];
r[13] = a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13];
r[14] = a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14];
r[15] = a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15];
mresult[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12];
mresult[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13];
mresult[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14];
mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
};
gMat44.normalize = [](float* a) {