diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl index a9635cff0..c27daeb45 100644 --- a/Engine/source/math/impl/mat44_impl.inl +++ b/Engine/source/math/impl/mat44_impl.inl @@ -239,24 +239,6 @@ namespace math_backend::mat44 m_store(m, r); } - inline void mat44_trs_impl(float* m, const float* t, const float* r_euler, const float* s) - { - f32x4x4 mr; - mat44_rotation_euler_impl((float*)&mr, r_euler[0], r_euler[1], r_euler[2]); - - f32x4 vs = v_load3_vec(s); // scale xyz - mr.r0 = v_mul(mr.r0, vs); - mr.r1 = v_mul(mr.r1, vs); - mr.r2 = v_mul(mr.r2, vs); - - mr.r0 = v_insert_w(mr.r0, _mm_set_ss(t[0])); - mr.r1 = v_insert_w(mr.r1, _mm_set_ss(t[1])); - mr.r2 = v_insert_w(mr.r2, _mm_set_ss(t[2])); - mr.r3 = v_set(0, 0, 0, 1.0f); - - m_store(m, mr); - } - inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up) { f32x4 vEye = v_load3_pos(eye); diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp index 3fe449a1b..01640ca1f 100644 --- a/Engine/source/math/impl/math_c.cpp +++ b/Engine/source/math/impl/math_c.cpp @@ -318,26 +318,26 @@ namespace math_backend::mat44::dispatch r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2]; }; - gMat44.mul_mat44 = [](const float* a, const float* b, float* r) { - r[0] = a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12]; - r[1] = a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13]; - r[2] = a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14]; - r[3] = a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15]; + gMat44.mul_mat44 = [](const float* a, const float* b, float* mresult) { + mresult[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12]; + mresult[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13]; + mresult[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14]; + mresult[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15]; - r[4] = a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12]; - r[5] = a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13]; - r[6] = a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14]; - r[7] = a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15]; + mresult[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12]; + mresult[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13]; + mresult[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14]; + mresult[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15]; - r[8] = a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12]; - r[9] = a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13]; - r[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14]; - r[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15]; + mresult[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12]; + mresult[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13]; + mresult[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14]; + mresult[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15]; - r[12] = a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12]; - r[13] = a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13]; - r[14] = a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14]; - r[15] = a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15]; + mresult[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12]; + mresult[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13]; + mresult[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14]; + mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15]; }; gMat44.normalize = [](float* a) { diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h index 3476fab1b..e6e8ef123 100644 --- a/Engine/source/math/isa/neon/neon_intrinsics.h +++ b/Engine/source/math/isa/neon/neon_intrinsics.h @@ -14,14 +14,82 @@ namespace inline f32x4 v_zero() { return vdupq_n_f32(0.0f); } inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); } + inline f32x4 v_set(float x, float y, float z, float w) + { + return { x, y, z, w }; // lane0=x, lane1=y, lane2=z, lane3=w + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + // extract scalar value from w lane0 + float w_val = vgetq_lane_f32(w, 0); + + // broadcast w_val to all lanes (we only need lane3 later) + f32x4 w_broadcast = vdupq_n_f32(w_val); + + // mask to select only lane3 + uint32x4_t mask = {0, 0, 0, 0xFFFFFFFF}; + + // vbslq: if mask bit=1, take from first argument, else take from second + return vbslq_f32(mask, w_broadcast, v); + } + //------------------------------------------------------ // Mask helpers //------------------------------------------------------ inline f32x4 v_mask_xyz() { - // equivalent to [1,1,1,0] - float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f}; - return mask; + uint32x4_t mask = { + 0xFFFFFFFF, + 0xFFFFFFFF, + 0xFFFFFFFF, + 0x00000000 + }; + + return vreinterpretq_f32_u32(mask); + } + + inline f32x4 v_and(f32x4 a, f32x4 b) + { + return vreinterpretq_f32_u32( + vandq_u32( + vreinterpretq_u32_f32(a), + vreinterpretq_u32_f32(b) + ) + ); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v, int x) + { + // base byte index of the float lane + uint8x16_t base = vdupq_n_u8((uint8_t)(x * 4)); + + // byte offsets inside a float (0,1,2,3 repeated 4 times) + const uint8x16_t offsets = { + 0,1,2,3, 0,1,2,3, + 0,1,2,3, 0,1,2,3 + }; + + uint8x16_t idx = vaddq_u8(base, offsets); + + return vreinterpretq_f32_u8( + vqtbl1q_u8( + vreinterpretq_u8_f32(v), + idx + ) + ); + } + + inline f32x4 v_swizzle_lo(f32x4 v) + { + float32x4x2_t t = vzipq_f32(v, v); + return t.val[0]; // [x, x, z, z] + } + + inline f32x4 v_swizzle_hi(f32x4 v) + { + float32x4x2_t t = vzipq_f32(v, v); + return t.val[1]; // [y, y, w, w] } inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) @@ -109,10 +177,30 @@ namespace inline f32x4 v_cross(f32x4 a, f32x4 b) { - float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 }; - float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 }; - float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b)); - return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 }; + f32x4 a_yzx = { vgetq_lane_f32(a,1), + vgetq_lane_f32(a,2), + vgetq_lane_f32(a,0), + 0.0f }; + + f32x4 b_zxy = { vgetq_lane_f32(b,2), + vgetq_lane_f32(b,0), + vgetq_lane_f32(b,1), + 0.0f }; + + f32x4 a_zxy = { vgetq_lane_f32(a,2), + vgetq_lane_f32(a,0), + vgetq_lane_f32(a,1), + 0.0f }; + + f32x4 b_yzx = { vgetq_lane_f32(b,1), + vgetq_lane_f32(b,2), + vgetq_lane_f32(b,0), + 0.0f }; + + return vsubq_f32( + vmulq_f32(a_yzx, b_zxy), + vmulq_f32(a_zxy, b_yzx) + ); } inline f32x4 v_normalize3(f32x4 v) @@ -127,4 +215,188 @@ namespace float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1); return vdupq_n_f32(sum); } + + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + v_store(dst + 0, m.r0); + v_store(dst + 4, m.r1); + v_store(dst + 8, m.r2); + v_store(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = {1,0,0,0}; + m.r1 = {0,1,0,0}; + m.r2 = {0,0,1,0}; + m.r3 = {0,0,0,1}; + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + return { + v_extract0(x), + v_extract0(y), + v_extract0(z), + v_extract0(w) + }; + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + return { + v_extract0(x), + v_extract0(y), + v_extract0(z), + 0.0f + }; + } + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + float32x4x2_t t0 = vtrnq_f32(m.r0, m.r1); + float32x4x2_t t1 = vtrnq_f32(m.r2, m.r3); + + float32x2_t a0 = vget_low_f32(t0.val[0]); + float32x2_t a1 = vget_high_f32(t0.val[0]); + float32x2_t a2 = vget_low_f32(t1.val[0]); + float32x2_t a3 = vget_high_f32(t1.val[0]); + + float32x2_t b0 = vget_low_f32(t0.val[1]); + float32x2_t b1 = vget_high_f32(t0.val[1]); + float32x2_t b2 = vget_low_f32(t1.val[1]); + float32x2_t b3 = vget_high_f32(t1.val[1]); + + f32x4x4 out; + + out.r0 = vcombine_f32(a0, a2); + out.r1 = vcombine_f32(b0, b2); + out.r2 = vcombine_f32(a1, a3); + out.r3 = vcombine_f32(b1, b3); + + return out; + } + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + f32x4x4 bt = m_transpose(b); + + auto mul_row = [&](f32x4 row) + { + f32x4 x = v_dot4(row, bt.r0); + f32x4 y = v_dot4(row, bt.r1); + f32x4 z = v_dot4(row, bt.r2); + f32x4 w = v_dot4(row, bt.r3); + + return f32x4{ + v_extract0(x), + v_extract0(y), + v_extract0(z), + v_extract0(w) + }; + }; + + f32x4x4 C; + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = vmulq_f32(a, c0); + f32x4 term1 = vmulq_f32(a, c1); + f32x4 term2 = vmulq_f32(a, c2); + + f32x4 det = vaddq_f32(term0, vaddq_f32(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = v_and(m.r0, v_mask_xyz()); + f32x4 r1 = v_and(m.r1, v_mask_xyz()); + f32x4 r2 = v_and(m.r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); + } + + inline f32x4x4 m_inverse(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(b, c); + f32x4 c1 = v_cross(c, d); + f32x4 c2 = v_cross(d, a); + f32x4 c3 = v_cross(a, b); + + f32x4 det = v_dot4(a, c1); + f32x4 invDet = v_rcp_nr(det); + + f32x4x4 adj; + adj.r0 = vmulq_f32(c1, invDet); + adj.r1 = vmulq_f32(c2, invDet); + adj.r2 = vmulq_f32(c3, invDet); + adj.r3 = vmulq_f32(c0, invDet); + + return m_transpose(adj); + } + } diff --git a/Engine/source/math/mMathFn.h b/Engine/source/math/mMathFn.h index e9e7d3840..241a9a393 100644 --- a/Engine/source/math/mMathFn.h +++ b/Engine/source/math/mMathFn.h @@ -76,11 +76,13 @@ extern void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m ); extern void (*m_matF_set_euler)(const F32 *e, F32 *result); extern void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result); extern void (*m_matF_identity)(F32 *m); -extern void (*m_matF_invert_to)(const F32* m, F32* d); +extern void (*m_matF_inverse)(F32 *m); +extern void (*m_matF_invert_to)(const F32 *m, F32 *d); +extern void (*m_matF_affineInverse)(F32 *m); extern void (*m_matF_transpose)(F32 *m); extern void (*m_matF_scale)(F32 *m,const F32* p); extern void (*m_matF_normalize)(F32 *m); -extern F32(*m_matF_determinant)(const F32* m); +extern F32 (*m_matF_determinant)(const F32 *m); extern void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult); extern void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult); // extern void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult); @@ -149,7 +151,7 @@ inline void m_matF_x_vectorF(const F32 *m, const F32 *v, F32 *vresult) inline bool mIsEqual( F32 a, F32 b, const F32 epsilon = __EQUAL_CONST_F ) { F32 diff = a - b; - return diff > -epsilon && diff < epsilon; + return diff > -epsilon && diff < epsilon; } inline bool mIsZero(const F32 val, const F32 epsilon = __EQUAL_CONST_F ) @@ -205,16 +207,16 @@ inline F32 mFmod(const F32 val, const F32 mod) return fmod(val, mod); } -inline S32 mRound(const F32 val) -{ - return (S32)floor(val + 0.5f); -} +inline S32 mRound(const F32 val) +{ + return (S32)floor(val + 0.5f); +} -inline F32 mRound(const F32 val, const S32 n) -{ - S32 place = (S32) pow(10.0f, n); +inline F32 mRound(const F32 val, const S32 n) +{ + S32 place = (S32) pow(10.0f, n); - return mFloor((val*place)+0.5)/place; + return mFloor((val*place)+0.5)/place; } inline F32 mRoundF(const F32 val, const F32 step) @@ -259,15 +261,15 @@ inline F32 mClampF(F32 val, F32 low, F32 high) inline S32 mWrap(S32 val, S32 low, S32 high) { - int len = high - low; - return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0); + int len = high - low; + return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0); } inline F32 mWrapF(F32 val, F32 low, F32 high) { - F32 t = fmod(val - low, high - low); - return t < 0 ? t + high : t + low; + F32 t = fmod(val - low, high - low); + return t < 0 ? t + high : t + low; } /// Template function for doing a linear interpolation between any two @@ -495,7 +497,7 @@ inline F64 mRadToDeg(F64 r) inline bool mIsNaN_F( const F32 x ) { - // If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN. + // If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN. // This is only going to work if the compiler is IEEE 748 compliant. // // Tested and working on VC2k5 diff --git a/Engine/source/math/mMath_C.cpp b/Engine/source/math/mMath_C.cpp index ad5aed0db..07f63dc38 100644 --- a/Engine/source/math/mMath_C.cpp +++ b/Engine/source/math/mMath_C.cpp @@ -152,8 +152,8 @@ static void m_point3F_interpolate_C(const F32 *from, const F32 *to, F32 factor, #ifdef TORQUE_COMPILER_GCC // remove possibility of aliases const F32 inverse = 1.0f - factor; - const F32 from0 = from[0], from1 = from[1], from2 = from[2]; - const F32 to0 = to[0], to1 = to[1], to2 = to[2]; + const F32 from0 = from[0], from1 = from[1], from2 = from[2]; + const F32 to0 = to[0], to1 = to[1], to2 = to[2]; result[0] = from0 * inverse + to0 * factor; result[1] = from1 * inverse + to1 * factor; @@ -182,8 +182,8 @@ static void m_point3D_interpolate_C(const F64 *from, const F64 *to, F64 factor, #ifdef TORQUE_COMPILER_GCC // remove possibility of aliases const F64 inverse = 1.0f - factor; - const F64 from0 = from[0], from1 = from[1], from2 = from[2]; - const F64 to0 = to[0], to1 = to[1], to2 = to[2]; + const F64 from0 = from[0], from1 = from[1], from2 = from[2]; + const F64 to0 = to[0], to1 = to[1], to2 = to[2]; result[0] = from0 * inverse + to0 * factor; result[1] = from1 * inverse + to1 * factor; @@ -436,8 +436,8 @@ static void affineInvertTo(const F32 * m, F32 * out) F32 invDet = 1.0f / (m[idx(0,0)] * d1 + m[idx(0,1)] * d2 + m[idx(0,2)] * d3); - F32 m00 = m[idx(0,0)] * invDet; - F32 m01 = m[idx(0,1)] * invDet; + F32 m00 = m[idx(0,0)] * invDet; + F32 m01 = m[idx(0,1)] * invDet; F32 m02 = m[idx(0,2)] * invDet; F32 * result = out; @@ -464,26 +464,71 @@ static void affineInvertTo(const F32 * m, F32 * out) } #endif -static void m_matF_invert_to_C(const F32* m, F32* d) +//-------------------------------------- +static void m_matF_inverse_C(F32 *m) { // using Cramers Rule find the Inverse // Minv = (1/det(M)) * adjoint(M) - F32 det = m_matF_determinant(m); - AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); + F32 det = m_matF_determinant( m ); + AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); - F32 invDet = 1.0f / det; + F32 invDet = 1.0f/det; + F32 temp[16]; - d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet; - d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet; + temp[0] = (m[5] * m[10]- m[6] * m[9]) * invDet; + temp[1] = (m[9] * m[2] - m[10]* m[1]) * invDet; + temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; + + temp[4] = (m[6] * m[8] - m[4] * m[10])* invDet; + temp[5] = (m[10]* m[0] - m[8] * m[2]) * invDet; + temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; + + temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; + temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; + temp[10]= (m[0] * m[5] - m[1] * m[4]) * invDet; + + m[0] = temp[0]; + m[1] = temp[1]; + m[2] = temp[2]; + + m[4] = temp[4]; + m[5] = temp[5]; + m[6] = temp[6]; + + m[8] = temp[8]; + m[9] = temp[9]; + m[10] = temp[10]; + + // invert the translation + temp[0] = -m[3]; + temp[1] = -m[7]; + temp[2] = -m[11]; + m_matF_x_vectorF(m, temp, &temp[4]); + m[3] = temp[4]; + m[7] = temp[5]; + m[11]= temp[6]; +} + +static void m_matF_invert_to_C(const F32 *m, F32 *d) +{ + // using Cramers Rule find the Inverse + // Minv = (1/det(M)) * adjoint(M) + F32 det = m_matF_determinant( m ); + AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); + + F32 invDet = 1.0f/det; + + d[0] = (m[5] * m[10]- m[6] * m[9]) * invDet; + d[1] = (m[9] * m[2] - m[10]* m[1]) * invDet; d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; - d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet; - d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet; + d[4] = (m[6] * m[8] - m[4] * m[10])* invDet; + d[5] = (m[10]* m[0] - m[8] * m[2]) * invDet; d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; - d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet; + d[10]= (m[0] * m[5] - m[1] * m[4]) * invDet; // invert the translation F32 temp[6]; @@ -493,13 +538,33 @@ static void m_matF_invert_to_C(const F32* m, F32* d) m_matF_x_vectorF(d, temp, &temp[3]); d[3] = temp[3]; d[7] = temp[4]; - d[11] = temp[5]; - d[12] = m[12]; - d[13] = m[13]; - d[14] = m[14]; - d[15] = m[15]; + d[11]= temp[5]; + d[ 12 ] = m[ 12 ]; + d[ 13 ] = m[ 13 ]; + d[ 14 ] = m[ 14 ]; + d[ 15 ] = m[ 15 ]; } +//-------------------------------------- +static void m_matF_affineInverse_C(F32 *m) +{ + // Matrix class checks to make sure this is an affine transform before calling + // this function, so we can proceed assuming it is... + F32 temp[16]; + dMemcpy(temp, m, 16 * sizeof(F32)); + + // Transpose rotation + m[1] = temp[4]; + m[4] = temp[1]; + m[2] = temp[8]; + m[8] = temp[2]; + m[6] = temp[9]; + m[9] = temp[6]; + + m[3] = -(temp[0]*temp[3] + temp[4]*temp[7] + temp[8]*temp[11]); + m[7] = -(temp[1]*temp[3] + temp[5]*temp[7] + temp[9]*temp[11]); + m[11] = -(temp[2]*temp[3] + temp[6]*temp[7] + temp[10]*temp[11]); +} inline void swap(F32 &a, F32 &b) { @@ -568,11 +633,11 @@ static void m_matF_normalize_C(F32 *m) //-------------------------------------- -static F32 m_matF_determinant_C(const F32* m) +static F32 m_matF_determinant_C(const F32 *m) { - return m[0] * (m[5] * m[10] - m[6] * m[9]) + - m[4] * (m[2] * m[9] - m[1] * m[10]) + - m[8] * (m[1] * m[6] - m[2] * m[5]); + return m[0] * (m[5] * m[10] - m[6] * m[9]) + + m[4] * (m[2] * m[9] - m[1] * m[10]) + + m[8] * (m[1] * m[6] - m[2] * m[5]) ; } @@ -825,11 +890,13 @@ void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m ) = m_quatF_set_mat void (*m_matF_set_euler)(const F32 *e, F32 *result) = m_matF_set_euler_C; void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result) = m_matF_set_euler_point_C; void (*m_matF_identity)(F32 *m) = m_matF_identity_C; -void (*m_matF_invert_to)(const F32* m, F32* d) = m_matF_invert_to_C; +void (*m_matF_inverse)(F32 *m) = m_matF_inverse_C; +void (*m_matF_affineInverse)(F32 *m) = m_matF_affineInverse_C; +void (*m_matF_invert_to)(const F32 *m, F32 *d) = m_matF_invert_to_C; void (*m_matF_transpose)(F32 *m) = m_matF_transpose_C; void (*m_matF_scale)(F32 *m,const F32* p) = m_matF_scale_C; void (*m_matF_normalize)(F32 *m) = m_matF_normalize_C; -F32(*m_matF_determinant)(const F32* m) = m_matF_determinant_C; +F32 (*m_matF_determinant)(const F32 *m) = m_matF_determinant_C; void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult) = default_matF_x_matF_C; void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult) = default_matF_x_matF_C; // void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult) = m_matF_x_point3F_C; @@ -868,11 +935,13 @@ void mInstallLibrary_C() m_matF_set_euler = m_matF_set_euler_C; m_matF_set_euler_point = m_matF_set_euler_point_C; m_matF_identity = m_matF_identity_C; - m_matF_invert_to = m_matF_invert_to_C; + m_matF_inverse = m_matF_inverse_C; + m_matF_affineInverse = m_matF_affineInverse_C; + m_matF_invert_to = m_matF_invert_to_C; m_matF_transpose = m_matF_transpose_C; m_matF_scale = m_matF_scale_C; m_matF_normalize = m_matF_normalize_C; - m_matF_determinant = m_matF_determinant_C; + m_matF_determinant = m_matF_determinant_C; m_matF_x_matF = default_matF_x_matF_C; m_matF_x_matF_aligned = default_matF_x_matF_C; // m_matF_x_point3F = m_matF_x_point3F_C; @@ -881,4 +950,3 @@ void mInstallLibrary_C() m_matF_x_scale_x_planeF = m_matF_x_scale_x_planeF_C; m_matF_x_box3F = m_matF_x_box3F_C; } - diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h index f42aaa5e4..a670ab3ba 100644 --- a/Engine/source/math/mMatrix.h +++ b/Engine/source/math/mMatrix.h @@ -59,7 +59,7 @@ /// 4x4 Matrix Class /// /// This runs at F32 precision. -using math_backend::mat44::dispatch::gMat44; +using namespace math_backend::mat44::dispatch; class MatrixF { friend class MatrixFEngineExport; @@ -131,7 +131,7 @@ public: EulerF toEuler() const; F32 determinant() const { - return gMat44.determinant(*this); + return GetMat44().determinant(*this); } /// Compute the inverse of the matrix. @@ -375,52 +375,51 @@ inline MatrixF& MatrixF::identity() inline MatrixF& MatrixF::inverse() { - gMat44.inverse(m); + GetMat44().inverse(m); return (*this); } inline void MatrixF::invertTo( MatrixF *out ) { out = this; - gMat44.inverse(*out); + GetMat44().inverse(*out); } inline MatrixF& MatrixF::affineInverse() { - gMat44.affine_inverse(m); + GetMat44().affine_inverse(m); return (*this); } inline MatrixF& MatrixF::transpose() { - gMat44.transpose(m); + GetMat44().transpose(m); return (*this); } inline MatrixF& MatrixF::scale(const Point3F& p) { - gMat44.scale(m, p); + GetMat44().scale(m, p); return *this; } inline Point3F MatrixF::getScale() const { Point3F scale; - gMat44.get_scale(m, scale); + GetMat44().get_scale(m, scale); return scale; } inline void MatrixF::normalize() { - gMat44.normalize(m); + GetMat44().normalize(m); } inline MatrixF& MatrixF::mul( const MatrixF &a ) { // M * a -> M AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!"); MatrixF tempThis(*this); - gMat44.mul_mat44(tempThis, a, tempThis); - *this = tempThis; + GetMat44().mul_mat44(tempThis, a, *this); return (*this); } @@ -429,14 +428,14 @@ inline MatrixF& MatrixF::mulL( const MatrixF &a ) { // a * M -> M AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!"); MatrixF tempThis(*this); - gMat44.mul_mat44(a, tempThis, *this); + GetMat44().mul_mat44(a, tempThis, *this); return (*this); } inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b ) { // a * b -> M AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!"); - gMat44.mul_mat44(a, b, *this); + GetMat44().mul_mat44(a, b, *this); return (*this); } @@ -461,7 +460,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b) inline void MatrixF::mul( Point4F& p ) const { Point4F temp; - gMat44.mul_float4(*this, p, temp); + GetMat44().mul_float4(*this, p, temp); p = temp; } @@ -469,28 +468,28 @@ inline void MatrixF::mulP( Point3F& p) const { // M * p -> d Point3F d; - gMat44.mul_pos3(*this, p, d); + GetMat44().mul_pos3(*this, p, d); p = d; } inline void MatrixF::mulP( const Point3F &p, Point3F *d) const { // M * p -> d - gMat44.mul_pos3(*this, p, *d); + GetMat44().mul_pos3(*this, p, *d); } inline void MatrixF::mulV( VectorF& v) const { // M * v -> v VectorF temp; - gMat44.mul_vec3(*this, v, temp); + GetMat44().mul_vec3(*this, v, temp); v = temp; } inline void MatrixF::mulV( const VectorF &v, Point3F *d) const { // M * v -> d - gMat44.mul_vec3(*this, v, *d); + GetMat44().mul_vec3(*this, v, *d); } inline void MatrixF::mul(Box3F& b) const @@ -634,14 +633,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 ) { // temp = m1 * m2 MatrixF temp; - gMat44.mul_mat44(m1, m2, temp); + GetMat44().mul_mat44(m1, m2, temp); return temp; } inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 ) { MatrixF tempThis(*this); - gMat44.mul_mat44(tempThis, m1, *this); + GetMat44().mul_mat44(tempThis, m1, *this); return (*this); } diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h index 8e329f3a1..7e7567f92 100644 --- a/Engine/source/math/mPoint3.h +++ b/Engine/source/math/mPoint3.h @@ -102,7 +102,7 @@ public: class Point3D; //------------------------------------------------------------------------------ -using math_backend::float3::dispatch::gFloat3; +using namespace math_backend::float3::dispatch; class Point3F { //-------------------------------------- Public data @@ -504,7 +504,7 @@ inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _ { AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor"); - gFloat3.lerp(_from, _to, _factor, *this); + GetFloat3().lerp(_from, _to, _factor, *this); } inline void Point3F::zero() @@ -606,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c) inline F32 Point3F::lenSquared() const { - return gFloat3.lengthSquared(*this); + return GetFloat3().lengthSquared(*this); } inline F32 Point3F::len() const { - return gFloat3.length(*this); + return GetFloat3().length(*this); } inline void Point3F::normalize() { - gFloat3.normalize(*this); + GetFloat3().normalize(*this); } inline F32 Point3F::magnitudeSafe() const @@ -633,13 +633,13 @@ inline F32 Point3F::magnitudeSafe() const inline void Point3F::normalizeSafe() { - gFloat3.normalize(*this); + GetFloat3().normalize(*this); } inline void Point3F::normalize(F32 val) { - gFloat3.normalize_mag(*this, val); + GetFloat3().normalize_mag(*this, val); } inline bool Point3F::operator==(const Point3F& _test) const @@ -655,33 +655,33 @@ inline bool Point3F::operator!=(const Point3F& _test) const inline Point3F Point3F::operator+(const Point3F& _add) const { Point3F temp; - gFloat3.add(*this, _add, temp); + GetFloat3().add(*this, _add, temp); return temp; } inline Point3F Point3F::operator-(const Point3F& _rSub) const { Point3F temp; - gFloat3.sub(*this, _rSub, temp); + GetFloat3().sub(*this, _rSub, temp); return temp; } inline Point3F& Point3F::operator+=(const Point3F& _add) { - gFloat3.add(*this, _add, *this); + GetFloat3().add(*this, _add, *this); return *this; } inline Point3F& Point3F::operator-=(const Point3F& _rSub) { - gFloat3.sub(*this, _rSub, *this); + GetFloat3().sub(*this, _rSub, *this); return *this; } inline Point3F Point3F::operator*(F32 _mul) const { Point3F temp; - gFloat3.mul_scalar(*this, _mul, temp); + GetFloat3().mul_scalar(*this, _mul, temp); return temp; } @@ -690,13 +690,13 @@ inline Point3F Point3F::operator/(F32 _div) const AssertFatal(_div != 0.0f, "Error, div by zero attempted"); Point3F temp; - gFloat3.div_scalar(*this, _div, temp); + GetFloat3().div_scalar(*this, _div, temp); return temp; } inline Point3F& Point3F::operator*=(F32 _mul) { - gFloat3.mul_scalar(*this, _mul, *this); + GetFloat3().mul_scalar(*this, _mul, *this); return *this; } @@ -704,20 +704,20 @@ inline Point3F& Point3F::operator/=(F32 _div) { AssertFatal(_div != 0.0f, "Error, div by zero attempted"); - gFloat3.div_scalar(*this, _div, *this); + GetFloat3().div_scalar(*this, _div, *this); return *this; } inline Point3F Point3F::operator*(const Point3F &_vec) const { Point3F temp; - gFloat3.mul(*this, _vec, temp); + GetFloat3().mul(*this, _vec, temp); return temp; } inline Point3F& Point3F::operator*=(const Point3F &_vec) { - gFloat3.mul(*this, _vec, *this); + GetFloat3().mul(*this, _vec, *this); return *this; } @@ -725,14 +725,14 @@ inline Point3F Point3F::operator/(const Point3F &_vec) const { AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted"); Point3F temp; - gFloat3.div(*this, _vec, temp); + GetFloat3().div(*this, _vec, temp); return temp; } inline Point3F& Point3F::operator/=(const Point3F &_vec) { AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted"); - gFloat3.div(*this, _vec, *this); + GetFloat3().div(*this, _vec, *this); return *this; } @@ -985,7 +985,7 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand) inline Point3F operator*(F32 mul, const Point3F& multiplicand) { Point3F temp; - gFloat3.mul_scalar(multiplicand, mul, temp); + GetFloat3().mul_scalar(multiplicand, mul, temp); return temp; } @@ -996,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand) inline F32 mDot(const Point3F &p1, const Point3F &p2) { - return gFloat3.dot(p1, p2); + return GetFloat3().dot(p1, p2); } inline F64 mDot(const Point3D &p1, const Point3D &p2) @@ -1006,7 +1006,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2) inline void mCross(const Point3F &a, const Point3F &b, Point3F *res) { - gFloat3.cross(a, b, *res); + GetFloat3().cross(a, b, *res); } inline void mCross(const Point3D &a, const Point3D &b, Point3D *res) @@ -1019,7 +1019,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res) inline Point3F mCross(const Point3F &a, const Point3F &b) { Point3F r; - gFloat3.cross(a, b, r); + GetFloat3().cross(a, b, r); return r; } diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h index 0f715f983..4befd326b 100644 --- a/Engine/source/math/mPoint4.h +++ b/Engine/source/math/mPoint4.h @@ -63,7 +63,7 @@ class Point4I /// Uses F32 internally. /// /// Useful for representing quaternions and other 4d beasties. -using math_backend::float4::dispatch::gFloat4; +using namespace math_backend::float4::dispatch; class Point4F { @@ -156,12 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w) inline F32 Point4F::len() const { - return gFloat4.length(*this); + return GetFloat4().length(*this); } inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor) { - gFloat4.lerp(_from, _to, _factor, *this); + GetFloat4().lerp(_from, _to, _factor, *this); } inline void Point4F::zero() @@ -194,7 +194,7 @@ inline Point4F& Point4F::operator/=(F32 scalar) if (mIsZero(scalar)) return *this; - gFloat4.div_scalar(*this, scalar, *this); + GetFloat4().div_scalar(*this, scalar, *this); return *this; } @@ -202,47 +202,47 @@ inline Point4F& Point4F::operator/=(F32 scalar) inline Point4F Point4F::operator+(const Point4F& _add) const { Point4F res; - gFloat4.add(*this, _add, res); + GetFloat4().add(*this, _add, res); return res; } inline Point4F& Point4F::operator+=(const Point4F& _add) { - gFloat4.add(*this, _add, *this); + GetFloat4().add(*this, _add, *this); return *this; } inline Point4F Point4F::operator-(const Point4F& _rSub) const { Point4F res; - gFloat4.sub(*this, _rSub, res); + GetFloat4().sub(*this, _rSub, res); return res; } inline Point4F Point4F::operator*(const Point4F &_vec) const { Point4F res; - gFloat4.mul(*this, _vec, res); + GetFloat4().mul(*this, _vec, res); return res; } inline Point4F Point4F::operator*(F32 _mul) const { Point4F res; - gFloat4.mul_scalar(*this, _mul, res); + GetFloat4().mul_scalar(*this, _mul, res); return res; } inline Point4F Point4F::operator /(F32 t) const { Point4F res; - gFloat4.div_scalar(*this, t, res); + GetFloat4().div_scalar(*this, t, res); return res; } inline F32 mDot(const Point4F &p1, const Point4F &p2) { - return gFloat4.dot(p1, p2); + return GetFloat4().dot(p1, p2); } //------------------------------------------------------------------------------ diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp index f8a16c3ed..d110650ac 100644 --- a/Engine/source/math/public/math_backend.cpp +++ b/Engine/source/math/public/math_backend.cpp @@ -1,4 +1,3 @@ -#pragma once #include "math/public/math_backend.h" namespace math_backend::float4::dispatch diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h index 0a3127f81..f704f0db5 100644 --- a/Engine/source/math/public/math_backend.h +++ b/Engine/source/math/public/math_backend.h @@ -35,4 +35,61 @@ namespace math_backend void install_from_cpu_flags(uint32_t cpu_flags); } +#include + +namespace math_backend::float4::dispatch +{ + //-------------------------------------------------- + // Thread-safe getter, ensures scalar installed if empty + //-------------------------------------------------- + inline Float4Funcs& GetFloat4() + { + if (__builtin_expect(gFloat4.mul == nullptr, 0)) + { + static std::once_flag once; + std::call_once(once, []{ + install_scalar(); + }); + } + return gFloat4; + } +} + +namespace math_backend::float3::dispatch +{ + //-------------------------------------------------- + // Thread-safe getter, ensures scalar installed if empty + //-------------------------------------------------- + inline Float3Funcs& GetFloat3() + { + if (__builtin_expect(gFloat3.mul == nullptr, 0)) + { + static std::once_flag once; + std::call_once(once, []{ + install_scalar(); + }); + } + return gFloat3; + } +} + + +namespace math_backend::mat44::dispatch +{ + //-------------------------------------------------- + // Thread-safe getter, ensures scalar installed if empty + //-------------------------------------------------- + inline Mat44Funcs& GetMat44() + { + if (__builtin_expect(gMat44.mul_mat44 == nullptr, 0)) + { + static std::once_flag once; + std::call_once(once, []{ + install_scalar(); + }); + } + return gMat44; + } +} + #endif // !_MATH_BACKEND_H_