diff --git a/Engine/source/environment/meshRoad.cpp b/Engine/source/environment/meshRoad.cpp index 0b10f4a29..ea09c4352 100644 --- a/Engine/source/environment/meshRoad.cpp +++ b/Engine/source/environment/meshRoad.cpp @@ -3406,7 +3406,7 @@ MatrixF MeshRoad::getNodeTransform( U32 idx ) mat.setColumn( 2, node.normal ); mat.setColumn( 3, node.point ); - AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!"); + AssertFatal(mat.determinant() != 0.0f, "no inverse!"); return mat; } @@ -3456,7 +3456,7 @@ void MeshRoad::calcSliceTransform( U32 idx, MatrixF &mat ) mat.setColumn( 2, slice.normal ); mat.setColumn( 3, slice.p1 ); - AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!"); + AssertFatal(mat.determinant() != 0.0f, "no inverse!"); } F32 MeshRoad::getRoadLength() const diff --git a/Engine/source/environment/river.cpp b/Engine/source/environment/river.cpp index acefb66b3..a3456cf17 100644 --- a/Engine/source/environment/river.cpp +++ b/Engine/source/environment/river.cpp @@ -2139,7 +2139,7 @@ MatrixF River::getNodeTransform( U32 idx ) const mat.setColumn( 2, node.normal ); mat.setColumn( 3, node.point ); - AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!"); + AssertFatal( mat.determinant() != 0.0f, "no inverse!"); return mat; } diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl index 87b325faf..216fc99ba 100644 --- a/Engine/source/math/impl/float3_impl.inl +++ b/Engine/source/math/impl/float3_impl.inl @@ -99,7 +99,7 @@ namespace math_backend::float3 f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va))); f32x4 vnorm = v_mul(va, invLen); - v_store(a, vnorm); + v_store3(a, vnorm); } // Linear interpolation: r = from + (to - from) * f diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl new file mode 100644 index 000000000..a9635cff0 --- /dev/null +++ b/Engine/source/math/impl/mat44_impl.inl @@ -0,0 +1,312 @@ +#pragma once +#include // for sqrtf, etc. +#include "../mConstants.h" + +namespace math_backend::mat44 +{ + //------------------------------------------------------------------ + // Matrix Transpose + inline void mat44_transpose_impl(float* m) + { + f32x4x4 ma = m_load(m); + f32x4x4 mr = m_transpose(ma); + m_store(m, mr); + } + + inline float mat44_get_determinant(const float* m) + { + f32x4x4 ma = m_load(m); + return v_extract0(m_determinant_affine(ma)); + } + + // Matrix Scale: Float3 (assume w = 1.0f) + inline void mat44_scale_impl(float* m, const float* s) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load3_pos(s); + + ma.r0 = v_mul(ma.r0, va); + ma.r1 = v_mul(ma.r1, va); + ma.r2 = v_mul(ma.r2, va); + ma.r3 = v_mul(ma.r3, va); + m_store(m, ma); + } + + inline void mat44_get_scale_impl(const float* m, float* s) + { + f32x4x4 ma = m_load(m); + + // squared lengths + f32x4 len2_x = v_dot3(ma.r0, ma.r0); + f32x4 len2_y = v_dot3(ma.r1, ma.r1); + f32x4 len2_z = v_dot3(ma.r2, ma.r2); + + // extract and sqrt + s[0] = 1.0f / v_extract0(v_rsqrt_nr(len2_x)); + s[1] = 1.0f / v_extract0(v_rsqrt_nr(len2_y)); + s[2] = 1.0f / v_extract0(v_rsqrt_nr(len2_z)); + } + + // Matrix Scale Uniform: Float value (assume w = 1.0f) + inline void mat44_scale_uniform(float* m, float s) + { + f32x4x4 ma = m_load(m); + + // (s, s, s, 1) + f32x4 scale = v_set(s, s, s, 1.0f); + + // Scale only rotation rows (xyz part) + ma.r0 = v_mul(ma.r0, scale); + ma.r1 = v_mul(ma.r1, scale); + ma.r2 = v_mul(ma.r2, scale); + m_store(m, ma); + } + + // Matrix Inverse + inline void mat44_inverse_impl(float* m) + { + f32x4x4 ma = m_load(m); + + // Compute cofactors using cross products + f32x4x4 mTemp; + mTemp.r0 = v_cross(ma.r1, ma.r2); + mTemp.r1 = v_cross(ma.r2, ma.r0); + mTemp.r2 = v_cross(ma.r0, ma.r1); + + // Determinant = dot(ma.r0, c0) + f32x4 det = v_dot3(ma.r0, mTemp.r0); + f32x4 invDet = v_rcp_nr(det); + + // Scale cofactors + mTemp.r0 = v_mul(mTemp.r0, invDet); + mTemp.r1 = v_mul(mTemp.r1, invDet); + mTemp.r2 = v_mul(mTemp.r2, invDet); + + // Store inverse 3x3 (transpose of cofactor matrix) + + mTemp = m_transpose(mTemp); + mTemp.r3 = v_set(0, 0, 0, 1.0f); + + // ---- Translation ---- + + // Load original translation + f32x4 T = v_set(m[3], m[7], m[11], 0.0f); + + // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) + f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); + result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); + result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); + result = v_mul(result, v_set1(-1.0f)); + + m_store(m, mTemp); + + // Store translation + m[3] = v_extract0(result); + m[7] = v_extract0(v_swizzle_singular_mask(result, 1)); + m[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + } + + // Matrix Affine Inverse + inline void mat44_affine_inverse_impl(float* m) + { + f32x4x4 ma = m_load(m); + + f32x4x4 mTemp = m_transpose(ma); + mTemp.r3 = v_set(0, 0, 0, 1); + + // ---- Translation ---- + // Load original translation + f32x4 T = v_set(m[3], m[7], m[11], 0.0f); + + // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) + f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); + result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); + result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); + result = v_mul(result, v_set1(-1.0f)); + + m_store(m, mTemp); + + // Store translation + m[3] = v_extract0(result); + m[7] = v_extract0(v_swizzle_singular_mask(result, 1)); + m[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + } + + inline void mat44_normalize_impl(float* a) + { + // Load the matrix + f32x4x4 m = m_load(a); + + // Extract axes (rows 0-2), zero out w using v_mask_xyz + f32x4 xaxis = v_mul(m.r0, v_mask_xyz()); + f32x4 yaxis = v_mul(m.r1, v_mask_xyz()); + f32x4 zaxis = v_mul(m.r2, v_mask_xyz()); + + xaxis = v_normalize3(xaxis); + + float dotXY = v_extract0(v_hadd4(v_mul(xaxis, yaxis))); + f32x4 projYonX = v_mul(v_set1(dotXY), xaxis); + yaxis = v_normalize3(v_sub(yaxis, projYonX)); + + zaxis = v_cross(xaxis, yaxis); + + // Store normalized axes back (preserve translation w) + m.r0 = v_preserve_w(xaxis, m.r0); + m.r1 = v_preserve_w(yaxis, m.r1); + m.r2 = v_preserve_w(zaxis, m.r2); + + // Store back to memory + m_store(a, m); + } + + // Matrix Multiply: a * b + inline void mat44_mul_mat44_impl(const float* a, const float* b, float* r) + { + f32x4x4 ma = m_load(a); + f32x4x4 mb = m_load(b); + + f32x4x4 mr = m_mul(ma, mb); + m_store(r, mr); + } + + // Vector Multiply: m * p (assume w = 1.0f) + inline void mat44_mul_pos3_impl(const float *m, const float *p, float* r) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load3_pos(p); + f32x4 vr = m_mul_vec4(ma, va); + v_store3(r, vr); + } + + // Vector Multiply: m * v (assume w = 0.0f) + inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load3_vec(v); + f32x4 vr = m_mul_vec4(ma, va); + v_store3(r, vr); + } + + // Vector Multiply: m * p (full [4x4] * [1x4]) + inline void mat44_mul_float4_impl(const float* m, const float* p, float* r) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load(p); + f32x4 vr = m_mul_vec4(ma, va); + v_store(r, vr); + } + + //-------------------------------------------------- + // MATRIX ROTATION FUNCTIONS + //-------------------------------------------------- + + inline void mat44_rotation_x_impl(float* m, float angle) + { + float c = cosf(angle), s = sinf(angle); + f32x4x4 mr = m_identity(); + mr.r1 = v_set(0, c, s, 0); + mr.r2 = v_set(0, -s, c, 0); + m_store(m, mr); + } + + inline void mat44_rotation_y_impl(float* m, float angle) + { + float c = cosf(angle), s = sinf(angle); + f32x4x4 mr = m_identity(); + mr.r0 = v_set(c, 0, -s, 0); + mr.r2 = v_set(s, 0, c, 0); + m_store(m, mr); + } + + inline void mat44_rotation_z_impl(float* m, float angle) + { + float c = cosf(angle), s = sinf(angle); + f32x4x4 mr = m_identity(); + mr.r0 = v_set(c, s, 0, 0); + mr.r1 = v_set(-s, c, 0, 0); + m_store(m, mr); + } + + // Compose rotation from Euler angles (pitch=X, yaw=Y, roll=Z) + inline void mat44_rotation_euler_impl(float* m, float pitch, float yaw, float roll) + { + f32x4x4 rx, ry, rz; + mat44_rotation_x_impl((float*)&rx, pitch); + mat44_rotation_y_impl((float*)&ry, yaw); + mat44_rotation_z_impl((float*)&rz, roll); + + f32x4x4 r = m_mul(rz, m_mul(ry, rx)); + m_store(m, r); + } + + inline void mat44_trs_impl(float* m, const float* t, const float* r_euler, const float* s) + { + f32x4x4 mr; + mat44_rotation_euler_impl((float*)&mr, r_euler[0], r_euler[1], r_euler[2]); + + f32x4 vs = v_load3_vec(s); // scale xyz + mr.r0 = v_mul(mr.r0, vs); + mr.r1 = v_mul(mr.r1, vs); + mr.r2 = v_mul(mr.r2, vs); + + mr.r0 = v_insert_w(mr.r0, _mm_set_ss(t[0])); + mr.r1 = v_insert_w(mr.r1, _mm_set_ss(t[1])); + mr.r2 = v_insert_w(mr.r2, _mm_set_ss(t[2])); + mr.r3 = v_set(0, 0, 0, 1.0f); + + m_store(m, mr); + } + + inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up) + { + f32x4 vEye = v_load3_pos(eye); + f32x4 vTarget = v_load3_pos(target); + f32x4 vUp = v_load3_vec(up); + + // Forward (z+) + f32x4 zaxis = v_normalize3(v_sub(vTarget, vEye)); + + // Right (x+) + f32x4 xaxis = v_normalize3(v_cross(vUp, zaxis)); + + // Up (y+) + f32x4 yaxis = v_cross(zaxis, xaxis); + + // Compute translation components: -dot(axis, eye) + f32x4 t_x = v_mul(v_dot3(xaxis, vEye), v_set1(-1.0f)); + f32x4 t_y = v_mul(v_dot3(yaxis, vEye), v_set1(-1.0f)); + f32x4 t_z = v_mul(v_dot3(zaxis, vEye), v_set1(-1.0f)); + + f32x4x4 view; + view.r0 = v_insert_w(xaxis, t_x); + view.r1 = v_insert_w(yaxis, t_y); + view.r2 = v_insert_w(zaxis, t_z); + view.r3 = v_set(0, 0, 0, 1.0f); + + m_store(m, view); + } + + inline void mat44_perspective_impl(float* m, float fovY, float aspect, float znear, float zfar) + { + float f = 1.0f / tanf(fovY * 0.5f); + float nf = 1.0f / (znear - zfar); + + f32x4x4 mp = m_zero(); + mp.r0 = v_set(f / aspect, 0, 0, 0); + mp.r1 = v_set(0, f, 0, 0); + mp.r2 = v_set(0, 0, (zfar + znear) * nf, 2 * zfar * znear * nf); + mp.r3 = v_set(0, 0, -1, 0); // row-major projection + m_store(m, mp); + } + + inline void mat44_orthographic_impl(float* m, float left, float right, float bottom, float top, float znear, float zfar) + { + f32x4x4 mo = m_zero(); + mo.r0 = v_set(2.0f / (right - left), 0, 0, -(right + left) / (right - left)); + mo.r1 = v_set(0, 2.0f / (top - bottom), 0, -(top + bottom) / (top - bottom)); + mo.r2 = v_set(0, 0, -2.0f / (zfar - znear), -(zfar + znear) / (zfar - znear)); + mo.r3 = v_set(0, 0, 0, 1.0f); + m_store(m, mo); + } + +} // namespace math_backend::mat44 diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp index 7d8a317ba..3fe449a1b 100644 --- a/Engine/source/math/impl/math_c.cpp +++ b/Engine/source/math/impl/math_c.cpp @@ -1,3 +1,4 @@ +#include "platform/platform.h" #include "math/public/float4_dispatch.h" #include "math/public/float3_dispatch.h" #include "math/public/mat44_dispatch.h" @@ -196,6 +197,89 @@ namespace math_backend::mat44::dispatch swap(a[11], a[14]); }; + gMat44.determinant = [](const float* m) { + return m[0] * (m[5] * m[10] - m[6] * m[9]) + + m[4] * (m[2] * m[9] - m[1] * m[10]) + + m[8] * (m[1] * m[6] - m[2] * m[5]); + }; + + gMat44.mul_vec3 = [](const float* a, const float* b, float* r) { +#ifdef TORQUE_COMPILER_GCC + const F32 v0 = b[0], v1 = b[1], v2 = b[2]; + const F32 m0 = a[0], m1 = a[1], m2 = a[2]; + const F32 m4 = a[4], m5 = a[5], m6 = a[6]; + const F32 m8 = a[8], m9 = a[9], m10 = a[10]; + + r[0] = m0 * v0 + m1 * v1 + m2 * v2; + r[1] = m4 * v0 + m5 * v1 + m6 * v2; + r[2] = m8 * v0 + m9 * v1 + m10 * v2; +#else + r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; + r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2]; + r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2]; +#endif + + }; + + gMat44.inverse = [](float* m) { + // using Cramers Rule find the Inverse + // Minv = (1/det(M)) * adjoint(M) + float det = gMat44.determinant(m); + AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); + float invDet = 1.0f / det; + float temp[16]; + temp[0] = (m[5] * m[10] - m[6] * m[9]) * invDet; + temp[1] = (m[9] * m[2] - m[10] * m[1]) * invDet; + temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; + + temp[4] = (m[6] * m[8] - m[4] * m[10]) * invDet; + temp[5] = (m[10] * m[0] - m[8] * m[2]) * invDet; + temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; + + temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; + temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; + temp[10] = (m[0] * m[5] - m[1] * m[4]) * invDet; + + m[0] = temp[0]; + m[1] = temp[1]; + m[2] = temp[2]; + + m[4] = temp[4]; + m[5] = temp[5]; + m[6] = temp[6]; + + m[8] = temp[8]; + m[9] = temp[9]; + m[10] = temp[10]; + + // invert the translation + temp[0] = -m[3]; + temp[1] = -m[7]; + temp[2] = -m[11]; + gMat44.mul_vec3(m, temp, &temp[4]); + m[3] = temp[4]; + m[7] = temp[5]; + m[11] = temp[6]; + + }; + + gMat44.affine_inverse = [](float* a) { + F32 temp[16]; + dMemcpy(temp, a, 16 * sizeof(F32)); + + // Transpose rotation + a[1] = temp[4]; + a[4] = temp[1]; + a[2] = temp[8]; + a[8] = temp[2]; + a[6] = temp[9]; + a[9] = temp[6]; + + a[3] = -(temp[0] * temp[3] + temp[4] * temp[7] + temp[8] * temp[11]); + a[7] = -(temp[1] * temp[3] + temp[5] * temp[7] + temp[9] * temp[11]); + a[11] = -(temp[2] * temp[3] + temp[6] * temp[7] + temp[10] * temp[11]); + }; + gMat44.scale = [](float* a, const float* s) { // Note, doesn't allow scaling w... @@ -204,5 +288,86 @@ namespace math_backend::mat44::dispatch a[8] *= s[0]; a[9] *= s[1]; a[10] *= s[2]; a[12] *= s[0]; a[13] *= s[1]; a[14] *= s[2]; }; + + gMat44.get_scale = [](const float* a, float* s) { + // Note, doesn't allow scaling w... + s[0] = sqrt(a[0] * a[0] + a[4] * a[4] + a[8] * a[8]); + s[1] = sqrt(a[1] * a[1] + a[5] * a[5] + a[9] * a[9]); + s[2] = sqrt(a[2] * a[2] + a[6] * a[6] + a[10] * a[10]); + }; + + gMat44.mul_float4 = [](const float* a, const float* b, float* r) { + AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!"); + r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; + r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2] + a[7] * b[3]; + r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2] + a[11] * b[3]; + r[2] = a[12] * b[0] + a[13] * b[1] + a[14] * b[2] + a[15] * b[3]; + }; + + gMat44.mul_pos3 = [](const float* a, const float* b, float* r) { + AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!"); + r[0] = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]; + r[1] = a[4]*b[0] + a[5]*b[1] + a[6]*b[2] + a[7]; + r[2] = a[8]*b[0] + a[9]*b[1] + a[10]*b[2] + a[11]; + }; + + gMat44.mul_vec3 = [](const float* a, const float* b, float* r) { + AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!"); + r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; + r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2]; + r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2]; + }; + + gMat44.mul_mat44 = [](const float* a, const float* b, float* r) { + r[0] = a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12]; + r[1] = a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13]; + r[2] = a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14]; + r[3] = a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15]; + + r[4] = a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12]; + r[5] = a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13]; + r[6] = a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14]; + r[7] = a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15]; + + r[8] = a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12]; + r[9] = a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13]; + r[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14]; + r[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15]; + + r[12] = a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12]; + r[13] = a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13]; + r[14] = a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14]; + r[15] = a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15]; + }; + + gMat44.normalize = [](float* a) { + F32 col0[3], col1[3], col2[3]; + // extract columns 0 and 1 + col0[0] = a[0]; + col0[1] = a[4]; + col0[2] = a[8]; + + col1[0] = a[1]; + col1[1] = a[5]; + col1[2] = a[9]; + + math_backend::float3::dispatch::gFloat3.normalize(col0); + math_backend::float3::dispatch::gFloat3.normalize(col1); + math_backend::float3::dispatch::gFloat3.normalize(col2); + + // store the normalized columns + a[0] = col0[0]; + a[4] = col0[1]; + a[8] = col0[2]; + + a[1] = col1[0]; + a[5] = col1[1]; + a[9] = col1[2]; + + a[2] = col2[0]; + a[6] = col2[1]; + a[10] = col2[2]; + + }; } } diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h index 0340d84a2..a67f900b4 100644 --- a/Engine/source/math/isa/avx/avx_intrinsics.h +++ b/Engine/source/math/isa/avx/avx_intrinsics.h @@ -1,5 +1,5 @@ #pragma once -#include // AVX/AVX2 intrinsics +#include // AVX namespace { @@ -26,11 +26,58 @@ namespace inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + inline f32x4 v_swizzle_mask(f32x4 v1, const int x, const int y, const int z, const int w) + { + __m128i idx = _mm_set_epi32(w, z, y, x); + return _mm_permutevar_ps(v1, idx); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x) + { + __m128i idx = _mm_set1_epi32(x); + return _mm_permutevar_ps(v1, idx); + } + + inline f32x4 v_swizzle_lo(f32x4 v1) + { + return _mm_moveldup_ps(v1); + } + + inline f32x4 v_swizzle_hi(f32x4 v1) + { + return _mm_movehdup_ps(v1); + } + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) { return _mm_blend_ps(newv, original, 0b1000); } + //------------------------------------------------------ + // Shuffle helpers + //------------------------------------------------------ + + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, const int x, const int y, const int z, const int w) + { + f32x4 lo = v1; + f32x4 hi = v2; + + f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x)); + f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z)); + + return _mm_movelh_ps(combined_lo, combined_hi); + } + + inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2) + { + return _mm_movelh_ps(v1, v2); + } + + inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2) + { + return _mm_movehl_ps(v2, v1); + } + //------------------------------------------------------ // Float3 helpers (safe loading into 4 lanes) //------------------------------------------------------ @@ -54,6 +101,16 @@ namespace dst[2] = tmp[2]; } + inline f32x4 v_set(float x, float y, float z, float w) + { + return _mm_set_ps(w, z, y, x); + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + return _mm_insert_ps(v, w, 0x30); + } + //------------------------------------------------------ // Simple Arithmatic //------------------------------------------------------ @@ -70,7 +127,7 @@ namespace // Element-wise subtract inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } - //------------------------------------------------------ + //------------------------------------------------------ // Fast recip //------------------------------------------------------ @@ -137,4 +194,195 @@ namespace __m128 sum = _mm_hadd_ps(a, a); return _mm_hadd_ps(sum, sum); } + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + //------------------------------------------------------ + // Matrix Load / Store + //------------------------------------------------------ + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + v_store(dst + 0, m.r0); + v_store(dst + 4, m.r1); + v_store(dst + 8, m.r2); + v_store(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = _mm_set_ps(0, 0, 0, 1); + m.r1 = _mm_set_ps(0, 0, 1, 0); + m.r2 = _mm_set_ps(0, 1, 0, 0); + m.r3 = _mm_set_ps(1, 0, 0, 0); + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + //------------------------------------------------------ + // Matrix × Vector + //------------------------------------------------------ + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + // combine to a vector + return _mm_set_ps(_mm_cvtss_f32(w), + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + // combine to a vector + return _mm_set_ps(0.0f, + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + //------------------------------------------------------ + // Matrix × Matrix + //------------------------------------------------------ + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + // Transpose B once for dot products + f32x4 b0 = b.r0; + f32x4 b1 = b.r1; + f32x4 b2 = b.r2; + f32x4 b3 = b.r3; + + // Transpose (SSE2) + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); + + f32x4x4 C; + + auto mul_row = [&](f32x4 arow) + { + f32x4 x = v_dot4(arow, b0); + f32x4 y = v_dot4(arow, b1); + f32x4 z = v_dot4(arow, b2); + f32x4 w = v_dot4(arow, b3); + + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + }; + + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + //------------------------------------------------------ + // Transpose + //------------------------------------------------------ + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + f32x4 r3 = m.r3; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + + return { r0, r1, r2, r3 }; + } + + inline f32x4x4 m_inverse_rigid(const f32x4x4& m) + { + f32x4x4 t = m_transpose(m); + + // Extract translation + f32x4 T = v_set( + _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))), + 0.0f + ); + + f32x4 newT = m_mul_vec4(t, T); + newT = _mm_sub_ps(v_zero(), newT); + + t.r0 = v_preserve_w(t.r0, newT); + t.r1 = v_preserve_w(t.r1, newT); + t.r2 = v_preserve_w(t.r2, newT); + t.r3 = v_set(0, 0, 0, 1); + + return t; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = _mm_mul_ps(a, c0); + f32x4 term1 = _mm_mul_ps(a, c1); + f32x4 term2 = _mm_mul_ps(a, c2); + + f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); // splatted determinant + } } diff --git a/Engine/source/math/isa/avx/mat44.cpp b/Engine/source/math/isa/avx/mat44.cpp new file mode 100644 index 000000000..6634b652c --- /dev/null +++ b/Engine/source/math/isa/avx/mat44.cpp @@ -0,0 +1,22 @@ +#include "avx_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_avx() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + } +} diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h index 0340d84a2..67f0df04a 100644 --- a/Engine/source/math/isa/avx2/avx2_intrinsics.h +++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h @@ -1,5 +1,5 @@ #pragma once -#include // AVX/AVX2 intrinsics +#include // AVX2 namespace { @@ -26,11 +26,58 @@ namespace inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + inline f32x4 v_swizzle_mask(f32x4 v1,const int x, const int y, const int z, const int w) + { + __m128i idx = _mm_set_epi32(w, z, y, x); + return _mm_permutevar_ps(v1, idx); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x) + { + __m128i idx = _mm_set1_epi32(x); + return _mm_permutevar_ps(v1, idx); + } + + inline f32x4 v_swizzle_lo(f32x4 v1) + { + return _mm_moveldup_ps(v1); + } + + inline f32x4 v_swizzle_hi(f32x4 v1) + { + return _mm_movehdup_ps(v1); + } + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) { return _mm_blend_ps(newv, original, 0b1000); } + //------------------------------------------------------ + // Shuffle helpers + //------------------------------------------------------ + + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w) + { + f32x4 lo = v1; + f32x4 hi = v2; + + f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x)); + f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z)); + + return _mm_movelh_ps(combined_lo, combined_hi); + } + + inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2) + { + return _mm_movelh_ps(v1, v2); + } + + inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2) + { + return _mm_movehl_ps(v2, v1); + } + //------------------------------------------------------ // Float3 helpers (safe loading into 4 lanes) //------------------------------------------------------ @@ -54,6 +101,16 @@ namespace dst[2] = tmp[2]; } + inline f32x4 v_set(float x, float y, float z, float w) + { + return _mm_set_ps(w, z, y, x); + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + return _mm_insert_ps(v, w, 0x30); + } + //------------------------------------------------------ // Simple Arithmatic //------------------------------------------------------ @@ -70,7 +127,7 @@ namespace // Element-wise subtract inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } - //------------------------------------------------------ + //------------------------------------------------------ // Fast recip //------------------------------------------------------ @@ -137,4 +194,195 @@ namespace __m128 sum = _mm_hadd_ps(a, a); return _mm_hadd_ps(sum, sum); } + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + //------------------------------------------------------ + // Matrix Load / Store + //------------------------------------------------------ + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + v_store(dst + 0, m.r0); + v_store(dst + 4, m.r1); + v_store(dst + 8, m.r2); + v_store(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = _mm_set_ps(0, 0, 0, 1); + m.r1 = _mm_set_ps(0, 0, 1, 0); + m.r2 = _mm_set_ps(0, 1, 0, 0); + m.r3 = _mm_set_ps(1, 0, 0, 0); + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + //------------------------------------------------------ + // Matrix × Vector + //------------------------------------------------------ + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + // combine to a vector + return _mm_set_ps(_mm_cvtss_f32(w), + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + // combine to a vector + return _mm_set_ps(0.0f, + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + //------------------------------------------------------ + // Matrix × Matrix + //------------------------------------------------------ + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + // Transpose B once for dot products + f32x4 b0 = b.r0; + f32x4 b1 = b.r1; + f32x4 b2 = b.r2; + f32x4 b3 = b.r3; + + // Transpose (SSE2) + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); + + f32x4x4 C; + + auto mul_row = [&](f32x4 arow) + { + f32x4 x = v_dot4(arow, b0); + f32x4 y = v_dot4(arow, b1); + f32x4 z = v_dot4(arow, b2); + f32x4 w = v_dot4(arow, b3); + + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + }; + + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + //------------------------------------------------------ + // Transpose + //------------------------------------------------------ + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + f32x4 r3 = m.r3; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + + return { r0, r1, r2, r3 }; + } + + inline f32x4x4 m_inverse_rigid(const f32x4x4& m) + { + f32x4x4 t = m_transpose(m); + + // Extract translation + f32x4 T = v_set( + _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))), + 0.0f + ); + + f32x4 newT = m_mul_vec4(t, T); + newT = _mm_sub_ps(v_zero(), newT); + + t.r0 = v_preserve_w(t.r0, newT); + t.r1 = v_preserve_w(t.r1, newT); + t.r2 = v_preserve_w(t.r2, newT); + t.r3 = v_set(0, 0, 0, 1); + + return t; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = _mm_mul_ps(a, c0); + f32x4 term1 = _mm_mul_ps(a, c1); + f32x4 term2 = _mm_mul_ps(a, c2); + + f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); // splatted determinant + } } diff --git a/Engine/source/math/isa/avx2/mat44.cpp b/Engine/source/math/isa/avx2/mat44.cpp new file mode 100644 index 000000000..11e842174 --- /dev/null +++ b/Engine/source/math/isa/avx2/mat44.cpp @@ -0,0 +1,22 @@ +#include "avx2_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_avx2() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + } +} diff --git a/Engine/source/math/isa/sse2/mat44.cpp b/Engine/source/math/isa/sse2/mat44.cpp new file mode 100644 index 000000000..6738ee391 --- /dev/null +++ b/Engine/source/math/isa/sse2/mat44.cpp @@ -0,0 +1,22 @@ +#include "sse2_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_sse2() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + } +} diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h index 85b09ebb0..71a95c1b7 100644 --- a/Engine/source/math/isa/sse2/sse2_intrinsics.h +++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h @@ -7,7 +7,7 @@ namespace typedef __m128 f32x4; //------------------------------------------------------ - // Load / Store + // Vector/Point Load / Store //------------------------------------------------------ // Load 4 floats from memory into a SIMD register @@ -35,6 +35,17 @@ namespace return _mm_set_ps(1.0f, p[2], p[1], p[0]); } + inline f32x4 v_set(float x, float y, float z, float w) + { + return _mm_set_ps(w, z, y, x); + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); + return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v)); + } + inline void v_store3(float* dst, f32x4 v) { alignas(16) float tmp[4]; // temp storage @@ -50,12 +61,70 @@ namespace inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); } + inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w) + { + alignas(16) float tmp[4]; + _mm_store_ps(tmp, v); + + return _mm_set_ps( + tmp[w], + tmp[z], + tmp[y], + tmp[x] + ); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v, int x) + { + alignas(16) float tmp[4]; + _mm_store_ps(tmp, v); + return _mm_set1_ps(tmp[x]); + } + + inline f32x4 v_swizzle_lo(f32x4 v1) + { + return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0)); + } + + inline f32x4 v_swizzle_hi(f32x4 v1) + { + return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1)); + } + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) { - f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); + f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1 return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv)); } + //------------------------------------------------------ + // Shuffle helpers + //------------------------------------------------------ + + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w) + { + alignas(16) float a[4], b[4]; + _mm_store_ps(a, v1); + _mm_store_ps(b, v2); + + return _mm_set_ps( + b[w], + b[z], + a[y], + a[x] + ); + } + + inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2) + { + return _mm_movelh_ps(v1, v2); + } + + inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2) + { + return _mm_movehl_ps(v2, v1); + } + //------------------------------------------------------ // Simple Arithmatic @@ -73,6 +142,13 @@ namespace // Element-wise subtract inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + // Absolute value. + inline f32x4 v_abs(f32x4 v) + { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(v, mask); + } + //------------------------------------------------------ // Fast recip //------------------------------------------------------ @@ -153,4 +229,292 @@ namespace shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)); return _mm_add_ps(sums, shuf); } + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + //------------------------------------------------------ + // Matrix Load / Store + //------------------------------------------------------ + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + v_store(dst + 0, m.r0); + v_store(dst + 4, m.r1); + v_store(dst + 8, m.r2); + v_store(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = _mm_set_ps(0, 0, 0, 1); + m.r1 = _mm_set_ps(0, 0, 1, 0); + m.r2 = _mm_set_ps(0, 1, 0, 0); + m.r3 = _mm_set_ps(1, 0, 0, 0); + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + //------------------------------------------------------ + // Matrix × Vector + //------------------------------------------------------ + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + // Pack lowest lane of each into a vector + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + // combine to a vector + return _mm_set_ps(0.0f, + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + //------------------------------------------------------ + // Matrix × Matrix + //------------------------------------------------------ + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + // Transpose B once for dot products + f32x4 b0 = b.r0; + f32x4 b1 = b.r1; + f32x4 b2 = b.r2; + f32x4 b3 = b.r3; + + // Transpose (SSE2) + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); + + f32x4x4 C; + + auto mul_row = [&](f32x4 arow) + { + f32x4 x = v_dot4(arow, b0); + f32x4 y = v_dot4(arow, b1); + f32x4 z = v_dot4(arow, b2); + f32x4 w = v_dot4(arow, b3); + + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + }; + + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + //------------------------------------------------------ + // Transpose + //------------------------------------------------------ + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + f32x4 r3 = m.r3; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + + return { r0, r1, r2, r3 }; + } + + inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2) + { + dst[0] = v_extract0(c0); // c0.x + dst[1] = v_extract0(c1); // c1.x + dst[2] = v_extract0(c2); // c2.x + + dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y + dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y + dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y + + dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z + dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z + dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z + } + + inline f32x4x4 m_inverse(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(b, c); + f32x4 c1 = v_cross(c, d); + f32x4 c2 = v_cross(d, a); + f32x4 c3 = v_cross(a, b); + + f32x4 det = v_dot4(a, c1); + f32x4 invDet = v_rcp_nr(det); + + f32x4x4 adj; + adj.r0 = _mm_mul_ps(c1, invDet); + adj.r1 = _mm_mul_ps(c2, invDet); + adj.r2 = _mm_mul_ps(c3, invDet); + adj.r3 = _mm_mul_ps(c0, invDet); + + return m_transpose(adj); + } + + inline f32x4x4 m_inverse_rigid(const f32x4x4& m) + { + f32x4x4 t = m_transpose(m); + + // Extract translation + f32x4 T = v_set( + _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))), + 0.0f + ); + + f32x4 newT = m_mul_vec4(t, T); + newT = _mm_sub_ps(v_zero(), newT); + + t.r0 = v_preserve_w(t.r0, newT); + t.r1 = v_preserve_w(t.r1, newT); + t.r2 = v_preserve_w(t.r2, newT); + t.r3 = v_set(0, 0, 0, 1); + + return t; + } + + inline f32x4x4 m_inverse_affine(const f32x4x4& m) + { + // Extract upper 3x3 rows + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + // Zero translation + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + // Compute cofactors via cross products + f32x4 c0 = v_cross(r1, r2); + f32x4 c1 = v_cross(r2, r0); + f32x4 c2 = v_cross(r0, r1); + + // Determinant + f32x4 det = v_dot3(r0, c0); + + // Reciprocal determinant + f32x4 invDet = v_rcp_nr(det); + + // Inverse 3x3 (transpose of cofactor matrix) + f32x4 i0 = _mm_mul_ps(c0, invDet); + f32x4 i1 = _mm_mul_ps(c1, invDet); + f32x4 i2 = _mm_mul_ps(c2, invDet); + + // Transpose 3x3 + f32x4 t0 = i0; + f32x4 t1 = i1; + f32x4 t2 = i2; + f32x4 t3 = _mm_setzero_ps(); + + _MM_TRANSPOSE4_PS(t0, t1, t2, t3); + + // Extract translation + alignas(16) float tmp[4]; + tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[3] = 0.0f; + + f32x4 T = v_load(tmp); + + // New translation = -R' * T + f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T); + newT = _mm_sub_ps(v_zero(), newT); + + // Assemble final matrix + f32x4x4 out; + out.r0 = v_preserve_w(t0, newT); + out.r1 = v_preserve_w(t1, newT); + out.r2 = v_preserve_w(t2, newT); + out.r3 = _mm_set_ps(1, 0, 0, 0); + + return out; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = _mm_mul_ps(a, c0); + f32x4 term1 = _mm_mul_ps(a, c1); + f32x4 term2 = _mm_mul_ps(a, c2); + + f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); // splatted determinant + } } diff --git a/Engine/source/math/isa/sse41/mat44.cpp b/Engine/source/math/isa/sse41/mat44.cpp new file mode 100644 index 000000000..94bc5e4b2 --- /dev/null +++ b/Engine/source/math/isa/sse41/mat44.cpp @@ -0,0 +1,22 @@ +#include "sse41_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_sse41() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + } +} diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h index 047cb44ee..2ea63e6b5 100644 --- a/Engine/source/math/isa/sse41/sse41_intrinsics.h +++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h @@ -26,11 +26,70 @@ namespace inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w) + { + alignas(16) float tmp[4]; + _mm_store_ps(tmp, v); + + return _mm_set_ps( + tmp[w], + tmp[z], + tmp[y], + tmp[x] + ); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v, int x) + { + alignas(16) float tmp[4]; + _mm_store_ps(tmp, v); + return _mm_set1_ps(tmp[x]); + } + + inline f32x4 v_swizzle_lo(f32x4 v1) + { + return _mm_moveldup_ps(v1); + } + + inline f32x4 v_swizzle_hi(f32x4 v1) + { + return _mm_movehdup_ps(v1); + } + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) { return _mm_blend_ps(newv, original, 0b1000); } + //------------------------------------------------------ + // Shuffle helpers + //------------------------------------------------------ + + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w) + { + alignas(16) float a[4], b[4]; + _mm_store_ps(a, v1); + _mm_store_ps(b, v2); + + return _mm_set_ps( + b[w], + b[z], + a[y], + a[x] + ); + } + + inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2) + { + return _mm_movelh_ps(v1, v2); + } + + inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2) + { + return _mm_movehl_ps(v2, v1); + } + + //------------------------------------------------------ // Float3 helpers (safe loading into 4 lanes) //------------------------------------------------------ @@ -54,6 +113,16 @@ namespace dst[2] = tmp[2]; } + inline f32x4 v_set(float x, float y, float z, float w) + { + return _mm_set_ps(w, z, y, x); + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + return _mm_insert_ps(v, w, 0x30); + } + //------------------------------------------------------ // Simple Arithmatic //------------------------------------------------------ @@ -70,7 +139,7 @@ namespace // Element-wise subtract inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } - //------------------------------------------------------ + //------------------------------------------------------ // Fast recip //------------------------------------------------------ @@ -104,13 +173,13 @@ namespace // full dot4 inline f32x4 v_dot4(f32x4 a, f32x4 b) { - return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1 + return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1 } // dot3 (ignores w) inline f32x4 v_dot3(f32x4 a, f32x4 b) { - return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1 + return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1 } // cross product xyz only. @@ -137,4 +206,293 @@ namespace __m128 sum = _mm_hadd_ps(a, a); return _mm_hadd_ps(sum, sum); } + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + //------------------------------------------------------ + // Matrix Load / Store + //------------------------------------------------------ + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + _mm_storeu_ps(dst + 0, m.r0); + _mm_storeu_ps(dst + 4, m.r1); + _mm_storeu_ps(dst + 8, m.r2); + _mm_storeu_ps(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = _mm_set_ps(0, 0, 0, 1); + m.r1 = _mm_set_ps(0, 0, 1, 0); + m.r2 = _mm_set_ps(0, 1, 0, 0); + m.r3 = _mm_set_ps(1, 0, 0, 0); + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + //------------------------------------------------------ + // Matrix × Vector + //------------------------------------------------------ + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + // combine to a vector + return _mm_set_ps(_mm_cvtss_f32(w), + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + // combine to a vector + return _mm_set_ps(0.0f, + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + //------------------------------------------------------ + // Matrix × Matrix + //------------------------------------------------------ + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + // Transpose B once for dot products + f32x4 b0 = b.r0; + f32x4 b1 = b.r1; + f32x4 b2 = b.r2; + f32x4 b3 = b.r3; + + // Transpose (SSE2) + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); + + f32x4x4 C; + + auto mul_row = [&](f32x4 arow) + { + f32x4 x = v_dot4(arow, b0); + f32x4 y = v_dot4(arow, b1); + f32x4 z = v_dot4(arow, b2); + f32x4 w = v_dot4(arow, b3); + + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + }; + + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + //------------------------------------------------------ + // Transpose + //------------------------------------------------------ + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + f32x4 r3 = m.r3; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + + return { r0, r1, r2, r3 }; + } + + inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2) + { + dst[0] = v_extract0(c0); // c0.x + dst[1] = v_extract0(c1); // c1.x + dst[2] = v_extract0(c2); // c2.x + + dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y + dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y + dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y + + dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z + dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z + dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z + } + + inline f32x4x4 m_inverse(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(b, c); + f32x4 c1 = v_cross(c, d); + f32x4 c2 = v_cross(d, a); + f32x4 c3 = v_cross(a, b); + + f32x4 det = v_dot4(a, c1); + f32x4 invDet = v_rcp_nr(det); + + f32x4x4 adj; + adj.r0 = _mm_mul_ps(c1, invDet); + adj.r1 = _mm_mul_ps(c2, invDet); + adj.r2 = _mm_mul_ps(c3, invDet); + adj.r3 = _mm_mul_ps(c0, invDet); + + return m_transpose(adj); + } + + inline f32x4x4 m_inverse_rigid(const f32x4x4& m) + { + f32x4x4 t = m_transpose(m); + + // Extract translation + f32x4 T = v_set( + _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))), + 0.0f + ); + + f32x4 newT = m_mul_vec4(t, T); + newT = _mm_sub_ps(v_zero(), newT); + + t.r0 = v_preserve_w(t.r0, newT); + t.r1 = v_preserve_w(t.r1, newT); + t.r2 = v_preserve_w(t.r2, newT); + t.r3 = v_set(0, 0, 0, 1); + + return t; + } + + inline f32x4x4 m_inverse_affine(const f32x4x4& m) + { + // Extract upper 3x3 rows + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + // Zero translation + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + // Compute cofactors via cross products + f32x4 c0 = v_cross(r1, r2); + f32x4 c1 = v_cross(r2, r0); + f32x4 c2 = v_cross(r0, r1); + + // Determinant + f32x4 det = v_dot3(r0, c0); + + // Reciprocal determinant + f32x4 invDet = v_rcp_nr(det); + + // Inverse 3x3 (transpose of cofactor matrix) + f32x4 i0 = _mm_mul_ps(c0, invDet); + f32x4 i1 = _mm_mul_ps(c1, invDet); + f32x4 i2 = _mm_mul_ps(c2, invDet); + + // Transpose 3x3 + f32x4 t0 = i0; + f32x4 t1 = i1; + f32x4 t2 = i2; + f32x4 t3 = _mm_setzero_ps(); + + _MM_TRANSPOSE4_PS(t0, t1, t2, t3); + + // Extract translation + alignas(16) float tmp[4]; + tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[3] = 0.0f; + + f32x4 T = v_load(tmp); + + // New translation = -R' * T + f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T); + newT = _mm_sub_ps(v_zero(), newT); + + // Assemble final matrix + f32x4x4 out; + out.r0 = v_preserve_w(t0, newT); + out.r1 = v_preserve_w(t1, newT); + out.r2 = v_preserve_w(t2, newT); + out.r3 = _mm_set_ps(1, 0, 0, 0); + + return out; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = _mm_mul_ps(a, c0); + f32x4 term1 = _mm_mul_ps(a, c1); + f32x4 term2 = _mm_mul_ps(a, c2); + + f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); // splatted determinant + } } diff --git a/Engine/source/math/mMathFn.h b/Engine/source/math/mMathFn.h index 9f84b37df..e9e7d3840 100644 --- a/Engine/source/math/mMathFn.h +++ b/Engine/source/math/mMathFn.h @@ -76,13 +76,11 @@ extern void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m ); extern void (*m_matF_set_euler)(const F32 *e, F32 *result); extern void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result); extern void (*m_matF_identity)(F32 *m); -extern void (*m_matF_inverse)(F32 *m); -extern void (*m_matF_invert_to)(const F32 *m, F32 *d); -extern void (*m_matF_affineInverse)(F32 *m); +extern void (*m_matF_invert_to)(const F32* m, F32* d); extern void (*m_matF_transpose)(F32 *m); extern void (*m_matF_scale)(F32 *m,const F32* p); extern void (*m_matF_normalize)(F32 *m); -extern F32 (*m_matF_determinant)(const F32 *m); +extern F32(*m_matF_determinant)(const F32* m); extern void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult); extern void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult); // extern void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult); diff --git a/Engine/source/math/mMath_C.cpp b/Engine/source/math/mMath_C.cpp index 7fac832f9..ad5aed0db 100644 --- a/Engine/source/math/mMath_C.cpp +++ b/Engine/source/math/mMath_C.cpp @@ -464,71 +464,26 @@ static void affineInvertTo(const F32 * m, F32 * out) } #endif -//-------------------------------------- -static void m_matF_inverse_C(F32 *m) +static void m_matF_invert_to_C(const F32* m, F32* d) { // using Cramers Rule find the Inverse // Minv = (1/det(M)) * adjoint(M) - F32 det = m_matF_determinant( m ); - AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); + F32 det = m_matF_determinant(m); + AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); - F32 invDet = 1.0f/det; - F32 temp[16]; + F32 invDet = 1.0f / det; - temp[0] = (m[5] * m[10]- m[6] * m[9]) * invDet; - temp[1] = (m[9] * m[2] - m[10]* m[1]) * invDet; - temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; - - temp[4] = (m[6] * m[8] - m[4] * m[10])* invDet; - temp[5] = (m[10]* m[0] - m[8] * m[2]) * invDet; - temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; - - temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; - temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; - temp[10]= (m[0] * m[5] - m[1] * m[4]) * invDet; - - m[0] = temp[0]; - m[1] = temp[1]; - m[2] = temp[2]; - - m[4] = temp[4]; - m[5] = temp[5]; - m[6] = temp[6]; - - m[8] = temp[8]; - m[9] = temp[9]; - m[10] = temp[10]; - - // invert the translation - temp[0] = -m[3]; - temp[1] = -m[7]; - temp[2] = -m[11]; - m_matF_x_vectorF(m, temp, &temp[4]); - m[3] = temp[4]; - m[7] = temp[5]; - m[11]= temp[6]; -} - -static void m_matF_invert_to_C(const F32 *m, F32 *d) -{ - // using Cramers Rule find the Inverse - // Minv = (1/det(M)) * adjoint(M) - F32 det = m_matF_determinant( m ); - AssertFatal( det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); - - F32 invDet = 1.0f/det; - - d[0] = (m[5] * m[10]- m[6] * m[9]) * invDet; - d[1] = (m[9] * m[2] - m[10]* m[1]) * invDet; + d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet; + d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet; d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; - d[4] = (m[6] * m[8] - m[4] * m[10])* invDet; - d[5] = (m[10]* m[0] - m[8] * m[2]) * invDet; + d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet; + d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet; d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; - d[10]= (m[0] * m[5] - m[1] * m[4]) * invDet; + d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet; // invert the translation F32 temp[6]; @@ -538,33 +493,13 @@ static void m_matF_invert_to_C(const F32 *m, F32 *d) m_matF_x_vectorF(d, temp, &temp[3]); d[3] = temp[3]; d[7] = temp[4]; - d[11]= temp[5]; - d[ 12 ] = m[ 12 ]; - d[ 13 ] = m[ 13 ]; - d[ 14 ] = m[ 14 ]; - d[ 15 ] = m[ 15 ]; + d[11] = temp[5]; + d[12] = m[12]; + d[13] = m[13]; + d[14] = m[14]; + d[15] = m[15]; } -//-------------------------------------- -static void m_matF_affineInverse_C(F32 *m) -{ - // Matrix class checks to make sure this is an affine transform before calling - // this function, so we can proceed assuming it is... - F32 temp[16]; - dMemcpy(temp, m, 16 * sizeof(F32)); - - // Transpose rotation - m[1] = temp[4]; - m[4] = temp[1]; - m[2] = temp[8]; - m[8] = temp[2]; - m[6] = temp[9]; - m[9] = temp[6]; - - m[3] = -(temp[0]*temp[3] + temp[4]*temp[7] + temp[8]*temp[11]); - m[7] = -(temp[1]*temp[3] + temp[5]*temp[7] + temp[9]*temp[11]); - m[11] = -(temp[2]*temp[3] + temp[6]*temp[7] + temp[10]*temp[11]); -} inline void swap(F32 &a, F32 &b) { @@ -633,11 +568,11 @@ static void m_matF_normalize_C(F32 *m) //-------------------------------------- -static F32 m_matF_determinant_C(const F32 *m) +static F32 m_matF_determinant_C(const F32* m) { - return m[0] * (m[5] * m[10] - m[6] * m[9]) + - m[4] * (m[2] * m[9] - m[1] * m[10]) + - m[8] * (m[1] * m[6] - m[2] * m[5]) ; + return m[0] * (m[5] * m[10] - m[6] * m[9]) + + m[4] * (m[2] * m[9] - m[1] * m[10]) + + m[8] * (m[1] * m[6] - m[2] * m[5]); } @@ -890,13 +825,11 @@ void (*m_quatF_set_matF)( F32 x, F32 y, F32 z, F32 w, F32* m ) = m_quatF_set_mat void (*m_matF_set_euler)(const F32 *e, F32 *result) = m_matF_set_euler_C; void (*m_matF_set_euler_point)(const F32 *e, const F32 *p, F32 *result) = m_matF_set_euler_point_C; void (*m_matF_identity)(F32 *m) = m_matF_identity_C; -void (*m_matF_inverse)(F32 *m) = m_matF_inverse_C; -void (*m_matF_affineInverse)(F32 *m) = m_matF_affineInverse_C; -void (*m_matF_invert_to)(const F32 *m, F32 *d) = m_matF_invert_to_C; +void (*m_matF_invert_to)(const F32* m, F32* d) = m_matF_invert_to_C; void (*m_matF_transpose)(F32 *m) = m_matF_transpose_C; void (*m_matF_scale)(F32 *m,const F32* p) = m_matF_scale_C; void (*m_matF_normalize)(F32 *m) = m_matF_normalize_C; -F32 (*m_matF_determinant)(const F32 *m) = m_matF_determinant_C; +F32(*m_matF_determinant)(const F32* m) = m_matF_determinant_C; void (*m_matF_x_matF)(const F32 *a, const F32 *b, F32 *mresult) = default_matF_x_matF_C; void (*m_matF_x_matF_aligned)(const F32 *a, const F32 *b, F32 *mresult) = default_matF_x_matF_C; // void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult) = m_matF_x_point3F_C; @@ -935,13 +868,11 @@ void mInstallLibrary_C() m_matF_set_euler = m_matF_set_euler_C; m_matF_set_euler_point = m_matF_set_euler_point_C; m_matF_identity = m_matF_identity_C; - m_matF_inverse = m_matF_inverse_C; - m_matF_affineInverse = m_matF_affineInverse_C; - m_matF_invert_to = m_matF_invert_to_C; + m_matF_invert_to = m_matF_invert_to_C; m_matF_transpose = m_matF_transpose_C; m_matF_scale = m_matF_scale_C; m_matF_normalize = m_matF_normalize_C; - m_matF_determinant = m_matF_determinant_C; + m_matF_determinant = m_matF_determinant_C; m_matF_x_matF = default_matF_x_matF_C; m_matF_x_matF_aligned = default_matF_x_matF_C; // m_matF_x_point3F = m_matF_x_point3F_C; diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h index 763a62b93..f42aaa5e4 100644 --- a/Engine/source/math/mMatrix.h +++ b/Engine/source/math/mMatrix.h @@ -50,13 +50,16 @@ #ifndef _CONSOLE_H_ #include "console/console.h" #endif +#ifndef _MATH_BACKEND_H_ +#include "math/public/math_backend.h" +#endif #ifndef USE_TEMPLATE_MATRIX /// 4x4 Matrix Class /// /// This runs at F32 precision. - +using math_backend::mat44::dispatch::gMat44; class MatrixF { friend class MatrixFEngineExport; @@ -128,7 +131,7 @@ public: EulerF toEuler() const; F32 determinant() const { - return m_matF_determinant(*this); + return gMat44.determinant(*this); } /// Compute the inverse of the matrix. @@ -372,71 +375,68 @@ inline MatrixF& MatrixF::identity() inline MatrixF& MatrixF::inverse() { - m_matF_inverse(m); + gMat44.inverse(m); return (*this); } inline void MatrixF::invertTo( MatrixF *out ) { - m_matF_invert_to(m,*out); + out = this; + gMat44.inverse(*out); } inline MatrixF& MatrixF::affineInverse() { -// AssertFatal(isAffine() == true, "Error, this matrix is not an affine transform"); - m_matF_affineInverse(m); + gMat44.affine_inverse(m); return (*this); } inline MatrixF& MatrixF::transpose() { - m_matF_transpose(m); + gMat44.transpose(m); return (*this); } inline MatrixF& MatrixF::scale(const Point3F& p) { - m_matF_scale(m,p); + gMat44.scale(m, p); return *this; } inline Point3F MatrixF::getScale() const { Point3F scale; - scale.x = mSqrt(m[0]*m[0] + m[4] * m[4] + m[8] * m[8]); - scale.y = mSqrt(m[1]*m[1] + m[5] * m[5] + m[9] * m[9]); - scale.z = mSqrt(m[2]*m[2] + m[6] * m[6] + m[10] * m[10]); + gMat44.get_scale(m, scale); return scale; } inline void MatrixF::normalize() { - m_matF_normalize(m); + gMat44.normalize(m); } inline MatrixF& MatrixF::mul( const MatrixF &a ) { // M * a -> M AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!"); - MatrixF tempThis(*this); - m_matF_x_matF(tempThis, a, *this); + gMat44.mul_mat44(tempThis, a, tempThis); + *this = tempThis; + return (*this); } inline MatrixF& MatrixF::mulL( const MatrixF &a ) { // a * M -> M AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!"); - MatrixF tempThis(*this); - m_matF_x_matF(a, tempThis, *this); + gMat44.mul_mat44(a, tempThis, *this); return (*this); } inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b ) { // a * b -> M AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!"); - - m_matF_x_matF(a, b, *this); + gMat44.mul_mat44(a, b, *this); return (*this); } @@ -461,7 +461,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b) inline void MatrixF::mul( Point4F& p ) const { Point4F temp; - m_matF_x_point4F(*this, &p.x, &temp.x); + gMat44.mul_float4(*this, p, temp); p = temp; } @@ -469,28 +469,28 @@ inline void MatrixF::mulP( Point3F& p) const { // M * p -> d Point3F d; - m_matF_x_point3F(*this, &p.x, &d.x); + gMat44.mul_pos3(*this, p, d); p = d; } inline void MatrixF::mulP( const Point3F &p, Point3F *d) const { // M * p -> d - m_matF_x_point3F(*this, &p.x, &d->x); + gMat44.mul_pos3(*this, p, *d); } inline void MatrixF::mulV( VectorF& v) const { // M * v -> v VectorF temp; - m_matF_x_vectorF(*this, &v.x, &temp.x); + gMat44.mul_vec3(*this, v, temp); v = temp; } inline void MatrixF::mulV( const VectorF &v, Point3F *d) const { // M * v -> d - m_matF_x_vectorF(*this, &v.x, &d->x); + gMat44.mul_vec3(*this, v, *d); } inline void MatrixF::mul(Box3F& b) const @@ -634,14 +634,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 ) { // temp = m1 * m2 MatrixF temp; - m_matF_x_matF(m1, m2, temp); + gMat44.mul_mat44(m1, m2, temp); return temp; } inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 ) { MatrixF tempThis(*this); - m_matF_x_matF(tempThis, m1, *this); + gMat44.mul_mat44(tempThis, m1, *this); return (*this); } diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h index 61b488861..b1b78ef13 100644 --- a/Engine/source/math/public/mat44_dispatch.h +++ b/Engine/source/math/public/mat44_dispatch.h @@ -8,7 +8,16 @@ namespace math_backend::mat44::dispatch struct Mat44Funcs { void (*transpose)(float*) = nullptr; + void (*inverse)(float*) = nullptr; + void (*affine_inverse)(float*) = nullptr; + void (*normalize)(float*) = nullptr; + void (*mul_mat44)(const float* a, const float* b, float* r) = nullptr; + void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr; + void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr; + void (*mul_float4)(const float* a, const float* b, float* r) = nullptr; + float (*determinant)(const float*) = nullptr; void (*scale)(float*, const float*) = nullptr; + void (*get_scale)(const float*, float*) = nullptr; }; // Global dispatch table diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp index 9b5e5daed..81e00b3c7 100644 --- a/Engine/source/math/public/math_backend.cpp +++ b/Engine/source/math/public/math_backend.cpp @@ -18,6 +18,26 @@ namespace math_backend::mat44::dispatch Mat44Funcs gMat44{}; } +namespace math_backend +{ + // Use an anonymous namespace for the static initializer + namespace { + struct ScalarInitializer + { + ScalarInitializer() + { + // Install scalar defaults immediately for all types + float4::dispatch::install_scalar(); + float3::dispatch::install_scalar(); + mat44::dispatch::install_scalar(); + } + }; + + // Static instance ensures constructor runs before main() + ScalarInitializer g_scalarInitializer; + } +} + math_backend::backend math_backend::choose_backend(U32 cpu_flags) { @@ -47,21 +67,25 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags) case backend::avx2: float4::dispatch::install_avx2(); float3::dispatch::install_avx2(); + mat44::dispatch::install_avx2(); break; case backend::avx: float4::dispatch::install_avx(); float3::dispatch::install_avx(); + mat44::dispatch::install_avx(); break; case backend::sse41: float4::dispatch::install_sse41(); float3::dispatch::install_sse41(); + mat44::dispatch::install_sse41(); break; case backend::sse2: float4::dispatch::install_sse2(); float3::dispatch::install_sse2(); + mat44::dispatch::install_sse2(); break; #elif defined(__aarch64__) || defined(__ARM_NEON) case backend::neon: diff --git a/Engine/source/ts/assimp/assimpAppNode.cpp b/Engine/source/ts/assimp/assimpAppNode.cpp index ff2e1ffd5..af448f940 100644 --- a/Engine/source/ts/assimp/assimpAppNode.cpp +++ b/Engine/source/ts/assimp/assimpAppNode.cpp @@ -250,7 +250,7 @@ MatrixF AssimpAppNode::getNodeTransform(F32 time) // Check for inverted node coordinate spaces => can happen when modelers // use the 'mirror' tool in their 3d app. Shows up as negative // transforms in the collada model. - if (m_matF_determinant(nodeTransform) < 0.0f) + if (nodeTransform.determinant() < 0.0f) { // Mark this node as inverted so we can mirror mesh geometry, then // de-invert the transform matrix diff --git a/Engine/source/ts/collada/colladaAppMesh.cpp b/Engine/source/ts/collada/colladaAppMesh.cpp index a944e2eef..4c4470654 100644 --- a/Engine/source/ts/collada/colladaAppMesh.cpp +++ b/Engine/source/ts/collada/colladaAppMesh.cpp @@ -1099,7 +1099,7 @@ void ColladaAppMesh::lookupSkinData() // Inverted node coordinate spaces (negative scale factor) are corrected // in ColladaAppNode::getNodeTransform, so need to apply the same operation // here to match - if (m_matF_determinant(invBind) < 0.0f) + if (invBind.determinant() < 0.0f) initialTransforms[iJoint].scale(Point3F(1, 1, -1)); initialTransforms[iJoint].mul(invBind); diff --git a/Engine/source/ts/collada/colladaAppNode.cpp b/Engine/source/ts/collada/colladaAppNode.cpp index 2ffc10fba..0cd8258c8 100644 --- a/Engine/source/ts/collada/colladaAppNode.cpp +++ b/Engine/source/ts/collada/colladaAppNode.cpp @@ -192,7 +192,7 @@ MatrixF ColladaAppNode::getNodeTransform(F32 time) // Check for inverted node coordinate spaces => can happen when modelers // use the 'mirror' tool in their 3d app. Shows up as negative // transforms in the collada model. - if (m_matF_determinant(nodeTransform) < 0.0f) + if (nodeTransform.determinant() < 0.0f) { // Mark this node as inverted so we can mirror mesh geometry, then // de-invert the transform matrix