diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl index da6971d0d..e021b6fb1 100644 --- a/Engine/source/math/impl/mat44_impl.inl +++ b/Engine/source/math/impl/mat44_impl.inl @@ -15,8 +15,13 @@ namespace math_backend::mat44 inline float mat44_get_determinant(const float* m) { - f32x4x4 ma = m_load(m); - return v_extract0(m_determinant_affine(ma)); + f32x4 r0 = v_load3_vec(m + 0); // row0 xyz + f32x4 r1 = v_load3_vec(m + 4); // row1 xyz + f32x4 r2 = v_load3_vec(m + 8); // row2 xyz + + f32x4 c0 = v_cross(r1, r2); // cofactor for row0 + f32x4 det = v_dot3(r0, c0); // splatted determinant + return v_extract0(det); } // Matrix Scale: Float3 (assume w = 1.0f) @@ -32,51 +37,66 @@ namespace math_backend::mat44 m_store(m, ma); } - inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result) + inline void mat44_transform_plane_impl(const float* m, const float* s, const float* p, float* presult) { + f32x4 scale = v_load3_pos(s); + f32x4 invScale = v_div(v_set1(1.0f), scale); + + //-------------------------------------------------- + // Load affine 3x3 rows and translation + //-------------------------------------------------- + f32x4 row0 = v_load3_vec(m + 0); + f32x4 row1 = v_load3_vec(m + 4); + f32x4 row2 = v_load3_vec(m + 8); + f32x4 shear = v_set(m[3], m[7], m[11], 1.0); + + //-------------------------------------------------- + // Compute A, B, C = -dot(row, shear) + //-------------------------------------------------- + f32x4 A = v_mul(v_dot3(row0, shear), v_set1(-1.0f)); + f32x4 B = v_mul(v_dot3(row1, shear), v_set1(-1.0f)); + f32x4 C = v_mul(v_dot3(row2, shear), v_set1(-1.0f)); + + + f32x4x4 invTrMatrix; + invTrMatrix.r0 = v_set(m[0], m[1], m[2], v_extract0(A)); + invTrMatrix.r1 = v_set(m[4], m[5], m[6], v_extract0(B)); + invTrMatrix.r2 = v_set(m[8], m[9], m[10], v_extract0(C)); + invTrMatrix.r3 = v_set(0.0f, 0.0f, 0.0f, 1.0f); + + // Apply inverse scale to upper-left 3x3 + invTrMatrix.r0 = v_mul(invTrMatrix.r0, invScale); + invTrMatrix.r1 = v_mul(invTrMatrix.r1, invScale); + invTrMatrix.r2 = v_mul(invTrMatrix.r2, invScale); + + + f32x4 normal = v_load3_pos(p); // plane normal {x,y,z,1} + f32x4 point = v_mul(normal, v_set1(-p[3])); // point = -d * normal + + // Apply transform to normal + f32x4 normTransformed = m_mul_vec3(invTrMatrix, normal); + normTransformed = v_normalize3(normTransformed); + + // transform point with original + f32x4 scaleVec = v_load3_pos(s); // scale vector + f32x4 pointScaled = v_mul(point, scaleVec); + + pointScaled = v_insert_w(pointScaled, v_set1(1.0f)); + f32x4x4 M = m_load(m); + // Transform point + f32x4 pointTransformed = m_mul_vec4(M, pointScaled); - f32x4 plane_v = v_load(plane); - f32x4 scale_v = v_load3_vec(scale); - f32x4 invScale = v_rcp_nr(scale_v); + //-------------------------------------------------- + // Compute plane d = -dot(normal, transformedPoint) + //-------------------------------------------------- + f32x4 dp = v_dot3( pointTransformed, normTransformed); + float planeD = -v_extract0(dp); - // normal = plane.xyz - f32x4 normal = plane_v; - - // apply Inv(s) - normal = v_mul(normal, invScale); - - // multiply by Inv(Tr(m)) (only the rotation part matters) - f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0); - f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1); - f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2); - - normal = v_add(v_add(nx, ny), nz); - - normal = v_normalize3(normal); - - // compute point on plane - float d = v_extract0(v_swizzle_singular_mask(plane_v, 3)); - - f32x4 point = v_mul(plane_v, v_set1(-d)); - point = v_preserve_w(point, v_set1(1.0f)); - - // apply scale - point = v_mul(point, scale_v); - - // transform point by matrix - point = m_mul_vec4(M, point); - - // compute new plane distance - float newD = -v_extract0(v_dot3(point, normal)); - - alignas(16) float n[4]; - v_store(n, normal); - - plane_result[0] = n[0]; - plane_result[1] = n[1]; - plane_result[2] = n[2]; - plane_result[3] = newD; + presult[0] = v_extract0(normTransformed); + presult[1] = v_extract0(v_swizzle_mask(normTransformed, 1, 1, 1, 1)); + presult[2] = v_extract0(v_swizzle_mask(normTransformed, 2, 2, 2, 2)); + presult[3] = planeD; } inline void mat44_get_scale_impl(const float* m, float* s) @@ -109,92 +129,138 @@ namespace math_backend::mat44 m_store(m, ma); } + // Vector Multiply: m * v (assume w = 0.0f) + inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load3_vec(v); + f32x4 vr = m_mul_vec3(ma, va); + v_store3(r, vr); + } + + // Matrix Inverse inline void mat44_inverse_impl(float* m) { - f32x4x4 ma = m_load(m); + //// using Cramers Rule find the Inverse + //// Minv = (1/det(M)) * adjoint(M) + f32x4 r0 = v_load3_vec(m + 0); // row 0: m00 m01 m02 + f32x4 r1 = v_load3_vec(m + 4); // row 1: m10 m11 m12 + f32x4 r2 = v_load3_vec(m + 8); // row 2: m20 m21 m22 + float det = mat44_get_determinant(m); + f32x4 invDet = v_set1(1.0f / det); - // Compute cofactors using cross products - f32x4x4 mTemp; - mTemp.r0 = v_cross(ma.r1, ma.r2); - mTemp.r1 = v_cross(ma.r2, ma.r0); - mTemp.r2 = v_cross(ma.r0, ma.r1); + f32x4x4 temp; - // Determinant = dot(ma.r0, c0) - f32x4 det = v_dot3(ma.r0, mTemp.r0); - f32x4 invDet = v_rcp_nr(det); + temp.r0 = v_set( + (m[5] * m[10] - m[6] * m[9]), + (m[9] * m[2] - m[10] * m[1]), + (m[1] * m[6] - m[2] * m[5]), + 0 + ); - // Scale cofactors - mTemp.r0 = v_mul(mTemp.r0, invDet); - mTemp.r1 = v_mul(mTemp.r1, invDet); - mTemp.r2 = v_mul(mTemp.r2, invDet); + temp.r1 = v_set( + (m[6] * m[8] - m[4] * m[10]), + (m[10] * m[0] - m[8] * m[2]), + (m[2] * m[4] - m[0] * m[6]), + 0 + ); - // Store inverse 3x3 (transpose of cofactor matrix) + temp.r2 = v_set( + (m[4] * m[9] - m[5] * m[8]), + (m[8] * m[1] - m[9] * m[0]), + (m[0] * m[5] - m[1] * m[4]), + 0 + ); - mTemp = m_transpose(mTemp); - mTemp.r3 = ma.r3; + temp.r0 = v_mul(temp.r0, invDet); + temp.r1 = v_mul(temp.r1, invDet); + temp.r2 = v_mul(temp.r2, invDet); - // ---- Translation ---- + // Compute new translation: -R^-1 * T + f32x4 t = v_set(m[3], m[7], m[11], 0.0f); // row-major: last element in row + f32x4 t_new; - // Load original translation - f32x4 T = v_set(m[3], m[7], m[11], 0.0f); - - // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) - f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); - result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); - result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); - result = v_mul(result, v_set1(-1.0f)); + t_new = v_set( + -v_extract0(v_dot3(temp.r0, t)), + -v_extract0(v_dot3(temp.r1, t)), + -v_extract0(v_dot3(temp.r2, t)), + 0.0f + ); - m_store(m, mTemp); + // Store back rotation + m[0] = v_extract0(temp.r0); m[1] = v_extract0(v_swizzle_mask(temp.r0, 1, 1, 1, 1)); m[2] = v_extract0(v_swizzle_mask(temp.r0, 2, 2, 2, 2)); + m[4] = v_extract0(temp.r1); m[5] = v_extract0(v_swizzle_mask(temp.r1, 1, 1, 1, 1)); m[6] = v_extract0(v_swizzle_mask(temp.r1, 2, 2, 2, 2)); + m[8] = v_extract0(temp.r2); m[9] = v_extract0(v_swizzle_mask(temp.r2, 1, 1, 1, 1)); m[10] = v_extract0(v_swizzle_mask(temp.r2, 2, 2, 2, 2)); - // Store translation - m[3] = v_extract0(result); - m[7] = v_extract0(v_swizzle_singular_mask(result, 1)); - m[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + // Store translation + m[3] = v_extract0(t_new); + m[7] = v_extract0(v_swizzle_mask(t_new, 1, 1, 1, 1)); + m[11] = v_extract0(v_swizzle_mask(t_new, 2, 2, 2, 2)); } // Matrix Inverse - inline void mat44_inverse_to_impl(const float* m, float* dst) + inline void mat44_inverse_to_impl(const float* m, float* d) { - f32x4x4 ma = m_load(m); + //// using Cramers Rule find the Inverse + //// Minv = (1/det(M)) * adjoint(M) + f32x4 r0 = v_load3_vec(m + 0); // row 0: m00 m01 m02 + f32x4 r1 = v_load3_vec(m + 4); // row 1: m10 m11 m12 + f32x4 r2 = v_load3_vec(m + 8); // row 2: m20 m21 m22 + float det = mat44_get_determinant(m); + f32x4 invDet = v_set1(1.0f / det); - // Compute cofactors using cross products - f32x4x4 mTemp; - mTemp.r0 = v_cross(ma.r1, ma.r2); - mTemp.r1 = v_cross(ma.r2, ma.r0); - mTemp.r2 = v_cross(ma.r0, ma.r1); + f32x4x4 temp; - // Determinant = dot(ma.r0, c0) - f32x4 det = v_dot3(ma.r0, mTemp.r0); - f32x4 invDet = v_rcp_nr(det); + temp.r0 = v_set( + (m[5] * m[10] - m[6] * m[9]), + (m[9] * m[2] - m[10] * m[1]), + (m[1] * m[6] - m[2] * m[5]), + 0 + ); - // Scale cofactors - mTemp.r0 = v_mul(mTemp.r0, invDet); - mTemp.r1 = v_mul(mTemp.r1, invDet); - mTemp.r2 = v_mul(mTemp.r2, invDet); + temp.r1 = v_set( + (m[6] * m[8] - m[4] * m[10]), + (m[10] * m[0] - m[8] * m[2]), + (m[2] * m[4] - m[0] * m[6]), + 0 + ); - // Store inverse 3x3 (transpose of cofactor matrix) + temp.r2 = v_set( + (m[4] * m[9] - m[5] * m[8]), + (m[8] * m[1] - m[9] * m[0]), + (m[0] * m[5] - m[1] * m[4]), + 0 + ); - mTemp = m_transpose(mTemp); - mTemp.r3 = ma.r3; + temp.r0 = v_mul(temp.r0, invDet); + temp.r1 = v_mul(temp.r1, invDet); + temp.r2 = v_mul(temp.r2, invDet); - // ---- Translation ---- + // Compute new translation: -R^-1 * T + f32x4 t = v_set(m[3], m[7], m[11], 0.0f); // row-major: last element in row + f32x4 t_new; - // Load original translation - f32x4 T = v_set(m[3], m[7], m[11], 0.0f); + t_new = v_set( + -v_extract0(v_dot3(temp.r0, t)), + -v_extract0(v_dot3(temp.r1, t)), + -v_extract0(v_dot3(temp.r2, t)), + 0.0f + ); - // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) - f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); - result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); - result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); - result = v_mul(result, v_set1(-1.0f)); + // Store back rotation + d[0] = v_extract0(temp.r0); d[1] = v_extract0(v_swizzle_mask(temp.r0, 1, 1, 1, 1)); d[2] = v_extract0(v_swizzle_mask(temp.r0, 2, 2, 2, 2)); + d[4] = v_extract0(temp.r1); d[5] = v_extract0(v_swizzle_mask(temp.r1, 1, 1, 1, 1)); d[6] = v_extract0(v_swizzle_mask(temp.r1, 2, 2, 2, 2)); + d[8] = v_extract0(temp.r2); d[9] = v_extract0(v_swizzle_mask(temp.r2, 1, 1, 1, 1)); d[10] = v_extract0(v_swizzle_mask(temp.r2, 2, 2, 2, 2)); - m_store(dst, mTemp); - - // Store translation - dst[3] = v_extract0(result); - dst[7] = v_extract0(v_swizzle_singular_mask(result, 1)); - dst[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + // Store translation + d[3] = v_extract0(t_new); + d[7] = v_extract0(v_swizzle_mask(t_new, 1, 1, 1, 1)); + d[11] = v_extract0(v_swizzle_mask(t_new, 2, 2, 2, 2)); + d[12] = m[12]; + d[13] = m[13]; + d[14] = m[14]; + d[15] = m[15]; } // Matrix Affine Inverse @@ -275,15 +341,6 @@ namespace math_backend::mat44 v_store3(r, vr); } - // Vector Multiply: m * v (assume w = 0.0f) - inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r) - { - f32x4x4 ma = m_load(m); - f32x4 va = v_load3_vec(v); - f32x4 vr = m_mul_vec3(ma, va); - v_store3(r, vr); - } - // Vector Multiply: m * p (full [4x4] * [1x4]) inline void mat44_mul_float4_impl(const float* m, const float* p, float* r) {