From a64cc6a512584bc616f8100b9df2f14d04d3b151 Mon Sep 17 00:00:00 2001 From: AzaezelX Date: Tue, 17 Mar 2026 12:59:45 -0500 Subject: [PATCH 1/4] revert some of the more experimental matrix math from marauder: Fixing transform plane as some descrepencies between the simd function and the c math caused issues with precision. Use c math for inverse functions until a proper simd implementation can be found. --- Engine/source/math/impl/mat44_impl.inl | 230 ++++++++++++++----------- 1 file changed, 128 insertions(+), 102 deletions(-) diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl index da6971d0d..d688c6184 100644 --- a/Engine/source/math/impl/mat44_impl.inl +++ b/Engine/source/math/impl/mat44_impl.inl @@ -15,8 +15,13 @@ namespace math_backend::mat44 inline float mat44_get_determinant(const float* m) { - f32x4x4 ma = m_load(m); - return v_extract0(m_determinant_affine(ma)); + f32x4 r0 = v_load3_vec(m + 0); // row0 xyz + f32x4 r1 = v_load3_vec(m + 4); // row1 xyz + f32x4 r2 = v_load3_vec(m + 8); // row2 xyz + + f32x4 c0 = v_cross(r1, r2); // cofactor for row0 + f32x4 det = v_dot3(r0, c0); // splatted determinant + return v_extract0(det); } // Matrix Scale: Float3 (assume w = 1.0f) @@ -32,51 +37,81 @@ namespace math_backend::mat44 m_store(m, ma); } - inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result) + inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* presult) { - f32x4x4 M = m_load(m); - f32x4 plane_v = v_load(plane); f32x4 scale_v = v_load3_vec(scale); f32x4 invScale = v_rcp_nr(scale_v); - // normal = plane.xyz - f32x4 normal = plane_v; + f32x4x4 M = m_load(m); - // apply Inv(s) + f32x4 normal = plane_v; normal = v_mul(normal, invScale); - // multiply by Inv(Tr(m)) (only the rotation part matters) - f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0); - f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1); - f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2); + //--------------------------------------------------------- + // Extract translation column (tx ty tz) + //--------------------------------------------------------- + f32x4 shear = v_set( + v_extract0(v_swizzle_singular_mask(M.r0, 3)), + v_extract0(v_swizzle_singular_mask(M.r1, 3)), + v_extract0(v_swizzle_singular_mask(M.r2, 3)), + 0.0f + ); + + float A = -v_extract0(v_dot3(M.r0, shear)); + float B = -v_extract0(v_dot3(M.r1, shear)); + float C = -v_extract0(v_dot3(M.r2, shear)); + + //--------------------------------------------------------- + // Build columns of rotation + //--------------------------------------------------------- + + f32x4 col0 = v_set( + v_extract0(v_swizzle_singular_mask(M.r0, 0)), + v_extract0(v_swizzle_singular_mask(M.r1, 0)), + v_extract0(v_swizzle_singular_mask(M.r2, 0)), + 0.0f + ); + + f32x4 col1 = v_set( + v_extract0(v_swizzle_singular_mask(M.r0, 1)), + v_extract0(v_swizzle_singular_mask(M.r1, 1)), + v_extract0(v_swizzle_singular_mask(M.r2, 1)), + 0.0f + ); + + f32x4 col2 = v_set( + v_extract0(v_swizzle_singular_mask(M.r0, 2)), + v_extract0(v_swizzle_singular_mask(M.r1, 2)), + v_extract0(v_swizzle_singular_mask(M.r2, 2)), + 0.0f + ); + + f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), col0); + f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), col1); + f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), col2); normal = v_add(v_add(nx, ny), nz); - + normal = v_add(normal, v_set(A, B, C, 0.0f)); normal = v_normalize3(normal); - // compute point on plane float d = v_extract0(v_swizzle_singular_mask(plane_v, 3)); - f32x4 point = v_mul(plane_v, v_set1(-d)); - point = v_preserve_w(point, v_set1(1.0f)); + f32x4 origNormal = plane_v; + f32x4 point = v_mul(origNormal, v_set1(-d)); + point = v_insert_w(point, v_set1(1.0f)); - // apply scale point = v_mul(point, scale_v); - - // transform point by matrix point = m_mul_vec4(M, point); - - // compute new plane distance float newD = -v_extract0(v_dot3(point, normal)); alignas(16) float n[4]; v_store(n, normal); - plane_result[0] = n[0]; - plane_result[1] = n[1]; - plane_result[2] = n[2]; - plane_result[3] = newD; + presult[0] = n[0]; + presult[1] = n[1]; + presult[2] = n[2]; + presult[3] = newD; } inline void mat44_get_scale_impl(const float* m, float* s) @@ -109,92 +144,92 @@ namespace math_backend::mat44 m_store(m, ma); } + // Vector Multiply: m * v (assume w = 0.0f) + inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load3_vec(v); + f32x4 vr = m_mul_vec3(ma, va); + v_store3(r, vr); + } + + // Matrix Inverse inline void mat44_inverse_impl(float* m) { - f32x4x4 ma = m_load(m); + // using Cramers Rule find the Inverse + // Minv = (1/det(M)) * adjoint(M) + float det = mat44_get_determinant(m); + float invDet = 1.0f / det; + float temp[16]; + temp[0] = (m[5] * m[10] - m[6] * m[9]) * invDet; + temp[1] = (m[9] * m[2] - m[10] * m[1]) * invDet; + temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; - // Compute cofactors using cross products - f32x4x4 mTemp; - mTemp.r0 = v_cross(ma.r1, ma.r2); - mTemp.r1 = v_cross(ma.r2, ma.r0); - mTemp.r2 = v_cross(ma.r0, ma.r1); + temp[4] = (m[6] * m[8] - m[4] * m[10]) * invDet; + temp[5] = (m[10] * m[0] - m[8] * m[2]) * invDet; + temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; - // Determinant = dot(ma.r0, c0) - f32x4 det = v_dot3(ma.r0, mTemp.r0); - f32x4 invDet = v_rcp_nr(det); + temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; + temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; + temp[10] = (m[0] * m[5] - m[1] * m[4]) * invDet; - // Scale cofactors - mTemp.r0 = v_mul(mTemp.r0, invDet); - mTemp.r1 = v_mul(mTemp.r1, invDet); - mTemp.r2 = v_mul(mTemp.r2, invDet); + m[0] = temp[0]; + m[1] = temp[1]; + m[2] = temp[2]; - // Store inverse 3x3 (transpose of cofactor matrix) + m[4] = temp[4]; + m[5] = temp[5]; + m[6] = temp[6]; - mTemp = m_transpose(mTemp); - mTemp.r3 = ma.r3; + m[8] = temp[8]; + m[9] = temp[9]; + m[10] = temp[10]; - // ---- Translation ---- - - // Load original translation - f32x4 T = v_set(m[3], m[7], m[11], 0.0f); - - // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) - f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); - result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); - result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); - result = v_mul(result, v_set1(-1.0f)); - - m_store(m, mTemp); - - // Store translation - m[3] = v_extract0(result); - m[7] = v_extract0(v_swizzle_singular_mask(result, 1)); - m[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + // invert the translation + temp[0] = -m[3]; + temp[1] = -m[7]; + temp[2] = -m[11]; + mat44_mul_vec3_impl(m, temp, &temp[4]); + m[3] = temp[4]; + m[7] = temp[5]; + m[11] = temp[6]; } // Matrix Inverse - inline void mat44_inverse_to_impl(const float* m, float* dst) + inline void mat44_inverse_to_impl(const float* m, float* d) { - f32x4x4 ma = m_load(m); + // using Cramers Rule find the Inverse + // Minv = (1/det(M)) * adjoint(M) + float det = mat44_get_determinant(m); - // Compute cofactors using cross products - f32x4x4 mTemp; - mTemp.r0 = v_cross(ma.r1, ma.r2); - mTemp.r1 = v_cross(ma.r2, ma.r0); - mTemp.r2 = v_cross(ma.r0, ma.r1); + float invDet = 1.0f / det; - // Determinant = dot(ma.r0, c0) - f32x4 det = v_dot3(ma.r0, mTemp.r0); - f32x4 invDet = v_rcp_nr(det); + d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet; + d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet; + d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; - // Scale cofactors - mTemp.r0 = v_mul(mTemp.r0, invDet); - mTemp.r1 = v_mul(mTemp.r1, invDet); - mTemp.r2 = v_mul(mTemp.r2, invDet); + d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet; + d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet; + d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; - // Store inverse 3x3 (transpose of cofactor matrix) + d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; + d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; + d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet; - mTemp = m_transpose(mTemp); - mTemp.r3 = ma.r3; - - // ---- Translation ---- - - // Load original translation - f32x4 T = v_set(m[3], m[7], m[11], 0.0f); - - // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) - f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); - result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); - result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); - result = v_mul(result, v_set1(-1.0f)); - - m_store(dst, mTemp); - - // Store translation - dst[3] = v_extract0(result); - dst[7] = v_extract0(v_swizzle_singular_mask(result, 1)); - dst[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + // invert the translation + float temp[6]; + temp[0] = -m[3]; + temp[1] = -m[7]; + temp[2] = -m[11]; + mat44_mul_vec3_impl(d, temp, &temp[3]); + d[3] = temp[3]; + d[7] = temp[4]; + d[11] = temp[5]; + d[12] = m[12]; + d[13] = m[13]; + d[14] = m[14]; + d[15] = m[15]; } // Matrix Affine Inverse @@ -275,15 +310,6 @@ namespace math_backend::mat44 v_store3(r, vr); } - // Vector Multiply: m * v (assume w = 0.0f) - inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r) - { - f32x4x4 ma = m_load(m); - f32x4 va = v_load3_vec(v); - f32x4 vr = m_mul_vec3(ma, va); - v_store3(r, vr); - } - // Vector Multiply: m * p (full [4x4] * [1x4]) inline void mat44_mul_float4_impl(const float* m, const float* p, float* r) { From ce8ee9462496df793171e0d52d497d2b3cb40320 Mon Sep 17 00:00:00 2001 From: AzaezelX Date: Tue, 17 Mar 2026 13:15:40 -0500 Subject: [PATCH 2/4] fix CI --- Tools/CMake/torque_macros.cmake | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake index fe0cec4c6..ca1a65325 100644 --- a/Tools/CMake/torque_macros.cmake +++ b/Tools/CMake/torque_macros.cmake @@ -176,7 +176,27 @@ function(add_math_backend name compile_defs) elseif(name STREQUAL "avx2") target_compile_options(math_${name} PRIVATE -mavx2 -mfma) elseif(name STREQUAL "neon") - target_compile_options(math_${name} PRIVATE -march=armv8-a) + if(APPLE) + set_target_properties(math_${name} PROPERTIES OSX_ARCHITECTURES "arm64") + else() + target_compile_options(math_${name} PRIVATE -march=armv8-a) + endif() + endif() + endif() + + if(APPLE) + # ARM-only backend + if(name STREQUAL "neon") + set_target_properties(math_${name} PROPERTIES + OSX_ARCHITECTURES "arm64" + ) + endif() + + # x86-only backends + if(name MATCHES "sse2|sse41|avx|avx2") + set_target_properties(math_${name} PROPERTIES + OSX_ARCHITECTURES "x86_64" + ) endif() endif() From ff1b4d47e5c2961bf430e64686f83eb9161f2300 Mon Sep 17 00:00:00 2001 From: AzaezelX Date: Tue, 17 Mar 2026 14:04:54 -0500 Subject: [PATCH 3/4] further followups to CI compilation config --- Tools/CMake/torqueMacOSconfigs.cmake | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Tools/CMake/torqueMacOSconfigs.cmake b/Tools/CMake/torqueMacOSconfigs.cmake index 793771c13..1d3ba7fbc 100644 --- a/Tools/CMake/torqueMacOSconfigs.cmake +++ b/Tools/CMake/torqueMacOSconfigs.cmake @@ -9,22 +9,22 @@ set(CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG "-Wl,-rpath,") # minimum for multi arch build is 11. set(CMAKE_OSX_DEPLOYMENT_TARGET "11" CACHE STRING "" FORCE) -set(CMAKE_OSX_ARCHITECTURES "x86_64;arm64" CACHE STRING "" FORCE) +set(CMAKE_OSX_ARCHITECTURES "x86_64;arm64" CACHE STRING "Architectures to build" FORCE) set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET[arch=arm64] "11.0" CACHE STRING "arm 64 minimum deployment target" FORCE) -if(CMAKE_OSX_ARCHITECTURES MATCHES "((^|;|, )(arm64|arm64e|x86_64))+") - set(CMAKE_C_SIZEOF_DATA_PTR 8) - set(CMAKE_CXX_SIZEOF_DATA_PTR 8) - if(CMAKE_OSX_ARCHITECTURES MATCHES "((^|;|, )(arm64|arm64e))+") - set(CMAKE_SYSTEM_PROCESSOR "aarch64") - else() - set(CMAKE_SYSTEM_PROCESSOR "x86_64") - endif() -else() - set(CMAKE_C_SIZEOF_DATA_PTR 4) - set(CMAKE_CXX_SIZEOF_DATA_PTR 4) - set(CMAKE_SYSTEM_PROCESSOR "arm") -endif() +# if(CMAKE_OSX_ARCHITECTURES MATCHES "((^|;|, )(arm64|arm64e|x86_64))+") +# set(CMAKE_C_SIZEOF_DATA_PTR 8) +# set(CMAKE_CXX_SIZEOF_DATA_PTR 8) +# if(CMAKE_OSX_ARCHITECTURES MATCHES "((^|;|, )(arm64|arm64e))+") +# set(CMAKE_SYSTEM_PROCESSOR "aarch64") +# else() +# set(CMAKE_SYSTEM_PROCESSOR "x86_64") +# endif() +# else() +# set(CMAKE_C_SIZEOF_DATA_PTR 4) +# set(CMAKE_CXX_SIZEOF_DATA_PTR 4) +# set(CMAKE_SYSTEM_PROCESSOR "arm") +# endif() # Enable codesigning with secure timestamp when not in Debug configuration (required for Notarization) set(CMAKE_XCODE_ATTRIBUTE_OTHER_CODE_SIGN_FLAGS[variant=Release] "--timestamp") From 832d1afec4dfeec70b83be95669d61a242ceceac Mon Sep 17 00:00:00 2001 From: AzaezelX Date: Tue, 17 Mar 2026 14:55:23 -0500 Subject: [PATCH 4/4] further followups to CI compilation config --- Engine/source/CMakeLists.txt | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt index d439a06e2..2ca86a2f7 100644 --- a/Engine/source/CMakeLists.txt +++ b/Engine/source/CMakeLists.txt @@ -501,12 +501,23 @@ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH) set(IS_X86 FALSE) set(IS_ARM FALSE) -if(ARCH MATCHES "x86_64|amd64|i[3-6]86") - set(IS_X86 TRUE) -endif() - -if(ARCH MATCHES "arm64|aarch64") - set(IS_ARM TRUE) +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + # Use the CMAKE_OSX_ARCHITECTURES list for universal builds + foreach(arch IN LISTS CMAKE_OSX_ARCHITECTURES) + if(arch STREQUAL "x86_64") + set(IS_X86 TRUE) + elseif(arch STREQUAL "arm64") + set(IS_ARM TRUE) + endif() + endforeach() +else() + # Non-macOS detection + if(ARCH MATCHES "arm64|aarch64") + set(IS_ARM TRUE) + endif() + if(ARCH MATCHES "x86_64|amd64|i[3-6]86") + set(IS_X86 TRUE) + endif() endif() # always available