diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt index 029ef89ec..d439a06e2 100644 --- a/Engine/source/CMakeLists.txt +++ b/Engine/source/CMakeLists.txt @@ -130,7 +130,7 @@ torqueAddSourceDirectories("windowManager" "windowManager/torque" "windowManager torqueAddSourceDirectories("scene" "scene/culling" "scene/zones" "scene/mixin") # Handle math -torqueAddSourceDirectories("math" "math/util") +torqueAddSourceDirectories("math" "math/util" "math/public" "math/impl") # note impl must skip the .inl files, never use them in engine code. # Handle persistence set(TORQUE_INCLUDE_DIRECTORIES ${TORQUE_INCLUDE_DIRECTORIES} "persistence/rapidjson") @@ -496,6 +496,37 @@ else() set_target_properties(${TORQUE_APP_NAME} PROPERTIES LINK_FLAGS "-Wl,-rpath,./") endif() +string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH) + +set(IS_X86 FALSE) +set(IS_ARM FALSE) + +if(ARCH MATCHES "x86_64|amd64|i[3-6]86") + set(IS_X86 TRUE) +endif() + +if(ARCH MATCHES "arm64|aarch64") + set(IS_ARM TRUE) +endif() + +# always available +add_math_backend(scalar MATH_SIMD_SCALAR) +message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}") +message(STATUS "IS_X86=${IS_X86}") +message(STATUS "IS_ARM=${IS_ARM}") + +# x86 family +if(IS_X86) + add_math_backend(sse2 MATH_SIMD_SSE2) + add_math_backend(sse41 MATH_SIMD_SSE41) + add_math_backend(avx MATH_SIMD_AVX) + add_math_backend(avx2 MATH_SIMD_AVX2) +endif() + +# ARM family +if(IS_ARM) + add_math_backend(neon MATH_SIMD_NEON) +endif() if(MSVC) # Match projectGenerator naming for executables diff --git a/Engine/source/environment/meshRoad.cpp b/Engine/source/environment/meshRoad.cpp index 0b10f4a29..ea09c4352 100644 --- a/Engine/source/environment/meshRoad.cpp +++ b/Engine/source/environment/meshRoad.cpp @@ -3406,7 +3406,7 @@ MatrixF MeshRoad::getNodeTransform( U32 idx ) mat.setColumn( 2, node.normal ); mat.setColumn( 3, node.point ); - AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!"); + AssertFatal(mat.determinant() != 0.0f, "no inverse!"); return mat; } @@ -3456,7 +3456,7 @@ void MeshRoad::calcSliceTransform( U32 idx, MatrixF &mat ) mat.setColumn( 2, slice.normal ); mat.setColumn( 3, slice.p1 ); - AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!"); + AssertFatal(mat.determinant() != 0.0f, "no inverse!"); } F32 MeshRoad::getRoadLength() const diff --git a/Engine/source/environment/river.cpp b/Engine/source/environment/river.cpp index acefb66b3..a3456cf17 100644 --- a/Engine/source/environment/river.cpp +++ b/Engine/source/environment/river.cpp @@ -2139,7 +2139,7 @@ MatrixF River::getNodeTransform( U32 idx ) const mat.setColumn( 2, node.normal ); mat.setColumn( 3, node.point ); - AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!"); + AssertFatal( mat.determinant() != 0.0f, "no inverse!"); return mat; } diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl new file mode 100644 index 000000000..ec67fff8d --- /dev/null +++ b/Engine/source/math/impl/float3_impl.inl @@ -0,0 +1,122 @@ +#pragma once +#include // for sqrtf, etc. +#include "../mConstants.h" + +// Safely loads a float3 -> simd 4 lane backend +namespace math_backend::float3 +{ + //---------------------------------------------------------- + // Add two float4 vectors: r = a + b + inline void float3_add_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vr = v_add(va, vb); + v_store3(r, vr); + } + + // Subtract: r = a - b + inline void float3_sub_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vr = v_sub(va, vb); + v_store3(r, vr); + } + + // Multiply element-wise: r = a * b + inline void float3_mul_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vr = v_mul(va, vb); + v_store3(r, vr); + } + + // Multiply by scalar: r = a * s + inline void float3_mul_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_mul(va, vs); + v_store3(r, vr); + } + + // Divide element-wise: r = a / b + inline void float3_div_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vr = v_div(va, vb); + v_store3(r, vr); + } + + // Divide by scalar: r = a / s + inline void float3_div_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_div(va, vs); + v_store3(r, vr); + } + + // Dot product: returns scalar + inline float float3_dot_impl(const float* a, const float* b) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vdot = v_dot3(va, vb); + return v_extract0(vdot); // first lane is the sum of 3 elements + } + + // Length squared + inline float float3_length_squared_impl(const float* a) + { + return float3_dot_impl(a, a); + } + + // Length + inline float float3_length_impl(const float* a) + { + return std::sqrt(float3_length_squared_impl(a)); + } + + // Normalize in-place + inline void float3_normalize_impl(float* a) + { + f32x4 va = v_load3_vec(a); + f32x4 vr = v_normalize3(va); + v_store3(a, vr); + } + + // Normalize with magnitude: r = normalize(a) * r + inline void float3_normalize_mag_impl(float* a, float r) + { + f32x4 va = v_load3_vec(a); + + // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a)) + f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va))); + + f32x4 vnorm = v_mul(va, invLen); + v_store3(a, vnorm); + } + + // Linear interpolation: r = from + (to - from) * f + inline void float3_lerp_impl(const float* from, const float* to, float f, float* r) + { + f32x4 vfrom = v_load3_vec(from); + f32x4 vto = v_load3_vec(to); + f32x4 vf = v_set1(f); + f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom))); + v_store3(r, vr); + } + + inline void float3_cross_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vcross = v_cross(va, vb); + v_store3(r, vcross); + } + +} diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl new file mode 100644 index 000000000..9371ae7c3 --- /dev/null +++ b/Engine/source/math/impl/float4_impl.inl @@ -0,0 +1,123 @@ +#pragma once +#include // for sqrtf, etc. +#include "../mConstants.h" + +namespace math_backend::float4 +{ + + //---------------------------------------------------------- + // Add two float4 vectors: r = a + b + inline void float4_add_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vr = v_add(va, vb); + v_store(r, vr); + } + + // Subtract: r = a - b + inline void float4_sub_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vr = v_sub(va, vb); + v_store(r, vr); + } + + // Multiply element-wise: r = a * b + inline void float4_mul_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vr = v_mul(va, vb); + v_store(r, vr); + } + + // Multiply by scalar: r = a * s + inline void float4_mul_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_mul(va, vs); + v_store(r, vr); + } + + // Divide element-wise: r = a / b + inline void float4_div_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vr = v_div(va, vb); + v_store(r, vr); + } + + // Divide by scalar: r = a / s + inline void float4_div_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_div(va, vs); + v_store(r, vr); + } + + // Dot product: returns scalar + inline float float4_dot_impl(const float* a, const float* b) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vdot = v_dot4(va, vb); // calls ISA-specific implementation + return v_extract0(vdot); + } + + // Length squared + inline float float4_length_squared_impl(const float* a) + { + return float4_dot_impl(a, a); + } + + // Length + inline float float4_length_impl(const float* a) + { + return std::sqrt(float4_length_squared_impl(a)); + } + + // Normalize in-place + inline void float4_normalize_impl(float* a) + { + f32x4 va = v_load(a); + f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted + f32x4 vnorm = v_mul(va, invLen); + v_store(a, vnorm); + } + + // Normalize with magnitude: r = normalize(a) * r + inline void float4_normalize_mag_impl(float* a, float r) + { + f32x4 va = v_load(a); + + // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a)) + f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va))); + + f32x4 vnorm = v_mul(va, invLen); + v_store(a, vnorm); + } + + // Linear interpolation: r = from + (to - from) * f + inline void float4_lerp_impl(const float* from, const float* to, float f, float* r) + { + f32x4 vfrom = v_load(from); + f32x4 vto = v_load(to); + f32x4 vf = v_set1(f); + f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom))); + v_store(r, vr); + } + + inline void float4_cross_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vcross = v_cross(va, vb); + v_store(r, vcross); + } + +} // namespace math_backend::float4 diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl new file mode 100644 index 000000000..da6971d0d --- /dev/null +++ b/Engine/source/math/impl/mat44_impl.inl @@ -0,0 +1,424 @@ +#pragma once +#include // for sqrtf, etc. +#include "../mConstants.h" + +namespace math_backend::mat44 +{ + //------------------------------------------------------------------ + // Matrix Transpose + inline void mat44_transpose_impl(float* m) + { + f32x4x4 ma = m_load(m); + f32x4x4 mr = m_transpose(ma); + m_store(m, mr); + } + + inline float mat44_get_determinant(const float* m) + { + f32x4x4 ma = m_load(m); + return v_extract0(m_determinant_affine(ma)); + } + + // Matrix Scale: Float3 (assume w = 1.0f) + inline void mat44_scale_impl(float* m, const float* s) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load3_pos(s); + + ma.r0 = v_mul(ma.r0, va); + ma.r1 = v_mul(ma.r1, va); + ma.r2 = v_mul(ma.r2, va); + ma.r3 = v_mul(ma.r3, va); + m_store(m, ma); + } + + inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result) + { + f32x4x4 M = m_load(m); + + f32x4 plane_v = v_load(plane); + f32x4 scale_v = v_load3_vec(scale); + f32x4 invScale = v_rcp_nr(scale_v); + + // normal = plane.xyz + f32x4 normal = plane_v; + + // apply Inv(s) + normal = v_mul(normal, invScale); + + // multiply by Inv(Tr(m)) (only the rotation part matters) + f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0); + f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1); + f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2); + + normal = v_add(v_add(nx, ny), nz); + + normal = v_normalize3(normal); + + // compute point on plane + float d = v_extract0(v_swizzle_singular_mask(plane_v, 3)); + + f32x4 point = v_mul(plane_v, v_set1(-d)); + point = v_preserve_w(point, v_set1(1.0f)); + + // apply scale + point = v_mul(point, scale_v); + + // transform point by matrix + point = m_mul_vec4(M, point); + + // compute new plane distance + float newD = -v_extract0(v_dot3(point, normal)); + + alignas(16) float n[4]; + v_store(n, normal); + + plane_result[0] = n[0]; + plane_result[1] = n[1]; + plane_result[2] = n[2]; + plane_result[3] = newD; + } + + inline void mat44_get_scale_impl(const float* m, float* s) + { + f32x4x4 ma = m_load(m); + + // squared lengths + f32x4 len2_x = v_dot3(ma.r0, ma.r0); + f32x4 len2_y = v_dot3(ma.r1, ma.r1); + f32x4 len2_z = v_dot3(ma.r2, ma.r2); + + // extract and sqrt + s[0] = 1.0f / v_extract0(v_rsqrt_nr(len2_x)); + s[1] = 1.0f / v_extract0(v_rsqrt_nr(len2_y)); + s[2] = 1.0f / v_extract0(v_rsqrt_nr(len2_z)); + } + + // Matrix Scale Uniform: Float value (assume w = 1.0f) + inline void mat44_scale_uniform(float* m, float s) + { + f32x4x4 ma = m_load(m); + + // (s, s, s, 1) + f32x4 scale = v_set(s, s, s, 1.0f); + + // Scale only rotation rows (xyz part) + ma.r0 = v_mul(ma.r0, scale); + ma.r1 = v_mul(ma.r1, scale); + ma.r2 = v_mul(ma.r2, scale); + m_store(m, ma); + } + + // Matrix Inverse + inline void mat44_inverse_impl(float* m) + { + f32x4x4 ma = m_load(m); + + // Compute cofactors using cross products + f32x4x4 mTemp; + mTemp.r0 = v_cross(ma.r1, ma.r2); + mTemp.r1 = v_cross(ma.r2, ma.r0); + mTemp.r2 = v_cross(ma.r0, ma.r1); + + // Determinant = dot(ma.r0, c0) + f32x4 det = v_dot3(ma.r0, mTemp.r0); + f32x4 invDet = v_rcp_nr(det); + + // Scale cofactors + mTemp.r0 = v_mul(mTemp.r0, invDet); + mTemp.r1 = v_mul(mTemp.r1, invDet); + mTemp.r2 = v_mul(mTemp.r2, invDet); + + // Store inverse 3x3 (transpose of cofactor matrix) + + mTemp = m_transpose(mTemp); + mTemp.r3 = ma.r3; + + // ---- Translation ---- + + // Load original translation + f32x4 T = v_set(m[3], m[7], m[11], 0.0f); + + // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) + f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); + result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); + result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); + result = v_mul(result, v_set1(-1.0f)); + + m_store(m, mTemp); + + // Store translation + m[3] = v_extract0(result); + m[7] = v_extract0(v_swizzle_singular_mask(result, 1)); + m[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + } + + // Matrix Inverse + inline void mat44_inverse_to_impl(const float* m, float* dst) + { + f32x4x4 ma = m_load(m); + + // Compute cofactors using cross products + f32x4x4 mTemp; + mTemp.r0 = v_cross(ma.r1, ma.r2); + mTemp.r1 = v_cross(ma.r2, ma.r0); + mTemp.r2 = v_cross(ma.r0, ma.r1); + + // Determinant = dot(ma.r0, c0) + f32x4 det = v_dot3(ma.r0, mTemp.r0); + f32x4 invDet = v_rcp_nr(det); + + // Scale cofactors + mTemp.r0 = v_mul(mTemp.r0, invDet); + mTemp.r1 = v_mul(mTemp.r1, invDet); + mTemp.r2 = v_mul(mTemp.r2, invDet); + + // Store inverse 3x3 (transpose of cofactor matrix) + + mTemp = m_transpose(mTemp); + mTemp.r3 = ma.r3; + + // ---- Translation ---- + + // Load original translation + f32x4 T = v_set(m[3], m[7], m[11], 0.0f); + + // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) + f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); + result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); + result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); + result = v_mul(result, v_set1(-1.0f)); + + m_store(dst, mTemp); + + // Store translation + dst[3] = v_extract0(result); + dst[7] = v_extract0(v_swizzle_singular_mask(result, 1)); + dst[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + } + + // Matrix Affine Inverse + inline void mat44_affine_inverse_impl(float* m) + { + f32x4x4 ma = m_load(m); + + f32x4x4 mTemp = m_transpose(ma); + mTemp.r3 = v_set(0, 0, 0, 1); + + // ---- Translation ---- + // Load original translation + f32x4 T = v_set(m[3], m[7], m[11], 0.0f); + + // Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2) + f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0)); + result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1))); + result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2))); + result = v_mul(result, v_set1(-1.0f)); + + m_store(m, mTemp); + + // Store translation + m[3] = v_extract0(result); + m[7] = v_extract0(v_swizzle_singular_mask(result, 1)); + m[11] = v_extract0(v_swizzle_singular_mask(result, 2)); + } + + inline void mat44_normalize_impl(float* m) + { + // Load the matrix into SIMD registers + f32x4x4 mat = m_load(m); + + // Transpose: now rows are columns + mat = m_transpose(mat); + + // Extract columns (which are now rows) + f32x4 col0 = mat.r0; + f32x4 col1 = mat.r1; + + // Rebuild orthonormal basis + f32x4 col2 = v_cross(col0, col1); + col1 = v_cross(col2, col0); + + // Normalize columns + col0 = v_normalize3(col0); + col1 = v_normalize3(col1); + col2 = v_normalize3(col2); + + // Write back directly into transposed matrix + mat.r0 = col0; + mat.r1 = col1; + mat.r2 = col2; + + // Transpose back to row-major + mat = m_transpose(mat); + + // Store back + m_store(m, mat); + } + + // Matrix Multiply: a * b + inline void mat44_mul_mat44_impl(const float* a, const float* b, float* r) + { + f32x4x4 ma = m_load(a); + f32x4x4 mb = m_load(b); + + f32x4x4 mr = m_mul(ma, mb); + m_store(r, mr); + } + + // Vector Multiply: m * p (assume w = 1.0f) + inline void mat44_mul_pos3_impl(const float *m, const float *p, float* r) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load3_pos(p); + f32x4 vr = m_mul_vec4(ma, va); + v_store3(r, vr); + } + + // Vector Multiply: m * v (assume w = 0.0f) + inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load3_vec(v); + f32x4 vr = m_mul_vec3(ma, va); + v_store3(r, vr); + } + + // Vector Multiply: m * p (full [4x4] * [1x4]) + inline void mat44_mul_float4_impl(const float* m, const float* p, float* r) + { + f32x4x4 ma = m_load(m); + f32x4 va = v_load(p); + f32x4 vr = m_mul_vec4(ma, va); + v_store(r, vr); + } + + //-------------------------------------------------- + // MATRIX ROTATION FUNCTIONS + //-------------------------------------------------- + + inline void mat44_rotation_x_impl(float* m, float angle) + { + float c = cosf(angle), s = sinf(angle); + f32x4x4 mr = m_identity(); + mr.r1 = v_set(0, c, s, 0); + mr.r2 = v_set(0, -s, c, 0); + m_store(m, mr); + } + + inline void mat44_rotation_y_impl(float* m, float angle) + { + float c = cosf(angle), s = sinf(angle); + f32x4x4 mr = m_identity(); + mr.r0 = v_set(c, 0, -s, 0); + mr.r2 = v_set(s, 0, c, 0); + m_store(m, mr); + } + + inline void mat44_rotation_z_impl(float* m, float angle) + { + float c = cosf(angle), s = sinf(angle); + f32x4x4 mr = m_identity(); + mr.r0 = v_set(c, s, 0, 0); + mr.r1 = v_set(-s, c, 0, 0); + m_store(m, mr); + } + + // Compose rotation from Euler angles (pitch=X, yaw=Y, roll=Z) + inline void mat44_rotation_euler_impl(float* m, float pitch, float yaw, float roll) + { + f32x4x4 rx, ry, rz; + mat44_rotation_x_impl((float*)&rx, pitch); + mat44_rotation_y_impl((float*)&ry, yaw); + mat44_rotation_z_impl((float*)&rz, roll); + + f32x4x4 r = m_mul(rz, m_mul(ry, rx)); + m_store(m, r); + } + + inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up) + { + f32x4 vEye = v_load3_pos(eye); + f32x4 vTarget = v_load3_pos(target); + f32x4 vUp = v_load3_vec(up); + + // Forward (z+) + f32x4 zaxis = v_normalize3(v_sub(vTarget, vEye)); + + // Right (x+) + f32x4 xaxis = v_normalize3(v_cross(vUp, zaxis)); + + // Up (y+) + f32x4 yaxis = v_cross(zaxis, xaxis); + + // Compute translation components: -dot(axis, eye) + f32x4 t_x = v_mul(v_dot3(xaxis, vEye), v_set1(-1.0f)); + f32x4 t_y = v_mul(v_dot3(yaxis, vEye), v_set1(-1.0f)); + f32x4 t_z = v_mul(v_dot3(zaxis, vEye), v_set1(-1.0f)); + + f32x4x4 view; + view.r0 = v_insert_w(xaxis, t_x); + view.r1 = v_insert_w(yaxis, t_y); + view.r2 = v_insert_w(zaxis, t_z); + view.r3 = v_set(0, 0, 0, 1.0f); + + m_store(m, view); + } + + inline void mat44_perspective_impl(float* m, float fovY, float aspect, float znear, float zfar) + { + float f = 1.0f / tanf(fovY * 0.5f); + float nf = 1.0f / (znear - zfar); + + f32x4x4 mp = m_zero(); + mp.r0 = v_set(f / aspect, 0, 0, 0); + mp.r1 = v_set(0, f, 0, 0); + mp.r2 = v_set(0, 0, (zfar + znear) * nf, 2 * zfar * znear * nf); + mp.r3 = v_set(0, 0, -1, 0); // row-major projection + m_store(m, mp); + } + + inline void mat44_orthographic_impl(float* m, float left, float right, float bottom, float top, float znear, float zfar) + { + f32x4x4 mo = m_zero(); + mo.r0 = v_set(2.0f / (right - left), 0, 0, -(right + left) / (right - left)); + mo.r1 = v_set(0, 2.0f / (top - bottom), 0, -(top + bottom) / (top - bottom)); + mo.r2 = v_set(0, 0, -2.0f / (zfar - znear), -(zfar + znear) / (zfar - znear)); + mo.r3 = v_set(0, 0, 0, 1.0f); + m_store(m, mo); + } + + //-------------------------------------------------- + // MATRIX BATCH FUNCTIONS + //-------------------------------------------------- + + inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result) + { + int i = 0; + // AVX has 8 lanes to play with +#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX) + // 8-wide AVX only + for (; i + 8 <= count; i += 8) + { + vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false); + vec4_batch8 vr = m_mul_pos3_batch8(m, va); + store_vec3_batch8(&result[i*3], vr); + } +#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX + + // 4-wide + for (; i + 4 <= count; i += 4) + { + vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false); + vec4_batch4 vr = m_mul_pos3_batch4(m, va); + store_vec3_batch4(&result[i * 3], vr); + } + + for (; i < count; ++i) + { + size_t idx = i * 3; + mat44_mul_pos3_impl(m, &points[idx], &result[idx]); + } + } + +} // namespace math_backend::mat44 diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp new file mode 100644 index 000000000..285f43abc --- /dev/null +++ b/Engine/source/math/impl/math_c.cpp @@ -0,0 +1,500 @@ +#include "platform/platform.h" +#include "math/public/float4_dispatch.h" +#include "math/public/float3_dispatch.h" +#include "math/public/mat44_dispatch.h" +#include "math/mConstants.h" +#include "math/mMatrix.h" +#include // for sqrtf, etc. + +namespace math_backend::float4::dispatch +{ + void install_scalar() + { + gFloat4.add = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] + b[i]; + }; + + gFloat4.sub = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] - b[i]; + }; + + gFloat4.mul = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] * b[i]; + }; + + gFloat4.mul_scalar = [](const float* a, float s, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] * s; + }; + + gFloat4.div = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] / b[i]; + }; + + gFloat4.div_scalar = [](const float* a, float s, float* r) { + float denom = 1.0f / s; + for (int i = 0; i < 4; i++) r[i] = a[i] * denom; + }; + + gFloat4.dot = [](const float* a, const float* b) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * b[i]; + return sum; + }; + + gFloat4.length = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * a[i]; + return std::sqrt(sum); + }; + + gFloat4.lengthSquared = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * a[i]; + return (sum); + }; + + gFloat4.normalize = [](float* a) { + float len = gFloat4.length(a); + if (len > POINT_EPSILON) + { + float denom = 1.0f / len; + for (int i = 0; i < 4; i++) + a[i] *= denom; + } + }; + + gFloat4.normalize_mag = [](float* a, float f) { + float len = gFloat4.length(a); + if (len > POINT_EPSILON) + { + float denom = f / len; + for (int i = 0; i < 4; i++) a[i] *= denom; + } + }; + + gFloat4.lerp = [](const float* from, const float* to, float f, float* r) { + for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f; + }; + + gFloat4.cross = [](const float* a, const float* b, float* r) { + const float ax = a[0]; + const float ay = a[1]; + const float az = a[2]; + + const float bx = b[0]; + const float by = b[1]; + const float bz = b[2]; + + r[0] = ay * bz - az * by; + r[1] = az * bx - ax * bz; + r[2] = ax * by - ay * bx; + }; + } +} + +namespace math_backend::float3::dispatch +{ + void install_scalar() + { + gFloat3.add = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] + b[i]; + }; + + gFloat3.sub = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] - b[i]; + }; + + gFloat3.mul = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] * b[i]; + }; + + gFloat3.mul_scalar = [](const float* a, float s, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] * s; + }; + + gFloat3.div = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] / b[i]; + }; + + gFloat3.div_scalar = [](const float* a, float s, float* r) { + float denom = 1.0f / s; + for (int i = 0; i < 3; i++) r[i] = a[i] * denom; + }; + + gFloat3.dot = [](const float* a, const float* b) { + float sum = 0.f; + for (int i = 0; i < 3; i++) sum += a[i] * b[i]; + return sum; + }; + + gFloat3.length = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 3; i++) sum += a[i] * a[i]; + return std::sqrt(sum); + }; + + gFloat3.lengthSquared = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 3; i++) sum += a[i] * a[i]; + return (sum); + }; + + gFloat3.normalize = [](float* a) { + float len = gFloat3.length(a); + if (len > POINT_EPSILON) + { + float denom = 1.0 / len; + for (int i = 0; i < 3; i++) a[i] *= denom; + } + }; + + gFloat3.normalize_mag = [](float* a, float f) { + float len = gFloat3.length(a); + if (len > POINT_EPSILON) + { + float denom = f / len; + for (int i = 0; i < 3; i++) a[i] *= denom; + } + }; + + gFloat3.lerp = [](const float* from, const float* to, float f, float* r) { + for (int i = 0; i < 3; i++) r[i] = from[i] + (to[i] - from[i]) * f; + }; + + gFloat3.cross = [](const float* a, const float* b, float* r) { + const float ax = a[0]; + const float ay = a[1]; + const float az = a[2]; + + const float bx = b[0]; + const float by = b[1]; + const float bz = b[2]; + + r[0] = ay * bz - az * by; + r[1] = az * bx - ax * bz; + r[2] = ax * by - ay * bx; + }; + } +} + +inline void swap(float& a, float& b) +{ + float temp = a; + a = b; + b = temp; +} + + +namespace math_backend::mat44::dispatch +{ + void install_scalar() + { + gMat44.transpose = [](float* a) { + swap(a[1], a[4]); + swap(a[2], a[8]); + swap(a[3], a[12]); + swap(a[6], a[9]); + swap(a[7], a[13]); + swap(a[11], a[14]); + }; + + gMat44.determinant = [](const float* m) { + return m[0] * (m[5] * m[10] - m[6] * m[9]) + + m[4] * (m[2] * m[9] - m[1] * m[10]) + + m[8] * (m[1] * m[6] - m[2] * m[5]); + }; + + gMat44.mul_vec3 = [](const float* a, const float* b, float* r) { +#ifdef TORQUE_COMPILER_GCC + const F32 v0 = b[0], v1 = b[1], v2 = b[2]; + const F32 m0 = a[0], m1 = a[1], m2 = a[2]; + const F32 m4 = a[4], m5 = a[5], m6 = a[6]; + const F32 m8 = a[8], m9 = a[9], m10 = a[10]; + + r[0] = m0 * v0 + m1 * v1 + m2 * v2; + r[1] = m4 * v0 + m5 * v1 + m6 * v2; + r[2] = m8 * v0 + m9 * v1 + m10 * v2; +#else + r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; + r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2]; + r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2]; +#endif + + }; + + gMat44.inverse = [](float* m) { + // using Cramers Rule find the Inverse + // Minv = (1/det(M)) * adjoint(M) + float det = gMat44.determinant(m); + AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); + float invDet = 1.0f / det; + float temp[16]; + temp[0] = (m[5] * m[10] - m[6] * m[9]) * invDet; + temp[1] = (m[9] * m[2] - m[10] * m[1]) * invDet; + temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; + + temp[4] = (m[6] * m[8] - m[4] * m[10]) * invDet; + temp[5] = (m[10] * m[0] - m[8] * m[2]) * invDet; + temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; + + temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; + temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; + temp[10] = (m[0] * m[5] - m[1] * m[4]) * invDet; + + m[0] = temp[0]; + m[1] = temp[1]; + m[2] = temp[2]; + + m[4] = temp[4]; + m[5] = temp[5]; + m[6] = temp[6]; + + m[8] = temp[8]; + m[9] = temp[9]; + m[10] = temp[10]; + + // invert the translation + temp[0] = -m[3]; + temp[1] = -m[7]; + temp[2] = -m[11]; + gMat44.mul_vec3(m, temp, &temp[4]); + m[3] = temp[4]; + m[7] = temp[5]; + m[11] = temp[6]; + + }; + + gMat44.inverse_to = [](const float* m, float* d) { + // using Cramers Rule find the Inverse + // Minv = (1/det(M)) * adjoint(M) + F32 det = gMat44.determinant(m); + AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse."); + + F32 invDet = 1.0f / det; + + d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet; + d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet; + d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet; + + d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet; + d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet; + d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet; + + d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet; + d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet; + d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet; + + // invert the translation + F32 temp[6]; + temp[0] = -m[3]; + temp[1] = -m[7]; + temp[2] = -m[11]; + gMat44.mul_vec3(d, temp, &temp[3]); + d[3] = temp[3]; + d[7] = temp[4]; + d[11] = temp[5]; + d[12] = m[12]; + d[13] = m[13]; + d[14] = m[14]; + d[15] = m[15]; + }; + + gMat44.affine_inverse = [](float* a) { + F32 temp[16]; + dMemcpy(temp, a, 16 * sizeof(F32)); + + // Transpose rotation + a[1] = temp[4]; + a[4] = temp[1]; + a[2] = temp[8]; + a[8] = temp[2]; + a[6] = temp[9]; + a[9] = temp[6]; + + a[3] = -(temp[0] * temp[3] + temp[4] * temp[7] + temp[8] * temp[11]); + a[7] = -(temp[1] * temp[3] + temp[5] * temp[7] + temp[9] * temp[11]); + a[11] = -(temp[2] * temp[3] + temp[6] * temp[7] + temp[10] * temp[11]); + }; + + gMat44.scale = [](float* a, const float* s) { + // Note, doesn't allow scaling w... + + a[0] *= s[0]; a[1] *= s[1]; a[2] *= s[2]; + a[4] *= s[0]; a[5] *= s[1]; a[6] *= s[2]; + a[8] *= s[0]; a[9] *= s[1]; a[10] *= s[2]; + a[12] *= s[0]; a[13] *= s[1]; a[14] *= s[2]; + }; + + gMat44.get_scale = [](const float* a, float* s) { + // Note, doesn't allow scaling w... + s[0] = sqrt(a[0] * a[0] + a[4] * a[4] + a[8] * a[8]); + s[1] = sqrt(a[1] * a[1] + a[5] * a[5] + a[9] * a[9]); + s[2] = sqrt(a[2] * a[2] + a[6] * a[6] + a[10] * a[10]); + }; + + gMat44.mul_float4 = [](const float* a, const float* b, float* r) { + AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!"); + r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; + r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2] + a[7] * b[3]; + r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2] + a[11] * b[3]; + r[2] = a[12] * b[0] + a[13] * b[1] + a[14] * b[2] + a[15] * b[3]; + }; + + gMat44.mul_pos3 = [](const float* a, const float* b, float* r) { + AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!"); + r[0] = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]; + r[1] = a[4]*b[0] + a[5]*b[1] + a[6]*b[2] + a[7]; + r[2] = a[8]*b[0] + a[9]*b[1] + a[10]*b[2] + a[11]; + }; + + gMat44.mul_vec3 = [](const float* a, const float* b, float* r) { + AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!"); + r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; + r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2]; + r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2]; + }; + + gMat44.mul_mat44 = [](const float* a, const float* b, float* mresult) { + mresult[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12]; + mresult[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13]; + mresult[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14]; + mresult[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15]; + + mresult[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12]; + mresult[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13]; + mresult[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14]; + mresult[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15]; + + mresult[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12]; + mresult[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13]; + mresult[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14]; + mresult[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15]; + + mresult[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12]; + mresult[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13]; + mresult[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14]; + mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15]; + }; + + gMat44.transform_plane = [](const F32* m, const F32* s, const F32* p, F32* presult) { + // We take in a matrix, a scale factor, and a plane equation. We want to output + // the resultant normal + // We have T = m*s + // To multiply the normal, we want Inv(Tr(m*s)) + // Inv(Tr(ms)) = Inv(Tr(s) * Tr(m)) + // = Inv(Tr(m)) * Inv(Tr(s)) + // + // Inv(Tr(s)) = Inv(s) = [ 1/x 0 0 0] + // [ 0 1/y 0 0] + // [ 0 0 1/z 0] + // [ 0 0 0 1] + // + // Since m is an affine matrix, + // Tr(m) = [ [ ] 0 ] + // [ [ R ] 0 ] + // [ [ ] 0 ] + // [ [ x y z ] 1 ] + // + // Inv(Tr(m)) = [ [ -1 ] 0 ] + // [ [ R ] 0 ] + // [ [ ] 0 ] + // [ [ A B C ] 1 ] + // Where: + // + // P = (x, y, z) + // A = -(Row(0, r) * P); + // B = -(Row(1, r) * P); + // C = -(Row(2, r) * P); + + MatrixF invScale(true); + F32* pScaleElems = invScale; + pScaleElems[MatrixF::idx(0, 0)] = 1.0f / s[0]; + pScaleElems[MatrixF::idx(1, 1)] = 1.0f / s[1]; + pScaleElems[MatrixF::idx(2, 2)] = 1.0f / s[2]; + + const Point3F shear(m[MatrixF::idx(3, 0)], m[MatrixF::idx(3, 1)], m[MatrixF::idx(3, 2)]); + + const Point3F row0(m[MatrixF::idx(0, 0)], m[MatrixF::idx(0, 1)], m[MatrixF::idx(0, 2)]); + const Point3F row1(m[MatrixF::idx(1, 0)], m[MatrixF::idx(1, 1)], m[MatrixF::idx(1, 2)]); + const Point3F row2(m[MatrixF::idx(2, 0)], m[MatrixF::idx(2, 1)], m[MatrixF::idx(2, 2)]); + + const F32 A = -mDot(row0, shear); + const F32 B = -mDot(row1, shear); + const F32 C = -mDot(row2, shear); + + MatrixF invTrMatrix(true); + F32* destMat = invTrMatrix; + destMat[MatrixF::idx(0, 0)] = m[MatrixF::idx(0, 0)]; + destMat[MatrixF::idx(1, 0)] = m[MatrixF::idx(1, 0)]; + destMat[MatrixF::idx(2, 0)] = m[MatrixF::idx(2, 0)]; + destMat[MatrixF::idx(0, 1)] = m[MatrixF::idx(0, 1)]; + destMat[MatrixF::idx(1, 1)] = m[MatrixF::idx(1, 1)]; + destMat[MatrixF::idx(2, 1)] = m[MatrixF::idx(2, 1)]; + destMat[MatrixF::idx(0, 2)] = m[MatrixF::idx(0, 2)]; + destMat[MatrixF::idx(1, 2)] = m[MatrixF::idx(1, 2)]; + destMat[MatrixF::idx(2, 2)] = m[MatrixF::idx(2, 2)]; + destMat[MatrixF::idx(0, 3)] = A; + destMat[MatrixF::idx(1, 3)] = B; + destMat[MatrixF::idx(2, 3)] = C; + invTrMatrix.mul(invScale); + + Point3F norm(p[0], p[1], p[2]); + Point3F point = norm * -p[3]; + invTrMatrix.mulP(norm); + norm.normalize(); + + MatrixF temp; + dMemcpy(temp, m, sizeof(F32) * 16); + point.x *= s[0]; + point.y *= s[1]; + point.z *= s[2]; + temp.mulP(point); + + PlaneF resultPlane(point, norm); + presult[0] = resultPlane.x; + presult[1] = resultPlane.y; + presult[2] = resultPlane.z; + presult[3] = resultPlane.d; + + }; + + gMat44.normalize = [](float* a) { + F32 col0[3], col1[3], col2[3]; + // extract columns 0 and 1 + col0[0] = a[0]; + col0[1] = a[4]; + col0[2] = a[8]; + + col1[0] = a[1]; + col1[1] = a[5]; + col1[2] = a[9]; + + math_backend::float3::dispatch::gFloat3.normalize(col0); + math_backend::float3::dispatch::gFloat3.normalize(col1); + math_backend::float3::dispatch::gFloat3.normalize(col2); + + // store the normalized columns + a[0] = col0[0]; + a[4] = col0[1]; + a[8] = col0[2]; + + a[1] = col1[0]; + a[5] = col1[1]; + a[9] = col1[2]; + + a[2] = col2[0]; + a[6] = col2[1]; + a[10] = col2[2]; + + }; + + gMat44.batch_mul_pos3 = [](const float* m, const float* pts, int count, float* result_ptrs) { + int i = 0; + for (; i < count; i++) + { + int idx = i * 3; + gMat44.mul_pos3(m, &pts[idx], &result_ptrs[idx]); + } + }; + } +} diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h new file mode 100644 index 000000000..40337e8c5 --- /dev/null +++ b/Engine/source/math/isa/avx/avx_intrinsics.h @@ -0,0 +1,625 @@ +#pragma once +#include // AVX + +namespace +{ + typedef __m128 f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + inline f32x4 v_zero() { return _mm_setzero_ps(); } + + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + + inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + + inline f32x4 v_swizzle_mask(f32x4 v1, const int x, const int y, const int z, const int w) + { + __m128i idx = _mm_set_epi32(w, z, y, x); + return _mm_permutevar_ps(v1, idx); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x) + { + __m128i idx = _mm_set1_epi32(x); + return _mm_permutevar_ps(v1, idx); + } + + inline f32x4 v_swizzle_lo(f32x4 v1) + { + return _mm_moveldup_ps(v1); + } + + inline f32x4 v_swizzle_hi(f32x4 v1) + { + return _mm_movehdup_ps(v1); + } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + return _mm_blend_ps(newv, original, 0b1000); + } + + //------------------------------------------------------ + // Shuffle helpers + //------------------------------------------------------ + + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w) + { + f32x4 lo = v1; + f32x4 hi = v2; + + f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x)); + f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z)); + + return _mm_movelh_ps(combined_lo, combined_hi); + } + + inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2) + { + return _mm_movelh_ps(v1, v2); + } + + inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2) + { + return _mm_movehl_ps(v2, v1); + } + + //------------------------------------------------------ + // Float3 helpers (safe loading into 4 lanes) + //------------------------------------------------------ + + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + return _mm_set_ps(0.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + return _mm_set_ps(1.0f, p[2], p[1], p[0]); + } + + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + inline f32x4 v_set(float x, float y, float z, float w) + { + return _mm_set_ps(w, z, y, x); + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + return _mm_insert_ps(v, w, 0x30); + } + + //------------------------------------------------------ + // Simple Arithmatic + //------------------------------------------------------ + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + + // Fast recip 1/b + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = _mm_rcp_ps(b); + f32x4 two = _mm_set1_ps(2.0f); + r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + } + + // Divide fast ( b = recip eg 1/b) + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 half = _mm_set1_ps(0.5f); + f32x4 three = _mm_set1_ps(3.0f); + + f32x4 r = _mm_rsqrt_ps(x); + f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r); + return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr)); + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + + // full dot4 + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes + } + + // dot3 (ignores w) + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes + } + + // cross product xyz only. + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + + f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); + } + + inline f32x4 v_normalize3(f32x4 v) + { + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); + } + + // adds all 4 lanes together. + inline f32x4 v_hadd4(f32x4 a) + { + // sum all 4 lanes in SSE41 + __m128 sum = _mm_hadd_ps(a, a); + return _mm_hadd_ps(sum, sum); + } + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + //------------------------------------------------------ + // Matrix Load / Store + //------------------------------------------------------ + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + v_store(dst + 0, m.r0); + v_store(dst + 4, m.r1); + v_store(dst + 8, m.r2); + v_store(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = _mm_set_ps(0, 0, 0, 1); + m.r1 = _mm_set_ps(0, 0, 1, 0); + m.r2 = _mm_set_ps(0, 1, 0, 0); + m.r3 = _mm_set_ps(1, 0, 0, 0); + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + //------------------------------------------------------ + // Matrix × Vector + //------------------------------------------------------ + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + // combine to a vector + return _mm_set_ps(_mm_cvtss_f32(w), + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + // combine to a vector + return _mm_set_ps(0.0f, + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + //------------------------------------------------------ + // Matrix × Matrix + //------------------------------------------------------ + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + // Transpose B once for dot products + f32x4 b0 = b.r0; + f32x4 b1 = b.r1; + f32x4 b2 = b.r2; + f32x4 b3 = b.r3; + + // Transpose (SSE2) + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); + + f32x4x4 C; + + auto mul_row = [&](f32x4 arow) + { + f32x4 x = v_dot4(arow, b0); + f32x4 y = v_dot4(arow, b1); + f32x4 z = v_dot4(arow, b2); + f32x4 w = v_dot4(arow, b3); + + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + }; + + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + //------------------------------------------------------ + // Transpose + //------------------------------------------------------ + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + f32x4 r3 = m.r3; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + + return { r0, r1, r2, r3 }; + } + + inline f32x4x4 m_inverse_rigid(const f32x4x4& m) + { + f32x4x4 t = m_transpose(m); + + // Extract translation + f32x4 T = v_set( + _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))), + 0.0f + ); + + f32x4 newT = m_mul_vec4(t, T); + newT = _mm_sub_ps(v_zero(), newT); + + t.r0 = v_preserve_w(t.r0, newT); + t.r1 = v_preserve_w(t.r1, newT); + t.r2 = v_preserve_w(t.r2, newT); + t.r3 = v_set(0, 0, 0, 1); + + return t; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = _mm_mul_ps(a, c0); + f32x4 term1 = _mm_mul_ps(a, c1); + f32x4 term2 = _mm_mul_ps(a, c2); + + f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); // splatted determinant + } + + //--------------------------------------------------------- + // BATCH INTRINSICS + //--------------------------------------------------------- + typedef __m256 f32x8; + + struct vec4_batch8 + { + f32x8 x; + f32x8 y; + f32x8 z; + f32x8 w; + }; + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW) + { + vec4_batch8 r; + + r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9], + ptr[12], ptr[15], ptr[18], ptr[21]); + r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10], + ptr[13], ptr[16], ptr[19], ptr[22]); + r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11], + ptr[14], ptr[17], ptr[20], ptr[23]); + + if (fillW) + { + r.w = _mm256_set1_ps(w); + } + + return r; + } + + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) + { + vec4_batch4 r; + + r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]); + r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]); + r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]); + + if (fillW) + { + r.w = _mm_set1_ps(w); + } + + return r; + } + + // Store the result back out to a float array 8 at a time + inline void store_f32x8(float* out, f32x8 v) + { + _mm256_storeu_ps(out, v); + } + + // Store the result back out to a float array 4 at a time + inline void store_f32x4(float* out, f32x4 v) + { + _mm_storeu_ps(out, v); + } + + // Store the result back to a float3 array with size of 8 + inline void store_vec3_batch8(float* out, const vec4_batch8& v) + { + alignas(32) float xs[8]; + alignas(32) float ys[8]; + alignas(32) float zs[8]; + + _mm256_store_ps(xs, v.x); + _mm256_store_ps(ys, v.y); + _mm256_store_ps(zs, v.z); + + for (int i = 0; i < 8; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + // Store the result back to a float3 array with size of 4 + inline void store_vec3_batch4(float* out, const vec4_batch4& v) + { + alignas(16) float xs[8]; + alignas(16) float ys[8]; + alignas(16) float zs[8]; + + _mm_store_ps(xs, v.x); + _mm_store_ps(ys, v.y); + _mm_store_ps(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b) + { + f32x4 mulx = _mm_mul_ps(a.x, b.x); + f32x4 muly = _mm_mul_ps(a.y, b.y); + f32x4 mulz = _mm_mul_ps(a.z, b.z); + + return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); + } + + inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b) + { + f32x8 mulx = _mm256_mul_ps(a.x, b.x); + f32x8 muly = _mm256_mul_ps(a.y, b.y); + f32x8 mulz = _mm256_mul_ps(a.z, b.z); + + return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz); + } + + // Batch 8 mul_Vec4. + inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v) + { + vec4_batch8 r; + + __m256 m00 = _mm256_set1_ps(m[0]); + __m256 m01 = _mm256_set1_ps(m[1]); + __m256 m02 = _mm256_set1_ps(m[2]); + __m256 m03 = _mm256_set1_ps(m[3]); + + __m256 m10 = _mm256_set1_ps(m[4]); + __m256 m11 = _mm256_set1_ps(m[5]); + __m256 m12 = _mm256_set1_ps(m[6]); + __m256 m13 = _mm256_set1_ps(m[7]); + + __m256 m20 = _mm256_set1_ps(m[8]); + __m256 m21 = _mm256_set1_ps(m[9]); + __m256 m22 = _mm256_set1_ps(m[10]); + __m256 m23 = _mm256_set1_ps(m[11]); + + // + // row0 dot + // + r.x = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m00), + _mm256_mul_ps(v.y, m01)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m10), + _mm256_mul_ps(v.y, m11)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m20), + _mm256_mul_ps(v.y, m21)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m22), m23)); + + return r; + } + + // Batch 4 mul_Vec4. + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) + { + vec4_batch4 r; + + f32x4 m00 = _mm_set1_ps(m[0]); + f32x4 m01 = _mm_set1_ps(m[1]); + f32x4 m02 = _mm_set1_ps(m[2]); + f32x4 m03 = _mm_set1_ps(m[3]); + + f32x4 m10 = _mm_set1_ps(m[4]); + f32x4 m11 = _mm_set1_ps(m[5]); + f32x4 m12 = _mm_set1_ps(m[6]); + f32x4 m13 = _mm_set1_ps(m[7]); + + f32x4 m20 = _mm_set1_ps(m[8]); + f32x4 m21 = _mm_set1_ps(m[9]); + f32x4 m22 = _mm_set1_ps(m[10]); + f32x4 m23 = _mm_set1_ps(m[11]); + + // + // row0 dot + // + r.x = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m00), + _mm_mul_ps(v.y, m01)), + _mm_add_ps( + _mm_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m10), + _mm_mul_ps(v.y, m11)), + _mm_add_ps( + _mm_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m20), + _mm_mul_ps(v.y, m21)), + _mm_add_ps( + _mm_mul_ps(v.z, m22), m23)); + + return r; + } + +} diff --git a/Engine/source/math/isa/avx/float3.cpp b/Engine/source/math/isa/avx/float3.cpp new file mode 100644 index 000000000..7829f292c --- /dev/null +++ b/Engine/source/math/isa/avx/float3.cpp @@ -0,0 +1,26 @@ +#include "avx_intrinsics.h" +#include "float3_dispatch.h" +#include // AVX/AVX2 intrinsics + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install AVX backend + void install_avx() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/avx/float4.cpp b/Engine/source/math/isa/avx/float4.cpp new file mode 100644 index 000000000..cfdab0908 --- /dev/null +++ b/Engine/source/math/isa/avx/float4.cpp @@ -0,0 +1,25 @@ +#include "avx_intrinsics.h" +#include "float4_dispatch.h" + +#include "float4_impl.inl" + +namespace math_backend::float4::dispatch +{ + // Install AVX backend + void install_avx() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; + gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; + } +} diff --git a/Engine/source/math/isa/avx/mat44.cpp b/Engine/source/math/isa/avx/mat44.cpp new file mode 100644 index 000000000..b0fddcd1c --- /dev/null +++ b/Engine/source/math/isa/avx/mat44.cpp @@ -0,0 +1,26 @@ +#include "avx_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_avx() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.inverse_to = mat44_inverse_to_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.transform_plane = mat44_transform_plane_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + + gMat44.batch_mul_pos3 = mat44_batch_mul_pos3; + } +} diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h new file mode 100644 index 000000000..3291c3521 --- /dev/null +++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h @@ -0,0 +1,625 @@ +#pragma once +#include // AVX2 + +namespace +{ + typedef __m128 f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + inline f32x4 v_zero() { return _mm_setzero_ps(); } + + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + + inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + + inline f32x4 v_swizzle_mask(f32x4 v1,const int x, const int y, const int z, const int w) + { + __m128i idx = _mm_set_epi32(w, z, y, x); + return _mm_permutevar_ps(v1, idx); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x) + { + __m128i idx = _mm_set1_epi32(x); + return _mm_permutevar_ps(v1, idx); + } + + inline f32x4 v_swizzle_lo(f32x4 v1) + { + return _mm_moveldup_ps(v1); + } + + inline f32x4 v_swizzle_hi(f32x4 v1) + { + return _mm_movehdup_ps(v1); + } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + return _mm_blend_ps(newv, original, 0b1000); + } + + //------------------------------------------------------ + // Shuffle helpers + //------------------------------------------------------ + + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w) + { + f32x4 lo = v1; + f32x4 hi = v2; + + f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x)); + f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z)); + + return _mm_movelh_ps(combined_lo, combined_hi); + } + + inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2) + { + return _mm_movelh_ps(v1, v2); + } + + inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2) + { + return _mm_movehl_ps(v2, v1); + } + + //------------------------------------------------------ + // Float3 helpers (safe loading into 4 lanes) + //------------------------------------------------------ + + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + return _mm_set_ps(0.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + return _mm_set_ps(1.0f, p[2], p[1], p[0]); + } + + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + inline f32x4 v_set(float x, float y, float z, float w) + { + return _mm_set_ps(w, z, y, x); + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + return _mm_insert_ps(v, w, 0x30); + } + + //------------------------------------------------------ + // Simple Arithmatic + //------------------------------------------------------ + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + + // Fast recip 1/b + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = _mm_rcp_ps(b); + f32x4 two = _mm_set1_ps(2.0f); + r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + } + + // Divide fast ( b = recip eg 1/b) + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 half = _mm_set1_ps(0.5f); + f32x4 three = _mm_set1_ps(3.0f); + + f32x4 r = _mm_rsqrt_ps(x); + f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r); + return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr)); + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + + // full dot4 + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes + } + + // dot3 (ignores w) + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes + } + + // cross product xyz only. + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + + f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); + } + + inline f32x4 v_normalize3(f32x4 v) + { + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); + } + + // adds all 4 lanes together. + inline f32x4 v_hadd4(f32x4 a) + { + // sum all 4 lanes in SSE41 + __m128 sum = _mm_hadd_ps(a, a); + return _mm_hadd_ps(sum, sum); + } + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + //------------------------------------------------------ + // Matrix Load / Store + //------------------------------------------------------ + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + v_store(dst + 0, m.r0); + v_store(dst + 4, m.r1); + v_store(dst + 8, m.r2); + v_store(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = _mm_set_ps(0, 0, 0, 1); + m.r1 = _mm_set_ps(0, 0, 1, 0); + m.r2 = _mm_set_ps(0, 1, 0, 0); + m.r3 = _mm_set_ps(1, 0, 0, 0); + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + //------------------------------------------------------ + // Matrix × Vector + //------------------------------------------------------ + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + // combine to a vector + return _mm_set_ps(_mm_cvtss_f32(w), + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + // combine to a vector + return _mm_set_ps(0.0f, + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + //------------------------------------------------------ + // Matrix × Matrix + //------------------------------------------------------ + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + // Transpose B once for dot products + f32x4 b0 = b.r0; + f32x4 b1 = b.r1; + f32x4 b2 = b.r2; + f32x4 b3 = b.r3; + + // Transpose (SSE2) + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); + + f32x4x4 C; + + auto mul_row = [&](f32x4 arow) + { + f32x4 x = v_dot4(arow, b0); + f32x4 y = v_dot4(arow, b1); + f32x4 z = v_dot4(arow, b2); + f32x4 w = v_dot4(arow, b3); + + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + }; + + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + //------------------------------------------------------ + // Transpose + //------------------------------------------------------ + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + f32x4 r3 = m.r3; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + + return { r0, r1, r2, r3 }; + } + + inline f32x4x4 m_inverse_rigid(const f32x4x4& m) + { + f32x4x4 t = m_transpose(m); + + // Extract translation + f32x4 T = v_set( + _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))), + 0.0f + ); + + f32x4 newT = m_mul_vec4(t, T); + newT = _mm_sub_ps(v_zero(), newT); + + t.r0 = v_preserve_w(t.r0, newT); + t.r1 = v_preserve_w(t.r1, newT); + t.r2 = v_preserve_w(t.r2, newT); + t.r3 = v_set(0, 0, 0, 1); + + return t; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = _mm_mul_ps(a, c0); + f32x4 term1 = _mm_mul_ps(a, c1); + f32x4 term2 = _mm_mul_ps(a, c2); + + f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); // splatted determinant + } + + //--------------------------------------------------------- + // BATCH INTRINSICS + //--------------------------------------------------------- + typedef __m256 f32x8; + + struct vec4_batch8 + { + f32x8 x; + f32x8 y; + f32x8 z; + f32x8 w; + }; + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW) + { + vec4_batch8 r; + + r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9], + ptr[12], ptr[15], ptr[18], ptr[21]); + r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10], + ptr[13], ptr[16], ptr[19], ptr[22]); + r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11], + ptr[14], ptr[17], ptr[20], ptr[23]); + + if (fillW) + { + r.w = _mm256_set1_ps(w); + } + + return r; + } + + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) + { + vec4_batch4 r; + + r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]); + r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]); + r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]); + + if (fillW) + { + r.w = _mm_set1_ps(w); + } + + return r; + } + + // Store the result back out to a float array 8 at a time + inline void store_f32x8(float* out, f32x8 v) + { + _mm256_storeu_ps(out, v); + } + + // Store the result back out to a float array 4 at a time + inline void store_f32x4(float* out, f32x4 v) + { + _mm_storeu_ps(out, v); + } + + // Store the result back to a float3 array with size of 8 + inline void store_vec3_batch8(float* out, const vec4_batch8& v) + { + alignas(32) float xs[8]; + alignas(32) float ys[8]; + alignas(32) float zs[8]; + + _mm256_store_ps(xs, v.x); + _mm256_store_ps(ys, v.y); + _mm256_store_ps(zs, v.z); + + for (int i = 0; i < 8; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + // Store the result back to a float3 array with size of 4 + inline void store_vec3_batch4(float* out, const vec4_batch4& v) + { + alignas(16) float xs[8]; + alignas(16) float ys[8]; + alignas(16) float zs[8]; + + _mm_store_ps(xs, v.x); + _mm_store_ps(ys, v.y); + _mm_store_ps(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b) + { + f32x4 mulx = _mm_mul_ps(a.x, b.x); + f32x4 muly = _mm_mul_ps(a.y, b.y); + f32x4 mulz = _mm_mul_ps(a.z, b.z); + + return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); + } + + inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b) + { + f32x8 mulx = _mm256_mul_ps(a.x, b.x); + f32x8 muly = _mm256_mul_ps(a.y, b.y); + f32x8 mulz = _mm256_mul_ps(a.z, b.z); + + return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz); + } + + // Batch 8 mul_Vec4. + inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v) + { + vec4_batch8 r; + + __m256 m00 = _mm256_set1_ps(m[0]); + __m256 m01 = _mm256_set1_ps(m[1]); + __m256 m02 = _mm256_set1_ps(m[2]); + __m256 m03 = _mm256_set1_ps(m[3]); + + __m256 m10 = _mm256_set1_ps(m[4]); + __m256 m11 = _mm256_set1_ps(m[5]); + __m256 m12 = _mm256_set1_ps(m[6]); + __m256 m13 = _mm256_set1_ps(m[7]); + + __m256 m20 = _mm256_set1_ps(m[8]); + __m256 m21 = _mm256_set1_ps(m[9]); + __m256 m22 = _mm256_set1_ps(m[10]); + __m256 m23 = _mm256_set1_ps(m[11]); + + // + // row0 dot + // + r.x = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m00), + _mm256_mul_ps(v.y, m01)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m10), + _mm256_mul_ps(v.y, m11)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m20), + _mm256_mul_ps(v.y, m21)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m22), m23)); + + return r; + } + + // Batch 4 mul_Vec4. + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) + { + vec4_batch4 r; + + f32x4 m00 = _mm_set1_ps(m[0]); + f32x4 m01 = _mm_set1_ps(m[1]); + f32x4 m02 = _mm_set1_ps(m[2]); + f32x4 m03 = _mm_set1_ps(m[3]); + + f32x4 m10 = _mm_set1_ps(m[4]); + f32x4 m11 = _mm_set1_ps(m[5]); + f32x4 m12 = _mm_set1_ps(m[6]); + f32x4 m13 = _mm_set1_ps(m[7]); + + f32x4 m20 = _mm_set1_ps(m[8]); + f32x4 m21 = _mm_set1_ps(m[9]); + f32x4 m22 = _mm_set1_ps(m[10]); + f32x4 m23 = _mm_set1_ps(m[11]); + + // + // row0 dot + // + r.x = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m00), + _mm_mul_ps(v.y, m01)), + _mm_add_ps( + _mm_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m10), + _mm_mul_ps(v.y, m11)), + _mm_add_ps( + _mm_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m20), + _mm_mul_ps(v.y, m21)), + _mm_add_ps( + _mm_mul_ps(v.z, m22), m23)); + + return r; + } + +} diff --git a/Engine/source/math/isa/avx2/float3.cpp b/Engine/source/math/isa/avx2/float3.cpp new file mode 100644 index 000000000..fd0d485f6 --- /dev/null +++ b/Engine/source/math/isa/avx2/float3.cpp @@ -0,0 +1,26 @@ +#include "avx2_intrinsics.h" +#include "float3_dispatch.h" +#include // AVX/AVX2 intrinsics + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install AVX2 backend + void install_avx2() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp new file mode 100644 index 000000000..85228caaf --- /dev/null +++ b/Engine/source/math/isa/avx2/float4.cpp @@ -0,0 +1,25 @@ +#include "avx2_intrinsics.h" +#include "float4_dispatch.h" + +#include "float4_impl.inl" + +namespace math_backend::float4::dispatch +{ + // Install AVX2 backend + void install_avx2() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; + gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; + } +} diff --git a/Engine/source/math/isa/avx2/mat44.cpp b/Engine/source/math/isa/avx2/mat44.cpp new file mode 100644 index 000000000..83c90a647 --- /dev/null +++ b/Engine/source/math/isa/avx2/mat44.cpp @@ -0,0 +1,25 @@ +#include "avx2_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_avx2() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.inverse_to = mat44_inverse_to_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.transform_plane = mat44_transform_plane_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + gMat44.batch_mul_pos3 = mat44_batch_mul_pos3; + } +} diff --git a/Engine/source/math/isa/neon/float3.cpp b/Engine/source/math/isa/neon/float3.cpp new file mode 100644 index 000000000..904787932 --- /dev/null +++ b/Engine/source/math/isa/neon/float3.cpp @@ -0,0 +1,25 @@ +#include "neon_intrinsics.h" +#include "float3_dispatch.h" + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install NEON backend + void install_neon() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp new file mode 100644 index 000000000..6996f2a76 --- /dev/null +++ b/Engine/source/math/isa/neon/float4.cpp @@ -0,0 +1,25 @@ +#include "neon_intrinsics.h" +#include "float4_dispatch.h" + +#include "float4_impl.inl" + +namespace math_backend::float4::dispatch +{ + // Install NEON64 backend + void install_neon() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; + gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; + } +} diff --git a/Engine/source/math/isa/neon/mat44.cpp b/Engine/source/math/isa/neon/mat44.cpp new file mode 100644 index 000000000..cfeebb3bf --- /dev/null +++ b/Engine/source/math/isa/neon/mat44.cpp @@ -0,0 +1,23 @@ +#include "neon_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_neon() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.inverse_to = mat44_inverse_to_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + } +} diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h new file mode 100644 index 000000000..e118b1600 --- /dev/null +++ b/Engine/source/math/isa/neon/neon_intrinsics.h @@ -0,0 +1,504 @@ +#pragma once +#include + +namespace +{ + typedef float32x4_t f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + inline f32x4 v_load(const float* p) { return vld1q_f32(p); } + inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); } + inline f32x4 v_set1(float s) { return vdupq_n_f32(s); } + inline f32x4 v_zero() { return vdupq_n_f32(0.0f); } + inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); } + + inline f32x4 v_set(float x, float y, float z, float w) + { + return { x, y, z, w }; // lane0=x, lane1=y, lane2=z, lane3=w + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + // extract scalar value from w lane0 + float w_val = vgetq_lane_f32(w, 0); + + // broadcast w_val to all lanes (we only need lane3 later) + f32x4 w_broadcast = vdupq_n_f32(w_val); + + // mask to select only lane3 + uint32x4_t mask = {0, 0, 0, 0xFFFFFFFF}; + + // vbslq: if mask bit=1, take from first argument, else take from second + return vbslq_f32(mask, w_broadcast, v); + } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + inline f32x4 v_mask_xyz() + { + uint32x4_t mask = { + 0xFFFFFFFF, + 0xFFFFFFFF, + 0xFFFFFFFF, + 0x00000000 + }; + + return vreinterpretq_f32_u32(mask); + } + + inline f32x4 v_and(f32x4 a, f32x4 b) + { + return vreinterpretq_f32_u32( + vandq_u32( + vreinterpretq_u32_f32(a), + vreinterpretq_u32_f32(b) + ) + ); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v, int x) + { + // base byte index of the float lane + uint8x16_t base = vdupq_n_u8((uint8_t)(x * 4)); + + // byte offsets inside a float (0,1,2,3 repeated 4 times) + const uint8x16_t offsets = { + 0,1,2,3, 0,1,2,3, + 0,1,2,3, 0,1,2,3 + }; + + uint8x16_t idx = vaddq_u8(base, offsets); + + return vreinterpretq_f32_u8( + vqtbl1q_u8( + vreinterpretq_u8_f32(v), + idx + ) + ); + } + + inline f32x4 v_swizzle_lo(f32x4 v) + { + float32x4x2_t t = vzipq_f32(v, v); + return t.val[0]; // [x, x, z, z] + } + + inline f32x4 v_swizzle_hi(f32x4 v) + { + float32x4x2_t t = vzipq_f32(v, v); + return t.val[1]; // [y, y, w, w] + } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f}; + return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv); + } + + //------------------------------------------------------ + // Float3 helpers + //------------------------------------------------------ + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + float tmp[4] = { p[0], p[1], p[2], 0.0f }; + return vld1q_f32(tmp); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + float tmp[4] = { p[0], p[1], p[2], 1.0f }; + return vld1q_f32(tmp); + } + + inline void v_store3(float* dst, f32x4 v) + { + float tmp[4]; + vst1q_f32(tmp, v); + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + //------------------------------------------------------ + // Simple Arithmetic + //------------------------------------------------------ + inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); } + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64 + inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); } + inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = vrecpeq_f32(b); + r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson + r = vmulq_f32(r, vrecpsq_f32(b, r)); + return r; + } + + inline f32x4 v_div(f32x4 a, f32x4 b) + { + return vmulq_f32(a, v_rcp_nr(b)); + } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 r = vrsqrteq_f32(x); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); + return r; + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + f32x4 mul = vmulq_f32(a, b); + float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul)); + float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1); + return vdupq_n_f32(sum); + } + + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f}; + f32x4 mul = vmulq_f32(a, b); + mul = vmulq_f32(mul, mask); + float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul)); + float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1); + return vdupq_n_f32(sum); + } + + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = { vgetq_lane_f32(a,1), + vgetq_lane_f32(a,2), + vgetq_lane_f32(a,0), + 0.0f }; + + f32x4 b_zxy = { vgetq_lane_f32(b,2), + vgetq_lane_f32(b,0), + vgetq_lane_f32(b,1), + 0.0f }; + + f32x4 a_zxy = { vgetq_lane_f32(a,2), + vgetq_lane_f32(a,0), + vgetq_lane_f32(a,1), + 0.0f }; + + f32x4 b_yzx = { vgetq_lane_f32(b,1), + vgetq_lane_f32(b,2), + vgetq_lane_f32(b,0), + 0.0f }; + + return vsubq_f32( + vmulq_f32(a_yzx, b_zxy), + vmulq_f32(a_zxy, b_yzx) + ); + } + + inline f32x4 v_normalize3(f32x4 v) + { + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t fallback = {0.0f, 0.0f, 1.0f, 0.0f}; + + f32x4 dot = v_dot3(v, v); + + // dot == 0? + uint32x4_t isZero = vceqq_f32(dot, zero); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 norm = vmulq_f32(v, inv); + + // Select fallback when zero + return vbslq_f32(isZero, fallback, norm); + } + + inline f32x4 v_hadd4(f32x4 a) + { + float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a)); + float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1); + return vdupq_n_f32(sum); + } + + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + v_store(dst + 0, m.r0); + v_store(dst + 4, m.r1); + v_store(dst + 8, m.r2); + v_store(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = {1,0,0,0}; + m.r1 = {0,1,0,0}; + m.r2 = {0,0,1,0}; + m.r3 = {0,0,0,1}; + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + return { + v_extract0(x), + v_extract0(y), + v_extract0(z), + v_extract0(w) + }; + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + return { + v_extract0(x), + v_extract0(y), + v_extract0(z), + 0.0f + }; + } + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + float32x4x2_t t0 = vtrnq_f32(m.r0, m.r1); + float32x4x2_t t1 = vtrnq_f32(m.r2, m.r3); + + float32x2_t a0 = vget_low_f32(t0.val[0]); + float32x2_t a1 = vget_high_f32(t0.val[0]); + float32x2_t a2 = vget_low_f32(t1.val[0]); + float32x2_t a3 = vget_high_f32(t1.val[0]); + + float32x2_t b0 = vget_low_f32(t0.val[1]); + float32x2_t b1 = vget_high_f32(t0.val[1]); + float32x2_t b2 = vget_low_f32(t1.val[1]); + float32x2_t b3 = vget_high_f32(t1.val[1]); + + f32x4x4 out; + + out.r0 = vcombine_f32(a0, a2); + out.r1 = vcombine_f32(b0, b2); + out.r2 = vcombine_f32(a1, a3); + out.r3 = vcombine_f32(b1, b3); + + return out; + } + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + f32x4x4 bt = m_transpose(b); + + auto mul_row = [&](f32x4 row) + { + f32x4 x = v_dot4(row, bt.r0); + f32x4 y = v_dot4(row, bt.r1); + f32x4 z = v_dot4(row, bt.r2); + f32x4 w = v_dot4(row, bt.r3); + + return f32x4{ + v_extract0(x), + v_extract0(y), + v_extract0(z), + v_extract0(w) + }; + }; + + f32x4x4 C; + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = vmulq_f32(a, c0); + f32x4 term1 = vmulq_f32(a, c1); + f32x4 term2 = vmulq_f32(a, c2); + + f32x4 det = vaddq_f32(term0, vaddq_f32(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = v_and(m.r0, v_mask_xyz()); + f32x4 r1 = v_and(m.r1, v_mask_xyz()); + f32x4 r2 = v_and(m.r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); + } + + inline f32x4x4 m_inverse(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(b, c); + f32x4 c1 = v_cross(c, d); + f32x4 c2 = v_cross(d, a); + f32x4 c3 = v_cross(a, b); + + f32x4 det = v_dot4(a, c1); + f32x4 invDet = v_rcp_nr(det); + + f32x4x4 adj; + adj.r0 = vmulq_f32(c1, invDet); + adj.r1 = vmulq_f32(c2, invDet); + adj.r2 = vmulq_f32(c3, invDet); + adj.r3 = vmulq_f32(c0, invDet); + + return m_transpose(adj); + } + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) + { + vec4_batch4 r; + + r.x = (f32x4){ ptr[0], ptr[3], ptr[6], ptr[9] }; + r.y = (f32x4){ ptr[1], ptr[4], ptr[7], ptr[10] }; + r.z = (f32x4){ ptr[2], ptr[5], ptr[8], ptr[11] }; + + if (fillW) + { + r.w = vdupq_n_f32(w); + } + + return r; + } + + inline void store_vec3_batch4(float* out, const vec4_batch4& v) + { + alignas(16) float xs[4]; + alignas(16) float ys[4]; + alignas(16) float zs[4]; + + vst1q_f32(xs, v.x); + vst1q_f32(ys, v.y); + vst1q_f32(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) + { + vec4_batch4 r; + + float32x4_t m00 = vdupq_n_f32(m[0]); + float32x4_t m01 = vdupq_n_f32(m[1]); + float32x4_t m02 = vdupq_n_f32(m[2]); + float32x4_t m03 = vdupq_n_f32(m[3]); + + float32x4_t m10 = vdupq_n_f32(m[4]); + float32x4_t m11 = vdupq_n_f32(m[5]); + float32x4_t m12 = vdupq_n_f32(m[6]); + float32x4_t m13 = vdupq_n_f32(m[7]); + + float32x4_t m20 = vdupq_n_f32(m[8]); + float32x4_t m21 = vdupq_n_f32(m[9]); + float32x4_t m22 = vdupq_n_f32(m[10]); + float32x4_t m23 = vdupq_n_f32(m[11]); + + // row0 dot + r.x = vaddq_f32( + vaddq_f32( + vmulq_f32(v.x, m00), + vmulq_f32(v.y, m01)), + vaddq_f32( + vmulq_f32(v.z, m02), + m03)); + + // row1 dot + r.y = vaddq_f32( + vaddq_f32( + vmulq_f32(v.x, m10), + vmulq_f32(v.y, m11)), + vaddq_f32( + vmulq_f32(v.z, m12), + m13)); + + // row2 dot + r.z = vaddq_f32( + vaddq_f32( + vmulq_f32(v.x, m20), + vmulq_f32(v.y, m21)), + vaddq_f32( + vmulq_f32(v.z, m22), + m23)); + + return r; + } + +} diff --git a/Engine/source/math/isa/sse2/float3.cpp b/Engine/source/math/isa/sse2/float3.cpp new file mode 100644 index 000000000..bc822e8fc --- /dev/null +++ b/Engine/source/math/isa/sse2/float3.cpp @@ -0,0 +1,26 @@ +#include "sse2_intrinsics.h" +#include "float3_dispatch.h" +#include // SSE2 intrinsics + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install SSE2 backend + void install_sse2() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp new file mode 100644 index 000000000..aa986474b --- /dev/null +++ b/Engine/source/math/isa/sse2/float4.cpp @@ -0,0 +1,26 @@ +#include "sse2_intrinsics.h" +#include "float4_dispatch.h" + +#include "float4_impl.inl" + + +namespace math_backend::float4::dispatch +{ + // Install SSE2 backend + void install_sse2() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; + gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; + } +} diff --git a/Engine/source/math/isa/sse2/mat44.cpp b/Engine/source/math/isa/sse2/mat44.cpp new file mode 100644 index 000000000..d8d06ea58 --- /dev/null +++ b/Engine/source/math/isa/sse2/mat44.cpp @@ -0,0 +1,25 @@ +#include "sse2_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_sse2() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.inverse_to = mat44_inverse_to_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.transform_plane = mat44_transform_plane_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + gMat44.batch_mul_pos3 = mat44_batch_mul_pos3; + } +} diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h new file mode 100644 index 000000000..a582d3793 --- /dev/null +++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h @@ -0,0 +1,655 @@ +#pragma once +#include // SSE2 +#include // SSE + +namespace +{ + typedef __m128 f32x4; + + //------------------------------------------------------ + // Vector/Point Load / Store + //------------------------------------------------------ + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + inline f32x4 v_zero() { return _mm_setzero_ps(); } + + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + //------------------------------------------------------ + // Float3 helpers (safe loading into 4 lanes) + //------------------------------------------------------ + + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + return _mm_set_ps(0.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + return _mm_set_ps(1.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_set(float x, float y, float z, float w) + { + return _mm_set_ps(w, z, y, x); + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); + return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v)); + } + + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + + inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); } + + inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w) + { + alignas(16) float tmp[4]; + _mm_store_ps(tmp, v); + + return _mm_set_ps( + tmp[w], + tmp[z], + tmp[y], + tmp[x] + ); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v, int x) + { + alignas(16) float tmp[4]; + _mm_store_ps(tmp, v); + return _mm_set1_ps(tmp[x]); + } + + inline f32x4 v_swizzle_lo(f32x4 v1) + { + return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0)); + } + + inline f32x4 v_swizzle_hi(f32x4 v1) + { + return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1)); + } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1 + return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv)); + } + + //------------------------------------------------------ + // Shuffle helpers + //------------------------------------------------------ + + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w) + { + alignas(16) float a[4], b[4]; + _mm_store_ps(a, v1); + _mm_store_ps(b, v2); + + return _mm_set_ps( + b[w], + b[z], + a[y], + a[x] + ); + } + + inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2) + { + return _mm_movelh_ps(v1, v2); + } + + inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2) + { + return _mm_movehl_ps(v2, v1); + } + + + //------------------------------------------------------ + // Simple Arithmatic + //------------------------------------------------------ + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + // Absolute value. + inline f32x4 v_abs(f32x4 v) + { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(v, mask); + } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + + // Fast recip 1/b + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = _mm_rcp_ps(b); + f32x4 two = _mm_set1_ps(2.0f); + r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + } + + // Divide fast ( b = recip eg 1/b) + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 half = _mm_set1_ps(0.5f); + f32x4 three = _mm_set1_ps(3.0f); + + f32x4 r = _mm_rsqrt_ps(x); + f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r); + return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr)); + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + + // full dot4 + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + f32x4 prod = _mm_mul_ps(a, b); // multiply element-wise + f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2)); + prod = _mm_add_ps(prod, shuf); + shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1)); + prod = _mm_add_ps(prod, shuf); + return prod; // f32x4, all lanes = dot(a,b) + } + + // dot3 (ignores w) + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + f32x4 prod = _mm_mul_ps(a, b); + prod = _mm_and_ps(prod, v_mask_xyz()); // zero w + f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1)); + prod = _mm_add_ps(prod, shuf); + shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2)); + prod = _mm_add_ps(prod, shuf); + return prod; // f32x4, all lanes = dot(a,b) + } + + // cross product xyz only. + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + + f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); + } + + inline f32x4 v_normalize3(f32x4 v) + { + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + + f32x4 norm = _mm_mul_ps(v, inv); + + // vbsl equivalent + f32x4 result = _mm_or_ps( + _mm_and_ps(isZero, fallback), + _mm_andnot_ps(isZero, norm) + ); + + return result; + } + + // adds all 4 lanes together. + inline f32x4 v_hadd4(f32x4 a) + { + // sum all 4 lanes in SSE2 + __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs + __m128 sums = _mm_add_ps(a, shuf); + shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)); + return _mm_add_ps(sums, shuf); + } + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + //------------------------------------------------------ + // Matrix Load / Store + //------------------------------------------------------ + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + v_store(dst + 0, m.r0); + v_store(dst + 4, m.r1); + v_store(dst + 8, m.r2); + v_store(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = _mm_set_ps(0, 0, 0, 1); + m.r1 = _mm_set_ps(0, 0, 1, 0); + m.r2 = _mm_set_ps(0, 1, 0, 0); + m.r3 = _mm_set_ps(1, 0, 0, 0); + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + //------------------------------------------------------ + // Matrix × Vector + //------------------------------------------------------ + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + // combine to a vector + return _mm_set_ps(_mm_cvtss_f32(w), + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + // combine to a vector + return _mm_set_ps(0.0f, + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + //------------------------------------------------------ + // Matrix × Matrix + //------------------------------------------------------ + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + // Transpose B once for dot products + f32x4 b0 = b.r0; + f32x4 b1 = b.r1; + f32x4 b2 = b.r2; + f32x4 b3 = b.r3; + + // Transpose (SSE2) + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); + + f32x4x4 C; + + auto mul_row = [&](f32x4 arow) + { + f32x4 x = v_dot4(arow, b0); + f32x4 y = v_dot4(arow, b1); + f32x4 z = v_dot4(arow, b2); + f32x4 w = v_dot4(arow, b3); + + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + }; + + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + //------------------------------------------------------ + // Transpose + //------------------------------------------------------ + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + f32x4 r3 = m.r3; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + + return { r0, r1, r2, r3 }; + } + + inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2) + { + dst[0] = v_extract0(c0); // c0.x + dst[1] = v_extract0(c1); // c1.x + dst[2] = v_extract0(c2); // c2.x + + dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y + dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y + dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y + + dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z + dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z + dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z + } + + inline f32x4x4 m_inverse(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(b, c); + f32x4 c1 = v_cross(c, d); + f32x4 c2 = v_cross(d, a); + f32x4 c3 = v_cross(a, b); + + f32x4 det = v_dot4(a, c1); + f32x4 invDet = v_rcp_nr(det); + + f32x4x4 adj; + adj.r0 = _mm_mul_ps(c1, invDet); + adj.r1 = _mm_mul_ps(c2, invDet); + adj.r2 = _mm_mul_ps(c3, invDet); + adj.r3 = _mm_mul_ps(c0, invDet); + + return m_transpose(adj); + } + + inline f32x4x4 m_inverse_rigid(const f32x4x4& m) + { + f32x4x4 t = m_transpose(m); + + // Extract translation + f32x4 T = v_set( + _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))), + 0.0f + ); + + f32x4 newT = m_mul_vec4(t, T); + newT = _mm_sub_ps(v_zero(), newT); + + t.r0 = v_preserve_w(t.r0, newT); + t.r1 = v_preserve_w(t.r1, newT); + t.r2 = v_preserve_w(t.r2, newT); + t.r3 = v_set(0, 0, 0, 1); + + return t; + } + + inline f32x4x4 m_inverse_affine(const f32x4x4& m) + { + // Extract upper 3x3 rows + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + // Zero translation + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + // Compute cofactors via cross products + f32x4 c0 = v_cross(r1, r2); + f32x4 c1 = v_cross(r2, r0); + f32x4 c2 = v_cross(r0, r1); + + // Determinant + f32x4 det = v_dot3(r0, c0); + + // Reciprocal determinant + f32x4 invDet = v_rcp_nr(det); + + // Inverse 3x3 (transpose of cofactor matrix) + f32x4 i0 = _mm_mul_ps(c0, invDet); + f32x4 i1 = _mm_mul_ps(c1, invDet); + f32x4 i2 = _mm_mul_ps(c2, invDet); + + // Transpose 3x3 + f32x4 t0 = i0; + f32x4 t1 = i1; + f32x4 t2 = i2; + f32x4 t3 = _mm_setzero_ps(); + + _MM_TRANSPOSE4_PS(t0, t1, t2, t3); + + // Extract translation + alignas(16) float tmp[4]; + tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[3] = 0.0f; + + f32x4 T = v_load(tmp); + + // New translation = -R' * T + f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T); + newT = _mm_sub_ps(v_zero(), newT); + + // Assemble final matrix + f32x4x4 out; + out.r0 = v_preserve_w(t0, newT); + out.r1 = v_preserve_w(t1, newT); + out.r2 = v_preserve_w(t2, newT); + out.r3 = _mm_set_ps(1, 0, 0, 0); + + return out; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = _mm_mul_ps(a, c0); + f32x4 term1 = _mm_mul_ps(a, c1); + f32x4 term2 = _mm_mul_ps(a, c2); + + f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); // splatted determinant + } + + //--------------------------------------------------------- + // BATCH INTRINSICS + //--------------------------------------------------------- + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) + { + vec4_batch4 r; + + r.x = _mm_set_ps( + ptr[9], ptr[6], ptr[3], ptr[0]); + + r.y = _mm_set_ps( + ptr[10], ptr[7], ptr[4], ptr[1]); + + r.z = _mm_set_ps( + ptr[11], ptr[8], ptr[5], ptr[2]); + + if (fillW) + { + r.w = _mm_set1_ps(w); + } + + return r; + } + + // Store the result back out to a float array 4 at a time + inline void store_f32x4(float* out, f32x4 v) + { + _mm_storeu_ps(out, v); + } + + // Store the result back to a float3 array with size of 4 + inline void store_vec3_batch4(float* out, const vec4_batch4& v) + { + alignas(16) float xs[8]; + alignas(16) float ys[8]; + alignas(16) float zs[8]; + + _mm_store_ps(xs, v.x); + _mm_store_ps(ys, v.y); + _mm_store_ps(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b) + { + f32x4 mulx = _mm_mul_ps(a.x, b.x); + f32x4 muly = _mm_mul_ps(a.y, b.y); + f32x4 mulz = _mm_mul_ps(a.z, b.z); + + return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); + } + + // Batch 4 mul_Vec4. + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) + { + vec4_batch4 r; + + f32x4 m00 = _mm_set1_ps(m[0]); + f32x4 m01 = _mm_set1_ps(m[1]); + f32x4 m02 = _mm_set1_ps(m[2]); + f32x4 m03 = _mm_set1_ps(m[3]); + + f32x4 m10 = _mm_set1_ps(m[4]); + f32x4 m11 = _mm_set1_ps(m[5]); + f32x4 m12 = _mm_set1_ps(m[6]); + f32x4 m13 = _mm_set1_ps(m[7]); + + f32x4 m20 = _mm_set1_ps(m[8]); + f32x4 m21 = _mm_set1_ps(m[9]); + f32x4 m22 = _mm_set1_ps(m[10]); + f32x4 m23 = _mm_set1_ps(m[11]); + + // + // row0 dot + // + r.x = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m00), + _mm_mul_ps(v.y, m01)), + _mm_add_ps( + _mm_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m10), + _mm_mul_ps(v.y, m11)), + _mm_add_ps( + _mm_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m20), + _mm_mul_ps(v.y, m21)), + _mm_add_ps( + _mm_mul_ps(v.z, m22), m23)); + + return r; + } +} diff --git a/Engine/source/math/isa/sse41/float3.cpp b/Engine/source/math/isa/sse41/float3.cpp new file mode 100644 index 000000000..adf1a3a3d --- /dev/null +++ b/Engine/source/math/isa/sse41/float3.cpp @@ -0,0 +1,26 @@ +#include "sse41_intrinsics.h" +#include "float3_dispatch.h" +#include // SSE41 intrinsics + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install SSE41 backend + void install_sse41() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/sse41/float4.cpp b/Engine/source/math/isa/sse41/float4.cpp new file mode 100644 index 000000000..e9ca8aae2 --- /dev/null +++ b/Engine/source/math/isa/sse41/float4.cpp @@ -0,0 +1,25 @@ +#include "sse41_intrinsics.h" +#include "float4_dispatch.h" + +#include "float4_impl.inl" + +namespace math_backend::float4::dispatch +{ + // Install SSE41 backend + void install_sse41() + { + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; + gFloat4.lengthSquared = float4_length_squared_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; + gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; + } +} diff --git a/Engine/source/math/isa/sse41/mat44.cpp b/Engine/source/math/isa/sse41/mat44.cpp new file mode 100644 index 000000000..f9e33138b --- /dev/null +++ b/Engine/source/math/isa/sse41/mat44.cpp @@ -0,0 +1,25 @@ +#include "sse41_intrinsics.h" +#include "mat44_dispatch.h" + +#include "mat44_impl.inl" + +namespace math_backend::mat44::dispatch +{ + void install_sse41() + { + gMat44.transpose = mat44_transpose_impl; + gMat44.inverse = mat44_inverse_impl; + gMat44.inverse_to = mat44_inverse_to_impl; + gMat44.affine_inverse = mat44_affine_inverse_impl; + gMat44.mul_mat44 = mat44_mul_mat44_impl; + gMat44.mul_pos3 = mat44_mul_pos3_impl; + gMat44.mul_vec3 = mat44_mul_vec3_impl; + gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.transform_plane = mat44_transform_plane_impl; + gMat44.scale = mat44_scale_impl; + gMat44.get_scale = mat44_get_scale_impl; + gMat44.normalize = mat44_normalize_impl; + gMat44.determinant = mat44_get_determinant; + gMat44.batch_mul_pos3 = mat44_batch_mul_pos3; + } +} diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h new file mode 100644 index 000000000..80dfdbc71 --- /dev/null +++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h @@ -0,0 +1,625 @@ +#pragma once +#include // SSE4.1 + +namespace +{ + typedef __m128 f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + inline f32x4 v_zero() { return _mm_setzero_ps(); } + + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + + inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + + inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w) + { + alignas(16) float tmp[4]; + _mm_store_ps(tmp, v); + + return _mm_set_ps( + tmp[w], + tmp[z], + tmp[y], + tmp[x] + ); + } + + inline f32x4 v_swizzle_singular_mask(f32x4 v, int x) + { + alignas(16) float tmp[4]; + _mm_store_ps(tmp, v); + return _mm_set1_ps(tmp[x]); + } + + inline f32x4 v_swizzle_lo(f32x4 v1) + { + return _mm_moveldup_ps(v1); + } + + inline f32x4 v_swizzle_hi(f32x4 v1) + { + return _mm_movehdup_ps(v1); + } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + return _mm_blend_ps(newv, original, 0b1000); + } + + //------------------------------------------------------ + // Shuffle helpers + //------------------------------------------------------ + + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w) + { + alignas(16) float a[4], b[4]; + _mm_store_ps(a, v1); + _mm_store_ps(b, v2); + + return _mm_set_ps( + b[w], + b[z], + a[y], + a[x] + ); + } + + inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2) + { + return _mm_movelh_ps(v1, v2); + } + + inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2) + { + return _mm_movehl_ps(v2, v1); + } + + + //------------------------------------------------------ + // Float3 helpers (safe loading into 4 lanes) + //------------------------------------------------------ + + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + return _mm_set_ps(0.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + return _mm_set_ps(1.0f, p[2], p[1], p[0]); + } + + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + inline f32x4 v_set(float x, float y, float z, float w) + { + return _mm_set_ps(w, z, y, x); + } + + inline f32x4 v_insert_w(f32x4 v, f32x4 w) + { + return _mm_insert_ps(v, w, 0x30); + } + + //------------------------------------------------------ + // Simple Arithmatic + //------------------------------------------------------ + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + + // Fast recip 1/b + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = _mm_rcp_ps(b); + f32x4 two = _mm_set1_ps(2.0f); + r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + } + + // Divide fast ( b = recip eg 1/b) + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 half = _mm_set1_ps(0.5f); + f32x4 three = _mm_set1_ps(3.0f); + + f32x4 r = _mm_rsqrt_ps(x); + f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r); + return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr)); + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + + // full dot4 + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1 + } + + // dot3 (ignores w) + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1 + } + + // cross product xyz only. + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + + f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); + } + + inline f32x4 v_normalize3(f32x4 v) + { + const f32x4 zero = _mm_setzero_ps(); + const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} + f32x4 dot = v_dot3(v, v); + + f32x4 inv = v_rsqrt_nr(dot); + f32x4 isZero = _mm_cmpeq_ps(dot, zero); + f32x4 norm = _mm_mul_ps(v, inv); + + return _mm_blendv_ps(norm, fallback, isZero); + } + + // adds all 4 lanes together. + inline f32x4 v_hadd4(f32x4 a) + { + // sum all 4 lanes in SSE41 + __m128 sum = _mm_hadd_ps(a, a); + return _mm_hadd_ps(sum, sum); + } + + //------------------------------------------------------ + // Matrix type (row-major 4x4) + //------------------------------------------------------ + + struct f32x4x4 + { + f32x4 r0; + f32x4 r1; + f32x4 r2; + f32x4 r3; + }; + + //------------------------------------------------------ + // Matrix Load / Store + //------------------------------------------------------ + + inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major) + { + f32x4x4 out; + out.r0 = v_load(m + 0); + out.r1 = v_load(m + 4); + out.r2 = v_load(m + 8); + out.r3 = v_load(m + 12); + return out; + } + + inline void m_store(float* dst, const f32x4x4& m) + { + _mm_storeu_ps(dst + 0, m.r0); + _mm_storeu_ps(dst + 4, m.r1); + _mm_storeu_ps(dst + 8, m.r2); + _mm_storeu_ps(dst + 12, m.r3); + } + + inline f32x4x4 m_identity() + { + f32x4x4 m; + m.r0 = _mm_set_ps(0, 0, 0, 1); + m.r1 = _mm_set_ps(0, 0, 1, 0); + m.r2 = _mm_set_ps(0, 1, 0, 0); + m.r3 = _mm_set_ps(1, 0, 0, 0); + return m; + } + + inline f32x4x4 m_zero() + { + f32x4 z = v_zero(); + return { z, z, z, z }; + } + + //------------------------------------------------------ + // Matrix × Vector + //------------------------------------------------------ + + inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot4(m.r0, v); + f32x4 y = v_dot4(m.r1, v); + f32x4 z = v_dot4(m.r2, v); + f32x4 w = v_dot4(m.r3, v); + + // combine to a vector + return _mm_set_ps(_mm_cvtss_f32(w), + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) + { + f32x4 x = v_dot3(m.r0, v); + f32x4 y = v_dot3(m.r1, v); + f32x4 z = v_dot3(m.r2, v); + + // combine to a vector + return _mm_set_ps(0.0f, + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); + } + + //------------------------------------------------------ + // Matrix × Matrix + //------------------------------------------------------ + + inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b) + { + // Transpose B once for dot products + f32x4 b0 = b.r0; + f32x4 b1 = b.r1; + f32x4 b2 = b.r2; + f32x4 b3 = b.r3; + + // Transpose (SSE2) + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); + + f32x4x4 C; + + auto mul_row = [&](f32x4 arow) + { + f32x4 x = v_dot4(arow, b0); + f32x4 y = v_dot4(arow, b1); + f32x4 z = v_dot4(arow, b2); + f32x4 w = v_dot4(arow, b3); + + f32x4 xy = _mm_unpacklo_ps(x, y); + f32x4 zw = _mm_unpacklo_ps(z, w); + return _mm_movelh_ps(xy, zw); + }; + + C.r0 = mul_row(a.r0); + C.r1 = mul_row(a.r1); + C.r2 = mul_row(a.r2); + C.r3 = mul_row(a.r3); + + return C; + } + + //------------------------------------------------------ + // Transpose + //------------------------------------------------------ + + inline f32x4x4 m_transpose(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + f32x4 r3 = m.r3; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + + return { r0, r1, r2, r3 }; + } + + inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2) + { + dst[0] = v_extract0(c0); // c0.x + dst[1] = v_extract0(c1); // c1.x + dst[2] = v_extract0(c2); // c2.x + + dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y + dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y + dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y + + dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z + dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z + dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z + } + + inline f32x4x4 m_inverse(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(b, c); + f32x4 c1 = v_cross(c, d); + f32x4 c2 = v_cross(d, a); + f32x4 c3 = v_cross(a, b); + + f32x4 det = v_dot4(a, c1); + f32x4 invDet = v_rcp_nr(det); + + f32x4x4 adj; + adj.r0 = _mm_mul_ps(c1, invDet); + adj.r1 = _mm_mul_ps(c2, invDet); + adj.r2 = _mm_mul_ps(c3, invDet); + adj.r3 = _mm_mul_ps(c0, invDet); + + return m_transpose(adj); + } + + inline f32x4x4 m_inverse_rigid(const f32x4x4& m) + { + f32x4x4 t = m_transpose(m); + + // Extract translation + f32x4 T = v_set( + _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))), + _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))), + 0.0f + ); + + f32x4 newT = m_mul_vec4(t, T); + newT = _mm_sub_ps(v_zero(), newT); + + t.r0 = v_preserve_w(t.r0, newT); + t.r1 = v_preserve_w(t.r1, newT); + t.r2 = v_preserve_w(t.r2, newT); + t.r3 = v_set(0, 0, 0, 1); + + return t; + } + + inline f32x4x4 m_inverse_affine(const f32x4x4& m) + { + // Extract upper 3x3 rows + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + // Zero translation + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + // Compute cofactors via cross products + f32x4 c0 = v_cross(r1, r2); + f32x4 c1 = v_cross(r2, r0); + f32x4 c2 = v_cross(r0, r1); + + // Determinant + f32x4 det = v_dot3(r0, c0); + + // Reciprocal determinant + f32x4 invDet = v_rcp_nr(det); + + // Inverse 3x3 (transpose of cofactor matrix) + f32x4 i0 = _mm_mul_ps(c0, invDet); + f32x4 i1 = _mm_mul_ps(c1, invDet); + f32x4 i2 = _mm_mul_ps(c2, invDet); + + // Transpose 3x3 + f32x4 t0 = i0; + f32x4 t1 = i1; + f32x4 t2 = i2; + f32x4 t3 = _mm_setzero_ps(); + + _MM_TRANSPOSE4_PS(t0, t1, t2, t3); + + // Extract translation + alignas(16) float tmp[4]; + tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))); + tmp[3] = 0.0f; + + f32x4 T = v_load(tmp); + + // New translation = -R' * T + f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T); + newT = _mm_sub_ps(v_zero(), newT); + + // Assemble final matrix + f32x4x4 out; + out.r0 = v_preserve_w(t0, newT); + out.r1 = v_preserve_w(t1, newT); + out.r2 = v_preserve_w(t2, newT); + out.r3 = _mm_set_ps(1, 0, 0, 0); + + return out; + } + + inline f32x4 m_determinant(const f32x4x4& m) + { + f32x4 a = m.r0; + f32x4 b = m.r1; + f32x4 c = m.r2; + f32x4 d = m.r3; + + f32x4 c0 = v_cross(c, d); + f32x4 c1 = v_cross(d, b); + f32x4 c2 = v_cross(b, c); + + f32x4 term0 = _mm_mul_ps(a, c0); + f32x4 term1 = _mm_mul_ps(a, c1); + f32x4 term2 = _mm_mul_ps(a, c2); + + f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2)); + + return v_hadd4(det); + } + + inline f32x4 m_determinant_affine(const f32x4x4& m) + { + f32x4 r0 = m.r0; + f32x4 r1 = m.r1; + f32x4 r2 = m.r2; + + r0 = _mm_and_ps(r0, v_mask_xyz()); + r1 = _mm_and_ps(r1, v_mask_xyz()); + r2 = _mm_and_ps(r2, v_mask_xyz()); + + f32x4 c0 = v_cross(r1, r2); + return v_dot3(r0, c0); // splatted determinant + } + + //--------------------------------------------------------- + // BATCH INTRINSICS + //--------------------------------------------------------- + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) + { + vec4_batch4 r; + + r.x = _mm_set_ps( + ptr[9], ptr[6], ptr[3], ptr[0]); + + r.y = _mm_set_ps( + ptr[10], ptr[7], ptr[4], ptr[1]); + + r.z = _mm_set_ps( + ptr[11], ptr[8], ptr[5], ptr[2]); + + if (fillW) + { + r.w = _mm_set1_ps(w); + } + + return r; + } + + // Store the result back out to a float array 4 at a time + inline void store_f32x4(float* out, f32x4 v) + { + _mm_storeu_ps(out, v); + } + + // Store the result back to a float3 array with size of 4 + inline void store_vec3_batch4(float* out, const vec4_batch4& v) + { + alignas(16) float xs[8]; + alignas(16) float ys[8]; + alignas(16) float zs[8]; + + _mm_store_ps(xs, v.x); + _mm_store_ps(ys, v.y); + _mm_store_ps(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b) + { + f32x4 mulx = _mm_mul_ps(a.x, b.x); + f32x4 muly = _mm_mul_ps(a.y, b.y); + f32x4 mulz = _mm_mul_ps(a.z, b.z); + + return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); + } + + // Batch 4 mul_Vec4. + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) + { + vec4_batch4 r; + + f32x4 m00 = _mm_set1_ps(m[0]); + f32x4 m01 = _mm_set1_ps(m[1]); + f32x4 m02 = _mm_set1_ps(m[2]); + f32x4 m03 = _mm_set1_ps(m[3]); + + f32x4 m10 = _mm_set1_ps(m[4]); + f32x4 m11 = _mm_set1_ps(m[5]); + f32x4 m12 = _mm_set1_ps(m[6]); + f32x4 m13 = _mm_set1_ps(m[7]); + + f32x4 m20 = _mm_set1_ps(m[8]); + f32x4 m21 = _mm_set1_ps(m[9]); + f32x4 m22 = _mm_set1_ps(m[10]); + f32x4 m23 = _mm_set1_ps(m[11]); + + // + // row0 dot + // + r.x = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m00), + _mm_mul_ps(v.y, m01)), + _mm_add_ps( + _mm_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m10), + _mm_mul_ps(v.y, m11)), + _mm_add_ps( + _mm_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m20), + _mm_mul_ps(v.y, m21)), + _mm_add_ps( + _mm_mul_ps(v.z, m22), m23)); + + return r; + } +} diff --git a/Engine/source/math/mMathFn.h b/Engine/source/math/mMathFn.h index 9f84b37df..241a9a393 100644 --- a/Engine/source/math/mMathFn.h +++ b/Engine/source/math/mMathFn.h @@ -151,7 +151,7 @@ inline void m_matF_x_vectorF(const F32 *m, const F32 *v, F32 *vresult) inline bool mIsEqual( F32 a, F32 b, const F32 epsilon = __EQUAL_CONST_F ) { F32 diff = a - b; - return diff > -epsilon && diff < epsilon; + return diff > -epsilon && diff < epsilon; } inline bool mIsZero(const F32 val, const F32 epsilon = __EQUAL_CONST_F ) @@ -207,16 +207,16 @@ inline F32 mFmod(const F32 val, const F32 mod) return fmod(val, mod); } -inline S32 mRound(const F32 val) -{ - return (S32)floor(val + 0.5f); -} +inline S32 mRound(const F32 val) +{ + return (S32)floor(val + 0.5f); +} -inline F32 mRound(const F32 val, const S32 n) -{ - S32 place = (S32) pow(10.0f, n); +inline F32 mRound(const F32 val, const S32 n) +{ + S32 place = (S32) pow(10.0f, n); - return mFloor((val*place)+0.5)/place; + return mFloor((val*place)+0.5)/place; } inline F32 mRoundF(const F32 val, const F32 step) @@ -261,15 +261,15 @@ inline F32 mClampF(F32 val, F32 low, F32 high) inline S32 mWrap(S32 val, S32 low, S32 high) { - int len = high - low; - return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0); + int len = high - low; + return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0); } inline F32 mWrapF(F32 val, F32 low, F32 high) { - F32 t = fmod(val - low, high - low); - return t < 0 ? t + high : t + low; + F32 t = fmod(val - low, high - low); + return t < 0 ? t + high : t + low; } /// Template function for doing a linear interpolation between any two @@ -497,7 +497,7 @@ inline F64 mRadToDeg(F64 r) inline bool mIsNaN_F( const F32 x ) { - // If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN. + // If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN. // This is only going to work if the compiler is IEEE 748 compliant. // // Tested and working on VC2k5 diff --git a/Engine/source/math/mMath_C.cpp b/Engine/source/math/mMath_C.cpp index 7fac832f9..07f63dc38 100644 --- a/Engine/source/math/mMath_C.cpp +++ b/Engine/source/math/mMath_C.cpp @@ -152,8 +152,8 @@ static void m_point3F_interpolate_C(const F32 *from, const F32 *to, F32 factor, #ifdef TORQUE_COMPILER_GCC // remove possibility of aliases const F32 inverse = 1.0f - factor; - const F32 from0 = from[0], from1 = from[1], from2 = from[2]; - const F32 to0 = to[0], to1 = to[1], to2 = to[2]; + const F32 from0 = from[0], from1 = from[1], from2 = from[2]; + const F32 to0 = to[0], to1 = to[1], to2 = to[2]; result[0] = from0 * inverse + to0 * factor; result[1] = from1 * inverse + to1 * factor; @@ -182,8 +182,8 @@ static void m_point3D_interpolate_C(const F64 *from, const F64 *to, F64 factor, #ifdef TORQUE_COMPILER_GCC // remove possibility of aliases const F64 inverse = 1.0f - factor; - const F64 from0 = from[0], from1 = from[1], from2 = from[2]; - const F64 to0 = to[0], to1 = to[1], to2 = to[2]; + const F64 from0 = from[0], from1 = from[1], from2 = from[2]; + const F64 to0 = to[0], to1 = to[1], to2 = to[2]; result[0] = from0 * inverse + to0 * factor; result[1] = from1 * inverse + to1 * factor; @@ -436,8 +436,8 @@ static void affineInvertTo(const F32 * m, F32 * out) F32 invDet = 1.0f / (m[idx(0,0)] * d1 + m[idx(0,1)] * d2 + m[idx(0,2)] * d3); - F32 m00 = m[idx(0,0)] * invDet; - F32 m01 = m[idx(0,1)] * invDet; + F32 m00 = m[idx(0,0)] * invDet; + F32 m01 = m[idx(0,1)] * invDet; F32 m02 = m[idx(0,2)] * invDet; F32 * result = out; @@ -950,4 +950,3 @@ void mInstallLibrary_C() m_matF_x_scale_x_planeF = m_matF_x_scale_x_planeF_C; m_matF_x_box3F = m_matF_x_box3F_C; } - diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h index 763a62b93..3b160c94c 100644 --- a/Engine/source/math/mMatrix.h +++ b/Engine/source/math/mMatrix.h @@ -50,13 +50,16 @@ #ifndef _CONSOLE_H_ #include "console/console.h" #endif +#ifndef _MATH_BACKEND_H_ +#include "math/public/math_backend.h" +#endif #ifndef USE_TEMPLATE_MATRIX /// 4x4 Matrix Class /// /// This runs at F32 precision. - +using namespace math_backend::mat44::dispatch; class MatrixF { friend class MatrixFEngineExport; @@ -128,7 +131,7 @@ public: EulerF toEuler() const; F32 determinant() const { - return m_matF_determinant(*this); + return GetMat44().determinant(*this); } /// Compute the inverse of the matrix. @@ -228,6 +231,8 @@ public: void mul( Point4F& p ) const; ///< M * p -> p (full [4x4] * [1x4]) void mulP( Point3F& p ) const; ///< M * p -> p (assume w = 1.0f) void mulP( const Point3F &p, Point3F *d) const; ///< M * p -> d (assume w = 1.0f) + void batch_mulP(Point3F* points, U32 count) const; + void batch_mulP(const Point3F* points, U32 count, Point3F* out) const; void mulV( VectorF& p ) const; ///< M * v -> v (assume w = 0.0f) void mulV( const VectorF &p, Point3F *d) const; ///< M * v -> d (assume w = 0.0f) @@ -372,71 +377,66 @@ inline MatrixF& MatrixF::identity() inline MatrixF& MatrixF::inverse() { - m_matF_inverse(m); + GetMat44().inverse(m); return (*this); } inline void MatrixF::invertTo( MatrixF *out ) { - m_matF_invert_to(m,*out); + GetMat44().inverse_to(*this, *out); } inline MatrixF& MatrixF::affineInverse() { -// AssertFatal(isAffine() == true, "Error, this matrix is not an affine transform"); - m_matF_affineInverse(m); + GetMat44().affine_inverse(m); return (*this); } inline MatrixF& MatrixF::transpose() { - m_matF_transpose(m); + GetMat44().transpose(m); return (*this); } inline MatrixF& MatrixF::scale(const Point3F& p) { - m_matF_scale(m,p); + GetMat44().scale(m, p); return *this; } inline Point3F MatrixF::getScale() const { Point3F scale; - scale.x = mSqrt(m[0]*m[0] + m[4] * m[4] + m[8] * m[8]); - scale.y = mSqrt(m[1]*m[1] + m[5] * m[5] + m[9] * m[9]); - scale.z = mSqrt(m[2]*m[2] + m[6] * m[6] + m[10] * m[10]); + GetMat44().get_scale(m, scale); return scale; } inline void MatrixF::normalize() { - m_matF_normalize(m); + GetMat44().normalize(m); } inline MatrixF& MatrixF::mul( const MatrixF &a ) { // M * a -> M AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!"); - MatrixF tempThis(*this); - m_matF_x_matF(tempThis, a, *this); + GetMat44().mul_mat44(tempThis, a, *this); + return (*this); } inline MatrixF& MatrixF::mulL( const MatrixF &a ) { // a * M -> M AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!"); - MatrixF tempThis(*this); - m_matF_x_matF(a, tempThis, *this); + GetMat44().mul_mat44(a, tempThis, *this); return (*this); } inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b ) { // a * b -> M AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!"); - - m_matF_x_matF(a, b, *this); + GetMat44().mul_mat44(a, b, *this); return (*this); } @@ -461,7 +461,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b) inline void MatrixF::mul( Point4F& p ) const { Point4F temp; - m_matF_x_point4F(*this, &p.x, &temp.x); + GetMat44().mul_float4(*this, p, temp); p = temp; } @@ -469,28 +469,62 @@ inline void MatrixF::mulP( Point3F& p) const { // M * p -> d Point3F d; - m_matF_x_point3F(*this, &p.x, &d.x); + GetMat44().mul_pos3(*this, p, d); p = d; } inline void MatrixF::mulP( const Point3F &p, Point3F *d) const { // M * p -> d - m_matF_x_point3F(*this, &p.x, &d->x); + GetMat44().mul_pos3(*this, p, *d); +} + +inline void MatrixF::batch_mulP(Point3F* points, U32 count) const +{ + // Allocate temporary buffer + Point3F* temp = new Point3F[count]; + + GetMat44().batch_mul_pos3(m, reinterpret_cast(points), count, reinterpret_cast(temp)); + + // Copy the results back to out + for (U32 i = 0; i < count; ++i) + { + points[i] = temp[i]; + } + + // Free temporary buffer + delete[] temp; +} + +inline void MatrixF::batch_mulP(const Point3F* points, U32 count, Point3F* out) const +{ + // Allocate temporary buffer + Point3F* temp = new Point3F[count]; + + GetMat44().batch_mul_pos3(m, reinterpret_cast(points), count, reinterpret_cast(temp)); + + // Copy the results back to out + for (U32 i = 0; i < count; ++i) + { + out[i] = temp[i]; + } + + // Free temporary buffer + delete[] temp; } inline void MatrixF::mulV( VectorF& v) const { // M * v -> v VectorF temp; - m_matF_x_vectorF(*this, &v.x, &temp.x); + GetMat44().mul_vec3(*this, v, temp); v = temp; } inline void MatrixF::mulV( const VectorF &v, Point3F *d) const { // M * v -> d - m_matF_x_vectorF(*this, &v.x, &d->x); + GetMat44().mul_vec3(*this, v, *d); } inline void MatrixF::mul(Box3F& b) const @@ -634,14 +668,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 ) { // temp = m1 * m2 MatrixF temp; - m_matF_x_matF(m1, m2, temp); + GetMat44().mul_mat44(m1, m2, temp); return temp; } inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 ) { MatrixF tempThis(*this); - m_matF_x_matF(tempThis, m1, *this); + GetMat44().mul_mat44(tempThis, m1, *this); return (*this); } @@ -665,7 +699,9 @@ inline bool MatrixF::isNaN() inline void mTransformPlane(const MatrixF& mat, const Point3F& scale, const PlaneF& plane, PlaneF * result) { - m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x); + //m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x); + + GetMat44().transform_plane(mat, scale, plane, *result); } #else // !USE_TEMPLATE_MATRIX diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h index c1bb72923..895454c05 100644 --- a/Engine/source/math/mPoint3.h +++ b/Engine/source/math/mPoint3.h @@ -29,6 +29,11 @@ #ifndef _MPOINT2_H_ #include "math/mPoint2.h" #endif +#ifndef _MATH_BACKEND_H_ +#include "math/public/math_backend.h" +#endif + +#include //------------------------------------------------------------------------------ /// 3D integer point @@ -97,6 +102,7 @@ public: class Point3D; //------------------------------------------------------------------------------ +using namespace math_backend::float3::dispatch; class Point3F { //-------------------------------------- Public data @@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test) inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor) { AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor"); - m_point3F_interpolate( _from, _to, _factor, *this); + + GetFloat3().lerp(_from, _to, _factor, *this); } inline void Point3F::zero() @@ -585,31 +592,27 @@ inline void Point3F::neg() inline void Point3F::convolve(const Point3F& c) { - x *= c.x; - y *= c.y; - z *= c.z; + GetFloat3().mul(*this, c, *this); } inline void Point3F::convolveInverse(const Point3F& c) { - x /= c.x; - y /= c.y; - z /= c.z; + GetFloat3().div(*this, c, *this); } inline F32 Point3F::lenSquared() const { - return (x * x) + (y * y) + (z * z); + return GetFloat3().lengthSquared(*this); } inline F32 Point3F::len() const { - return mSqrt(x*x + y*y + z*z); + return GetFloat3().length(*this); } inline void Point3F::normalize() { - m_point3F_normalize(*this); + GetFloat3().normalize(*this); } inline F32 Point3F::magnitudeSafe() const @@ -626,18 +629,13 @@ inline F32 Point3F::magnitudeSafe() const inline void Point3F::normalizeSafe() { - F32 vmag = magnitudeSafe(); - - if( vmag > POINT_EPSILON ) - { - *this *= F32(1.0 / vmag); - } + GetFloat3().normalize(*this); } inline void Point3F::normalize(F32 val) { - m_point3F_normalize_f(*this, val); + GetFloat3().normalize_mag(*this, val); } inline bool Point3F::operator==(const Point3F& _test) const @@ -652,52 +650,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const inline Point3F Point3F::operator+(const Point3F& _add) const { - return Point3F(x + _add.x, y + _add.y, z + _add.z); + Point3F temp; + GetFloat3().add(*this, _add, temp); + return temp; } inline Point3F Point3F::operator-(const Point3F& _rSub) const { - return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z); + Point3F temp; + GetFloat3().sub(*this, _rSub, temp); + return temp; } inline Point3F& Point3F::operator+=(const Point3F& _add) { - x += _add.x; - y += _add.y; - z += _add.z; - + GetFloat3().add(*this, _add, *this); return *this; } inline Point3F& Point3F::operator-=(const Point3F& _rSub) { - x -= _rSub.x; - y -= _rSub.y; - z -= _rSub.z; - + GetFloat3().sub(*this, _rSub, *this); return *this; } inline Point3F Point3F::operator*(F32 _mul) const { - return Point3F(x * _mul, y * _mul, z * _mul); + Point3F temp; + GetFloat3().mul_scalar(*this, _mul, temp); + return temp; } inline Point3F Point3F::operator/(F32 _div) const { AssertFatal(_div != 0.0f, "Error, div by zero attempted"); - F32 inv = 1.0f / _div; - - return Point3F(x * inv, y * inv, z * inv); + Point3F temp; + GetFloat3().div_scalar(*this, _div, temp); + return temp; } inline Point3F& Point3F::operator*=(F32 _mul) { - x *= _mul; - y *= _mul; - z *= _mul; - + GetFloat3().mul_scalar(*this, _mul, *this); return *this; } @@ -705,39 +700,35 @@ inline Point3F& Point3F::operator/=(F32 _div) { AssertFatal(_div != 0.0f, "Error, div by zero attempted"); - F32 inv = 1.0f / _div; - x *= inv; - y *= inv; - z *= inv; - + GetFloat3().div_scalar(*this, _div, *this); return *this; } inline Point3F Point3F::operator*(const Point3F &_vec) const { - return Point3F(x * _vec.x, y * _vec.y, z * _vec.z); + Point3F temp; + GetFloat3().mul(*this, _vec, temp); + return temp; } inline Point3F& Point3F::operator*=(const Point3F &_vec) { - x *= _vec.x; - y *= _vec.y; - z *= _vec.z; + GetFloat3().mul(*this, _vec, *this); return *this; } inline Point3F Point3F::operator/(const Point3F &_vec) const { AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted"); - return Point3F(x / _vec.x, y / _vec.y, z / _vec.z); + Point3F temp; + GetFloat3().div(*this, _vec, temp); + return temp; } inline Point3F& Point3F::operator/=(const Point3F &_vec) { AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted"); - x /= _vec.x; - y /= _vec.y; - z /= _vec.z; + GetFloat3().div(*this, _vec, *this); return *this; } @@ -989,7 +980,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand) inline Point3F operator*(F32 mul, const Point3F& multiplicand) { - return multiplicand * mul; + Point3F temp; + GetFloat3().mul_scalar(multiplicand, mul, temp); + return temp; } inline Point3D operator*(F64 mul, const Point3D& multiplicand) @@ -999,7 +992,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand) inline F32 mDot(const Point3F &p1, const Point3F &p2) { - return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z); + return GetFloat3().dot(p1, p2); } inline F64 mDot(const Point3D &p1, const Point3D &p2) @@ -1009,9 +1002,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2) inline void mCross(const Point3F &a, const Point3F &b, Point3F *res) { - res->x = (a.y * b.z) - (a.z * b.y); - res->y = (a.z * b.x) - (a.x * b.z); - res->z = (a.x * b.y) - (a.y * b.x); + GetFloat3().cross(a, b, *res); } inline void mCross(const Point3D &a, const Point3D &b, Point3D *res) @@ -1024,7 +1015,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res) inline Point3F mCross(const Point3F &a, const Point3F &b) { Point3F r; - mCross( a, b, &r ); + GetFloat3().cross(a, b, r); return r; } diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h index 8ae173009..4befd326b 100644 --- a/Engine/source/math/mPoint4.h +++ b/Engine/source/math/mPoint4.h @@ -26,10 +26,12 @@ #ifndef _MMATHFN_H_ #include "math/mMathFn.h" #endif - #ifndef _MPOINT3_H_ #include "math/mPoint3.h" #endif +#ifndef _MATH_BACKEND_H_ +#include "math/public/math_backend.h" +#endif //------------------------------------------------------------------------------ @@ -61,6 +63,8 @@ class Point4I /// Uses F32 internally. /// /// Useful for representing quaternions and other 4d beasties. +using namespace math_backend::float4::dispatch; + class Point4F { //-------------------------------------- Public data @@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w) inline F32 Point4F::len() const { - return mSqrt(x*x + y*y + z*z + w*w); + return GetFloat4().length(*this); } inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor) { - x = (_from.x * (1.0f - _factor)) + (_to.x * _factor); - y = (_from.y * (1.0f - _factor)) + (_to.y * _factor); - z = (_from.z * (1.0f - _factor)) + (_to.z * _factor); - w = (_from.w * (1.0f - _factor)) + (_to.w * _factor); + GetFloat4().lerp(_from, _to, _factor, *this); } inline void Point4F::zero() @@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar) if (mIsZero(scalar)) return *this; - F32 denom = 1 / scalar; - - x *= denom; - y *= denom; - z *= denom; - w *= denom; + GetFloat4().div_scalar(*this, scalar, *this); return *this; } inline Point4F Point4F::operator+(const Point4F& _add) const { - return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w ); + Point4F res; + GetFloat4().add(*this, _add, res); + return res; } inline Point4F& Point4F::operator+=(const Point4F& _add) { - x += _add.x; - y += _add.y; - z += _add.z; - w += _add.w; - + GetFloat4().add(*this, _add, *this); return *this; } inline Point4F Point4F::operator-(const Point4F& _rSub) const { - return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w ); + Point4F res; + GetFloat4().sub(*this, _rSub, res); + return res; } inline Point4F Point4F::operator*(const Point4F &_vec) const { - return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w); + Point4F res; + GetFloat4().mul(*this, _vec, res); + return res; } inline Point4F Point4F::operator*(F32 _mul) const { - return Point4F(x * _mul, y * _mul, z * _mul, w * _mul); + Point4F res; + GetFloat4().mul_scalar(*this, _mul, res); + return res; } inline Point4F Point4F::operator /(F32 t) const { - F32 f = 1.0f / t; - return Point4F( x * f, y * f, z * f, w * f ); + Point4F res; + GetFloat4().div_scalar(*this, t, res); + return res; } inline F32 mDot(const Point4F &p1, const Point4F &p2) { - return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w); + return GetFloat4().dot(p1, p2); } //------------------------------------------------------------------------------ diff --git a/Engine/source/math/public/float3_dispatch.h b/Engine/source/math/public/float3_dispatch.h new file mode 100644 index 000000000..e4279cb84 --- /dev/null +++ b/Engine/source/math/public/float3_dispatch.h @@ -0,0 +1,39 @@ +#pragma once +#ifndef _FLOAT3_DISPATCH_H_ +#define _FLOAT3_DISPATCH_H_ + + +#include + +namespace math_backend::float3::dispatch +{ + struct Float3Funcs + { + void (*add)(const float*, const float*, float*) = nullptr; + void (*sub)(const float*, const float*, float*) = nullptr; + void (*mul)(const float*, const float*, float*) = nullptr; + void (*mul_scalar)(const float*, float, float*) = nullptr; + void (*div)(const float*, const float*, float*) = nullptr; + void (*div_scalar)(const float*, float, float*) = nullptr; + float (*dot)(const float*, const float*) = nullptr; + float (*length)(const float*) = nullptr; + float (*lengthSquared)(const float*) = nullptr; + void (*normalize)(float*) = nullptr; + void (*normalize_mag)(float*, float) = nullptr; + void (*lerp)(const float*, const float*, float, float*) = nullptr; + void (*cross)(const float*, const float*, float*) = nullptr; + }; + + // Global dispatch table + extern Float3Funcs gFloat3; + + // Backend installers (defined in ISA libraries) + void install_scalar(); + void install_sse2(); + void install_sse41(); + void install_avx(); + void install_avx2(); + void install_neon(); +} + +#endif // !_FLOAT4_DISPATCH_H_ diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h new file mode 100644 index 000000000..6f26114ce --- /dev/null +++ b/Engine/source/math/public/float4_dispatch.h @@ -0,0 +1,39 @@ +#pragma once +#ifndef _FLOAT4_DISPATCH_H_ +#define _FLOAT4_DISPATCH_H_ + + +#include + +namespace math_backend::float4::dispatch +{ + struct Float4Funcs + { + void (*add)(const float*, const float*, float*) = nullptr; + void (*sub)(const float*, const float*, float*) = nullptr; + void (*mul)(const float*, const float*, float*) = nullptr; + void (*mul_scalar)(const float*, float, float*) = nullptr; + void (*div)(const float*, const float*, float*) = nullptr; + void (*div_scalar)(const float*, float, float*) = nullptr; + float (*dot)(const float*, const float*) = nullptr; + float (*length)(const float*) = nullptr; + float (*lengthSquared)(const float*) = nullptr; + void (*normalize)(float*) = nullptr; + void (*normalize_mag)(float*, float) = nullptr; + void (*lerp)(const float*, const float*, float, float*) = nullptr; + void (*cross)(const float*, const float*, float*) = nullptr; + }; + + // Global dispatch table + extern Float4Funcs gFloat4; + + // Backend installers (defined in ISA libraries) + void install_scalar(); + void install_sse2(); + void install_sse41(); + void install_avx(); + void install_avx2(); + void install_neon(); +} + +#endif // !_FLOAT4_DISPATCH_H_ diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h new file mode 100644 index 000000000..b49ef9b84 --- /dev/null +++ b/Engine/source/math/public/mat44_dispatch.h @@ -0,0 +1,39 @@ +#pragma once +#ifndef _MAT44_DISPATCH_H_ +#define _MAT44_DISPATCH_H_ + + +namespace math_backend::mat44::dispatch +{ + struct Mat44Funcs + { + void (*transpose)(float*) = nullptr; + void (*inverse)(float*) = nullptr; + void (*inverse_to)(const float*, float*) = nullptr; + void (*affine_inverse)(float*) = nullptr; + void (*normalize)(float*) = nullptr; + void (*mul_mat44)(const float* a, const float* b, float* r) = nullptr; + void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr; + void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr; + void (*mul_float4)(const float* a, const float* b, float* r) = nullptr; + void (*transform_plane)(const float* m, const float* s, const float* p, float* r) = nullptr; + float (*determinant)(const float*) = nullptr; + void (*scale)(float*, const float*) = nullptr; + void (*get_scale)(const float*, float*) = nullptr; + + void (*batch_mul_pos3)(const float* m, const float* pts, int count, float* result_ptrs) = nullptr; + }; + + // Global dispatch table + extern Mat44Funcs gMat44; + + // Backend installers (defined in ISA libraries) + void install_scalar(); + void install_sse2(); + void install_sse41(); + void install_avx(); + void install_avx2(); + void install_neon(); +} + +#endif // !_MAT44_DISPATCH_H_ diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp new file mode 100644 index 000000000..d110650ac --- /dev/null +++ b/Engine/source/math/public/math_backend.cpp @@ -0,0 +1,103 @@ +#include "math/public/math_backend.h" + +namespace math_backend::float4::dispatch +{ + // Single definition of the global dispatch table + Float4Funcs gFloat4{}; +} + +namespace math_backend::float3::dispatch +{ + // Single definition of the global dispatch table + Float3Funcs gFloat3{}; +} + +namespace math_backend::mat44::dispatch +{ + Mat44Funcs gMat44{}; +} + +namespace math_backend +{ + // Use an anonymous namespace for the static initializer + namespace { + struct ScalarInitializer + { + ScalarInitializer() + { + // Install scalar defaults immediately for all types + float4::dispatch::install_scalar(); + float3::dispatch::install_scalar(); + mat44::dispatch::install_scalar(); + } + }; + + // Static instance ensures constructor runs before main() + ScalarInitializer g_scalarInitializer; + } +} + + +math_backend::backend math_backend::choose_backend(U32 cpu_flags) +{ +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) + + if (cpu_flags & CPU_PROP_AVX2) return backend::avx2; + if (cpu_flags & CPU_PROP_AVX) return backend::avx; + if (cpu_flags & CPU_PROP_SSE4_1) return backend::sse41; + if (cpu_flags & CPU_PROP_SSE2) return backend::sse2; + +#elif defined(__aarch64__) || defined(__ARM_NEON) + + if (cpu_flags & CPU_PROP_NEON) return backend::neon; + +#endif + return backend::scalar; +} + +void math_backend::install_from_cpu_flags(uint32_t cpu_flags) +{ + { + g_backend = choose_backend(cpu_flags); + + switch (g_backend) + { +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) + case backend::avx2: + float4::dispatch::install_avx2(); + float3::dispatch::install_avx2(); + mat44::dispatch::install_avx2(); + break; + + case backend::avx: + float4::dispatch::install_avx(); + float3::dispatch::install_avx(); + mat44::dispatch::install_avx(); + break; + + case backend::sse41: + float4::dispatch::install_sse41(); + float3::dispatch::install_sse41(); + mat44::dispatch::install_sse41(); + break; + + case backend::sse2: + float4::dispatch::install_sse2(); + float3::dispatch::install_sse2(); + mat44::dispatch::install_sse2(); + break; +#elif defined(__aarch64__) || defined(__ARM_NEON) + case backend::neon: + float4::dispatch::install_neon(); + float3::dispatch::install_neon(); + mat44::dispatch::install_neon(); + break; +#endif + default: + float4::dispatch::install_scalar(); + float3::dispatch::install_scalar(); + mat44::dispatch::install_scalar(); + break; + } + } +} diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h new file mode 100644 index 000000000..9c5575823 --- /dev/null +++ b/Engine/source/math/public/math_backend.h @@ -0,0 +1,95 @@ +#pragma once +#ifndef _MATH_BACKEND_H_ +#define _MATH_BACKEND_H_ + +#ifndef _MCONSTANTS_H_ +#include "math/mConstants.h" +#endif +#ifndef _PLATFORMASSERT_H_ +#include "platform/platformAssert.h" +#endif +#ifndef _FLOAT4_DISPATCH_H_ +#include "math/public/float4_dispatch.h" +#endif +#ifndef _FLOAT3_DISPATCH_H_ +#include "math/public/float3_dispatch.h" +#endif +#ifndef _MAT44_DISPATCH_H_ +#include "math/public/mat44_dispatch.h" +#endif + +namespace math_backend +{ + enum class backend + { + scalar, + sse2, + sse41, + avx, + avx2, + neon + }; + + static backend g_backend = backend::scalar; + backend choose_backend(U32 cpu_flags); + void install_from_cpu_flags(uint32_t cpu_flags); +} + +#include + +namespace math_backend::float4::dispatch +{ + //-------------------------------------------------- + // Thread-safe getter, ensures scalar installed if empty + //-------------------------------------------------- + inline Float4Funcs& GetFloat4() + { + if (gFloat4.mul == nullptr) + { + static std::once_flag once; + std::call_once(once, []{ + install_scalar(); + }); + } + return gFloat4; + } +} + +namespace math_backend::float3::dispatch +{ + //-------------------------------------------------- + // Thread-safe getter, ensures scalar installed if empty + //-------------------------------------------------- + inline Float3Funcs& GetFloat3() + { + if (gFloat3.mul == nullptr) + { + static std::once_flag once; + std::call_once(once, []{ + install_scalar(); + }); + } + return gFloat3; + } +} + + +namespace math_backend::mat44::dispatch +{ + //-------------------------------------------------- + // Thread-safe getter, ensures scalar installed if empty + //-------------------------------------------------- + inline Mat44Funcs& GetMat44() + { + if (gMat44.mul_mat44 == nullptr) + { + static std::once_flag once; + std::call_once(once, []{ + install_scalar(); + }); + } + return gMat44; + } +} + +#endif // !_MATH_BACKEND_H_ diff --git a/Engine/source/math/util/frustum.cpp b/Engine/source/math/util/frustum.cpp index 632d7cded..eff17a0c1 100644 --- a/Engine/source/math/util/frustum.cpp +++ b/Engine/source/math/util/frustum.cpp @@ -381,8 +381,10 @@ void FrustumData::_update() const // Transform the points into the desired culling space. - for( U32 i = 0; i < mPoints.size(); ++ i ) - mTransform.mulP( mPoints[ i ] ); + /*for( U32 i = 0; i < mPoints.size(); ++ i ) + mTransform.mulP( mPoints[ i ] );*/ + + mTransform.batch_mulP(mPoints.address(), mPoints.size()); // Update the axis aligned bounding box from // the newly transformed points. diff --git a/Engine/source/platform/platform.h b/Engine/source/platform/platform.h index acb929a9f..0d377f23f 100644 --- a/Engine/source/platform/platform.h +++ b/Engine/source/platform/platform.h @@ -75,10 +75,12 @@ enum ProcessorProperties CPU_PROP_SSE4_1 = (1<<7), ///< Supports SSE4_1 instruction set extension. CPU_PROP_SSE4_2 = (1<<8), ///< Supports SSE4_2 instruction set extension. CPU_PROP_AVX = (1<<9), ///< Supports AVX256 instruction set extension. - CPU_PROP_MP = (1<<10), ///< This is a multi-processor system. - CPU_PROP_LE = (1<<11), ///< This processor is LITTLE ENDIAN. - CPU_PROP_64bit = (1<<12), ///< This processor is 64-bit capable - CPU_PROP_NEON = (1<<13), ///< Supports the Arm Neon instruction set extension. + CPU_PROP_AVX2 = (1<<10), ///< Supports AVX256 instruction set extension. + CPU_PROP_AVX512 = (1<<11), ///< Supports AVX512 instruction set extension. + CPU_PROP_MP = (1<<12), ///< This is a multi-processor system. + CPU_PROP_LE = (1<<13), ///< This processor is LITTLE ENDIAN. + CPU_PROP_64bit = (1<<14), ///< This processor is 64-bit capable + CPU_PROP_NEON = (1<<15), ///< Supports the Arm Neon instruction set extension. }; /// Processor info manager. diff --git a/Engine/source/platformMac/macMath.mm b/Engine/source/platformMac/macMath.mm index 4feefb277..992e6ed53 100644 --- a/Engine/source/platformMac/macMath.mm +++ b/Engine/source/platformMac/macMath.mm @@ -25,6 +25,7 @@ #import "math/mMath.h" #import "core/strings/stringFunctions.h" #include "console/engineAPI.h" +#include "math/public/math_backend.h" extern void mInstallLibrary_C(); @@ -107,14 +108,10 @@ void Math::init(U32 properties) Con::printf("Math Init:"); Con::printf(" Installing Standard C extensions"); - mInstallLibrary_C(); + mInstallLibrary_C(); - #ifdef TORQUE_CPU_X86 - if( properties & CPU_PROP_SSE ) - { - Con::printf( " Installing SSE extensions" ); - } - #endif + Con::printf( " Installing ISA extensions" ); + math_backend::install_from_cpu_flags(properties); Con::printf(" "); } diff --git a/Engine/source/platformPOSIX/POSIXMath.cpp b/Engine/source/platformPOSIX/POSIXMath.cpp index 8f21329a3..95e05a1b0 100644 --- a/Engine/source/platformPOSIX/POSIXMath.cpp +++ b/Engine/source/platformPOSIX/POSIXMath.cpp @@ -27,6 +27,7 @@ #include "math/mMath.h" #include "core/strings/stringFunctions.h" #include "console/engineAPI.h" +#include "math/public/math_backend.h" extern void mInstallLibrary_C(); extern void mInstallLibrary_ASM(); @@ -90,29 +91,14 @@ void Math::init(U32 properties) Con::printf(" Installing Standard C extensions"); mInstallLibrary_C(); + Con::printf(" Installing ISA extensions"); + math_backend::install_from_cpu_flags(properties); + #if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64) Con::printf(" Installing Assembly extensions"); mInstallLibrary_ASM(); #endif - if (properties & CPU_PROP_FPU) - { - Con::printf(" Installing FPU extensions"); - } - - if (properties & CPU_PROP_MMX) - { - Con::printf(" Installing MMX extensions"); - } - -#if !defined(__MWERKS__) || (__MWERKS__ >= 0x2400) - if (properties & CPU_PROP_SSE) - { - Con::printf(" Installing SSE extensions"); - } -#endif //mwerks>2.4 - - Con::printf(" "); } diff --git a/Engine/source/platformWin32/winCPUInfo.cpp b/Engine/source/platformWin32/winCPUInfo.cpp index c8e05f44f..3bd183f44 100644 --- a/Engine/source/platformWin32/winCPUInfo.cpp +++ b/Engine/source/platformWin32/winCPUInfo.cpp @@ -72,13 +72,15 @@ enum CpuFlags BIT_XSAVE_RESTORE = BIT(27), BIT_AVX = BIT(28), + BIT_AVX2 = BIT(5), + BIT_AVX512F = BIT(16) }; static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor) { S32 cpuInfo[4]; __cpuid(cpuInfo, 1); - //U32 eax = cpuInfo[0]; // eax + int nIds = cpuInfo[0]; U32 edx = cpuInfo[3]; // edx U32 ecx = cpuInfo[2]; // ecx @@ -90,6 +92,18 @@ static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor) processor.properties |= (ecx & BIT_SSE4_1) ? CPU_PROP_SSE4_1 : 0; processor.properties |= (ecx & BIT_SSE4_2) ? CPU_PROP_SSE4_2 : 0; + if (nIds >= 7) + { + int ext[4]; + __cpuidex(ext, 7, 0); + + if (ext[1] & (BIT_AVX2)) // EBX bit 5 + processor.properties |= CPU_PROP_AVX2; + + if (ext[1] & (BIT_AVX512F)) // AVX-512 Foundation + processor.properties |= CPU_PROP_AVX512; + } + // AVX detection requires that xsaverestore is supported if (ecx & BIT_XSAVE_RESTORE && ecx & BIT_AVX) { @@ -176,6 +190,10 @@ void Processor::init() Con::printf(" SSE4.2 detected" ); if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX) Con::printf(" AVX detected"); + if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX2) + Con::printf(" AVX2 detected"); + if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX512) + Con::printf(" AVX512 detected"); if (Platform::SystemInfo.processor.properties & CPU_PROP_MP) Con::printf(" MultiCore CPU detected [%i cores, %i logical]", Platform::SystemInfo.processor.numPhysicalProcessors, Platform::SystemInfo.processor.numLogicalProcessors); diff --git a/Engine/source/platformWin32/winMath.cpp b/Engine/source/platformWin32/winMath.cpp index 44b215301..bccbd0de6 100644 --- a/Engine/source/platformWin32/winMath.cpp +++ b/Engine/source/platformWin32/winMath.cpp @@ -25,7 +25,7 @@ #include "console/engineAPI.h" #include "math/mMath.h" - +#include "math/public/math_backend.h" extern void mInstallLibrary_C(); extern void mInstallLibrary_ASM(); @@ -98,24 +98,12 @@ void Math::init(U32 properties) Con::printf(" Installing Standard C extensions"); mInstallLibrary_C(); + Con::printf(" Installing ISA extensions"); + math_backend::install_from_cpu_flags(properties); + Con::printf(" Installing Assembly extensions"); mInstallLibrary_ASM(); - if (properties & CPU_PROP_FPU) - { - Con::printf(" Installing FPU extensions"); - } - - if (properties & CPU_PROP_MMX) - { - Con::printf(" Installing MMX extensions"); - } - - if (properties & CPU_PROP_SSE) - { - Con::printf(" Installing SSE extensions"); - } - Con::printf(" "); } diff --git a/Engine/source/ts/assimp/assimpAppNode.cpp b/Engine/source/ts/assimp/assimpAppNode.cpp index ff2e1ffd5..af448f940 100644 --- a/Engine/source/ts/assimp/assimpAppNode.cpp +++ b/Engine/source/ts/assimp/assimpAppNode.cpp @@ -250,7 +250,7 @@ MatrixF AssimpAppNode::getNodeTransform(F32 time) // Check for inverted node coordinate spaces => can happen when modelers // use the 'mirror' tool in their 3d app. Shows up as negative // transforms in the collada model. - if (m_matF_determinant(nodeTransform) < 0.0f) + if (nodeTransform.determinant() < 0.0f) { // Mark this node as inverted so we can mirror mesh geometry, then // de-invert the transform matrix diff --git a/Engine/source/ts/collada/colladaAppMesh.cpp b/Engine/source/ts/collada/colladaAppMesh.cpp index a944e2eef..4c4470654 100644 --- a/Engine/source/ts/collada/colladaAppMesh.cpp +++ b/Engine/source/ts/collada/colladaAppMesh.cpp @@ -1099,7 +1099,7 @@ void ColladaAppMesh::lookupSkinData() // Inverted node coordinate spaces (negative scale factor) are corrected // in ColladaAppNode::getNodeTransform, so need to apply the same operation // here to match - if (m_matF_determinant(invBind) < 0.0f) + if (invBind.determinant() < 0.0f) initialTransforms[iJoint].scale(Point3F(1, 1, -1)); initialTransforms[iJoint].mul(invBind); diff --git a/Engine/source/ts/collada/colladaAppNode.cpp b/Engine/source/ts/collada/colladaAppNode.cpp index 2ffc10fba..0cd8258c8 100644 --- a/Engine/source/ts/collada/colladaAppNode.cpp +++ b/Engine/source/ts/collada/colladaAppNode.cpp @@ -192,7 +192,7 @@ MatrixF ColladaAppNode::getNodeTransform(F32 time) // Check for inverted node coordinate spaces => can happen when modelers // use the 'mirror' tool in their 3d app. Shows up as negative // transforms in the collada model. - if (m_matF_determinant(nodeTransform) < 0.0f) + if (nodeTransform.determinant() < 0.0f) { // Mark this node as inverted so we can mirror mesh geometry, then // de-invert the transform matrix diff --git a/Engine/source/util/fpsTracker.cpp b/Engine/source/util/fpsTracker.cpp index 021d4a4f5..830000bc8 100644 --- a/Engine/source/util/fpsTracker.cpp +++ b/Engine/source/util/fpsTracker.cpp @@ -36,6 +36,8 @@ void FPSTracker::reset() { fpsNext = (F32)Platform::getRealMilliseconds()/1000.0f + mUpdateInterval; + fpsAccumTime = 0.0f; + fpsAccumVirtualTime = 0.0f; fpsRealLast = 0.0f; fpsReal = 0.0f; fpsRealMin = 0.000001f; // Avoid division by zero. @@ -51,42 +53,60 @@ void FPSTracker::update() F32 realSeconds = (F32)Platform::getRealMilliseconds()/1000.0f; F32 virtualSeconds = (F32)Platform::getVirtualMilliseconds()/1000.0f; - fpsFrames++; - if (fpsFrames > 1) + if (fpsRealLast == 0.0f) { - fpsReal = fpsReal*(1.0-alpha) + (realSeconds-fpsRealLast)*alpha; - fpsVirtual = fpsVirtual*(1.0-alpha) + (virtualSeconds-fpsVirtualLast)*alpha; - - if( fpsFrames > 10 ) // Wait a few frames before updating these. - { - // Update min/max. This is a bit counter-intuitive, as the comparisons are - // inversed because these are all one-over-x values. - - if( fpsReal > fpsRealMin ) - fpsRealMin = fpsReal; - if( fpsReal < fpsRealMax ) - fpsRealMax = fpsReal; - } + fpsRealLast = realSeconds; + fpsVirtualLast = virtualSeconds; + return; } - fpsRealLast = realSeconds; + F32 frameTimeReal = realSeconds - fpsRealLast; + F32 frameTimeVirtual = virtualSeconds - fpsVirtualLast; + + fpsRealLast = realSeconds; fpsVirtualLast = virtualSeconds; - // update variables every few frames - F32 update = fpsRealLast - fpsNext; - if (update > 0.5f) - { - F32 delta = realSeconds - fpsNext; - Con::setVariable( "fps::frameDelta",avar("%g", delta)); - Con::setVariable( "fps::real", avar( "%4.1f", 1.0f / fpsReal ) ); - Con::setVariable( "fps::realMin", avar( "%4.1f", 1.0f / fpsRealMin ) ); - Con::setVariable( "fps::realMax", avar( "%4.1f", 1.0f / fpsRealMax ) ); - Con::setVariable( "fps::virtual", avar( "%4.1f", 1.0f / fpsVirtual ) ); + // Accumulate for windowed FPS calculation + fpsFrames++; + fpsAccumTime += frameTimeReal; + fpsAccumVirtualTime += frameTimeVirtual; - if (update > mUpdateInterval) - fpsNext = fpsRealLast + mUpdateInterval; - else - fpsNext += mUpdateInterval; + // Only update console values at interval + if (realSeconds >= fpsNext) + { + fpsReal = 0.0f; + fpsVirtual = 0.0f; + + if (fpsAccumTime > 0.0f) + fpsReal = fpsFrames / fpsAccumTime; + + if (fpsAccumVirtualTime > 0.0f) + fpsVirtual = fpsFrames / fpsAccumVirtualTime; + + // Update min/max correctly (these are FPS now) + if (fpsReal > 0.0f) + { + if (fpsReal < fpsRealMin) + fpsRealMin = fpsReal; + + if (fpsReal > fpsRealMax) + fpsRealMax = fpsReal; + } + + // frameDelta = actual accumulated real time over window + Con::setVariable("fps::frameTimeMs", avar("%4.3f", (fpsAccumTime / fpsFrames) * 1000.0f)); + Con::setVariable("fps::frameDelta", avar("%g", fpsAccumTime)); + Con::setVariable("fps::real", avar("%4.1f", fpsReal)); + Con::setVariable("fps::realMin", avar("%4.1f", fpsRealMin)); + Con::setVariable("fps::realMax", avar("%4.1f", fpsRealMax)); + Con::setVariable("fps::virtual", avar("%4.1f", fpsVirtual)); + + // Reset window + fpsFrames = 0; + fpsAccumTime = 0.0f; + fpsAccumVirtualTime = 0.0f; + + fpsNext = realSeconds + mUpdateInterval; } } diff --git a/Engine/source/util/fpsTracker.h b/Engine/source/util/fpsTracker.h index e5973ae68..a29fbbecf 100644 --- a/Engine/source/util/fpsTracker.h +++ b/Engine/source/util/fpsTracker.h @@ -27,6 +27,8 @@ struct FPSTracker { + F32 fpsAccumTime; + F32 fpsAccumVirtualTime; F32 fpsRealLast; F32 fpsReal; F32 fpsRealMin; @@ -48,4 +50,4 @@ struct FPSTracker extern FPSTracker gFPS; -#endif \ No newline at end of file +#endif diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake index 48bb6896e..fe0cec4c6 100644 --- a/Tools/CMake/torque_macros.cmake +++ b/Tools/CMake/torque_macros.cmake @@ -133,4 +133,54 @@ macro(addFramework framework) find_library(_${framework}_FRAMEWORK_PATH ${framework} PATHS /System/Library/Frameworks /Library/Frameworks) set(TORQUE_LINK_FRAMEWORKS ${TORQUE_LINK_FRAMEWORKS} "${_${framework}_FRAMEWORK_PATH}") endif() -endmacro() \ No newline at end of file +endmacro() + +function(add_math_backend name compile_defs) + file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp" "math/isa/${name}/*.h") + + if(NOT SRC) + return() + endif() + + file(GLOB_RECURSE INL CONFIGURE_DEPENDS "math/impl/*.inl") + + add_library(math_${name} OBJECT ${SRC} ${INL}) + + message(STATUS "adding math library for isa ${name}") + target_include_directories(math_${name} PUBLIC + "math/public" + "math/impl" + "math/isa/${name}" + ) + + target_compile_definitions(math_${name} PRIVATE ${compile_defs}) + + # ISA flags + if(MSVC) + if(name STREQUAL "sse2") + target_compile_options(math_${name} PRIVATE /arch:SSE2) + elseif(name STREQUAL "sse41") + target_compile_options(math_${name} PRIVATE /arch:SSE2) + elseif(name STREQUAL "avx") + target_compile_options(math_${name} PRIVATE /arch:AVX) + elseif(name STREQUAL "avx2") + target_compile_options(math_${name} PRIVATE /arch:AVX2) + endif() + else() + if(name STREQUAL "sse2") + target_compile_options(math_${name} PRIVATE -msse2) + elseif(name STREQUAL "sse41") + target_compile_options(math_${name} PRIVATE -msse4.1) + elseif(name STREQUAL "avx") + target_compile_options(math_${name} PRIVATE -mavx) + elseif(name STREQUAL "avx2") + target_compile_options(math_${name} PRIVATE -mavx2 -mfma) + elseif(name STREQUAL "neon") + target_compile_options(math_${name} PRIVATE -march=armv8-a) + endif() + endif() + + # Inject objects into engine + target_sources(${TORQUE_APP_NAME} PRIVATE $) + set_target_properties(math_${name} PROPERTIES FOLDER "Libraries/Math") +endfunction() \ No newline at end of file