diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt index 746bae1e7..d5f7aeaad 100644 --- a/Engine/source/CMakeLists.txt +++ b/Engine/source/CMakeLists.txt @@ -503,12 +503,17 @@ set(IS_ARM FALSE) if(ARCH MATCHES "x86_64|amd64|i[3-6]86") set(IS_X86 TRUE) -elseif(ARCH MATCHES "arm64|aarch64") +endif() + +if(ARCH MATCHES "arm64|aarch64|arm") set(IS_ARM TRUE) endif() # always available add_math_backend(scalar MATH_SIMD_SCALAR) +message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}") +message(STATUS "IS_X86=${IS_X86}") +message(STATUS "IS_ARM=${IS_ARM}") # x86 family if(IS_X86) diff --git a/Engine/source/gfx/gfxDrawUtil.cpp b/Engine/source/gfx/gfxDrawUtil.cpp index 16b886a55..3dce4b3af 100644 --- a/Engine/source/gfx/gfxDrawUtil.cpp +++ b/Engine/source/gfx/gfxDrawUtil.cpp @@ -853,7 +853,11 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2, // 3D World Draw Misc //----------------------------------------------------------------------------- -static SphereMesh gSphere; +SphereMesh& getSphere() +{ + static SphereMesh instance; + return instance; +} void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm ) { @@ -868,7 +872,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P GFX->pushWorldMatrix(); GFX->multWorld(mat); - const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2); + const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2); S32 numPoly = sphereMesh->numPoly; S32 totalPoly = 0; GFXVertexBufferHandle verts(mDevice, numPoly*3, GFXBufferTypeVolatile); diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl new file mode 100644 index 000000000..87b325faf --- /dev/null +++ b/Engine/source/math/impl/float3_impl.inl @@ -0,0 +1,123 @@ +#pragma once +#include // for sqrtf, etc. +#include "../mConstants.h" + +// Safely loads a float3 -> simd 4 lane backend +namespace math_backend::float3 +{ + //---------------------------------------------------------- + // Add two float4 vectors: r = a + b + inline void float3_add_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vr = v_add(va, vb); + v_store3(r, vr); + } + + // Subtract: r = a - b + inline void float3_sub_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vr = v_sub(va, vb); + v_store3(r, vr); + } + + // Multiply element-wise: r = a * b + inline void float3_mul_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vr = v_mul(va, vb); + v_store3(r, vr); + } + + // Multiply by scalar: r = a * s + inline void float3_mul_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_mul(va, vs); + v_store3(r, vr); + } + + // Divide element-wise: r = a / b + inline void float3_div_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vr = v_div(va, vb); + v_store3(r, vr); + } + + // Divide by scalar: r = a / s + inline void float3_div_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_div(va, vs); + v_store3(r, vr); + } + + // Dot product: returns scalar + inline float float3_dot_impl(const float* a, const float* b) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vdot = v_dot3(va, vb); + return v_extract0(vdot); // first lane is the sum of 3 elements + } + + // Length squared + inline float float3_length_squared_impl(const float* a) + { + return float3_dot_impl(a, a); + } + + // Length + inline float float3_length_impl(const float* a) + { + return std::sqrt(float3_length_squared_impl(a)); + } + + // Normalize in-place + inline void float3_normalize_impl(float* a) + { + f32x4 va = v_load3_vec(a); + f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted + f32x4 vnorm = v_mul(va, invLen); + v_store3(a, vnorm); + } + + // Normalize with magnitude: r = normalize(a) * r + inline void float3_normalize_mag_impl(float* a, float r) + { + f32x4 va = v_load3_vec(a); + + // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a)) + f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va))); + + f32x4 vnorm = v_mul(va, invLen); + v_store(a, vnorm); + } + + // Linear interpolation: r = from + (to - from) * f + inline void float3_lerp_impl(const float* from, const float* to, float f, float* r) + { + f32x4 vfrom = v_load3_vec(from); + f32x4 vto = v_load3_vec(to); + f32x4 vf = v_set1(f); + f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom))); + v_store3(r, vr); + } + + inline void float3_cross_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3_vec(a); + f32x4 vb = v_load3_vec(b); + f32x4 vcross = v_cross(va, vb); + v_store3(r, vcross); + } + +} diff --git a/Engine/source/math/impl/float4_c.cpp b/Engine/source/math/impl/float4_c.cpp deleted file mode 100644 index b3d6559f9..000000000 --- a/Engine/source/math/impl/float4_c.cpp +++ /dev/null @@ -1,60 +0,0 @@ -#include "math/public/float4_dispatch.h" -#include "math/mConstants.h" -#include - -namespace math_backend::float4::dispatch -{ - void install_scalar() - { - gFloat4.add = [](const float* a, const float* b, float* r) { - for (int i = 0; i < 4; i++) r[i] = a[i] + b[i]; - }; - - gFloat4.sub = [](const float* a, const float* b, float* r) { - for (int i = 0; i < 4; i++) r[i] = a[i] - b[i]; - }; - - gFloat4.mul = [](const float* a, const float* b, float* r) { - for (int i = 0; i < 4; i++) r[i] = a[i] * b[i]; - }; - - gFloat4.mul_scalar = [](const float* a, float s, float* r) { - for (int i = 0; i < 4; i++) r[i] = a[i] * s; - }; - - gFloat4.div = [](const float* a, const float* b, float* r) { - for (int i = 0; i < 4; i++) r[i] = a[i] / b[i]; - }; - - gFloat4.div_scalar = [](const float* a, float s, float* r) { - for (int i = 0; i < 4; i++) r[i] = a[i] / s; - }; - - gFloat4.dot = [](const float* a, const float* b) { - float sum = 0.f; - for (int i = 0; i < 4; i++) sum += a[i] * b[i]; - return sum; - }; - - gFloat4.length = [](const float* a) { - float sum = 0.f; - for (int i = 0; i < 4; i++) sum += a[i] * a[i]; - return sqrtf(sum); - }; - - gFloat4.lengthSquared = [](const float* a) { - float sum = 0.f; - for (int i = 0; i < 4; i++) sum += a[i] * a[i]; - return (sum); - }; - - gFloat4.normalize = [](float* a) { - float len = gFloat4.length(a); - if (len > POINT_EPSILON) for (int i = 0; i < 4; i++) a[i] /= len; - }; - - gFloat4.lerp = [](const float* from, const float* to, float f, float* r) { - for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f; - }; - } -} diff --git a/Engine/source/math/impl/float4_impl.inl b/Engine/source/math/impl/float4_impl.inl index cb61ed4fc..9371ae7c3 100644 --- a/Engine/source/math/impl/float4_impl.inl +++ b/Engine/source/math/impl/float4_impl.inl @@ -65,8 +65,8 @@ namespace math_backend::float4 { f32x4 va = v_load(a); f32x4 vb = v_load(b); - f32x4 vmul = v_mul(va, vb); - return v_hadd4(vmul); + f32x4 vdot = v_dot4(va, vb); // calls ISA-specific implementation + return v_extract0(vdot); } // Length squared @@ -84,21 +84,22 @@ namespace math_backend::float4 // Normalize in-place inline void float4_normalize_impl(float* a) { - float len = float4_length_impl(a); - if (len > POINT_EPSILON) // safe threshold - { - float4_mul_scalar_impl(a, 1.0f / len, a); - } + f32x4 va = v_load(a); + f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted + f32x4 vnorm = v_mul(va, invLen); + v_store(a, vnorm); } // Normalize with magnitude: r = normalize(a) * r inline void float4_normalize_mag_impl(float* a, float r) { - float len = float4_length_impl(a); - if (len > POINT_EPSILON) - { - float4_mul_scalar_impl(a, r / len, a); - } + f32x4 va = v_load(a); + + // invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a)) + f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va))); + + f32x4 vnorm = v_mul(va, invLen); + v_store(a, vnorm); } // Linear interpolation: r = from + (to - from) * f @@ -111,4 +112,12 @@ namespace math_backend::float4 v_store(r, vr); } + inline void float4_cross_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load(a); + f32x4 vb = v_load(b); + f32x4 vcross = v_cross(va, vb); + v_store(r, vcross); + } + } // namespace math_backend::float4 diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp new file mode 100644 index 000000000..7d8a317ba --- /dev/null +++ b/Engine/source/math/impl/math_c.cpp @@ -0,0 +1,208 @@ +#include "math/public/float4_dispatch.h" +#include "math/public/float3_dispatch.h" +#include "math/public/mat44_dispatch.h" +#include "math/mConstants.h" +#include // for sqrtf, etc. + +namespace math_backend::float4::dispatch +{ + void install_scalar() + { + gFloat4.add = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] + b[i]; + }; + + gFloat4.sub = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] - b[i]; + }; + + gFloat4.mul = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] * b[i]; + }; + + gFloat4.mul_scalar = [](const float* a, float s, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] * s; + }; + + gFloat4.div = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 4; i++) r[i] = a[i] / b[i]; + }; + + gFloat4.div_scalar = [](const float* a, float s, float* r) { + float denom = 1.0f / s; + for (int i = 0; i < 4; i++) r[i] = a[i] * denom; + }; + + gFloat4.dot = [](const float* a, const float* b) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * b[i]; + return sum; + }; + + gFloat4.length = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * a[i]; + return std::sqrt(sum); + }; + + gFloat4.lengthSquared = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 4; i++) sum += a[i] * a[i]; + return (sum); + }; + + gFloat4.normalize = [](float* a) { + float len = gFloat4.length(a); + if (len > POINT_EPSILON) + { + float denom = 1.0f / len; + for (int i = 0; i < 4; i++) + a[i] *= denom; + } + }; + + gFloat4.normalize_mag = [](float* a, float f) { + float len = gFloat4.length(a); + if (len > POINT_EPSILON) + { + float denom = f / len; + for (int i = 0; i < 4; i++) a[i] *= denom; + } + }; + + gFloat4.lerp = [](const float* from, const float* to, float f, float* r) { + for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f; + }; + + gFloat4.cross = [](const float* a, const float* b, float* r) { + const float ax = a[0]; + const float ay = a[1]; + const float az = a[2]; + + const float bx = b[0]; + const float by = b[1]; + const float bz = b[2]; + + r[0] = ay * bz - az * by; + r[1] = az * bx - ax * bz; + r[2] = ax * by - ay * bx; + }; + } +} + +namespace math_backend::float3::dispatch +{ + void install_scalar() + { + gFloat3.add = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] + b[i]; + }; + + gFloat3.sub = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] - b[i]; + }; + + gFloat3.mul = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] * b[i]; + }; + + gFloat3.mul_scalar = [](const float* a, float s, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] * s; + }; + + gFloat3.div = [](const float* a, const float* b, float* r) { + for (int i = 0; i < 3; i++) r[i] = a[i] / b[i]; + }; + + gFloat3.div_scalar = [](const float* a, float s, float* r) { + float denom = 1.0f / s; + for (int i = 0; i < 3; i++) r[i] = a[i] * denom; + }; + + gFloat3.dot = [](const float* a, const float* b) { + float sum = 0.f; + for (int i = 0; i < 3; i++) sum += a[i] * b[i]; + return sum; + }; + + gFloat3.length = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 3; i++) sum += a[i] * a[i]; + return std::sqrt(sum); + }; + + gFloat3.lengthSquared = [](const float* a) { + float sum = 0.f; + for (int i = 0; i < 3; i++) sum += a[i] * a[i]; + return (sum); + }; + + gFloat3.normalize = [](float* a) { + float len = gFloat3.length(a); + if (len > POINT_EPSILON) + { + float denom = 1.0 / len; + for (int i = 0; i < 3; i++) a[i] *= denom; + } + }; + + gFloat3.normalize_mag = [](float* a, float f) { + float len = gFloat3.length(a); + if (len > POINT_EPSILON) + { + float denom = f / len; + for (int i = 0; i < 3; i++) a[i] *= denom; + } + }; + + gFloat3.lerp = [](const float* from, const float* to, float f, float* r) { + for (int i = 0; i < 3; i++) r[i] = from[i] + (to[i] - from[i]) * f; + }; + + gFloat3.cross = [](const float* a, const float* b, float* r) { + const float ax = a[0]; + const float ay = a[1]; + const float az = a[2]; + + const float bx = b[0]; + const float by = b[1]; + const float bz = b[2]; + + r[0] = ay * bz - az * by; + r[1] = az * bx - ax * bz; + r[2] = ax * by - ay * bx; + }; + } +} + +inline void swap(float& a, float& b) +{ + float temp = a; + a = b; + b = temp; +} + + +namespace math_backend::mat44::dispatch +{ + void install_scalar() + { + gMat44.transpose = [](float* a) { + swap(a[1], a[4]); + swap(a[2], a[8]); + swap(a[3], a[12]); + swap(a[6], a[9]); + swap(a[7], a[13]); + swap(a[11], a[14]); + }; + + gMat44.scale = [](float* a, const float* s) { + // Note, doesn't allow scaling w... + + a[0] *= s[0]; a[1] *= s[1]; a[2] *= s[2]; + a[4] *= s[0]; a[5] *= s[1]; a[6] *= s[2]; + a[8] *= s[0]; a[9] *= s[1]; a[10] *= s[2]; + a[12] *= s[0]; a[13] *= s[1]; a[14] *= s[2]; + }; + } +} diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h new file mode 100644 index 000000000..0340d84a2 --- /dev/null +++ b/Engine/source/math/isa/avx/avx_intrinsics.h @@ -0,0 +1,140 @@ +#pragma once +#include // AVX/AVX2 intrinsics + +namespace +{ + typedef __m128 f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + inline f32x4 v_zero() { return _mm_setzero_ps(); } + + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + + inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + return _mm_blend_ps(newv, original, 0b1000); + } + + //------------------------------------------------------ + // Float3 helpers (safe loading into 4 lanes) + //------------------------------------------------------ + + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + return _mm_set_ps(0.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + return _mm_set_ps(1.0f, p[2], p[1], p[0]); + } + + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + //------------------------------------------------------ + // Simple Arithmatic + //------------------------------------------------------ + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + + // Fast recip 1/b + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = _mm_rcp_ps(b); + f32x4 two = _mm_set1_ps(2.0f); + return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r))); + } + + // Divide fast ( b = recip eg 1/b) + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 r = _mm_rsqrt_ps(x); + + f32x4 half = _mm_set1_ps(0.5f); + f32x4 three = _mm_set1_ps(3.0f); + + r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r))); + + return _mm_mul_ps(r, half); + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + + // full dot4 + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1 + } + + // dot3 (ignores w) + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1 + } + + // cross product xyz only. + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + + f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); + } + + inline f32x4 v_normalize3(f32x4 v) + { + f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); + return _mm_mul_ps(v, inv); + } + + // adds all 4 lanes together. + inline f32x4 v_hadd4(f32x4 a) + { + // sum all 4 lanes in SSE41 + __m128 sum = _mm_hadd_ps(a, a); + return _mm_hadd_ps(sum, sum); + } +} diff --git a/Engine/source/math/isa/avx/float3.cpp b/Engine/source/math/isa/avx/float3.cpp new file mode 100644 index 000000000..7829f292c --- /dev/null +++ b/Engine/source/math/isa/avx/float3.cpp @@ -0,0 +1,26 @@ +#include "avx_intrinsics.h" +#include "float3_dispatch.h" +#include // AVX/AVX2 intrinsics + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install AVX backend + void install_avx() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/avx/float4.cpp b/Engine/source/math/isa/avx/float4.cpp index 1e23fb8b1..cfdab0908 100644 --- a/Engine/source/math/isa/avx/float4.cpp +++ b/Engine/source/math/isa/avx/float4.cpp @@ -1,49 +1,5 @@ - +#include "avx_intrinsics.h" #include "float4_dispatch.h" -#include // AVX/AVX2 intrinsics - -namespace -{ - typedef __m128 f32x4; - - // Load 4 floats from memory into a SIMD register - inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } - - // Store 4 floats from SIMD register back to memory - inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } - - // Broadcast a single float across all 4 lanes - inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } - - // Element-wise multiply - inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } - - // Element-wise divide - inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } - - // Element-wise add - inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } - - // Element-wise subtract - inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } - - // Horizontal sum of all 4 elements (for dot product, length, etc.) - inline float v_hadd4(f32x4 a) - { - __m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...] - __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3 - return _mm_cvtss_f32(t2); // extract first element - } - - // specialized dot product for AVX - float float4_dot_avx(const float* a, const float* b) - { - f32x4 va = _mm_loadu_ps(a); - f32x4 vb = _mm_loadu_ps(b); - __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane - return _mm_cvtss_f32(dp); - } -} #include "float4_impl.inl" @@ -52,16 +8,18 @@ namespace math_backend::float4::dispatch // Install AVX backend void install_avx() { - gFloat4.add = float4_add_impl; - gFloat4.sub = float4_sub_impl; - gFloat4.mul = float4_mul_impl; - gFloat4.mul_scalar = float4_mul_scalar_impl; - gFloat4.div = float4_div_impl; - gFloat4.div_scalar = float4_div_scalar_impl; - gFloat4.dot = float4_dot_avx; - gFloat4.length = float4_length_impl; + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; - gFloat4.normalize = float4_normalize_impl; - gFloat4.lerp = float4_lerp_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; + gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; } } diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h new file mode 100644 index 000000000..0340d84a2 --- /dev/null +++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h @@ -0,0 +1,140 @@ +#pragma once +#include // AVX/AVX2 intrinsics + +namespace +{ + typedef __m128 f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + inline f32x4 v_zero() { return _mm_setzero_ps(); } + + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + + inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + return _mm_blend_ps(newv, original, 0b1000); + } + + //------------------------------------------------------ + // Float3 helpers (safe loading into 4 lanes) + //------------------------------------------------------ + + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + return _mm_set_ps(0.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + return _mm_set_ps(1.0f, p[2], p[1], p[0]); + } + + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + //------------------------------------------------------ + // Simple Arithmatic + //------------------------------------------------------ + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + + // Fast recip 1/b + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = _mm_rcp_ps(b); + f32x4 two = _mm_set1_ps(2.0f); + return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r))); + } + + // Divide fast ( b = recip eg 1/b) + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 r = _mm_rsqrt_ps(x); + + f32x4 half = _mm_set1_ps(0.5f); + f32x4 three = _mm_set1_ps(3.0f); + + r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r))); + + return _mm_mul_ps(r, half); + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + + // full dot4 + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1 + } + + // dot3 (ignores w) + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1 + } + + // cross product xyz only. + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + + f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); + } + + inline f32x4 v_normalize3(f32x4 v) + { + f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); + return _mm_mul_ps(v, inv); + } + + // adds all 4 lanes together. + inline f32x4 v_hadd4(f32x4 a) + { + // sum all 4 lanes in SSE41 + __m128 sum = _mm_hadd_ps(a, a); + return _mm_hadd_ps(sum, sum); + } +} diff --git a/Engine/source/math/isa/avx2/float3.cpp b/Engine/source/math/isa/avx2/float3.cpp new file mode 100644 index 000000000..fd0d485f6 --- /dev/null +++ b/Engine/source/math/isa/avx2/float3.cpp @@ -0,0 +1,26 @@ +#include "avx2_intrinsics.h" +#include "float3_dispatch.h" +#include // AVX/AVX2 intrinsics + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install AVX2 backend + void install_avx2() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp index 439d0e2d0..85228caaf 100644 --- a/Engine/source/math/isa/avx2/float4.cpp +++ b/Engine/source/math/isa/avx2/float4.cpp @@ -1,49 +1,5 @@ - +#include "avx2_intrinsics.h" #include "float4_dispatch.h" -#include // AVX/AVX2 intrinsics - -namespace -{ - typedef __m128 f32x4; - - // Load 4 floats from memory into a SIMD register - inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } - - // Store 4 floats from SIMD register back to memory - inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } - - // Broadcast a single float across all 4 lanes - inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } - - // Element-wise multiply - inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } - - // Element-wise divide - inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } - - // Element-wise add - inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } - - // Element-wise subtract - inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } - - // Horizontal sum of all 4 elements (for dot product, length, etc.) - inline float v_hadd4(f32x4 a) - { - __m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...] - __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3 - return _mm_cvtss_f32(t2); // extract first element - } - - // specialized dot product for AVX - float float4_dot_avx(const float* a, const float* b) - { - f32x4 va = _mm_loadu_ps(a); - f32x4 vb = _mm_loadu_ps(b); - __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane - return _mm_cvtss_f32(dp); - } -} #include "float4_impl.inl" @@ -58,10 +14,12 @@ namespace math_backend::float4::dispatch gFloat4.mul_scalar = float4_mul_scalar_impl; gFloat4.div = float4_div_impl; gFloat4.div_scalar = float4_div_scalar_impl; - gFloat4.dot = float4_dot_avx; + gFloat4.dot = float4_dot_impl; gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; } } diff --git a/Engine/source/math/isa/neon/float3.cpp b/Engine/source/math/isa/neon/float3.cpp new file mode 100644 index 000000000..904787932 --- /dev/null +++ b/Engine/source/math/isa/neon/float3.cpp @@ -0,0 +1,25 @@ +#include "neon_intrinsics.h" +#include "float3_dispatch.h" + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install NEON backend + void install_neon() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/neon/float4.cpp b/Engine/source/math/isa/neon/float4.cpp index 6258db743..6996f2a76 100644 --- a/Engine/source/math/isa/neon/float4.cpp +++ b/Engine/source/math/isa/neon/float4.cpp @@ -1,50 +1,25 @@ +#include "neon_intrinsics.h" #include "float4_dispatch.h" -#include -namespace -{ - typedef float32x4_t f32x4; - - inline f32x4 v_load(const float* p) { return vld1q_f32(p); } - inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); } - inline f32x4 v_set1(float s) { return vdupq_n_f32(s); } - - inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); } - inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); } - inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); } - - // AArch64 native divide - inline f32x4 v_div(f32x4 a, f32x4 b) - { - return vdivq_f32(a, b); - } - - inline float v_hadd4(f32x4 a) - { - float32x2_t low = vget_low_f32(a); - float32x2_t high = vget_high_f32(a); - float32x2_t sum = vadd_f32(low, high); - sum = vpadd_f32(sum, sum); - return vget_lane_f32(sum, 0); - } -} - -#include "../../impl/float4_impl.inl" +#include "float4_impl.inl" namespace math_backend::float4::dispatch { + // Install NEON64 backend void install_neon() { - gFloat4.add = float4_add_impl; - gFloat4.sub = float4_sub_impl; - gFloat4.mul = float4_mul_impl; - gFloat4.mul_scalar = float4_mul_scalar_impl; - gFloat4.div = float4_div_impl; - gFloat4.div_scalar = float4_div_scalar_impl; - gFloat4.dot = float4_dot_impl; - gFloat4.length = float4_length_impl; + gFloat4.add = float4_add_impl; + gFloat4.sub = float4_sub_impl; + gFloat4.mul = float4_mul_impl; + gFloat4.mul_scalar = float4_mul_scalar_impl; + gFloat4.div = float4_div_impl; + gFloat4.div_scalar = float4_div_scalar_impl; + gFloat4.dot = float4_dot_impl; + gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; - gFloat4.normalize = float4_normalize_impl; - gFloat4.lerp = float4_lerp_impl; + gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; + gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; } } diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h new file mode 100644 index 000000000..3476fab1b --- /dev/null +++ b/Engine/source/math/isa/neon/neon_intrinsics.h @@ -0,0 +1,130 @@ +#pragma once +#include + +namespace +{ + typedef float32x4_t f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + inline f32x4 v_load(const float* p) { return vld1q_f32(p); } + inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); } + inline f32x4 v_set1(float s) { return vdupq_n_f32(s); } + inline f32x4 v_zero() { return vdupq_n_f32(0.0f); } + inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + inline f32x4 v_mask_xyz() + { + // equivalent to [1,1,1,0] + float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f}; + return mask; + } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f}; + return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv); + } + + //------------------------------------------------------ + // Float3 helpers + //------------------------------------------------------ + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + float tmp[4] = { p[0], p[1], p[2], 0.0f }; + return vld1q_f32(tmp); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + float tmp[4] = { p[0], p[1], p[2], 1.0f }; + return vld1q_f32(tmp); + } + + inline void v_store3(float* dst, f32x4 v) + { + float tmp[4]; + vst1q_f32(tmp, v); + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + //------------------------------------------------------ + // Simple Arithmetic + //------------------------------------------------------ + inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); } + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64 + inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); } + inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = vrecpeq_f32(b); + r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson + r = vmulq_f32(r, vrecpsq_f32(b, r)); + return r; + } + + inline f32x4 v_div(f32x4 a, f32x4 b) + { + return vmulq_f32(a, v_rcp_nr(b)); + } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 r = vrsqrteq_f32(x); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); + return r; + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + f32x4 mul = vmulq_f32(a, b); + float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul)); + float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1); + return vdupq_n_f32(sum); + } + + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f}; + f32x4 mul = vmulq_f32(a, b); + mul = vmulq_f32(mul, mask); + float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul)); + float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1); + return vdupq_n_f32(sum); + } + + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 }; + float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 }; + float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b)); + return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 }; + } + + inline f32x4 v_normalize3(f32x4 v) + { + f32x4 inv = v_rsqrt_nr(v_dot3(v,v)); + return vmulq_f32(v, inv); + } + + inline f32x4 v_hadd4(f32x4 a) + { + float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a)); + float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1); + return vdupq_n_f32(sum); + } +} diff --git a/Engine/source/math/isa/sse2/float3.cpp b/Engine/source/math/isa/sse2/float3.cpp new file mode 100644 index 000000000..bc822e8fc --- /dev/null +++ b/Engine/source/math/isa/sse2/float3.cpp @@ -0,0 +1,26 @@ +#include "sse2_intrinsics.h" +#include "float3_dispatch.h" +#include // SSE2 intrinsics + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install SSE2 backend + void install_sse2() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp index 00850560a..aa986474b 100644 --- a/Engine/source/math/isa/sse2/float4.cpp +++ b/Engine/source/math/isa/sse2/float4.cpp @@ -1,42 +1,8 @@ +#include "sse2_intrinsics.h" #include "float4_dispatch.h" -#include // SSE2 intrinsics -namespace -{ - typedef __m128 f32x4; - // Load 4 floats from memory into a SIMD register - inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } +#include "float4_impl.inl" - // Store 4 floats from SIMD register back to memory - inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } - - // Broadcast a single float across all 4 lanes - inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } - - // Element-wise multiply - inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } - - // Element-wise divide - inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } - - // Element-wise add - inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } - - // Element-wise subtract - inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } - - // Horizontal sum of all 4 elements (for dot product, length, etc.) - inline float v_hadd4(f32x4 a) - { - __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs - __m128 sums = _mm_add_ps(a, shuf); // sums: [a0+a1 a1+a0 a2+a3 a3+a2] - shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)); // move high pair to low - sums = _mm_add_ps(sums, shuf); // total sum in lower float - return _mm_cvtss_f32(sums); - } -} - -#include "../../impl/float4_impl.inl" namespace math_backend::float4::dispatch { @@ -53,6 +19,8 @@ namespace math_backend::float4::dispatch gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; } } diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h new file mode 100644 index 000000000..85b09ebb0 --- /dev/null +++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h @@ -0,0 +1,156 @@ +#pragma once +#include // SSE2 +#include // SSE + +namespace +{ + typedef __m128 f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + inline f32x4 v_zero() { return _mm_setzero_ps(); } + + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + //------------------------------------------------------ + // Float3 helpers (safe loading into 4 lanes) + //------------------------------------------------------ + + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + return _mm_set_ps(0.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + return _mm_set_ps(1.0f, p[2], p[1], p[0]); + } + + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + + inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); + return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv)); + } + + + //------------------------------------------------------ + // Simple Arithmatic + //------------------------------------------------------ + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + + // Fast recip 1/b + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = _mm_rcp_ps(b); + f32x4 two = _mm_set1_ps(2.0f); + return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r))); + } + + // Divide fast ( b = recip eg 1/b) + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 r = _mm_rsqrt_ps(x); + + f32x4 half = _mm_set1_ps(0.5f); + f32x4 three = _mm_set1_ps(3.0f); + + r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r))); + + return _mm_mul_ps(r, half); + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + + // full dot4 + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + f32x4 prod = _mm_mul_ps(a, b); // multiply element-wise + f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1)); + prod = _mm_add_ps(prod, shuf); + shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2)); + prod = _mm_add_ps(prod, shuf); + return prod; // f32x4, all lanes = dot(a,b) + } + + // dot3 (ignores w) + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + f32x4 prod = _mm_mul_ps(a, b); + prod = _mm_and_ps(prod, v_mask_xyz()); // zero w + f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1)); + prod = _mm_add_ps(prod, shuf); + shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2)); + prod = _mm_add_ps(prod, shuf); + return prod; // f32x4, all lanes = dot(a,b) + } + + // cross product xyz only. + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + + f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); + } + + inline f32x4 v_normalize3(f32x4 v) + { + f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); + return _mm_mul_ps(v, inv); + } + + // adds all 4 lanes together. + inline f32x4 v_hadd4(f32x4 a) + { + // sum all 4 lanes in SSE2 + __m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs + __m128 sums = _mm_add_ps(a, shuf); + shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)); + return _mm_add_ps(sums, shuf); + } +} diff --git a/Engine/source/math/isa/sse41/float3.cpp b/Engine/source/math/isa/sse41/float3.cpp new file mode 100644 index 000000000..adf1a3a3d --- /dev/null +++ b/Engine/source/math/isa/sse41/float3.cpp @@ -0,0 +1,26 @@ +#include "sse41_intrinsics.h" +#include "float3_dispatch.h" +#include // SSE41 intrinsics + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install SSE41 backend + void install_sse41() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_impl; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + gFloat3.cross = float3_cross_impl; + } +} diff --git a/Engine/source/math/isa/sse41/float4.cpp b/Engine/source/math/isa/sse41/float4.cpp index 80127acb9..e9ca8aae2 100644 --- a/Engine/source/math/isa/sse41/float4.cpp +++ b/Engine/source/math/isa/sse41/float4.cpp @@ -1,49 +1,5 @@ - +#include "sse41_intrinsics.h" #include "float4_dispatch.h" -#include // SSE41 intrinsics - -namespace -{ - typedef __m128 f32x4; - - // Load 4 floats from memory into a SIMD register - inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } - - // Store 4 floats from SIMD register back to memory - inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } - - // Broadcast a single float across all 4 lanes - inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } - - // Element-wise multiply - inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } - - // Element-wise divide - inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } - - // Element-wise add - inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } - - // Element-wise subtract - inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } - - // Horizontal sum of all 4 elements (for dot product, length, etc.) - inline float v_hadd4(f32x4 a) - { - __m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...] - __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3 - return _mm_cvtss_f32(t2); // extract first element - } - - // specialized dot product for SSE4.1 - float float4_dot_sse41(const float* a, const float* b) - { - f32x4 va = _mm_loadu_ps(a); - f32x4 vb = _mm_loadu_ps(b); - __m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane - return _mm_cvtss_f32(dp); - } -} #include "float4_impl.inl" @@ -58,10 +14,12 @@ namespace math_backend::float4::dispatch gFloat4.mul_scalar = float4_mul_scalar_impl; gFloat4.div = float4_div_impl; gFloat4.div_scalar = float4_div_scalar_impl; - gFloat4.dot = float4_dot_sse41; + gFloat4.dot = float4_dot_impl; gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; gFloat4.lerp = float4_lerp_impl; + gFloat4.cross = float4_cross_impl; } } diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h new file mode 100644 index 000000000..047cb44ee --- /dev/null +++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h @@ -0,0 +1,140 @@ +#pragma once +#include // SSE4.1 + +namespace +{ + typedef __m128 f32x4; + + //------------------------------------------------------ + // Load / Store + //------------------------------------------------------ + + // Load 4 floats from memory into a SIMD register + inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); } + + inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); } + + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + inline f32x4 v_zero() { return _mm_setzero_ps(); } + + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + //------------------------------------------------------ + // Mask helpers + //------------------------------------------------------ + + inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); } + + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) + { + return _mm_blend_ps(newv, original, 0b1000); + } + + //------------------------------------------------------ + // Float3 helpers (safe loading into 4 lanes) + //------------------------------------------------------ + + inline f32x4 v_load3_vec(const float* p) // w = 0 + { + return _mm_set_ps(0.0f, p[2], p[1], p[0]); + } + + inline f32x4 v_load3_pos(const float* p) // w = 1 + { + return _mm_set_ps(1.0f, p[2], p[1], p[0]); + } + + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + //------------------------------------------------------ + // Simple Arithmatic + //------------------------------------------------------ + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + //------------------------------------------------------ + // Fast recip + //------------------------------------------------------ + + // Fast recip 1/b + inline f32x4 v_rcp_nr(f32x4 b) + { + f32x4 r = _mm_rcp_ps(b); + f32x4 two = _mm_set1_ps(2.0f); + return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r))); + } + + // Divide fast ( b = recip eg 1/b) + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); } + + inline f32x4 v_rsqrt_nr(f32x4 x) + { + f32x4 r = _mm_rsqrt_ps(x); + + f32x4 half = _mm_set1_ps(0.5f); + f32x4 three = _mm_set1_ps(3.0f); + + r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r))); + + return _mm_mul_ps(r, half); + } + + //------------------------------------------------------ + // Vector intrinsic functions + //------------------------------------------------------ + + // full dot4 + inline f32x4 v_dot4(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1 + } + + // dot3 (ignores w) + inline f32x4 v_dot3(f32x4 a, f32x4 b) + { + return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1 + } + + // cross product xyz only. + inline f32x4 v_cross(f32x4 a, f32x4 b) + { + f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + + f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); + } + + inline f32x4 v_normalize3(f32x4 v) + { + f32x4 inv = v_rsqrt_nr(v_dot3(v, v)); + return _mm_mul_ps(v, inv); + } + + // adds all 4 lanes together. + inline f32x4 v_hadd4(f32x4 a) + { + // sum all 4 lanes in SSE41 + __m128 sum = _mm_hadd_ps(a, a); + return _mm_hadd_ps(sum, sum); + } +} diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h index c1bb72923..8e329f3a1 100644 --- a/Engine/source/math/mPoint3.h +++ b/Engine/source/math/mPoint3.h @@ -29,6 +29,11 @@ #ifndef _MPOINT2_H_ #include "math/mPoint2.h" #endif +#ifndef _MATH_BACKEND_H_ +#include "math/public/math_backend.h" +#endif + +#include //------------------------------------------------------------------------------ /// 3D integer point @@ -97,6 +102,7 @@ public: class Point3D; //------------------------------------------------------------------------------ +using math_backend::float3::dispatch::gFloat3; class Point3F { //-------------------------------------- Public data @@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test) inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor) { AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor"); - m_point3F_interpolate( _from, _to, _factor, *this); + + gFloat3.lerp(_from, _to, _factor, *this); } inline void Point3F::zero() @@ -599,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c) inline F32 Point3F::lenSquared() const { - return (x * x) + (y * y) + (z * z); + return gFloat3.lengthSquared(*this); } inline F32 Point3F::len() const { - return mSqrt(x*x + y*y + z*z); + return gFloat3.length(*this); } inline void Point3F::normalize() { - m_point3F_normalize(*this); + gFloat3.normalize(*this); } inline F32 Point3F::magnitudeSafe() const @@ -626,18 +633,13 @@ inline F32 Point3F::magnitudeSafe() const inline void Point3F::normalizeSafe() { - F32 vmag = magnitudeSafe(); - - if( vmag > POINT_EPSILON ) - { - *this *= F32(1.0 / vmag); - } + gFloat3.normalize(*this); } inline void Point3F::normalize(F32 val) { - m_point3F_normalize_f(*this, val); + gFloat3.normalize_mag(*this, val); } inline bool Point3F::operator==(const Point3F& _test) const @@ -652,52 +654,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const inline Point3F Point3F::operator+(const Point3F& _add) const { - return Point3F(x + _add.x, y + _add.y, z + _add.z); + Point3F temp; + gFloat3.add(*this, _add, temp); + return temp; } inline Point3F Point3F::operator-(const Point3F& _rSub) const { - return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z); + Point3F temp; + gFloat3.sub(*this, _rSub, temp); + return temp; } inline Point3F& Point3F::operator+=(const Point3F& _add) { - x += _add.x; - y += _add.y; - z += _add.z; - + gFloat3.add(*this, _add, *this); return *this; } inline Point3F& Point3F::operator-=(const Point3F& _rSub) { - x -= _rSub.x; - y -= _rSub.y; - z -= _rSub.z; - + gFloat3.sub(*this, _rSub, *this); return *this; } inline Point3F Point3F::operator*(F32 _mul) const { - return Point3F(x * _mul, y * _mul, z * _mul); + Point3F temp; + gFloat3.mul_scalar(*this, _mul, temp); + return temp; } inline Point3F Point3F::operator/(F32 _div) const { AssertFatal(_div != 0.0f, "Error, div by zero attempted"); - F32 inv = 1.0f / _div; - - return Point3F(x * inv, y * inv, z * inv); + Point3F temp; + gFloat3.div_scalar(*this, _div, temp); + return temp; } inline Point3F& Point3F::operator*=(F32 _mul) { - x *= _mul; - y *= _mul; - z *= _mul; - + gFloat3.mul_scalar(*this, _mul, *this); return *this; } @@ -705,39 +704,35 @@ inline Point3F& Point3F::operator/=(F32 _div) { AssertFatal(_div != 0.0f, "Error, div by zero attempted"); - F32 inv = 1.0f / _div; - x *= inv; - y *= inv; - z *= inv; - + gFloat3.div_scalar(*this, _div, *this); return *this; } inline Point3F Point3F::operator*(const Point3F &_vec) const { - return Point3F(x * _vec.x, y * _vec.y, z * _vec.z); + Point3F temp; + gFloat3.mul(*this, _vec, temp); + return temp; } inline Point3F& Point3F::operator*=(const Point3F &_vec) { - x *= _vec.x; - y *= _vec.y; - z *= _vec.z; + gFloat3.mul(*this, _vec, *this); return *this; } inline Point3F Point3F::operator/(const Point3F &_vec) const { AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted"); - return Point3F(x / _vec.x, y / _vec.y, z / _vec.z); + Point3F temp; + gFloat3.div(*this, _vec, temp); + return temp; } inline Point3F& Point3F::operator/=(const Point3F &_vec) { AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted"); - x /= _vec.x; - y /= _vec.y; - z /= _vec.z; + gFloat3.div(*this, _vec, *this); return *this; } @@ -989,7 +984,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand) inline Point3F operator*(F32 mul, const Point3F& multiplicand) { - return multiplicand * mul; + Point3F temp; + gFloat3.mul_scalar(multiplicand, mul, temp); + return temp; } inline Point3D operator*(F64 mul, const Point3D& multiplicand) @@ -999,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand) inline F32 mDot(const Point3F &p1, const Point3F &p2) { - return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z); + return gFloat3.dot(p1, p2); } inline F64 mDot(const Point3D &p1, const Point3D &p2) @@ -1009,9 +1006,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2) inline void mCross(const Point3F &a, const Point3F &b, Point3F *res) { - res->x = (a.y * b.z) - (a.z * b.y); - res->y = (a.z * b.x) - (a.x * b.z); - res->z = (a.x * b.y) - (a.y * b.x); + gFloat3.cross(a, b, *res); } inline void mCross(const Point3D &a, const Point3D &b, Point3D *res) @@ -1024,7 +1019,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res) inline Point3F mCross(const Point3F &a, const Point3F &b) { Point3F r; - mCross( a, b, &r ); + gFloat3.cross(a, b, r); return r; } diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h index 8ae173009..0f715f983 100644 --- a/Engine/source/math/mPoint4.h +++ b/Engine/source/math/mPoint4.h @@ -26,10 +26,12 @@ #ifndef _MMATHFN_H_ #include "math/mMathFn.h" #endif - #ifndef _MPOINT3_H_ #include "math/mPoint3.h" #endif +#ifndef _MATH_BACKEND_H_ +#include "math/public/math_backend.h" +#endif //------------------------------------------------------------------------------ @@ -61,6 +63,8 @@ class Point4I /// Uses F32 internally. /// /// Useful for representing quaternions and other 4d beasties. +using math_backend::float4::dispatch::gFloat4; + class Point4F { //-------------------------------------- Public data @@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w) inline F32 Point4F::len() const { - return mSqrt(x*x + y*y + z*z + w*w); + return gFloat4.length(*this); } inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor) { - x = (_from.x * (1.0f - _factor)) + (_to.x * _factor); - y = (_from.y * (1.0f - _factor)) + (_to.y * _factor); - z = (_from.z * (1.0f - _factor)) + (_to.z * _factor); - w = (_from.w * (1.0f - _factor)) + (_to.w * _factor); + gFloat4.lerp(_from, _to, _factor, *this); } inline void Point4F::zero() @@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar) if (mIsZero(scalar)) return *this; - F32 denom = 1 / scalar; - - x *= denom; - y *= denom; - z *= denom; - w *= denom; + gFloat4.div_scalar(*this, scalar, *this); return *this; } inline Point4F Point4F::operator+(const Point4F& _add) const { - return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w ); + Point4F res; + gFloat4.add(*this, _add, res); + return res; } inline Point4F& Point4F::operator+=(const Point4F& _add) { - x += _add.x; - y += _add.y; - z += _add.z; - w += _add.w; - + gFloat4.add(*this, _add, *this); return *this; } inline Point4F Point4F::operator-(const Point4F& _rSub) const { - return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w ); + Point4F res; + gFloat4.sub(*this, _rSub, res); + return res; } inline Point4F Point4F::operator*(const Point4F &_vec) const { - return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w); + Point4F res; + gFloat4.mul(*this, _vec, res); + return res; } inline Point4F Point4F::operator*(F32 _mul) const { - return Point4F(x * _mul, y * _mul, z * _mul, w * _mul); + Point4F res; + gFloat4.mul_scalar(*this, _mul, res); + return res; } inline Point4F Point4F::operator /(F32 t) const { - F32 f = 1.0f / t; - return Point4F( x * f, y * f, z * f, w * f ); + Point4F res; + gFloat4.div_scalar(*this, t, res); + return res; } inline F32 mDot(const Point4F &p1, const Point4F &p2) { - return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w); + return gFloat4.dot(p1, p2); } //------------------------------------------------------------------------------ diff --git a/Engine/source/math/public/float3_dispatch.h b/Engine/source/math/public/float3_dispatch.h new file mode 100644 index 000000000..e4279cb84 --- /dev/null +++ b/Engine/source/math/public/float3_dispatch.h @@ -0,0 +1,39 @@ +#pragma once +#ifndef _FLOAT3_DISPATCH_H_ +#define _FLOAT3_DISPATCH_H_ + + +#include + +namespace math_backend::float3::dispatch +{ + struct Float3Funcs + { + void (*add)(const float*, const float*, float*) = nullptr; + void (*sub)(const float*, const float*, float*) = nullptr; + void (*mul)(const float*, const float*, float*) = nullptr; + void (*mul_scalar)(const float*, float, float*) = nullptr; + void (*div)(const float*, const float*, float*) = nullptr; + void (*div_scalar)(const float*, float, float*) = nullptr; + float (*dot)(const float*, const float*) = nullptr; + float (*length)(const float*) = nullptr; + float (*lengthSquared)(const float*) = nullptr; + void (*normalize)(float*) = nullptr; + void (*normalize_mag)(float*, float) = nullptr; + void (*lerp)(const float*, const float*, float, float*) = nullptr; + void (*cross)(const float*, const float*, float*) = nullptr; + }; + + // Global dispatch table + extern Float3Funcs gFloat3; + + // Backend installers (defined in ISA libraries) + void install_scalar(); + void install_sse2(); + void install_sse41(); + void install_avx(); + void install_avx2(); + void install_neon(); +} + +#endif // !_FLOAT4_DISPATCH_H_ diff --git a/Engine/source/math/public/float4_dispatch.cpp b/Engine/source/math/public/float4_dispatch.cpp deleted file mode 100644 index 810eb0e46..000000000 --- a/Engine/source/math/public/float4_dispatch.cpp +++ /dev/null @@ -1,7 +0,0 @@ -#include "math/public/float4_dispatch.h" - -namespace math_backend::float4::dispatch -{ - // Single definition of the global dispatch table - Float4Funcs gFloat4{}; -} diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h index 319b1893f..6f26114ce 100644 --- a/Engine/source/math/public/float4_dispatch.h +++ b/Engine/source/math/public/float4_dispatch.h @@ -19,7 +19,9 @@ namespace math_backend::float4::dispatch float (*length)(const float*) = nullptr; float (*lengthSquared)(const float*) = nullptr; void (*normalize)(float*) = nullptr; + void (*normalize_mag)(float*, float) = nullptr; void (*lerp)(const float*, const float*, float, float*) = nullptr; + void (*cross)(const float*, const float*, float*) = nullptr; }; // Global dispatch table @@ -32,9 +34,6 @@ namespace math_backend::float4::dispatch void install_avx(); void install_avx2(); void install_neon(); - - // Centralized installer (engine calls this once) - void install_preferred(); } #endif // !_FLOAT4_DISPATCH_H_ diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h new file mode 100644 index 000000000..61b488861 --- /dev/null +++ b/Engine/source/math/public/mat44_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +#ifndef _MAT44_DISPATCH_H_ +#define _MAT44_DISPATCH_H_ + + +namespace math_backend::mat44::dispatch +{ + struct Mat44Funcs + { + void (*transpose)(float*) = nullptr; + void (*scale)(float*, const float*) = nullptr; + }; + + // Global dispatch table + extern Mat44Funcs gMat44; + + // Backend installers (defined in ISA libraries) + void install_scalar(); + void install_sse2(); + void install_sse41(); + void install_avx(); + void install_avx2(); + void install_neon(); +} + +#endif // !_MAT44_DISPATCH_H_ diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp index 7998924ee..9b5e5daed 100644 --- a/Engine/source/math/public/math_backend.cpp +++ b/Engine/source/math/public/math_backend.cpp @@ -1,6 +1,24 @@ #pragma once #include "math/public/math_backend.h" +namespace math_backend::float4::dispatch +{ + // Single definition of the global dispatch table + Float4Funcs gFloat4{}; +} + +namespace math_backend::float3::dispatch +{ + // Single definition of the global dispatch table + Float3Funcs gFloat3{}; +} + +namespace math_backend::mat44::dispatch +{ + Mat44Funcs gMat44{}; +} + + math_backend::backend math_backend::choose_backend(U32 cpu_flags) { #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) @@ -12,7 +30,7 @@ math_backend::backend math_backend::choose_backend(U32 cpu_flags) #elif defined(__aarch64__) || defined(__ARM_NEON) - if (cpu_flags & CPU_NEON) return backend::neon; + if (cpu_flags & CPU_PROP_NEON) return backend::neon; #endif return backend::scalar; @@ -25,28 +43,36 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags) switch (g_backend) { +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) case backend::avx2: float4::dispatch::install_avx2(); + float3::dispatch::install_avx2(); break; case backend::avx: - //float4::dispatch::install_avx(); + float4::dispatch::install_avx(); + float3::dispatch::install_avx(); break; case backend::sse41: float4::dispatch::install_sse41(); + float3::dispatch::install_sse41(); break; case backend::sse2: float4::dispatch::install_sse2(); + float3::dispatch::install_sse2(); break; - +#elif defined(__aarch64__) || defined(__ARM_NEON) case backend::neon: float4::dispatch::install_neon(); + float3::dispatch::install_neon(); break; - +#endif default: float4::dispatch::install_scalar(); + float3::dispatch::install_scalar(); + mat44::dispatch::install_scalar(); break; } } diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h index 40476e7f0..0a3127f81 100644 --- a/Engine/source/math/public/math_backend.h +++ b/Engine/source/math/public/math_backend.h @@ -1,4 +1,7 @@ #pragma once +#ifndef _MATH_BACKEND_H_ +#define _MATH_BACKEND_H_ + #ifndef _MCONSTANTS_H_ #include "math/mConstants.h" #endif @@ -8,6 +11,12 @@ #ifndef _FLOAT4_DISPATCH_H_ #include "math/public/float4_dispatch.h" #endif +#ifndef _FLOAT3_DISPATCH_H_ +#include "math/public/float3_dispatch.h" +#endif +#ifndef _MAT44_DISPATCH_H_ +#include "math/public/mat44_dispatch.h" +#endif namespace math_backend { @@ -25,3 +34,5 @@ namespace math_backend backend choose_backend(U32 cpu_flags); void install_from_cpu_flags(uint32_t cpu_flags); } + +#endif // !_MATH_BACKEND_H_ diff --git a/Engine/source/platform/platform.h b/Engine/source/platform/platform.h index 1326c4692..0d377f23f 100644 --- a/Engine/source/platform/platform.h +++ b/Engine/source/platform/platform.h @@ -76,7 +76,7 @@ enum ProcessorProperties CPU_PROP_SSE4_2 = (1<<8), ///< Supports SSE4_2 instruction set extension. CPU_PROP_AVX = (1<<9), ///< Supports AVX256 instruction set extension. CPU_PROP_AVX2 = (1<<10), ///< Supports AVX256 instruction set extension. - CPU_PROP_AVX512 = (1<<11), ///< Supports AVX256 instruction set extension. + CPU_PROP_AVX512 = (1<<11), ///< Supports AVX512 instruction set extension. CPU_PROP_MP = (1<<12), ///< This is a multi-processor system. CPU_PROP_LE = (1<<13), ///< This processor is LITTLE ENDIAN. CPU_PROP_64bit = (1<<14), ///< This processor is 64-bit capable diff --git a/Engine/source/platformMac/macMath.mm b/Engine/source/platformMac/macMath.mm index 4feefb277..c275bdd17 100644 --- a/Engine/source/platformMac/macMath.mm +++ b/Engine/source/platformMac/macMath.mm @@ -25,6 +25,7 @@ #import "math/mMath.h" #import "core/strings/stringFunctions.h" #include "console/engineAPI.h" +#include "math/public/math_backend.h" extern void mInstallLibrary_C(); @@ -107,7 +108,9 @@ void Math::init(U32 properties) Con::printf("Math Init:"); Con::printf(" Installing Standard C extensions"); - mInstallLibrary_C(); + mInstallLibrary_C(); + + math_backend::install_from_cpu_flags(properties); #ifdef TORQUE_CPU_X86 if( properties & CPU_PROP_SSE ) diff --git a/Engine/source/platformPOSIX/POSIXMath.cpp b/Engine/source/platformPOSIX/POSIXMath.cpp index 8f21329a3..751df294e 100644 --- a/Engine/source/platformPOSIX/POSIXMath.cpp +++ b/Engine/source/platformPOSIX/POSIXMath.cpp @@ -27,6 +27,7 @@ #include "math/mMath.h" #include "core/strings/stringFunctions.h" #include "console/engineAPI.h" +#include "math/public/math_backend.h" extern void mInstallLibrary_C(); extern void mInstallLibrary_ASM(); @@ -90,6 +91,8 @@ void Math::init(U32 properties) Con::printf(" Installing Standard C extensions"); mInstallLibrary_C(); + math_backend::install_from_cpu_flags(properties); + #if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64) Con::printf(" Installing Assembly extensions"); mInstallLibrary_ASM(); diff --git a/Engine/source/platformWin32/winMath.cpp b/Engine/source/platformWin32/winMath.cpp index 44b215301..36a9d8c58 100644 --- a/Engine/source/platformWin32/winMath.cpp +++ b/Engine/source/platformWin32/winMath.cpp @@ -25,7 +25,7 @@ #include "console/engineAPI.h" #include "math/mMath.h" - +#include "math/public/math_backend.h" extern void mInstallLibrary_C(); extern void mInstallLibrary_ASM(); @@ -98,6 +98,8 @@ void Math::init(U32 properties) Con::printf(" Installing Standard C extensions"); mInstallLibrary_C(); + math_backend::install_from_cpu_flags(properties); + Con::printf(" Installing Assembly extensions"); mInstallLibrary_ASM(); diff --git a/Engine/source/util/fpsTracker.cpp b/Engine/source/util/fpsTracker.cpp index 021d4a4f5..830000bc8 100644 --- a/Engine/source/util/fpsTracker.cpp +++ b/Engine/source/util/fpsTracker.cpp @@ -36,6 +36,8 @@ void FPSTracker::reset() { fpsNext = (F32)Platform::getRealMilliseconds()/1000.0f + mUpdateInterval; + fpsAccumTime = 0.0f; + fpsAccumVirtualTime = 0.0f; fpsRealLast = 0.0f; fpsReal = 0.0f; fpsRealMin = 0.000001f; // Avoid division by zero. @@ -51,42 +53,60 @@ void FPSTracker::update() F32 realSeconds = (F32)Platform::getRealMilliseconds()/1000.0f; F32 virtualSeconds = (F32)Platform::getVirtualMilliseconds()/1000.0f; - fpsFrames++; - if (fpsFrames > 1) + if (fpsRealLast == 0.0f) { - fpsReal = fpsReal*(1.0-alpha) + (realSeconds-fpsRealLast)*alpha; - fpsVirtual = fpsVirtual*(1.0-alpha) + (virtualSeconds-fpsVirtualLast)*alpha; - - if( fpsFrames > 10 ) // Wait a few frames before updating these. - { - // Update min/max. This is a bit counter-intuitive, as the comparisons are - // inversed because these are all one-over-x values. - - if( fpsReal > fpsRealMin ) - fpsRealMin = fpsReal; - if( fpsReal < fpsRealMax ) - fpsRealMax = fpsReal; - } + fpsRealLast = realSeconds; + fpsVirtualLast = virtualSeconds; + return; } - fpsRealLast = realSeconds; + F32 frameTimeReal = realSeconds - fpsRealLast; + F32 frameTimeVirtual = virtualSeconds - fpsVirtualLast; + + fpsRealLast = realSeconds; fpsVirtualLast = virtualSeconds; - // update variables every few frames - F32 update = fpsRealLast - fpsNext; - if (update > 0.5f) - { - F32 delta = realSeconds - fpsNext; - Con::setVariable( "fps::frameDelta",avar("%g", delta)); - Con::setVariable( "fps::real", avar( "%4.1f", 1.0f / fpsReal ) ); - Con::setVariable( "fps::realMin", avar( "%4.1f", 1.0f / fpsRealMin ) ); - Con::setVariable( "fps::realMax", avar( "%4.1f", 1.0f / fpsRealMax ) ); - Con::setVariable( "fps::virtual", avar( "%4.1f", 1.0f / fpsVirtual ) ); + // Accumulate for windowed FPS calculation + fpsFrames++; + fpsAccumTime += frameTimeReal; + fpsAccumVirtualTime += frameTimeVirtual; - if (update > mUpdateInterval) - fpsNext = fpsRealLast + mUpdateInterval; - else - fpsNext += mUpdateInterval; + // Only update console values at interval + if (realSeconds >= fpsNext) + { + fpsReal = 0.0f; + fpsVirtual = 0.0f; + + if (fpsAccumTime > 0.0f) + fpsReal = fpsFrames / fpsAccumTime; + + if (fpsAccumVirtualTime > 0.0f) + fpsVirtual = fpsFrames / fpsAccumVirtualTime; + + // Update min/max correctly (these are FPS now) + if (fpsReal > 0.0f) + { + if (fpsReal < fpsRealMin) + fpsRealMin = fpsReal; + + if (fpsReal > fpsRealMax) + fpsRealMax = fpsReal; + } + + // frameDelta = actual accumulated real time over window + Con::setVariable("fps::frameTimeMs", avar("%4.3f", (fpsAccumTime / fpsFrames) * 1000.0f)); + Con::setVariable("fps::frameDelta", avar("%g", fpsAccumTime)); + Con::setVariable("fps::real", avar("%4.1f", fpsReal)); + Con::setVariable("fps::realMin", avar("%4.1f", fpsRealMin)); + Con::setVariable("fps::realMax", avar("%4.1f", fpsRealMax)); + Con::setVariable("fps::virtual", avar("%4.1f", fpsVirtual)); + + // Reset window + fpsFrames = 0; + fpsAccumTime = 0.0f; + fpsAccumVirtualTime = 0.0f; + + fpsNext = realSeconds + mUpdateInterval; } } diff --git a/Engine/source/util/fpsTracker.h b/Engine/source/util/fpsTracker.h index e5973ae68..a29fbbecf 100644 --- a/Engine/source/util/fpsTracker.h +++ b/Engine/source/util/fpsTracker.h @@ -27,6 +27,8 @@ struct FPSTracker { + F32 fpsAccumTime; + F32 fpsAccumVirtualTime; F32 fpsRealLast; F32 fpsReal; F32 fpsRealMin; @@ -48,4 +50,4 @@ struct FPSTracker extern FPSTracker gFPS; -#endif \ No newline at end of file +#endif diff --git a/Tools/CMake/torque_macros.cmake b/Tools/CMake/torque_macros.cmake index 5a7928b38..f799e8c39 100644 --- a/Tools/CMake/torque_macros.cmake +++ b/Tools/CMake/torque_macros.cmake @@ -136,7 +136,7 @@ macro(addFramework framework) endmacro() function(add_math_backend name compile_defs) - file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp") + file(GLOB_RECURSE SRC CONFIGURE_DEPENDS "math/isa/${name}/*.cpp" "math/isa/${name}/*.h") if(NOT SRC) return() @@ -144,6 +144,7 @@ function(add_math_backend name compile_defs) add_library(math_${name} OBJECT ${SRC}) + message(STATUS "adding math library for isa ${name}") target_include_directories(math_${name} PUBLIC "math/public" "math/impl"