From 6406ca1832a4a93095b52913c59bd7f80ec43322 Mon Sep 17 00:00:00 2001 From: marauder2k7 Date: Wed, 25 Feb 2026 15:06:04 +0000 Subject: [PATCH] Test simd math functions for float4 values beginning implementation of float4 simd functions for x64 and neon --- Engine/source/CMakeLists.txt | 2 +- Engine/source/math/mMathFn.h | 33 ++++ Engine/source/math/mMath_CPU.cpp | 237 ++++++++++++++++++++++++ Engine/source/math/mPoint4.h | 42 +++-- Engine/source/math/simd/mMath_AVX.cpp | 112 ----------- Engine/source/math/simd/mMath_SSE2.cpp | 142 -------------- Engine/source/math/simd/mMath_SSE41.cpp | 107 ----------- Engine/source/platformWin32/winMath.cpp | 19 +- 8 files changed, 305 insertions(+), 389 deletions(-) create mode 100644 Engine/source/math/mMath_CPU.cpp delete mode 100644 Engine/source/math/simd/mMath_AVX.cpp delete mode 100644 Engine/source/math/simd/mMath_SSE2.cpp delete mode 100644 Engine/source/math/simd/mMath_SSE41.cpp diff --git a/Engine/source/CMakeLists.txt b/Engine/source/CMakeLists.txt index a999252b6..029ef89ec 100644 --- a/Engine/source/CMakeLists.txt +++ b/Engine/source/CMakeLists.txt @@ -130,7 +130,7 @@ torqueAddSourceDirectories("windowManager" "windowManager/torque" "windowManager torqueAddSourceDirectories("scene" "scene/culling" "scene/zones" "scene/mixin") # Handle math -torqueAddSourceDirectories("math" "math/util" "math/simd") +torqueAddSourceDirectories("math" "math/util") # Handle persistence set(TORQUE_INCLUDE_DIRECTORIES ${TORQUE_INCLUDE_DIRECTORIES} "persistence/rapidjson") diff --git a/Engine/source/math/mMathFn.h b/Engine/source/math/mMathFn.h index 9f84b37df..946ba0e67 100644 --- a/Engine/source/math/mMathFn.h +++ b/Engine/source/math/mMathFn.h @@ -37,6 +37,39 @@ extern void MathConsoleInit(); +//------------------------------------------------------------------------------ +// Runtime ISA detection enum for x86 +enum class ISA { + NONE = 0, + SSE2, + SSE41, + AVX, + AVX2 +}; + +// Global variable storing detected ISA +extern ISA gISA; +//------------------------------------------------------------------------------ +// Forward declare math_backend +namespace math_backend { + namespace float4 { + void add(const float* a, const float* b, float* r); + void sub(const float* a, const float* b, float* r); + void mul(const float* a, const float* b, float* r); + void mul_scalar(const float* a, float s, float* r); + void div(const float* a, const float* b, float* r); + void div_scalar(const float* a, float s, float* r); + float dot(const float* a, const float* b); + void lerp(const float* from, const float* to, float f, float* r); + + void addArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); + void subArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); + void mulArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); + void mulArrayScalar(const class Point4F* a, float s, class Point4F* r, size_t n); + void lerpArray(const class Point4F* from, const class Point4F* to, float f, class Point4F* r, size_t n); + } +} // namespace math_backend + //-------------------------------------- // Installable Library Prototypes extern S32 (*m_mulDivS32)(S32 a, S32 b, S32 c); diff --git a/Engine/source/math/mMath_CPU.cpp b/Engine/source/math/mMath_CPU.cpp new file mode 100644 index 000000000..c59ab30ce --- /dev/null +++ b/Engine/source/math/mMath_CPU.cpp @@ -0,0 +1,237 @@ +#include "platform/platform.h" +#include "math/mMath.h" +#include "math/util/frustum.h" +#include // Caution!!! Possible platform specific include +#include "math/mMathFn.h" + + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) +#include +#define TORQUE_MATH_x64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#include +#define TORQUE_MATH_ARM 1 +#else +#define TORQUE_MATH_C 1 +#endif + +#include // for std::min + +//------------------------------------------------------------------------------ +// Global ISA variable (x86) +ISA gISA = ISA::NONE; + +#ifdef TORQUE_MATH_x64 +void mInstallLibrary_CPU() +{ + gISA = ISA::SSE2; // Bare minimum supported. + U32 properties = Platform::SystemInfo.processor.properties; + if (properties & CPU_PROP_AVX2) gISA = ISA::AVX2; return; + if (properties & CPU_PROP_AVX) gISA = ISA::AVX; return; + if (properties & CPU_PROP_SSE4_1) gISA = ISA::SSE41; return; + if (properties & CPU_PROP_SSE2) gISA = ISA::SSE2; return; +} +#else +void mInstallLibrary_CPU() { gISA = ISA::NONE; } +#endif + +//------------------------------------------------------------------------------ +// SINGLE POINT OPERATIONS +//------------------------------------------------------------------------------ +void math_backend::float4::add(const float* a, const float* b, float* r) +{ +#if TORQUE_MATH_x64 + switch (gISA) { + case ISA::AVX2: + case ISA::AVX: + case ISA::SSE41: + case ISA::SSE2: { + __m128 va = _mm_loadu_ps(a); + __m128 vb = _mm_loadu_ps(b); + _mm_storeu_ps(r, _mm_add_ps(va, vb)); + return; + } + default: break; + } +#elif TORQUE_MATH_ARM + vst1q_f32(r, vaddq_f32(vld1q_f32(a), vld1q_f32(b))); +#endif + + r[0] = a[0] + b[0]; + r[1] = a[1] + b[1]; + r[2] = a[2] + b[2]; + r[3] = a[3] + b[3]; + +} + +void math_backend::float4::sub(const float* a, const float* b, float* r) +{ +#if TORQUE_MATH_x64 + switch (gISA) { + case ISA::AVX2: + case ISA::AVX: + case ISA::SSE41: + case ISA::SSE2: { + __m128 va = _mm_loadu_ps(a); + __m128 vb = _mm_loadu_ps(b); + _mm_storeu_ps(r, _mm_sub_ps(va, vb)); + return; + } + default: break; + } +#elif TORQUE_MATH_ARM + vst1q_f32(r, vsubq_f32(vld1q_f32(a), vld1q_f32(b))); +#endif + + r[0] = a[0] - b[0]; + r[1] = a[1] - b[1]; + r[2] = a[2] - b[2]; + r[3] = a[3] - b[3]; +} + +void math_backend::float4::mul(const float* a, const float* b, float* r) +{ +#if TORQUE_MATH_x64 + switch (gISA) { + case ISA::AVX2: + case ISA::AVX: + case ISA::SSE41: + case ISA::SSE2: { + __m128 va = _mm_loadu_ps(a); + __m128 vb = _mm_loadu_ps(b); + _mm_storeu_ps(r, _mm_mul_ps(va, vb)); + return; + } + default: break; + } +#elif TORQUE_MATH_ARM + vst1q_f32(r, vmulq_f32(vld1q_f32(a), vld1q_f32(b))); +#endif + + r[0] = a[0] * b[0]; + r[1] = a[1] * b[1]; + r[2] = a[2] * b[2]; + r[3] = a[3] * b[3]; +} + +void math_backend::float4::mul_scalar(const float* a, float s, float* r) +{ +#if TORQUE_MATH_x64 + switch (gISA) { + case ISA::AVX2: + case ISA::AVX: + case ISA::SSE41: + case ISA::SSE2: { + __m128 va = _mm_loadu_ps(a); + __m128 vs = _mm_set1_ps(s); // broadcast scalar -> [s s s s] + _mm_storeu_ps(r, _mm_mul_ps(va, vs)); + return; + } + default: break; + } + +#elif TORQUE_MATH_ARM + float32x4_t va = vld1q_f32(a); + float32x4_t vs = vdupq_n_f32(s); + vst1q_f32(r, vmulq_f32(va, vs)); + return; +#endif + + // torque_c fallback + r[0] = a[0] * s; + r[1] = a[1] * s; + r[2] = a[2] * s; + r[3] = a[3] * s; +} + +void math_backend::float4::div(const float* a, const float* b, float* r) +{ +} + +void math_backend::float4::div_scalar(const float* a, float s, float* r) +{ +#if TORQUE_MATH_x64 + switch (gISA) { + case ISA::AVX2: + case ISA::AVX: + case ISA::SSE41: + case ISA::SSE2: { + __m128 va = _mm_loadu_ps(a); + + // compute reciprocal once + __m128 vs = _mm_set1_ps(s); + __m128 recip = _mm_rcp_ps(vs); + + // Newton–Raphson refinement (important for accuracy) + recip = _mm_mul_ps(recip, _mm_sub_ps(_mm_set1_ps(2.0f),_mm_mul_ps(vs, recip))); + _mm_storeu_ps(r, _mm_mul_ps(va, recip)); + return; + } + default: break; + } +#elif TORQUE_MATH_ARM + float32x4_t va = vld1q_f32(a); + float32x4_t vs = vdupq_n_f32(s); + + float32x4_t recip = vrecpeq_f32(vs); + recip = vmulq_f32(vrecpsq_f32(vs, recip), recip); + + vst1q_f32(r, vmulq_f32(va, recip)); + return; +#endif + + // scalar fallback + float inv = 1.0f / s; + r[0] = a[0] * inv; + r[1] = a[1] * inv; + r[2] = a[2] * inv; + r[3] = a[3] * inv; +} + +float math_backend::float4::dot(const float* a, const float* b) +{ +#if TORQUE_MATH_x64 + switch (gISA) { + + // ---- SSE4.1 : dedicated dot instruction ---- + case ISA::AVX2: + case ISA::AVX: + case ISA::SSE41: { + __m128 va = _mm_loadu_ps(a); + __m128 vb = _mm_loadu_ps(b); + return _mm_cvtss_f32(_mm_dp_ps(va, vb, 0xFF)); + } + + // ---- SSE2 fallback (no horizontal ops available) ---- + case ISA::SSE2: { + __m128 va = _mm_loadu_ps(a); + __m128 vb = _mm_loadu_ps(b); + + __m128 mul = _mm_mul_ps(va, vb); + + __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1)); + __m128 sums = _mm_add_ps(mul, shuf); + + shuf = _mm_movehl_ps(shuf, sums); + sums = _mm_add_ss(sums, shuf); + + return _mm_cvtss_f32(sums); + } + + default: break; + } + +#elif TORQUE_MATH_ARM + // ---- NEON ---- + float32x4_t va = vld1q_f32(a); + float32x4_t vb = vld1q_f32(b); + + float32x4_t vmul = vmulq_f32(va, vb); + float32x2_t sum2 = vpadd_f32(vget_low_f32(vmul), vget_high_f32(vmul)); + float32x2_t total = vpadd_f32(sum2, sum2); + + return vget_lane_f32(total, 0); +#endif + + return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; +} diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h index 8ae173009..70fd2466b 100644 --- a/Engine/source/math/mPoint4.h +++ b/Engine/source/math/mPoint4.h @@ -99,7 +99,8 @@ class Point4F Point4F operator*(F32) const; Point4F operator+(const Point4F&) const; Point4F& operator+=(const Point4F&); - Point4F operator-(const Point4F&) const; + Point4F operator-(const Point4F&) const; + Point4F operator-=(const Point4F&); Point4F operator*(const Point4F&) const; Point4F& operator*=(const Point4F&); Point4F& operator=(const Point3F&); @@ -205,32 +206,50 @@ inline Point4F& Point4F::operator/=(F32 scalar) inline Point4F Point4F::operator+(const Point4F& _add) const { - return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w ); + Point4F r; + math_backend::float4::add(*this, _add, r); + return r; + + //return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w ); } inline Point4F& Point4F::operator+=(const Point4F& _add) { - x += _add.x; - y += _add.y; - z += _add.z; - w += _add.w; - + math_backend::float4::add(*this, _add, *this); return *this; } inline Point4F Point4F::operator-(const Point4F& _rSub) const { - return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w ); + Point4F r; + math_backend::float4::sub(*this, _rSub, r); + return r; +} + +inline Point4F Point4F::operator-=(const Point4F& _rSub) +{ + math_backend::float4::sub(*this, _rSub, *this); + return *this; } inline Point4F Point4F::operator*(const Point4F &_vec) const { - return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w); + Point4F r; + math_backend::float4::mul(*this, _vec, r); + return r; +} + +inline Point4F& Point4F::operator*=(const Point4F& _vec) +{ + math_backend::float4::mul(*this, _vec, *this); + return *this; } inline Point4F Point4F::operator*(F32 _mul) const { - return Point4F(x * _mul, y * _mul, z * _mul, w * _mul); + Point4F r; + math_backend::float4::mul_scalar(*this, _mul, r); + return r; } inline Point4F Point4F::operator /(F32 t) const @@ -241,7 +260,8 @@ inline Point4F Point4F::operator /(F32 t) const inline F32 mDot(const Point4F &p1, const Point4F &p2) { - return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w); + return math_backend::float4::dot(p1, p2); + //return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w); } //------------------------------------------------------------------------------ diff --git a/Engine/source/math/simd/mMath_AVX.cpp b/Engine/source/math/simd/mMath_AVX.cpp deleted file mode 100644 index c619c6951..000000000 --- a/Engine/source/math/simd/mMath_AVX.cpp +++ /dev/null @@ -1,112 +0,0 @@ -#include "platform/platform.h" -#include "math/mMath.h" -#include "math/util/frustum.h" -#include // Caution!!! Possible platform specific include -#include "math/mMathFn.h" - -//################################################################ -// AVX 2 Functions -//################################################################ -#include - -void m_point3F_bulk_dot_avx2( const F32* refVector, - const F32* dotPoints, - const U32 numPoints, - const U32 pointStride, - F32* output) -{ - __m256 refX = _mm256_set1_ps(refVector[0]); - __m256 refY = _mm256_set1_ps(refVector[1]); - __m256 refZ = _mm256_set1_ps(refVector[2]); - - U32 i = 0; - - // Process 8 points at a time (AVX2 = 8 floats per 256-bit register) - for (; i + 7 < numPoints; i += 8) - { - // Load x, y, z components with stride - __m256 x = _mm256_set_ps( - dotPoints[(i + 7) * pointStride + 0], - dotPoints[(i + 6) * pointStride + 0], - dotPoints[(i + 5) * pointStride + 0], - dotPoints[(i + 4) * pointStride + 0], - dotPoints[(i + 3) * pointStride + 0], - dotPoints[(i + 2) * pointStride + 0], - dotPoints[(i + 1) * pointStride + 0], - dotPoints[(i + 0) * pointStride + 0] - ); - - __m256 y = _mm256_set_ps( - dotPoints[(i + 7) * pointStride + 1], - dotPoints[(i + 6) * pointStride + 1], - dotPoints[(i + 5) * pointStride + 1], - dotPoints[(i + 4) * pointStride + 1], - dotPoints[(i + 3) * pointStride + 1], - dotPoints[(i + 2) * pointStride + 1], - dotPoints[(i + 1) * pointStride + 1], - dotPoints[(i + 0) * pointStride + 1] - ); - - __m256 z = _mm256_set_ps( - dotPoints[(i + 7) * pointStride + 2], - dotPoints[(i + 6) * pointStride + 2], - dotPoints[(i + 5) * pointStride + 2], - dotPoints[(i + 4) * pointStride + 2], - dotPoints[(i + 3) * pointStride + 2], - dotPoints[(i + 2) * pointStride + 2], - dotPoints[(i + 1) * pointStride + 2], - dotPoints[(i + 0) * pointStride + 2] - ); - - // Multiply and accumulate: x*rx + y*ry + z*rz - __m256 dot = _mm256_mul_ps(x, refX); - dot = _mm256_fmadd_ps(y, refY, dot); // dot += y*refY - dot = _mm256_fmadd_ps(z, refZ, dot); // dot += z*refZ - - // Store the results - _mm256_storeu_ps(&output[i], dot); - } - - // Handle remaining points - for (; i < numPoints; i++) - { - const F32* pPoint = &dotPoints[i * pointStride]; - output[i] = refVector[0] * pPoint[0] + - refVector[1] * pPoint[1] + - refVector[2] * pPoint[2]; - } -} - -void default_matF_x_matF_AVX2(const F32* A, const F32* B, F32* C) -{ - for (int i = 0; i < 4; i++) - { - // Broadcast elements of A row - __m128 a0 = _mm_set1_ps(A[i * 4 + 0]); - __m128 a1 = _mm_set1_ps(A[i * 4 + 1]); - __m128 a2 = _mm_set1_ps(A[i * 4 + 2]); - __m128 a3 = _mm_set1_ps(A[i * 4 + 3]); - - // Load columns of B (rows in memory since row-major) - __m128 b0 = _mm_loadu_ps(&B[0 * 4]); // B row 0 - __m128 b1 = _mm_loadu_ps(&B[1 * 4]); // B row 1 - __m128 b2 = _mm_loadu_ps(&B[2 * 4]); // B row 2 - __m128 b3 = _mm_loadu_ps(&B[3 * 4]); // B row 3 - - // Multiply and sum - __m128 res = _mm_mul_ps(a0, b0); - res = _mm_add_ps(res, _mm_mul_ps(a1, b1)); - res = _mm_add_ps(res, _mm_mul_ps(a2, b2)); - res = _mm_add_ps(res, _mm_mul_ps(a3, b3)); - - // Store result row - _mm_storeu_ps(&C[i * 4], res); - } -} - -void mInstallLibrary_AVX2() -{ - m_point3F_bulk_dot = m_point3F_bulk_dot_avx2; - m_matF_x_matF = default_matF_x_matF_AVX2; - m_matF_x_matF_aligned = default_matF_x_matF_AVX2; -} diff --git a/Engine/source/math/simd/mMath_SSE2.cpp b/Engine/source/math/simd/mMath_SSE2.cpp deleted file mode 100644 index d0a9190c9..000000000 --- a/Engine/source/math/simd/mMath_SSE2.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include "platform/platform.h" -#include "math/mMath.h" -#include "math/util/frustum.h" -#include // Caution!!! Possible platform specific include -#include "math/mMathFn.h" - -//################################################################ -// SSE2 Functions - minimum baseline -//################################################################ -#include - -static void m_point3F_normalize_sse2(float* p) -{ - const float val = 1.0f; - - // Load vector x, y, z into SSE register (w lane unused) - __m128 vec = _mm_set_ps(0.0f, p[2], p[1], p[0]); - - // Compute sum of squares: x*x + y*y + z*z - __m128 sq = _mm_mul_ps(vec, vec); - __m128 sum = _mm_add_ps(sq, _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(2, 1, 0, 3))); - sum = _mm_add_ss(sum, _mm_movehl_ps(sum, sum)); - - // Extract scalar squared length - float squared; - _mm_store_ss(&squared, sum); - - if (squared != 0.0f) - { - // Exact normalization: 1/sqrt(squared) - float factor = 1.0f / std::sqrt(squared); - __m128 factorVec = _mm_set1_ps(factor); - - vec = _mm_mul_ps(vec, factorVec); - } - else - { - // Zero-length fallback - vec = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); - } - - // Store result back - p[0] = _mm_cvtss_f32(vec); - p[1] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))); - p[2] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2))); -} - -static void m_point3F_normalize_f_sse2(float* p, float val) -{ - __m128 vec = _mm_set_ps(0.0f, p[2], p[1], p[0]); - - __m128 sq = _mm_mul_ps(vec, vec); - __m128 sum = _mm_add_ps(sq, _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(2, 1, 0, 3))); - sum = _mm_add_ss(sum, _mm_movehl_ps(sum, sum)); - - float squared; - _mm_store_ss(&squared, sum); - - if (squared != 0.0f) - { - float factor = val / std::sqrt(squared); // exact - __m128 factorVec = _mm_set1_ps(factor); - vec = _mm_mul_ps(vec, factorVec); - } - else - { - // Zero-length fallback: use unit vector along z - vec = _mm_set_ps(0.0f, val, 0.0f, 0.0f); - } - - p[0] = _mm_cvtss_f32(vec); - p[1] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))); - p[2] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2))); -} - -static void matF_x_point4F_sse2(const float* m, const float* p, float* out) -{ - __m128 point = _mm_loadu_ps(p); - - __m128 r0 = _mm_loadu_ps(m + 0); - __m128 r1 = _mm_loadu_ps(m + 4); - __m128 r2 = _mm_loadu_ps(m + 8); - __m128 r3 = _mm_loadu_ps(m + 12); - - // Multiply rows by vector - __m128 m0 = _mm_mul_ps(r0, point); - __m128 m1 = _mm_mul_ps(r1, point); - __m128 m2 = _mm_mul_ps(r2, point); - __m128 m3 = _mm_mul_ps(r3, point); - - // Horizontal add - auto dot4 = [](__m128 v) -> float - { - __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); - __m128 sums = _mm_add_ps(v, shuf); - shuf = _mm_movehl_ps(shuf, sums); - sums = _mm_add_ss(sums, shuf); - return _mm_cvtss_f32(sums); - }; - - out[0] = dot4(m0); - out[1] = dot4(m1); - out[2] = dot4(m2); - out[3] = dot4(m3); -} - -static void m_matF_x_matF_sse2(const float* A, const float* B, float* R) -{ - __m128 b0 = _mm_loadu_ps(B + 0); - __m128 b1 = _mm_loadu_ps(B + 4); - __m128 b2 = _mm_loadu_ps(B + 8); - __m128 b3 = _mm_loadu_ps(B + 12); - - for (int i = 0; i < 4; i++) - { - __m128 a = _mm_loadu_ps(A + i * 4); - - __m128 xxxx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 yyyy = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); - __m128 zzzz = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); - __m128 wwww = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)); - - __m128 row = - _mm_add_ps( - _mm_add_ps(_mm_mul_ps(xxxx, b0), - _mm_mul_ps(yyyy, b1)), - _mm_add_ps(_mm_mul_ps(zzzz, b2), - _mm_mul_ps(wwww, b3)) - ); - - _mm_storeu_ps(R + i * 4, row); - } -} - -void mInstallLibrary_SSE2() -{ - m_point3F_normalize = m_point3F_normalize_sse2; - m_point3F_normalize_f = m_point3F_normalize_f_sse2; - m_matF_x_point4F = matF_x_point4F_sse2; - m_matF_x_matF = m_matF_x_matF_sse2; - m_matF_x_matF_aligned = m_matF_x_matF_sse2; -} diff --git a/Engine/source/math/simd/mMath_SSE41.cpp b/Engine/source/math/simd/mMath_SSE41.cpp deleted file mode 100644 index 8aafe8da8..000000000 --- a/Engine/source/math/simd/mMath_SSE41.cpp +++ /dev/null @@ -1,107 +0,0 @@ -#include "platform/platform.h" -#include "math/mMath.h" -#include "math/util/frustum.h" -#include // Caution!!! Possible platform specific include -#include "math/mMathFn.h" - - -//################################################################ -// SSE4.1 Functions -//################################################################ -#include // SSE4.1 - -static void m_point3F_normalize_sse41(float* p) -{ - // [x y z 0] - __m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]); - - // dot = x*x + y*y + z*z - __m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x - - float lenSq = _mm_cvtss_f32(dot); - - if (lenSq != 0.0f) - { - float invLen = 1.0f / sqrtf(lenSq); - __m128 scale = _mm_set1_ps(invLen); - v = _mm_mul_ps(v, scale); - } - else - { - // fallback [0,0,1] - v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); - } - - p[0] = _mm_cvtss_f32(v); - p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))); - p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))); -} - -static void m_point3F_normalize_f_sse41(float* p, float val) -{ - // [x y z 0] - __m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]); - - // dot = x*x + y*y + z*z - __m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x - - float lenSq = _mm_cvtss_f32(dot); - - if (lenSq != 0.0f) - { - float invLen = val / sqrtf(lenSq); - __m128 scale = _mm_set1_ps(invLen); - v = _mm_mul_ps(v, scale); - } - else - { - // fallback [0,0,1] - v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); - } - - p[0] = _mm_cvtss_f32(v); - p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))); - p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))); -} - -void matF_x_point4F_sse41(const float* m, const float* p, float* out) -{ - __m128 point = _mm_loadu_ps(p); - - __m128 r0 = _mm_loadu_ps(m + 0); - __m128 r1 = _mm_loadu_ps(m + 4); - __m128 r2 = _mm_loadu_ps(m + 8); - __m128 r3 = _mm_loadu_ps(m + 12); - - out[0] = _mm_cvtss_f32(_mm_dp_ps(r0, point, 0xF1)); - out[1] = _mm_cvtss_f32(_mm_dp_ps(r1, point, 0xF2)); - out[2] = _mm_cvtss_f32(_mm_dp_ps(r2, point, 0xF4)); - out[3] = _mm_cvtss_f32(_mm_dp_ps(r3, point, 0xF8)); -} - -static void m_matF_x_matF_sse41(const float* A, const float* B, float* R) -{ - __m128 col0 = _mm_set_ps(B[12], B[8], B[4], B[0]); - __m128 col1 = _mm_set_ps(B[13], B[9], B[5], B[1]); - __m128 col2 = _mm_set_ps(B[14], B[10], B[6], B[2]); - __m128 col3 = _mm_set_ps(B[15], B[11], B[7], B[3]); - - for (int i = 0; i < 4; i++) - { - __m128 row = _mm_loadu_ps(A + i * 4); - - R[i * 4 + 0] = _mm_cvtss_f32(_mm_dp_ps(row, col0, 0xF1)); - R[i * 4 + 1] = _mm_cvtss_f32(_mm_dp_ps(row, col1, 0xF1)); - R[i * 4 + 2] = _mm_cvtss_f32(_mm_dp_ps(row, col2, 0xF1)); - R[i * 4 + 3] = _mm_cvtss_f32(_mm_dp_ps(row, col3, 0xF1)); - } -} - -void mInstallLibrary_SSE41() -{ - m_point3F_normalize = m_point3F_normalize_sse41; - m_point3F_normalize_f = m_point3F_normalize_f_sse41; - m_matF_x_point4F = matF_x_point4F_sse41; - m_matF_x_matF = m_matF_x_matF_sse41; - m_matF_x_matF_aligned = m_matF_x_matF_sse41; -} diff --git a/Engine/source/platformWin32/winMath.cpp b/Engine/source/platformWin32/winMath.cpp index 1bf148f35..8e5e9cf7c 100644 --- a/Engine/source/platformWin32/winMath.cpp +++ b/Engine/source/platformWin32/winMath.cpp @@ -26,12 +26,8 @@ #include "math/mMath.h" - -extern void mInstallLibrary_SSE2(); -extern void mInstallLibrary_SSE41(); -extern void mInstallLibrary_AVX2(); extern void mInstallLibrary_C(); -extern void mInstallLibrary_ASM(); +extern void mInstallLibrary_CPU(); //-------------------------------------- DefineEngineStringlyVariadicFunction( mathInit, void, 1, 10, "( ... )" @@ -103,17 +99,8 @@ void Math::init(U32 properties) if (properties & CPU_PROP_SSE) { - Con::printf(" Installing SSE extensions"); - if (properties & CPU_PROP_SSE2) - mInstallLibrary_SSE2(); - if(properties & CPU_PROP_SSE4_1) - mInstallLibrary_SSE41(); - } - - if (properties & CPU_PROP_AVX2) - { - Con::printf(" Installing AVX2 extensions"); - mInstallLibrary_AVX2(); + Con::printf(" Installing extensions"); + mInstallLibrary_CPU(); } Con::printf(" ");