From fb986e375d743bc083a0c52d20d4385bfc2dab27 Mon Sep 17 00:00:00 2001 From: marauder2k7 Date: Wed, 25 Feb 2026 20:36:53 +0000 Subject: [PATCH] backup alot more changes here added some matrix functions most of the point3 and point4 is now converted over to simd functions matrix now has a lot done by simd but transpose/normal/affine inverse/ inverse still to do --- Engine/source/math/mMathFn.cpp | 0 Engine/source/math/mMathFn.h | 116 ++++++++++++++++++- Engine/source/math/mMath_CPU.cpp | 190 +++++++++++++++++++++++++++++++ Engine/source/math/mMatrix.h | 22 ++-- Engine/source/math/mPoint3.h | 52 ++++----- 5 files changed, 336 insertions(+), 44 deletions(-) create mode 100644 Engine/source/math/mMathFn.cpp diff --git a/Engine/source/math/mMathFn.cpp b/Engine/source/math/mMathFn.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/Engine/source/math/mMathFn.h b/Engine/source/math/mMathFn.h index 946ba0e67..75ee9a870 100644 --- a/Engine/source/math/mMathFn.h +++ b/Engine/source/math/mMathFn.h @@ -52,22 +52,132 @@ extern ISA gISA; //------------------------------------------------------------------------------ // Forward declare math_backend namespace math_backend { - namespace float4 { + namespace float3 { + // Operators void add(const float* a, const float* b, float* r); void sub(const float* a, const float* b, float* r); void mul(const float* a, const float* b, float* r); void mul_scalar(const float* a, float s, float* r); void div(const float* a, const float* b, float* r); void div_scalar(const float* a, float s, float* r); - float dot(const float* a, const float* b); - void lerp(const float* from, const float* to, float f, float* r); + // Vector math + float dot(const float* a, const float* b); + + inline float lengthSquared(const float* a) + { + return dot(a, a); + } + + inline float length(const float* a) + { + return sqrt(lengthSquared(a)); + } + + inline void normalize(float* a) + { + float len = length(a); + mul_scalar(a, 1.0f / len, a); + } + + inline void normalizeSafe(float* a) + { + float len = length(a); + if (len >= POINT_EPSILON) + mul_scalar(a, 1.0f / len, a); + } + + inline void normalizeMag(float* a, float r) + { + float len = length(a); + if (len >= POINT_EPSILON) + mul_scalar(a, r / len, a); + } + + void lerp(const float* from, const float* to, float f, float* r); void addArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); void subArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); void mulArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); void mulArrayScalar(const class Point4F* a, float s, class Point4F* r, size_t n); void lerpArray(const class Point4F* from, const class Point4F* to, float f, class Point4F* r, size_t n); } + + namespace float4 { + // Operators + void add(const float* a, const float* b, float* r); + void sub(const float* a, const float* b, float* r); + void mul(const float* a, const float* b, float* r); + void mul_scalar(const float* a, float s, float* r); + void div(const float* a, const float* b, float* r); + void div_scalar(const float* a, float s, float* r); + + // Vector math + float dot(const float* a, const float* b); + + inline float lengthSquared(const float* a) + { + return dot(a, a); + } + + inline float length(const float* a) + { + return sqrt(lengthSquared(a)); + } + + inline void normalize(float* a) + { + float len = length(a); + mul_scalar(a, 1.0f / len, a); + } + + inline void normalizeSafe(float* a) + { + float len = length(a); + if(len >= POINT_EPSILON) + mul_scalar(a, 1.0f / len, a); + } + + inline void normalizeMag(float* a, float r) + { + float len = length(a); + if (len >= POINT_EPSILON) + mul_scalar(a, r / len, a); + } + + void lerp(const float* from, const float* to, float f, float* r); + void addArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); + void subArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); + void mulArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n); + void mulArrayScalar(const class Point4F* a, float s, class Point4F* r, size_t n); + void lerpArray(const class Point4F* from, const class Point4F* to, float f, class Point4F* r, size_t n); + } + + namespace float_mat4x4 { + void multiply_mat4(const float* A, const float* B, float* R); + void multiply_float4(const float* m, const float* p, float* R); + inline void multiply_float3(const float* m, const float* p, float* R) + { + float va[4] = { p[0], p[1], p[2], 1.0f }; + float vr[4]; + multiply_float4(m, va, vr); + R[0] = vr[0]; + R[1] = vr[1]; + R[2] = vr[2]; + } + + inline void multiply_vector(const float* m, const float* v, float* R) + { + // vector, w = 0 + float va[4] = { v[0], v[1], v[2], 0.0f }; + float vr[4]; + multiply_float4(m, va, vr); + R[0] = vr[0]; + R[1] = vr[1]; + R[2] = vr[2]; + } + + void scale(float* M, const float* S); + } } // namespace math_backend //-------------------------------------- diff --git a/Engine/source/math/mMath_CPU.cpp b/Engine/source/math/mMath_CPU.cpp index a3391c911..6e7c6708c 100644 --- a/Engine/source/math/mMath_CPU.cpp +++ b/Engine/source/math/mMath_CPU.cpp @@ -237,3 +237,193 @@ float math_backend::float4::dot(const float* a, const float* b) return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; } + +void math_backend::float3::add(const float* a, const float* b, float* r) +{ + // Pad 3 floats into 4 for SIMD + float va[4] = { a[0], a[1], a[2], 0.0f }; + float vb[4] = { b[0], b[1], b[2], 0.0f }; + float vr[4]; + float4::add(va, vb, vr); + r[0] = vr[0]; + r[1] = vr[1]; + r[2] = vr[2]; +} + +void math_backend::float3::sub(const float* a, const float* b, float* r) +{ + // Pad 3 floats into 4 for SIMD + float va[4] = { a[0], a[1], a[2], 0.0f }; + float vb[4] = { b[0], b[1], b[2], 0.0f }; + float vr[4]; + float4::sub(va, vb, vr); + r[0] = vr[0]; + r[1] = vr[1]; + r[2] = vr[2]; +} + +void math_backend::float3::mul(const float* a, const float* b, float* r) +{ + // Pad 3 floats into 4 for SIMD + float va[4] = { a[0], a[1], a[2], 0.0f }; + float vb[4] = { b[0], b[1], b[2], 0.0f }; + float vr[4]; + float4::mul(va, vb, vr); + r[0] = vr[0]; + r[1] = vr[1]; + r[2] = vr[2]; +} + +void math_backend::float3::mul_scalar(const float* a, float s, float* r) +{ + float va[4] = { a[0], a[1], a[2], 0.0f }; + float vr[4]; + float4::mul_scalar(va, s, vr); + r[0] = vr[0]; + r[1] = vr[1]; + r[2] = vr[2]; +} + +float math_backend::float3::dot(const float* a, const float* b) +{ + float va[4] = { a[0], a[1], a[2], 0.0f }; + float vb[4] = { b[0], b[1], b[2], 0.0f }; + return float4::dot(va, vb); +} + + +void math_backend::float_mat4x4::multiply_mat4(const float* A, const float* B, float* R) +{ + __m128 b0 = _mm_loadu_ps(B + 0); + __m128 b1 = _mm_loadu_ps(B + 4); + __m128 b2 = _mm_loadu_ps(B + 8); + __m128 b3 = _mm_loadu_ps(B + 12); + + for (int i = 0; i < 4; i++) + { + __m128 a = _mm_loadu_ps(A + i * 4); + + __m128 r = + _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b0); + + r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), b1)); + r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), b2)); + r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), b3)); + + _mm_storeu_ps(R + i * 4, r); + } +} + +void math_backend::float_mat4x4::multiply_float4(const float* M, const float* P, float* R) +{ +#if TORQUE_MATH_x64 + switch (gISA) + { + case ISA::AVX2: + case ISA::AVX: + case ISA::SSE41: +#ifdef TORQUE_HAVE_SSE4_1 + { + __m128 p = _mm_loadu_ps(P); + + __m128 r0 = _mm_dp_ps(_mm_loadu_ps(M + 0), p, 0xF1); + __m128 r1 = _mm_dp_ps(_mm_loadu_ps(M + 4), p, 0xF2); + __m128 r2 = _mm_dp_ps(_mm_loadu_ps(M + 8), p, 0xF4); + __m128 r3 = _mm_dp_ps(_mm_loadu_ps(M + 12), p, 0xF8); + + __m128 r = _mm_or_ps(_mm_or_ps(r0, r1), _mm_or_ps(r2, r3)); + _mm_storeu_ps(R, r); + return; + } +#endif + case ISA::SSE2: + { + __m128 p = _mm_loadu_ps(P); + + __m128 xxxx = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 yyyy = _mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 zzzz = _mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 wwww = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 r = + _mm_mul_ps(_mm_loadu_ps(M + 0), xxxx); + + r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 4), yyyy)); + r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 8), zzzz)); + r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 12), wwww)); + + // transpose from column accum to row result + __m128 t0 = _mm_unpacklo_ps(r, r); + __m128 t1 = _mm_unpackhi_ps(r, r); + __m128 t2 = _mm_movelh_ps(t0, t1); + __m128 t3 = _mm_movehl_ps(t1, t0); + + _mm_storeu_ps(R, _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2, 0, 2, 0))); + return; + } + + default: break; + } + +#elif TORQUE_MATH_ARM + + float32x4_t p = vld1q_f32(P); + + float32x4_t r = + vmulq_n_f32(vld1q_f32(M + 0), vgetq_lane_f32(p, 0)); + + r = vmlaq_n_f32(r, vld1q_f32(M + 4), vgetq_lane_f32(p, 1)); + r = vmlaq_n_f32(r, vld1q_f32(M + 8), vgetq_lane_f32(p, 2)); + r = vmlaq_n_f32(r, vld1q_f32(M + 12), vgetq_lane_f32(p, 3)); + + // horizontal add rows + float32x2_t s0 = vadd_f32(vget_low_f32(r), vget_high_f32(r)); + float32x2_t s1 = vpadd_f32(s0, s0); + vst1q_lane_f32(R, s1, 0); + return; + +#endif + + R[0] = M[0] * P[0] + M[1] * P[1] + M[2] * P[2] + M[3] * P[3]; + R[1] = M[4] * P[0] + M[5] * P[1] + M[6] * P[2] + M[7] * P[3]; + R[2] = M[8] * P[0] + M[9] * P[1] + M[10] * P[2] + M[11] * P[3]; + R[3] = M[12] * P[0] + M[13] * P[1] + M[14] * P[2] + M[15] * P[3]; +} + +void math_backend::float_mat4x4::scale(float* M, const float* S) +{ + float scalef[4] = { S[0], S[1], S[2], 1.0f }; +#if TORQUE_MATH_x64 + __m128 scale = _mm_loadu_ps(scalef); // S = {x, y, z, 1} + + __m128 row0 = _mm_loadu_ps(M + 0); + __m128 row1 = _mm_loadu_ps(M + 4); + __m128 row2 = _mm_loadu_ps(M + 8); + __m128 row3 = _mm_loadu_ps(M + 12); + + row0 = _mm_mul_ps(row0, scale); + row1 = _mm_mul_ps(row1, scale); + row2 = _mm_mul_ps(row2, scale); + row3 = _mm_mul_ps(row3, scale); + + _mm_storeu_ps(M + 0, row0); + _mm_storeu_ps(M + 4, row1); + _mm_storeu_ps(M + 8, row2); + _mm_storeu_ps(M + 12, row3); + return; +#elif TORQUE_MATH_ARM + float32x4_t scale = vld1q_f32(scalef); + + vst1q_f32(M + 0, vmulq_f32(vld1q_f32(M + 0), scale)); + vst1q_f32(M + 4, vmulq_f32(vld1q_f32(M + 4), scale)); + vst1q_f32(M + 8, vmulq_f32(vld1q_f32(M + 8), scale)); + vst1q_f32(M + 12, vmulq_f32(vld1q_f32(M + 12), scale)); + return; +#endif + + M[0] *= S[0]; M[1] *= S[1]; M[2] *= S[2]; + M[4] *= S[0]; M[5] *= S[1]; M[6] *= S[2]; + M[8] *= S[0]; M[9] *= S[1]; M[10] *= S[2]; + M[12] *= S[0]; M[13] *= S[1]; M[14] *= S[2]; + +} diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h index 763a62b93..6da6374c6 100644 --- a/Engine/source/math/mMatrix.h +++ b/Engine/source/math/mMatrix.h @@ -396,7 +396,7 @@ inline MatrixF& MatrixF::transpose() inline MatrixF& MatrixF::scale(const Point3F& p) { - m_matF_scale(m,p); + math_backend::float_mat4x4::scale(m,p); return *this; } @@ -419,7 +419,7 @@ inline MatrixF& MatrixF::mul( const MatrixF &a ) AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!"); MatrixF tempThis(*this); - m_matF_x_matF(tempThis, a, *this); + math_backend::float_mat4x4::multiply_mat4(tempThis, a, *this); return (*this); } @@ -428,7 +428,7 @@ inline MatrixF& MatrixF::mulL( const MatrixF &a ) AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!"); MatrixF tempThis(*this); - m_matF_x_matF(a, tempThis, *this); + math_backend::float_mat4x4::multiply_mat4(a, tempThis, *this); return (*this); } @@ -436,7 +436,7 @@ inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b ) { // a * b -> M AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!"); - m_matF_x_matF(a, b, *this); + math_backend::float_mat4x4::multiply_mat4(a, b, *this); return (*this); } @@ -461,36 +461,36 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b) inline void MatrixF::mul( Point4F& p ) const { Point4F temp; - m_matF_x_point4F(*this, &p.x, &temp.x); + math_backend::float_mat4x4::multiply_float4(*this, &p.x, &temp.x); p = temp; } inline void MatrixF::mulP( Point3F& p) const { // M * p -> d - Point3F d; - m_matF_x_point3F(*this, &p.x, &d.x); - p = d; + Point3F temp; + math_backend::float_mat4x4::multiply_float3(*this, &p.x, &temp.x); + p = temp; } inline void MatrixF::mulP( const Point3F &p, Point3F *d) const { // M * p -> d - m_matF_x_point3F(*this, &p.x, &d->x); + math_backend::float_mat4x4::multiply_float3(*this, &p.x, &d->x); } inline void MatrixF::mulV( VectorF& v) const { // M * v -> v VectorF temp; - m_matF_x_vectorF(*this, &v.x, &temp.x); + math_backend::float_mat4x4::multiply_vector(*this, &v.x, &temp.x); v = temp; } inline void MatrixF::mulV( const VectorF &v, Point3F *d) const { // M * v -> d - m_matF_x_vectorF(*this, &v.x, &d->x); + math_backend::float_mat4x4::multiply_vector(*this, &v.x, &d->x); } inline void MatrixF::mul(Box3F& b) const diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h index c1bb72923..7d12608ab 100644 --- a/Engine/source/math/mPoint3.h +++ b/Engine/source/math/mPoint3.h @@ -599,17 +599,17 @@ inline void Point3F::convolveInverse(const Point3F& c) inline F32 Point3F::lenSquared() const { - return (x * x) + (y * y) + (z * z); + return math_backend::float3::lengthSquared(*this); } inline F32 Point3F::len() const { - return mSqrt(x*x + y*y + z*z); + return math_backend::float3::length(*this); } inline void Point3F::normalize() { - m_point3F_normalize(*this); + math_backend::float3::normalize(*this); } inline F32 Point3F::magnitudeSafe() const @@ -626,18 +626,13 @@ inline F32 Point3F::magnitudeSafe() const inline void Point3F::normalizeSafe() { - F32 vmag = magnitudeSafe(); - - if( vmag > POINT_EPSILON ) - { - *this *= F32(1.0 / vmag); - } + math_backend::float3::normalizeSafe(*this); } inline void Point3F::normalize(F32 val) { - m_point3F_normalize_f(*this, val); + math_backend::float3::normalizeMag(*this, val); } inline bool Point3F::operator==(const Point3F& _test) const @@ -652,35 +647,35 @@ inline bool Point3F::operator!=(const Point3F& _test) const inline Point3F Point3F::operator+(const Point3F& _add) const { - return Point3F(x + _add.x, y + _add.y, z + _add.z); + Point3F r; + math_backend::float3::add(*this, _add, r); + return r; } inline Point3F Point3F::operator-(const Point3F& _rSub) const { - return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z); + Point3F r; + math_backend::float3::sub(*this, _rSub, r); + return r; } inline Point3F& Point3F::operator+=(const Point3F& _add) { - x += _add.x; - y += _add.y; - z += _add.z; - + math_backend::float3::add(*this, _add, *this); return *this; } inline Point3F& Point3F::operator-=(const Point3F& _rSub) { - x -= _rSub.x; - y -= _rSub.y; - z -= _rSub.z; - + math_backend::float3::sub(*this, _rSub, *this); return *this; } inline Point3F Point3F::operator*(F32 _mul) const { - return Point3F(x * _mul, y * _mul, z * _mul); + Point3F r; + math_backend::float3::mul_scalar(*this, _mul, r); + return r; } inline Point3F Point3F::operator/(F32 _div) const @@ -694,10 +689,7 @@ inline Point3F Point3F::operator/(F32 _div) const inline Point3F& Point3F::operator*=(F32 _mul) { - x *= _mul; - y *= _mul; - z *= _mul; - + math_backend::float3::mul_scalar(*this, _mul, *this); return *this; } @@ -715,14 +707,14 @@ inline Point3F& Point3F::operator/=(F32 _div) inline Point3F Point3F::operator*(const Point3F &_vec) const { - return Point3F(x * _vec.x, y * _vec.y, z * _vec.z); + Point3F r; + math_backend::float3::mul(*this, _vec, r); + return r; } inline Point3F& Point3F::operator*=(const Point3F &_vec) { - x *= _vec.x; - y *= _vec.y; - z *= _vec.z; + math_backend::float3::mul(*this, _vec, *this); return *this; } @@ -999,7 +991,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand) inline F32 mDot(const Point3F &p1, const Point3F &p2) { - return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z); + return math_backend::float3::dot(p1, p2); } inline F64 mDot(const Point3D &p1, const Point3D &p2)