alot more changes here added some matrix functions
most of the point3 and point4 is now converted over to simd functions
matrix now has a lot done by simd but transpose/normal/affine inverse/ inverse still to do
This commit is contained in:
marauder2k7 2026-02-25 20:36:53 +00:00
parent 8908bbe462
commit fb986e375d
5 changed files with 336 additions and 44 deletions

View file

View file

@ -52,22 +52,132 @@ extern ISA gISA;
//------------------------------------------------------------------------------
// Forward declare math_backend
namespace math_backend {
namespace float4 {
namespace float3 {
// Operators
void add(const float* a, const float* b, float* r);
void sub(const float* a, const float* b, float* r);
void mul(const float* a, const float* b, float* r);
void mul_scalar(const float* a, float s, float* r);
void div(const float* a, const float* b, float* r);
void div_scalar(const float* a, float s, float* r);
float dot(const float* a, const float* b);
void lerp(const float* from, const float* to, float f, float* r);
// Vector math
float dot(const float* a, const float* b);
inline float lengthSquared(const float* a)
{
return dot(a, a);
}
inline float length(const float* a)
{
return sqrt(lengthSquared(a));
}
inline void normalize(float* a)
{
float len = length(a);
mul_scalar(a, 1.0f / len, a);
}
inline void normalizeSafe(float* a)
{
float len = length(a);
if (len >= POINT_EPSILON)
mul_scalar(a, 1.0f / len, a);
}
inline void normalizeMag(float* a, float r)
{
float len = length(a);
if (len >= POINT_EPSILON)
mul_scalar(a, r / len, a);
}
void lerp(const float* from, const float* to, float f, float* r);
void addArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n);
void subArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n);
void mulArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n);
void mulArrayScalar(const class Point4F* a, float s, class Point4F* r, size_t n);
void lerpArray(const class Point4F* from, const class Point4F* to, float f, class Point4F* r, size_t n);
}
namespace float4 {
// Operators
void add(const float* a, const float* b, float* r);
void sub(const float* a, const float* b, float* r);
void mul(const float* a, const float* b, float* r);
void mul_scalar(const float* a, float s, float* r);
void div(const float* a, const float* b, float* r);
void div_scalar(const float* a, float s, float* r);
// Vector math
float dot(const float* a, const float* b);
inline float lengthSquared(const float* a)
{
return dot(a, a);
}
inline float length(const float* a)
{
return sqrt(lengthSquared(a));
}
inline void normalize(float* a)
{
float len = length(a);
mul_scalar(a, 1.0f / len, a);
}
inline void normalizeSafe(float* a)
{
float len = length(a);
if(len >= POINT_EPSILON)
mul_scalar(a, 1.0f / len, a);
}
inline void normalizeMag(float* a, float r)
{
float len = length(a);
if (len >= POINT_EPSILON)
mul_scalar(a, r / len, a);
}
void lerp(const float* from, const float* to, float f, float* r);
void addArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n);
void subArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n);
void mulArray(const class Point4F* a, const class Point4F* b, class Point4F* r, size_t n);
void mulArrayScalar(const class Point4F* a, float s, class Point4F* r, size_t n);
void lerpArray(const class Point4F* from, const class Point4F* to, float f, class Point4F* r, size_t n);
}
namespace float_mat4x4 {
void multiply_mat4(const float* A, const float* B, float* R);
void multiply_float4(const float* m, const float* p, float* R);
inline void multiply_float3(const float* m, const float* p, float* R)
{
float va[4] = { p[0], p[1], p[2], 1.0f };
float vr[4];
multiply_float4(m, va, vr);
R[0] = vr[0];
R[1] = vr[1];
R[2] = vr[2];
}
inline void multiply_vector(const float* m, const float* v, float* R)
{
// vector, w = 0
float va[4] = { v[0], v[1], v[2], 0.0f };
float vr[4];
multiply_float4(m, va, vr);
R[0] = vr[0];
R[1] = vr[1];
R[2] = vr[2];
}
void scale(float* M, const float* S);
}
} // namespace math_backend
//--------------------------------------

View file

@ -237,3 +237,193 @@ float math_backend::float4::dot(const float* a, const float* b)
return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
}
void math_backend::float3::add(const float* a, const float* b, float* r)
{
// Pad 3 floats into 4 for SIMD
float va[4] = { a[0], a[1], a[2], 0.0f };
float vb[4] = { b[0], b[1], b[2], 0.0f };
float vr[4];
float4::add(va, vb, vr);
r[0] = vr[0];
r[1] = vr[1];
r[2] = vr[2];
}
void math_backend::float3::sub(const float* a, const float* b, float* r)
{
// Pad 3 floats into 4 for SIMD
float va[4] = { a[0], a[1], a[2], 0.0f };
float vb[4] = { b[0], b[1], b[2], 0.0f };
float vr[4];
float4::sub(va, vb, vr);
r[0] = vr[0];
r[1] = vr[1];
r[2] = vr[2];
}
void math_backend::float3::mul(const float* a, const float* b, float* r)
{
// Pad 3 floats into 4 for SIMD
float va[4] = { a[0], a[1], a[2], 0.0f };
float vb[4] = { b[0], b[1], b[2], 0.0f };
float vr[4];
float4::mul(va, vb, vr);
r[0] = vr[0];
r[1] = vr[1];
r[2] = vr[2];
}
void math_backend::float3::mul_scalar(const float* a, float s, float* r)
{
float va[4] = { a[0], a[1], a[2], 0.0f };
float vr[4];
float4::mul_scalar(va, s, vr);
r[0] = vr[0];
r[1] = vr[1];
r[2] = vr[2];
}
float math_backend::float3::dot(const float* a, const float* b)
{
float va[4] = { a[0], a[1], a[2], 0.0f };
float vb[4] = { b[0], b[1], b[2], 0.0f };
return float4::dot(va, vb);
}
void math_backend::float_mat4x4::multiply_mat4(const float* A, const float* B, float* R)
{
__m128 b0 = _mm_loadu_ps(B + 0);
__m128 b1 = _mm_loadu_ps(B + 4);
__m128 b2 = _mm_loadu_ps(B + 8);
__m128 b3 = _mm_loadu_ps(B + 12);
for (int i = 0; i < 4; i++)
{
__m128 a = _mm_loadu_ps(A + i * 4);
__m128 r =
_mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b0);
r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), b1));
r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), b2));
r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), b3));
_mm_storeu_ps(R + i * 4, r);
}
}
void math_backend::float_mat4x4::multiply_float4(const float* M, const float* P, float* R)
{
#if TORQUE_MATH_x64
switch (gISA)
{
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
#ifdef TORQUE_HAVE_SSE4_1
{
__m128 p = _mm_loadu_ps(P);
__m128 r0 = _mm_dp_ps(_mm_loadu_ps(M + 0), p, 0xF1);
__m128 r1 = _mm_dp_ps(_mm_loadu_ps(M + 4), p, 0xF2);
__m128 r2 = _mm_dp_ps(_mm_loadu_ps(M + 8), p, 0xF4);
__m128 r3 = _mm_dp_ps(_mm_loadu_ps(M + 12), p, 0xF8);
__m128 r = _mm_or_ps(_mm_or_ps(r0, r1), _mm_or_ps(r2, r3));
_mm_storeu_ps(R, r);
return;
}
#endif
case ISA::SSE2:
{
__m128 p = _mm_loadu_ps(P);
__m128 xxxx = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0));
__m128 yyyy = _mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1));
__m128 zzzz = _mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2));
__m128 wwww = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3));
__m128 r =
_mm_mul_ps(_mm_loadu_ps(M + 0), xxxx);
r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 4), yyyy));
r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 8), zzzz));
r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 12), wwww));
// transpose from column accum to row result
__m128 t0 = _mm_unpacklo_ps(r, r);
__m128 t1 = _mm_unpackhi_ps(r, r);
__m128 t2 = _mm_movelh_ps(t0, t1);
__m128 t3 = _mm_movehl_ps(t1, t0);
_mm_storeu_ps(R, _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2, 0, 2, 0)));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
float32x4_t p = vld1q_f32(P);
float32x4_t r =
vmulq_n_f32(vld1q_f32(M + 0), vgetq_lane_f32(p, 0));
r = vmlaq_n_f32(r, vld1q_f32(M + 4), vgetq_lane_f32(p, 1));
r = vmlaq_n_f32(r, vld1q_f32(M + 8), vgetq_lane_f32(p, 2));
r = vmlaq_n_f32(r, vld1q_f32(M + 12), vgetq_lane_f32(p, 3));
// horizontal add rows
float32x2_t s0 = vadd_f32(vget_low_f32(r), vget_high_f32(r));
float32x2_t s1 = vpadd_f32(s0, s0);
vst1q_lane_f32(R, s1, 0);
return;
#endif
R[0] = M[0] * P[0] + M[1] * P[1] + M[2] * P[2] + M[3] * P[3];
R[1] = M[4] * P[0] + M[5] * P[1] + M[6] * P[2] + M[7] * P[3];
R[2] = M[8] * P[0] + M[9] * P[1] + M[10] * P[2] + M[11] * P[3];
R[3] = M[12] * P[0] + M[13] * P[1] + M[14] * P[2] + M[15] * P[3];
}
void math_backend::float_mat4x4::scale(float* M, const float* S)
{
float scalef[4] = { S[0], S[1], S[2], 1.0f };
#if TORQUE_MATH_x64
__m128 scale = _mm_loadu_ps(scalef); // S = {x, y, z, 1}
__m128 row0 = _mm_loadu_ps(M + 0);
__m128 row1 = _mm_loadu_ps(M + 4);
__m128 row2 = _mm_loadu_ps(M + 8);
__m128 row3 = _mm_loadu_ps(M + 12);
row0 = _mm_mul_ps(row0, scale);
row1 = _mm_mul_ps(row1, scale);
row2 = _mm_mul_ps(row2, scale);
row3 = _mm_mul_ps(row3, scale);
_mm_storeu_ps(M + 0, row0);
_mm_storeu_ps(M + 4, row1);
_mm_storeu_ps(M + 8, row2);
_mm_storeu_ps(M + 12, row3);
return;
#elif TORQUE_MATH_ARM
float32x4_t scale = vld1q_f32(scalef);
vst1q_f32(M + 0, vmulq_f32(vld1q_f32(M + 0), scale));
vst1q_f32(M + 4, vmulq_f32(vld1q_f32(M + 4), scale));
vst1q_f32(M + 8, vmulq_f32(vld1q_f32(M + 8), scale));
vst1q_f32(M + 12, vmulq_f32(vld1q_f32(M + 12), scale));
return;
#endif
M[0] *= S[0]; M[1] *= S[1]; M[2] *= S[2];
M[4] *= S[0]; M[5] *= S[1]; M[6] *= S[2];
M[8] *= S[0]; M[9] *= S[1]; M[10] *= S[2];
M[12] *= S[0]; M[13] *= S[1]; M[14] *= S[2];
}

View file

@ -396,7 +396,7 @@ inline MatrixF& MatrixF::transpose()
inline MatrixF& MatrixF::scale(const Point3F& p)
{
m_matF_scale(m,p);
math_backend::float_mat4x4::scale(m,p);
return *this;
}
@ -419,7 +419,7 @@ inline MatrixF& MatrixF::mul( const MatrixF &a )
AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!");
MatrixF tempThis(*this);
m_matF_x_matF(tempThis, a, *this);
math_backend::float_mat4x4::multiply_mat4(tempThis, a, *this);
return (*this);
}
@ -428,7 +428,7 @@ inline MatrixF& MatrixF::mulL( const MatrixF &a )
AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!");
MatrixF tempThis(*this);
m_matF_x_matF(a, tempThis, *this);
math_backend::float_mat4x4::multiply_mat4(a, tempThis, *this);
return (*this);
}
@ -436,7 +436,7 @@ inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b )
{ // a * b -> M
AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!");
m_matF_x_matF(a, b, *this);
math_backend::float_mat4x4::multiply_mat4(a, b, *this);
return (*this);
}
@ -461,36 +461,36 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b)
inline void MatrixF::mul( Point4F& p ) const
{
Point4F temp;
m_matF_x_point4F(*this, &p.x, &temp.x);
math_backend::float_mat4x4::multiply_float4(*this, &p.x, &temp.x);
p = temp;
}
inline void MatrixF::mulP( Point3F& p) const
{
// M * p -> d
Point3F d;
m_matF_x_point3F(*this, &p.x, &d.x);
p = d;
Point3F temp;
math_backend::float_mat4x4::multiply_float3(*this, &p.x, &temp.x);
p = temp;
}
inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
{
// M * p -> d
m_matF_x_point3F(*this, &p.x, &d->x);
math_backend::float_mat4x4::multiply_float3(*this, &p.x, &d->x);
}
inline void MatrixF::mulV( VectorF& v) const
{
// M * v -> v
VectorF temp;
m_matF_x_vectorF(*this, &v.x, &temp.x);
math_backend::float_mat4x4::multiply_vector(*this, &v.x, &temp.x);
v = temp;
}
inline void MatrixF::mulV( const VectorF &v, Point3F *d) const
{
// M * v -> d
m_matF_x_vectorF(*this, &v.x, &d->x);
math_backend::float_mat4x4::multiply_vector(*this, &v.x, &d->x);
}
inline void MatrixF::mul(Box3F& b) const

View file

@ -599,17 +599,17 @@ inline void Point3F::convolveInverse(const Point3F& c)
inline F32 Point3F::lenSquared() const
{
return (x * x) + (y * y) + (z * z);
return math_backend::float3::lengthSquared(*this);
}
inline F32 Point3F::len() const
{
return mSqrt(x*x + y*y + z*z);
return math_backend::float3::length(*this);
}
inline void Point3F::normalize()
{
m_point3F_normalize(*this);
math_backend::float3::normalize(*this);
}
inline F32 Point3F::magnitudeSafe() const
@ -626,18 +626,13 @@ inline F32 Point3F::magnitudeSafe() const
inline void Point3F::normalizeSafe()
{
F32 vmag = magnitudeSafe();
if( vmag > POINT_EPSILON )
{
*this *= F32(1.0 / vmag);
}
math_backend::float3::normalizeSafe(*this);
}
inline void Point3F::normalize(F32 val)
{
m_point3F_normalize_f(*this, val);
math_backend::float3::normalizeMag(*this, val);
}
inline bool Point3F::operator==(const Point3F& _test) const
@ -652,35 +647,35 @@ inline bool Point3F::operator!=(const Point3F& _test) const
inline Point3F Point3F::operator+(const Point3F& _add) const
{
return Point3F(x + _add.x, y + _add.y, z + _add.z);
Point3F r;
math_backend::float3::add(*this, _add, r);
return r;
}
inline Point3F Point3F::operator-(const Point3F& _rSub) const
{
return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
Point3F r;
math_backend::float3::sub(*this, _rSub, r);
return r;
}
inline Point3F& Point3F::operator+=(const Point3F& _add)
{
x += _add.x;
y += _add.y;
z += _add.z;
math_backend::float3::add(*this, _add, *this);
return *this;
}
inline Point3F& Point3F::operator-=(const Point3F& _rSub)
{
x -= _rSub.x;
y -= _rSub.y;
z -= _rSub.z;
math_backend::float3::sub(*this, _rSub, *this);
return *this;
}
inline Point3F Point3F::operator*(F32 _mul) const
{
return Point3F(x * _mul, y * _mul, z * _mul);
Point3F r;
math_backend::float3::mul_scalar(*this, _mul, r);
return r;
}
inline Point3F Point3F::operator/(F32 _div) const
@ -694,10 +689,7 @@ inline Point3F Point3F::operator/(F32 _div) const
inline Point3F& Point3F::operator*=(F32 _mul)
{
x *= _mul;
y *= _mul;
z *= _mul;
math_backend::float3::mul_scalar(*this, _mul, *this);
return *this;
}
@ -715,14 +707,14 @@ inline Point3F& Point3F::operator/=(F32 _div)
inline Point3F Point3F::operator*(const Point3F &_vec) const
{
return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
Point3F r;
math_backend::float3::mul(*this, _vec, r);
return r;
}
inline Point3F& Point3F::operator*=(const Point3F &_vec)
{
x *= _vec.x;
y *= _vec.y;
z *= _vec.z;
math_backend::float3::mul(*this, _vec, *this);
return *this;
}
@ -999,7 +991,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
inline F32 mDot(const Point3F &p1, const Point3F &p2)
{
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
return math_backend::float3::dot(p1, p2);
}
inline F64 mDot(const Point3D &p1, const Point3D &p2)