further sse simd additions

avx2 float3 added
added normalize_magnitude
added divide fast to float3 may copy to float4

move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend
This commit is contained in:
marauder2k7 2026-02-26 21:11:31 +00:00
parent 2559145c06
commit 18d0aa0418
16 changed files with 337 additions and 77 deletions

View file

@ -853,7 +853,11 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2,
// 3D World Draw Misc
//-----------------------------------------------------------------------------
static SphereMesh gSphere;
SphereMesh& getSphere()
{
static SphereMesh instance;
return instance;
}
void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm )
{
@ -868,7 +872,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P
GFX->pushWorldMatrix();
GFX->multWorld(mat);
const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2);
const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2);
S32 numPoly = sphereMesh->numPoly;
S32 totalPoly = 0;
GFXVertexBufferHandle<GFXVertexPCT> verts(mDevice, numPoly*3, GFXBufferTypeVolatile);

View file

@ -0,0 +1,115 @@
#pragma once
#include <cmath> // for sqrtf, etc.
#include "../mConstants.h"
// Safely loads a float3 -> simd 4 lane backend
namespace math_backend::float3
{
//----------------------------------------------------------
// Add two float4 vectors: r = a + b
inline void float3_add_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3(a);
f32x4 vb = v_load3(b);
f32x4 vr = v_add(va, vb);
v_store3(r, vr);
}
// Subtract: r = a - b
inline void float3_sub_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3(a);
f32x4 vb = v_load3(b);
f32x4 vr = v_sub(va, vb);
v_store3(r, vr);
}
// Multiply element-wise: r = a * b
inline void float3_mul_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3(a);
f32x4 vb = v_load3(b);
f32x4 vr = v_mul(va, vb);
v_store3(r, vr);
}
// Multiply by scalar: r = a * s
inline void float3_mul_scalar_impl(const float* a, float s, float* r)
{
f32x4 va = v_load3(a);
f32x4 vs = v_set1(s);
f32x4 vr = v_mul(va, vs);
v_store3(r, vr);
}
// Divide element-wise: r = a / b
inline void float3_div_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3(a);
f32x4 vb = v_load3(b);
f32x4 vr = v_div_fast(va, vb);
v_store3(r, vr);
}
// Divide by scalar: r = a / s
inline void float3_div_scalar_impl(const float* a, float s, float* r)
{
f32x4 va = v_load3(a);
f32x4 vs = v_set1(s);
f32x4 vr = v_div_fast(va, vs);
v_store3(r, vr);
}
// Dot product: returns scalar
inline float float3_dot_impl(const float* a, const float* b)
{
f32x4 va = v_load3(a);
f32x4 vb = v_load3(b);
f32x4 vmul = v_mul(va, vb);
f32x4 vsum = v_hadd3(vmul);
return v_extract0(vsum); // first lane is the sum of 3 elements
}
// Length squared
inline float float3_length_squared_impl(const float* a)
{
return float3_dot_impl(a, a);
}
// Length
inline float float3_length_impl(const float* a)
{
return std::sqrt(float3_length_squared_impl(a));
}
// Normalize in-place
inline void float3_normalize_impl(float* a)
{
float len = float3_length_impl(a);
if (len > POINT_EPSILON) // safe threshold
{
float3_mul_scalar_impl(a, 1.0f / len, a);
}
}
// Normalize with magnitude: r = normalize(a) * r
inline void float3_normalize_mag_impl(float* a, float r)
{
float len = float3_length_impl(a);
if (len > POINT_EPSILON)
{
float3_mul_scalar_impl(a, r / len, a);
}
}
// Linear interpolation: r = from + (to - from) * f
inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
{
f32x4 vfrom = v_load3(from);
f32x4 vto = v_load3(to);
f32x4 vf = v_set1(f);
f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
v_store3(r, vr);
}
}

View file

@ -1,6 +1,6 @@
#include "math/public/float4_dispatch.h"
#include "math/mConstants.h"
#include <math.h>
#include <cmath> // for sqrtf, etc.
namespace math_backend::float4::dispatch
{
@ -27,7 +27,8 @@ namespace math_backend::float4::dispatch
};
gFloat4.div_scalar = [](const float* a, float s, float* r) {
for (int i = 0; i < 4; i++) r[i] = a[i] / s;
float denom = 1.0f / s;
for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
};
gFloat4.dot = [](const float* a, const float* b) {
@ -39,7 +40,7 @@ namespace math_backend::float4::dispatch
gFloat4.length = [](const float* a) {
float sum = 0.f;
for (int i = 0; i < 4; i++) sum += a[i] * a[i];
return sqrtf(sum);
return std::sqrt(sum);
};
gFloat4.lengthSquared = [](const float* a) {

View file

@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
}
}

View file

@ -0,0 +1,84 @@
#include "float3_dispatch.h"
#include <immintrin.h> // AVX/AVX2 intrinsics
namespace
{
typedef __m128 f32x4;
// Load 3 floats into 4-wide SIMD, zero the 4th lane
inline f32x4 v_load3(const float* p) { return _mm_set_ps(0.0f, p[2], p[1], p[0]); }
// Store 3 floats from SIMD register back to memory
inline void v_store3(float* dst, f32x4 v)
{
alignas(16) float tmp[4]; // temp storage
_mm_store_ps(tmp, v); // store all 4 lanes
dst[0] = tmp[0];
dst[1] = tmp[1];
dst[2] = tmp[2];
}
// extract just the first lane.
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
// Broadcast a single float across all 4 lanes
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
// Element-wise multiply
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
// Element-wise divide precise
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
// Element-wise divide fast (1/b)
inline f32x4 v_div_fast(f32x4 a, f32x4 b)
{
f32x4 rcp = _mm_rcp_ps(b);
// Optional refinement here
return _mm_mul_ps(a, rcp);
}
// Element-wise add
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
// Element-wise subtract
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
// Horizontal sum of all elements (for dot product, length, etc.)
inline f32x4 v_hadd3(f32x4 a)
{
__m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...]
__m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
return t2;
}
float float3_dot_avx(const float* a, const float* b)
{
f32x4 va = _mm_loadu_ps(a);
f32x4 vb = _mm_loadu_ps(b);
__m128 dp = _mm_dp_ps(va, vb, 0x71); // multiply 3 (0x71), sum all 4, lowest lane
return _mm_cvtss_f32(dp);
}
}
#include "float3_impl.inl"
namespace math_backend::float3::dispatch
{
// Install AVX2 backend
void install_avx2()
{
gFloat3.add = float3_add_impl;
gFloat3.sub = float3_sub_impl;
gFloat3.mul = float3_mul_impl;
gFloat3.mul_scalar = float3_mul_scalar_impl;
gFloat3.div = float3_div_impl;
gFloat3.div_scalar = float3_div_scalar_impl;
gFloat3.dot = float3_dot_avx;
gFloat3.length = float3_length_impl;
gFloat3.lengthSquared = float3_length_squared_impl;
gFloat3.normalize = float3_normalize_impl;
gFloat3.normalize_mag = float3_normalize_mag_impl;
gFloat3.lerp = float3_lerp_impl;
}
}

View file

@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
}
}

View file

@ -53,6 +53,7 @@ namespace math_backend::float4::dispatch
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
}
}

View file

@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
}
}

View file

@ -29,6 +29,11 @@
#ifndef _MPOINT2_H_
#include "math/mPoint2.h"
#endif
#ifndef _MATH_BACKEND_H_
#include "math/public/math_backend.h"
#endif
#include <array>
//------------------------------------------------------------------------------
/// 3D integer point
@ -97,6 +102,7 @@ public:
class Point3D;
//------------------------------------------------------------------------------
using math_backend::float3::dispatch::gFloat3;
class Point3F
{
//-------------------------------------- Public data
@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
{
AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
m_point3F_interpolate( _from, _to, _factor, *this);
gFloat3.lerp(_from, _to, _factor, *this);
}
inline void Point3F::zero()
@ -599,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c)
inline F32 Point3F::lenSquared() const
{
return (x * x) + (y * y) + (z * z);
return gFloat3.lengthSquared(*this);
}
inline F32 Point3F::len() const
{
return mSqrt(x*x + y*y + z*z);
return gFloat3.length(*this);
}
inline void Point3F::normalize()
{
m_point3F_normalize(*this);
gFloat3.normalize(*this);
}
inline F32 Point3F::magnitudeSafe() const
@ -626,18 +633,13 @@ inline F32 Point3F::magnitudeSafe() const
inline void Point3F::normalizeSafe()
{
F32 vmag = magnitudeSafe();
if( vmag > POINT_EPSILON )
{
*this *= F32(1.0 / vmag);
}
gFloat3.normalize(*this);
}
inline void Point3F::normalize(F32 val)
{
m_point3F_normalize_f(*this, val);
gFloat3.normalize_mag(*this, val);
}
inline bool Point3F::operator==(const Point3F& _test) const
@ -652,52 +654,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const
inline Point3F Point3F::operator+(const Point3F& _add) const
{
return Point3F(x + _add.x, y + _add.y, z + _add.z);
Point3F temp;
gFloat3.add(*this, _add, temp);
return temp;
}
inline Point3F Point3F::operator-(const Point3F& _rSub) const
{
return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
Point3F temp;
gFloat3.sub(*this, _rSub, temp);
return temp;
}
inline Point3F& Point3F::operator+=(const Point3F& _add)
{
x += _add.x;
y += _add.y;
z += _add.z;
gFloat3.add(*this, _add, *this);
return *this;
}
inline Point3F& Point3F::operator-=(const Point3F& _rSub)
{
x -= _rSub.x;
y -= _rSub.y;
z -= _rSub.z;
gFloat3.sub(*this, _rSub, *this);
return *this;
}
inline Point3F Point3F::operator*(F32 _mul) const
{
return Point3F(x * _mul, y * _mul, z * _mul);
Point3F temp;
gFloat3.mul_scalar(*this, _mul, temp);
return temp;
}
inline Point3F Point3F::operator/(F32 _div) const
{
AssertFatal(_div != 0.0f, "Error, div by zero attempted");
F32 inv = 1.0f / _div;
return Point3F(x * inv, y * inv, z * inv);
Point3F temp;
gFloat3.div_scalar(*this, _div, temp);
return temp;
}
inline Point3F& Point3F::operator*=(F32 _mul)
{
x *= _mul;
y *= _mul;
z *= _mul;
gFloat3.mul_scalar(*this, _mul, *this);
return *this;
}
@ -705,39 +704,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
{
AssertFatal(_div != 0.0f, "Error, div by zero attempted");
F32 inv = 1.0f / _div;
x *= inv;
y *= inv;
z *= inv;
gFloat3.div_scalar(*this, _div, *this);
return *this;
}
inline Point3F Point3F::operator*(const Point3F &_vec) const
{
return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
Point3F temp;
gFloat3.mul(*this, _vec, temp);
return temp;
}
inline Point3F& Point3F::operator*=(const Point3F &_vec)
{
x *= _vec.x;
y *= _vec.y;
z *= _vec.z;
gFloat3.mul(*this, _vec, *this);
return *this;
}
inline Point3F Point3F::operator/(const Point3F &_vec) const
{
AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
Point3F temp;
gFloat3.div(*this, _vec, temp);
return temp;
}
inline Point3F& Point3F::operator/=(const Point3F &_vec)
{
AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
x /= _vec.x;
y /= _vec.y;
z /= _vec.z;
gFloat3.div(*this, _vec, *this);
return *this;
}
@ -989,7 +984,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)
inline Point3F operator*(F32 mul, const Point3F& multiplicand)
{
return multiplicand * mul;
Point3F temp;
gFloat3.mul_scalar(multiplicand, mul, temp);
return temp;
}
inline Point3D operator*(F64 mul, const Point3D& multiplicand)
@ -999,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
inline F32 mDot(const Point3F &p1, const Point3F &p2)
{
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
return gFloat3.dot(p1, p2);
}
inline F64 mDot(const Point3D &p1, const Point3D &p2)

View file

@ -26,10 +26,12 @@
#ifndef _MMATHFN_H_
#include "math/mMathFn.h"
#endif
#ifndef _MPOINT3_H_
#include "math/mPoint3.h"
#endif
#ifndef _MATH_BACKEND_H_
#include "math/public/math_backend.h"
#endif
//------------------------------------------------------------------------------
@ -61,6 +63,8 @@ class Point4I
/// Uses F32 internally.
///
/// Useful for representing quaternions and other 4d beasties.
using math_backend::float4::dispatch::gFloat4;
class Point4F
{
//-------------------------------------- Public data
@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)
inline F32 Point4F::len() const
{
return mSqrt(x*x + y*y + z*z + w*w);
return gFloat4.length(*this);
}
inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
{
x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
gFloat4.lerp(_from, _to, _factor, *this);
}
inline void Point4F::zero()
@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
if (mIsZero(scalar))
return *this;
F32 denom = 1 / scalar;
x *= denom;
y *= denom;
z *= denom;
w *= denom;
gFloat4.div_scalar(*this, scalar, *this);
return *this;
}
inline Point4F Point4F::operator+(const Point4F& _add) const
{
return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
Point4F res;
gFloat4.add(*this, _add, res);
return res;
}
inline Point4F& Point4F::operator+=(const Point4F& _add)
{
x += _add.x;
y += _add.y;
z += _add.z;
w += _add.w;
gFloat4.add(*this, _add, *this);
return *this;
}
inline Point4F Point4F::operator-(const Point4F& _rSub) const
{
return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
Point4F res;
gFloat4.sub(*this, _rSub, res);
return res;
}
inline Point4F Point4F::operator*(const Point4F &_vec) const
{
return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
Point4F res;
gFloat4.mul(*this, _vec, res);
return res;
}
inline Point4F Point4F::operator*(F32 _mul) const
{
return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
Point4F res;
gFloat4.mul_scalar(*this, _mul, res);
return res;
}
inline Point4F Point4F::operator /(F32 t) const
{
F32 f = 1.0f / t;
return Point4F( x * f, y * f, z * f, w * f );
Point4F res;
gFloat4.div_scalar(*this, t, res);
return res;
}
inline F32 mDot(const Point4F &p1, const Point4F &p2)
{
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
return gFloat4.dot(p1, p2);
}
//------------------------------------------------------------------------------

View file

@ -0,0 +1,38 @@
#pragma once
#ifndef _FLOAT3_DISPATCH_H_
#define _FLOAT3_DISPATCH_H_
#include <cstdint>
namespace math_backend::float3::dispatch
{
struct Float3Funcs
{
void (*add)(const float*, const float*, float*) = nullptr;
void (*sub)(const float*, const float*, float*) = nullptr;
void (*mul)(const float*, const float*, float*) = nullptr;
void (*mul_scalar)(const float*, float, float*) = nullptr;
void (*div)(const float*, const float*, float*) = nullptr;
void (*div_scalar)(const float*, float, float*) = nullptr;
float (*dot)(const float*, const float*) = nullptr;
float (*length)(const float*) = nullptr;
float (*lengthSquared)(const float*) = nullptr;
void (*normalize)(float*) = nullptr;
void (*normalize_mag)(float*, float) = nullptr;
void (*lerp)(const float*, const float*, float, float*) = nullptr;
};
// Global dispatch table
extern Float3Funcs gFloat3;
// Backend installers (defined in ISA libraries)
void install_scalar();
void install_sse2();
void install_sse41();
void install_avx();
void install_avx2();
void install_neon();
}
#endif // !_FLOAT4_DISPATCH_H_

View file

@ -1,7 +1,14 @@
#include "math/public/float4_dispatch.h"
#include "math/public/float3_dispatch.h"
namespace math_backend::float4::dispatch
{
// Single definition of the global dispatch table
Float4Funcs gFloat4{};
}
namespace math_backend::float3::dispatch
{
// Single definition of the global dispatch table
Float3Funcs gFloat3{};
}

View file

@ -19,6 +19,7 @@ namespace math_backend::float4::dispatch
float (*length)(const float*) = nullptr;
float (*lengthSquared)(const float*) = nullptr;
void (*normalize)(float*) = nullptr;
void (*normalize_mag)(float*, float) = nullptr;
void (*lerp)(const float*, const float*, float, float*) = nullptr;
};
@ -32,9 +33,6 @@ namespace math_backend::float4::dispatch
void install_avx();
void install_avx2();
void install_neon();
// Centralized installer (engine calls this once)
void install_preferred();
}
#endif // !_FLOAT4_DISPATCH_H_

View file

@ -28,6 +28,7 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
case backend::avx2:
float4::dispatch::install_avx2();
float3::dispatch::install_avx2();
break;
case backend::avx:

View file

@ -1,4 +1,7 @@
#pragma once
#ifndef _MATH_BACKEND_H_
#define _MATH_BACKEND_H_
#ifndef _MCONSTANTS_H_
#include "math/mConstants.h"
#endif
@ -8,6 +11,9 @@
#ifndef _FLOAT4_DISPATCH_H_
#include "math/public/float4_dispatch.h"
#endif
#ifndef _FLOAT3_DISPATCH_H_
#include "math/public/float3_dispatch.h"
#endif
namespace math_backend
{
@ -25,3 +31,5 @@ namespace math_backend
backend choose_backend(U32 cpu_flags);
void install_from_cpu_flags(uint32_t cpu_flags);
}
#endif // !_MATH_BACKEND_H_

View file

@ -25,7 +25,7 @@
#include "console/engineAPI.h"
#include "math/mMath.h"
#include "math/public/math_backend.h"
extern void mInstallLibrary_C();
extern void mInstallLibrary_ASM();
@ -98,6 +98,8 @@ void Math::init(U32 properties)
Con::printf(" Installing Standard C extensions");
mInstallLibrary_C();
math_backend::install_from_cpu_flags(properties);
Con::printf(" Installing Assembly extensions");
mInstallLibrary_ASM();