From 18d0aa0418e0ce228fa1beb8b906ed8ae584031b Mon Sep 17 00:00:00 2001 From: marauder2k7 Date: Thu, 26 Feb 2026 21:11:31 +0000 Subject: [PATCH] further sse simd additions avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend --- Engine/source/gfx/gfxDrawUtil.cpp | 8 +- Engine/source/math/impl/float3_impl.inl | 115 ++++++++++++++++++ Engine/source/math/impl/float4_c.cpp | 7 +- Engine/source/math/isa/avx/float4.cpp | 1 + Engine/source/math/isa/avx2/float3.cpp | 84 +++++++++++++ Engine/source/math/isa/avx2/float4.cpp | 1 + Engine/source/math/isa/sse2/float4.cpp | 1 + Engine/source/math/isa/sse41/float4.cpp | 1 + Engine/source/math/mPoint3.h | 85 +++++++------ Engine/source/math/mPoint4.h | 49 ++++---- Engine/source/math/public/float3_dispatch.h | 38 ++++++ Engine/source/math/public/float4_dispatch.cpp | 7 ++ Engine/source/math/public/float4_dispatch.h | 4 +- Engine/source/math/public/math_backend.cpp | 1 + Engine/source/math/public/math_backend.h | 8 ++ Engine/source/platformWin32/winMath.cpp | 4 +- 16 files changed, 337 insertions(+), 77 deletions(-) create mode 100644 Engine/source/math/impl/float3_impl.inl create mode 100644 Engine/source/math/isa/avx2/float3.cpp create mode 100644 Engine/source/math/public/float3_dispatch.h diff --git a/Engine/source/gfx/gfxDrawUtil.cpp b/Engine/source/gfx/gfxDrawUtil.cpp index 16b886a55..3dce4b3af 100644 --- a/Engine/source/gfx/gfxDrawUtil.cpp +++ b/Engine/source/gfx/gfxDrawUtil.cpp @@ -853,7 +853,11 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2, // 3D World Draw Misc //----------------------------------------------------------------------------- -static SphereMesh gSphere; +SphereMesh& getSphere() +{ + static SphereMesh instance; + return instance; +} void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm ) { @@ -868,7 +872,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P GFX->pushWorldMatrix(); GFX->multWorld(mat); - const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2); + const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2); S32 numPoly = sphereMesh->numPoly; S32 totalPoly = 0; GFXVertexBufferHandle verts(mDevice, numPoly*3, GFXBufferTypeVolatile); diff --git a/Engine/source/math/impl/float3_impl.inl b/Engine/source/math/impl/float3_impl.inl new file mode 100644 index 000000000..14fce3f3f --- /dev/null +++ b/Engine/source/math/impl/float3_impl.inl @@ -0,0 +1,115 @@ +#pragma once +#include // for sqrtf, etc. +#include "../mConstants.h" + +// Safely loads a float3 -> simd 4 lane backend +namespace math_backend::float3 +{ + //---------------------------------------------------------- + // Add two float4 vectors: r = a + b + inline void float3_add_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3(a); + f32x4 vb = v_load3(b); + f32x4 vr = v_add(va, vb); + v_store3(r, vr); + } + + // Subtract: r = a - b + inline void float3_sub_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3(a); + f32x4 vb = v_load3(b); + f32x4 vr = v_sub(va, vb); + v_store3(r, vr); + } + + // Multiply element-wise: r = a * b + inline void float3_mul_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3(a); + f32x4 vb = v_load3(b); + f32x4 vr = v_mul(va, vb); + v_store3(r, vr); + } + + // Multiply by scalar: r = a * s + inline void float3_mul_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load3(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_mul(va, vs); + v_store3(r, vr); + } + + // Divide element-wise: r = a / b + inline void float3_div_impl(const float* a, const float* b, float* r) + { + f32x4 va = v_load3(a); + f32x4 vb = v_load3(b); + f32x4 vr = v_div_fast(va, vb); + v_store3(r, vr); + } + + // Divide by scalar: r = a / s + inline void float3_div_scalar_impl(const float* a, float s, float* r) + { + f32x4 va = v_load3(a); + f32x4 vs = v_set1(s); + f32x4 vr = v_div_fast(va, vs); + v_store3(r, vr); + } + + // Dot product: returns scalar + inline float float3_dot_impl(const float* a, const float* b) + { + f32x4 va = v_load3(a); + f32x4 vb = v_load3(b); + f32x4 vmul = v_mul(va, vb); + f32x4 vsum = v_hadd3(vmul); + return v_extract0(vsum); // first lane is the sum of 3 elements + } + + // Length squared + inline float float3_length_squared_impl(const float* a) + { + return float3_dot_impl(a, a); + } + + // Length + inline float float3_length_impl(const float* a) + { + return std::sqrt(float3_length_squared_impl(a)); + } + + // Normalize in-place + inline void float3_normalize_impl(float* a) + { + float len = float3_length_impl(a); + if (len > POINT_EPSILON) // safe threshold + { + float3_mul_scalar_impl(a, 1.0f / len, a); + } + } + + // Normalize with magnitude: r = normalize(a) * r + inline void float3_normalize_mag_impl(float* a, float r) + { + float len = float3_length_impl(a); + if (len > POINT_EPSILON) + { + float3_mul_scalar_impl(a, r / len, a); + } + } + + // Linear interpolation: r = from + (to - from) * f + inline void float3_lerp_impl(const float* from, const float* to, float f, float* r) + { + f32x4 vfrom = v_load3(from); + f32x4 vto = v_load3(to); + f32x4 vf = v_set1(f); + f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom))); + v_store3(r, vr); + } + +} diff --git a/Engine/source/math/impl/float4_c.cpp b/Engine/source/math/impl/float4_c.cpp index b3d6559f9..9063d20fa 100644 --- a/Engine/source/math/impl/float4_c.cpp +++ b/Engine/source/math/impl/float4_c.cpp @@ -1,6 +1,6 @@ #include "math/public/float4_dispatch.h" #include "math/mConstants.h" -#include +#include // for sqrtf, etc. namespace math_backend::float4::dispatch { @@ -27,7 +27,8 @@ namespace math_backend::float4::dispatch }; gFloat4.div_scalar = [](const float* a, float s, float* r) { - for (int i = 0; i < 4; i++) r[i] = a[i] / s; + float denom = 1.0f / s; + for (int i = 0; i < 4; i++) r[i] = a[i] * denom; }; gFloat4.dot = [](const float* a, const float* b) { @@ -39,7 +40,7 @@ namespace math_backend::float4::dispatch gFloat4.length = [](const float* a) { float sum = 0.f; for (int i = 0; i < 4; i++) sum += a[i] * a[i]; - return sqrtf(sum); + return std::sqrt(sum); }; gFloat4.lengthSquared = [](const float* a) { diff --git a/Engine/source/math/isa/avx/float4.cpp b/Engine/source/math/isa/avx/float4.cpp index 1e23fb8b1..08f534718 100644 --- a/Engine/source/math/isa/avx/float4.cpp +++ b/Engine/source/math/isa/avx/float4.cpp @@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; gFloat4.lerp = float4_lerp_impl; } } diff --git a/Engine/source/math/isa/avx2/float3.cpp b/Engine/source/math/isa/avx2/float3.cpp new file mode 100644 index 000000000..46b64a7dd --- /dev/null +++ b/Engine/source/math/isa/avx2/float3.cpp @@ -0,0 +1,84 @@ +#include "float3_dispatch.h" +#include // AVX/AVX2 intrinsics + +namespace +{ + typedef __m128 f32x4; + + // Load 3 floats into 4-wide SIMD, zero the 4th lane + inline f32x4 v_load3(const float* p) { return _mm_set_ps(0.0f, p[2], p[1], p[0]); } + + // Store 3 floats from SIMD register back to memory + inline void v_store3(float* dst, f32x4 v) + { + alignas(16) float tmp[4]; // temp storage + _mm_store_ps(tmp, v); // store all 4 lanes + dst[0] = tmp[0]; + dst[1] = tmp[1]; + dst[2] = tmp[2]; + } + + // extract just the first lane. + inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); } + + // Broadcast a single float across all 4 lanes + inline f32x4 v_set1(float s) { return _mm_set1_ps(s); } + + // Element-wise multiply + inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); } + + // Element-wise divide precise + inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); } + + // Element-wise divide fast (1/b) + inline f32x4 v_div_fast(f32x4 a, f32x4 b) + { + f32x4 rcp = _mm_rcp_ps(b); + // Optional refinement here + return _mm_mul_ps(a, rcp); + } + + // Element-wise add + inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); } + + // Element-wise subtract + inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); } + + // Horizontal sum of all elements (for dot product, length, etc.) + inline f32x4 v_hadd3(f32x4 a) + { + __m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...] + __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3 + return t2; + } + + float float3_dot_avx(const float* a, const float* b) + { + f32x4 va = _mm_loadu_ps(a); + f32x4 vb = _mm_loadu_ps(b); + __m128 dp = _mm_dp_ps(va, vb, 0x71); // multiply 3 (0x71), sum all 4, lowest lane + return _mm_cvtss_f32(dp); + } +} + +#include "float3_impl.inl" + +namespace math_backend::float3::dispatch +{ + // Install AVX2 backend + void install_avx2() + { + gFloat3.add = float3_add_impl; + gFloat3.sub = float3_sub_impl; + gFloat3.mul = float3_mul_impl; + gFloat3.mul_scalar = float3_mul_scalar_impl; + gFloat3.div = float3_div_impl; + gFloat3.div_scalar = float3_div_scalar_impl; + gFloat3.dot = float3_dot_avx; + gFloat3.length = float3_length_impl; + gFloat3.lengthSquared = float3_length_squared_impl; + gFloat3.normalize = float3_normalize_impl; + gFloat3.normalize_mag = float3_normalize_mag_impl; + gFloat3.lerp = float3_lerp_impl; + } +} diff --git a/Engine/source/math/isa/avx2/float4.cpp b/Engine/source/math/isa/avx2/float4.cpp index 439d0e2d0..a1e1836b1 100644 --- a/Engine/source/math/isa/avx2/float4.cpp +++ b/Engine/source/math/isa/avx2/float4.cpp @@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; gFloat4.lerp = float4_lerp_impl; } } diff --git a/Engine/source/math/isa/sse2/float4.cpp b/Engine/source/math/isa/sse2/float4.cpp index 00850560a..22a4f76bd 100644 --- a/Engine/source/math/isa/sse2/float4.cpp +++ b/Engine/source/math/isa/sse2/float4.cpp @@ -53,6 +53,7 @@ namespace math_backend::float4::dispatch gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; gFloat4.lerp = float4_lerp_impl; } } diff --git a/Engine/source/math/isa/sse41/float4.cpp b/Engine/source/math/isa/sse41/float4.cpp index 80127acb9..bf4c648ad 100644 --- a/Engine/source/math/isa/sse41/float4.cpp +++ b/Engine/source/math/isa/sse41/float4.cpp @@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch gFloat4.length = float4_length_impl; gFloat4.lengthSquared = float4_length_squared_impl; gFloat4.normalize = float4_normalize_impl; + gFloat4.normalize_mag = float4_normalize_mag_impl; gFloat4.lerp = float4_lerp_impl; } } diff --git a/Engine/source/math/mPoint3.h b/Engine/source/math/mPoint3.h index c1bb72923..6c3b19899 100644 --- a/Engine/source/math/mPoint3.h +++ b/Engine/source/math/mPoint3.h @@ -29,6 +29,11 @@ #ifndef _MPOINT2_H_ #include "math/mPoint2.h" #endif +#ifndef _MATH_BACKEND_H_ +#include "math/public/math_backend.h" +#endif + +#include //------------------------------------------------------------------------------ /// 3D integer point @@ -97,6 +102,7 @@ public: class Point3D; //------------------------------------------------------------------------------ +using math_backend::float3::dispatch::gFloat3; class Point3F { //-------------------------------------- Public data @@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test) inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor) { AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor"); - m_point3F_interpolate( _from, _to, _factor, *this); + + gFloat3.lerp(_from, _to, _factor, *this); } inline void Point3F::zero() @@ -599,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c) inline F32 Point3F::lenSquared() const { - return (x * x) + (y * y) + (z * z); + return gFloat3.lengthSquared(*this); } inline F32 Point3F::len() const { - return mSqrt(x*x + y*y + z*z); + return gFloat3.length(*this); } inline void Point3F::normalize() { - m_point3F_normalize(*this); + gFloat3.normalize(*this); } inline F32 Point3F::magnitudeSafe() const @@ -626,18 +633,13 @@ inline F32 Point3F::magnitudeSafe() const inline void Point3F::normalizeSafe() { - F32 vmag = magnitudeSafe(); - - if( vmag > POINT_EPSILON ) - { - *this *= F32(1.0 / vmag); - } + gFloat3.normalize(*this); } inline void Point3F::normalize(F32 val) { - m_point3F_normalize_f(*this, val); + gFloat3.normalize_mag(*this, val); } inline bool Point3F::operator==(const Point3F& _test) const @@ -652,52 +654,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const inline Point3F Point3F::operator+(const Point3F& _add) const { - return Point3F(x + _add.x, y + _add.y, z + _add.z); + Point3F temp; + gFloat3.add(*this, _add, temp); + return temp; } inline Point3F Point3F::operator-(const Point3F& _rSub) const { - return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z); + Point3F temp; + gFloat3.sub(*this, _rSub, temp); + return temp; } inline Point3F& Point3F::operator+=(const Point3F& _add) { - x += _add.x; - y += _add.y; - z += _add.z; - + gFloat3.add(*this, _add, *this); return *this; } inline Point3F& Point3F::operator-=(const Point3F& _rSub) { - x -= _rSub.x; - y -= _rSub.y; - z -= _rSub.z; - + gFloat3.sub(*this, _rSub, *this); return *this; } inline Point3F Point3F::operator*(F32 _mul) const { - return Point3F(x * _mul, y * _mul, z * _mul); + Point3F temp; + gFloat3.mul_scalar(*this, _mul, temp); + return temp; } inline Point3F Point3F::operator/(F32 _div) const { AssertFatal(_div != 0.0f, "Error, div by zero attempted"); - F32 inv = 1.0f / _div; - - return Point3F(x * inv, y * inv, z * inv); + Point3F temp; + gFloat3.div_scalar(*this, _div, temp); + return temp; } inline Point3F& Point3F::operator*=(F32 _mul) { - x *= _mul; - y *= _mul; - z *= _mul; - + gFloat3.mul_scalar(*this, _mul, *this); return *this; } @@ -705,39 +704,35 @@ inline Point3F& Point3F::operator/=(F32 _div) { AssertFatal(_div != 0.0f, "Error, div by zero attempted"); - F32 inv = 1.0f / _div; - x *= inv; - y *= inv; - z *= inv; - + gFloat3.div_scalar(*this, _div, *this); return *this; } inline Point3F Point3F::operator*(const Point3F &_vec) const { - return Point3F(x * _vec.x, y * _vec.y, z * _vec.z); + Point3F temp; + gFloat3.mul(*this, _vec, temp); + return temp; } inline Point3F& Point3F::operator*=(const Point3F &_vec) { - x *= _vec.x; - y *= _vec.y; - z *= _vec.z; + gFloat3.mul(*this, _vec, *this); return *this; } inline Point3F Point3F::operator/(const Point3F &_vec) const { AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted"); - return Point3F(x / _vec.x, y / _vec.y, z / _vec.z); + Point3F temp; + gFloat3.div(*this, _vec, temp); + return temp; } inline Point3F& Point3F::operator/=(const Point3F &_vec) { AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted"); - x /= _vec.x; - y /= _vec.y; - z /= _vec.z; + gFloat3.div(*this, _vec, *this); return *this; } @@ -989,7 +984,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand) inline Point3F operator*(F32 mul, const Point3F& multiplicand) { - return multiplicand * mul; + Point3F temp; + gFloat3.mul_scalar(multiplicand, mul, temp); + return temp; } inline Point3D operator*(F64 mul, const Point3D& multiplicand) @@ -999,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand) inline F32 mDot(const Point3F &p1, const Point3F &p2) { - return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z); + return gFloat3.dot(p1, p2); } inline F64 mDot(const Point3D &p1, const Point3D &p2) diff --git a/Engine/source/math/mPoint4.h b/Engine/source/math/mPoint4.h index 8ae173009..0f715f983 100644 --- a/Engine/source/math/mPoint4.h +++ b/Engine/source/math/mPoint4.h @@ -26,10 +26,12 @@ #ifndef _MMATHFN_H_ #include "math/mMathFn.h" #endif - #ifndef _MPOINT3_H_ #include "math/mPoint3.h" #endif +#ifndef _MATH_BACKEND_H_ +#include "math/public/math_backend.h" +#endif //------------------------------------------------------------------------------ @@ -61,6 +63,8 @@ class Point4I /// Uses F32 internally. /// /// Useful for representing quaternions and other 4d beasties. +using math_backend::float4::dispatch::gFloat4; + class Point4F { //-------------------------------------- Public data @@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w) inline F32 Point4F::len() const { - return mSqrt(x*x + y*y + z*z + w*w); + return gFloat4.length(*this); } inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor) { - x = (_from.x * (1.0f - _factor)) + (_to.x * _factor); - y = (_from.y * (1.0f - _factor)) + (_to.y * _factor); - z = (_from.z * (1.0f - _factor)) + (_to.z * _factor); - w = (_from.w * (1.0f - _factor)) + (_to.w * _factor); + gFloat4.lerp(_from, _to, _factor, *this); } inline void Point4F::zero() @@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar) if (mIsZero(scalar)) return *this; - F32 denom = 1 / scalar; - - x *= denom; - y *= denom; - z *= denom; - w *= denom; + gFloat4.div_scalar(*this, scalar, *this); return *this; } inline Point4F Point4F::operator+(const Point4F& _add) const { - return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w ); + Point4F res; + gFloat4.add(*this, _add, res); + return res; } inline Point4F& Point4F::operator+=(const Point4F& _add) { - x += _add.x; - y += _add.y; - z += _add.z; - w += _add.w; - + gFloat4.add(*this, _add, *this); return *this; } inline Point4F Point4F::operator-(const Point4F& _rSub) const { - return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w ); + Point4F res; + gFloat4.sub(*this, _rSub, res); + return res; } inline Point4F Point4F::operator*(const Point4F &_vec) const { - return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w); + Point4F res; + gFloat4.mul(*this, _vec, res); + return res; } inline Point4F Point4F::operator*(F32 _mul) const { - return Point4F(x * _mul, y * _mul, z * _mul, w * _mul); + Point4F res; + gFloat4.mul_scalar(*this, _mul, res); + return res; } inline Point4F Point4F::operator /(F32 t) const { - F32 f = 1.0f / t; - return Point4F( x * f, y * f, z * f, w * f ); + Point4F res; + gFloat4.div_scalar(*this, t, res); + return res; } inline F32 mDot(const Point4F &p1, const Point4F &p2) { - return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w); + return gFloat4.dot(p1, p2); } //------------------------------------------------------------------------------ diff --git a/Engine/source/math/public/float3_dispatch.h b/Engine/source/math/public/float3_dispatch.h new file mode 100644 index 000000000..9735c47d6 --- /dev/null +++ b/Engine/source/math/public/float3_dispatch.h @@ -0,0 +1,38 @@ +#pragma once +#ifndef _FLOAT3_DISPATCH_H_ +#define _FLOAT3_DISPATCH_H_ + + +#include + +namespace math_backend::float3::dispatch +{ + struct Float3Funcs + { + void (*add)(const float*, const float*, float*) = nullptr; + void (*sub)(const float*, const float*, float*) = nullptr; + void (*mul)(const float*, const float*, float*) = nullptr; + void (*mul_scalar)(const float*, float, float*) = nullptr; + void (*div)(const float*, const float*, float*) = nullptr; + void (*div_scalar)(const float*, float, float*) = nullptr; + float (*dot)(const float*, const float*) = nullptr; + float (*length)(const float*) = nullptr; + float (*lengthSquared)(const float*) = nullptr; + void (*normalize)(float*) = nullptr; + void (*normalize_mag)(float*, float) = nullptr; + void (*lerp)(const float*, const float*, float, float*) = nullptr; + }; + + // Global dispatch table + extern Float3Funcs gFloat3; + + // Backend installers (defined in ISA libraries) + void install_scalar(); + void install_sse2(); + void install_sse41(); + void install_avx(); + void install_avx2(); + void install_neon(); +} + +#endif // !_FLOAT4_DISPATCH_H_ diff --git a/Engine/source/math/public/float4_dispatch.cpp b/Engine/source/math/public/float4_dispatch.cpp index 810eb0e46..283ff22c2 100644 --- a/Engine/source/math/public/float4_dispatch.cpp +++ b/Engine/source/math/public/float4_dispatch.cpp @@ -1,7 +1,14 @@ #include "math/public/float4_dispatch.h" +#include "math/public/float3_dispatch.h" namespace math_backend::float4::dispatch { // Single definition of the global dispatch table Float4Funcs gFloat4{}; } + +namespace math_backend::float3::dispatch +{ + // Single definition of the global dispatch table + Float3Funcs gFloat3{}; +} diff --git a/Engine/source/math/public/float4_dispatch.h b/Engine/source/math/public/float4_dispatch.h index 319b1893f..f5214447f 100644 --- a/Engine/source/math/public/float4_dispatch.h +++ b/Engine/source/math/public/float4_dispatch.h @@ -19,6 +19,7 @@ namespace math_backend::float4::dispatch float (*length)(const float*) = nullptr; float (*lengthSquared)(const float*) = nullptr; void (*normalize)(float*) = nullptr; + void (*normalize_mag)(float*, float) = nullptr; void (*lerp)(const float*, const float*, float, float*) = nullptr; }; @@ -32,9 +33,6 @@ namespace math_backend::float4::dispatch void install_avx(); void install_avx2(); void install_neon(); - - // Centralized installer (engine calls this once) - void install_preferred(); } #endif // !_FLOAT4_DISPATCH_H_ diff --git a/Engine/source/math/public/math_backend.cpp b/Engine/source/math/public/math_backend.cpp index e7c6845c7..365bdb870 100644 --- a/Engine/source/math/public/math_backend.cpp +++ b/Engine/source/math/public/math_backend.cpp @@ -28,6 +28,7 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags) #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) case backend::avx2: float4::dispatch::install_avx2(); + float3::dispatch::install_avx2(); break; case backend::avx: diff --git a/Engine/source/math/public/math_backend.h b/Engine/source/math/public/math_backend.h index 40476e7f0..65244e151 100644 --- a/Engine/source/math/public/math_backend.h +++ b/Engine/source/math/public/math_backend.h @@ -1,4 +1,7 @@ #pragma once +#ifndef _MATH_BACKEND_H_ +#define _MATH_BACKEND_H_ + #ifndef _MCONSTANTS_H_ #include "math/mConstants.h" #endif @@ -8,6 +11,9 @@ #ifndef _FLOAT4_DISPATCH_H_ #include "math/public/float4_dispatch.h" #endif +#ifndef _FLOAT3_DISPATCH_H_ +#include "math/public/float3_dispatch.h" +#endif namespace math_backend { @@ -25,3 +31,5 @@ namespace math_backend backend choose_backend(U32 cpu_flags); void install_from_cpu_flags(uint32_t cpu_flags); } + +#endif // !_MATH_BACKEND_H_ diff --git a/Engine/source/platformWin32/winMath.cpp b/Engine/source/platformWin32/winMath.cpp index 44b215301..36a9d8c58 100644 --- a/Engine/source/platformWin32/winMath.cpp +++ b/Engine/source/platformWin32/winMath.cpp @@ -25,7 +25,7 @@ #include "console/engineAPI.h" #include "math/mMath.h" - +#include "math/public/math_backend.h" extern void mInstallLibrary_C(); extern void mInstallLibrary_ASM(); @@ -98,6 +98,8 @@ void Math::init(U32 properties) Con::printf(" Installing Standard C extensions"); mInstallLibrary_C(); + math_backend::install_from_cpu_flags(properties); + Con::printf(" Installing Assembly extensions"); mInstallLibrary_ASM();