mirror of
https://github.com/TorqueGameEngines/Torque3D.git
synced 2026-03-20 04:40:54 +00:00
further sse simd additions
avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend
This commit is contained in:
parent
2559145c06
commit
18d0aa0418
16 changed files with 337 additions and 77 deletions
115
Engine/source/math/impl/float3_impl.inl
Normal file
115
Engine/source/math/impl/float3_impl.inl
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
#pragma once
|
||||
#include <cmath> // for sqrtf, etc.
|
||||
#include "../mConstants.h"
|
||||
|
||||
// Safely loads a float3 -> simd 4 lane backend
|
||||
namespace math_backend::float3
|
||||
{
|
||||
//----------------------------------------------------------
|
||||
// Add two float4 vectors: r = a + b
|
||||
inline void float3_add_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3(a);
|
||||
f32x4 vb = v_load3(b);
|
||||
f32x4 vr = v_add(va, vb);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Subtract: r = a - b
|
||||
inline void float3_sub_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3(a);
|
||||
f32x4 vb = v_load3(b);
|
||||
f32x4 vr = v_sub(va, vb);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Multiply element-wise: r = a * b
|
||||
inline void float3_mul_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3(a);
|
||||
f32x4 vb = v_load3(b);
|
||||
f32x4 vr = v_mul(va, vb);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Multiply by scalar: r = a * s
|
||||
inline void float3_mul_scalar_impl(const float* a, float s, float* r)
|
||||
{
|
||||
f32x4 va = v_load3(a);
|
||||
f32x4 vs = v_set1(s);
|
||||
f32x4 vr = v_mul(va, vs);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Divide element-wise: r = a / b
|
||||
inline void float3_div_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3(a);
|
||||
f32x4 vb = v_load3(b);
|
||||
f32x4 vr = v_div_fast(va, vb);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Divide by scalar: r = a / s
|
||||
inline void float3_div_scalar_impl(const float* a, float s, float* r)
|
||||
{
|
||||
f32x4 va = v_load3(a);
|
||||
f32x4 vs = v_set1(s);
|
||||
f32x4 vr = v_div_fast(va, vs);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Dot product: returns scalar
|
||||
inline float float3_dot_impl(const float* a, const float* b)
|
||||
{
|
||||
f32x4 va = v_load3(a);
|
||||
f32x4 vb = v_load3(b);
|
||||
f32x4 vmul = v_mul(va, vb);
|
||||
f32x4 vsum = v_hadd3(vmul);
|
||||
return v_extract0(vsum); // first lane is the sum of 3 elements
|
||||
}
|
||||
|
||||
// Length squared
|
||||
inline float float3_length_squared_impl(const float* a)
|
||||
{
|
||||
return float3_dot_impl(a, a);
|
||||
}
|
||||
|
||||
// Length
|
||||
inline float float3_length_impl(const float* a)
|
||||
{
|
||||
return std::sqrt(float3_length_squared_impl(a));
|
||||
}
|
||||
|
||||
// Normalize in-place
|
||||
inline void float3_normalize_impl(float* a)
|
||||
{
|
||||
float len = float3_length_impl(a);
|
||||
if (len > POINT_EPSILON) // safe threshold
|
||||
{
|
||||
float3_mul_scalar_impl(a, 1.0f / len, a);
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize with magnitude: r = normalize(a) * r
|
||||
inline void float3_normalize_mag_impl(float* a, float r)
|
||||
{
|
||||
float len = float3_length_impl(a);
|
||||
if (len > POINT_EPSILON)
|
||||
{
|
||||
float3_mul_scalar_impl(a, r / len, a);
|
||||
}
|
||||
}
|
||||
|
||||
// Linear interpolation: r = from + (to - from) * f
|
||||
inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
|
||||
{
|
||||
f32x4 vfrom = v_load3(from);
|
||||
f32x4 vto = v_load3(to);
|
||||
f32x4 vf = v_set1(f);
|
||||
f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
#include "math/public/float4_dispatch.h"
|
||||
#include "math/mConstants.h"
|
||||
#include <math.h>
|
||||
#include <cmath> // for sqrtf, etc.
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
|
|
@ -27,7 +27,8 @@ namespace math_backend::float4::dispatch
|
|||
};
|
||||
|
||||
gFloat4.div_scalar = [](const float* a, float s, float* r) {
|
||||
for (int i = 0; i < 4; i++) r[i] = a[i] / s;
|
||||
float denom = 1.0f / s;
|
||||
for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
|
||||
};
|
||||
|
||||
gFloat4.dot = [](const float* a, const float* b) {
|
||||
|
|
@ -39,7 +40,7 @@ namespace math_backend::float4::dispatch
|
|||
gFloat4.length = [](const float* a) {
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < 4; i++) sum += a[i] * a[i];
|
||||
return sqrtf(sum);
|
||||
return std::sqrt(sum);
|
||||
};
|
||||
|
||||
gFloat4.lengthSquared = [](const float* a) {
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
|
|||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
84
Engine/source/math/isa/avx2/float3.cpp
Normal file
84
Engine/source/math/isa/avx2/float3.cpp
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
#include "float3_dispatch.h"
|
||||
#include <immintrin.h> // AVX/AVX2 intrinsics
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef __m128 f32x4;
|
||||
|
||||
// Load 3 floats into 4-wide SIMD, zero the 4th lane
|
||||
inline f32x4 v_load3(const float* p) { return _mm_set_ps(0.0f, p[2], p[1], p[0]); }
|
||||
|
||||
// Store 3 floats from SIMD register back to memory
|
||||
inline void v_store3(float* dst, f32x4 v)
|
||||
{
|
||||
alignas(16) float tmp[4]; // temp storage
|
||||
_mm_store_ps(tmp, v); // store all 4 lanes
|
||||
dst[0] = tmp[0];
|
||||
dst[1] = tmp[1];
|
||||
dst[2] = tmp[2];
|
||||
}
|
||||
|
||||
// extract just the first lane.
|
||||
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
|
||||
|
||||
// Broadcast a single float across all 4 lanes
|
||||
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
|
||||
|
||||
// Element-wise multiply
|
||||
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
|
||||
|
||||
// Element-wise divide precise
|
||||
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
|
||||
|
||||
// Element-wise divide fast (1/b)
|
||||
inline f32x4 v_div_fast(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 rcp = _mm_rcp_ps(b);
|
||||
// Optional refinement here
|
||||
return _mm_mul_ps(a, rcp);
|
||||
}
|
||||
|
||||
// Element-wise add
|
||||
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
|
||||
|
||||
// Element-wise subtract
|
||||
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
|
||||
|
||||
// Horizontal sum of all elements (for dot product, length, etc.)
|
||||
inline f32x4 v_hadd3(f32x4 a)
|
||||
{
|
||||
__m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...]
|
||||
__m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
|
||||
return t2;
|
||||
}
|
||||
|
||||
float float3_dot_avx(const float* a, const float* b)
|
||||
{
|
||||
f32x4 va = _mm_loadu_ps(a);
|
||||
f32x4 vb = _mm_loadu_ps(b);
|
||||
__m128 dp = _mm_dp_ps(va, vb, 0x71); // multiply 3 (0x71), sum all 4, lowest lane
|
||||
return _mm_cvtss_f32(dp);
|
||||
}
|
||||
}
|
||||
|
||||
#include "float3_impl.inl"
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
// Install AVX2 backend
|
||||
void install_avx2()
|
||||
{
|
||||
gFloat3.add = float3_add_impl;
|
||||
gFloat3.sub = float3_sub_impl;
|
||||
gFloat3.mul = float3_mul_impl;
|
||||
gFloat3.mul_scalar = float3_mul_scalar_impl;
|
||||
gFloat3.div = float3_div_impl;
|
||||
gFloat3.div_scalar = float3_div_scalar_impl;
|
||||
gFloat3.dot = float3_dot_avx;
|
||||
gFloat3.length = float3_length_impl;
|
||||
gFloat3.lengthSquared = float3_length_squared_impl;
|
||||
gFloat3.normalize = float3_normalize_impl;
|
||||
gFloat3.normalize_mag = float3_normalize_mag_impl;
|
||||
gFloat3.lerp = float3_lerp_impl;
|
||||
}
|
||||
}
|
||||
|
|
@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
|
|||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ namespace math_backend::float4::dispatch
|
|||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ namespace math_backend::float4::dispatch
|
|||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,6 +29,11 @@
|
|||
#ifndef _MPOINT2_H_
|
||||
#include "math/mPoint2.h"
|
||||
#endif
|
||||
#ifndef _MATH_BACKEND_H_
|
||||
#include "math/public/math_backend.h"
|
||||
#endif
|
||||
|
||||
#include <array>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
/// 3D integer point
|
||||
|
|
@ -97,6 +102,7 @@ public:
|
|||
class Point3D;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
using math_backend::float3::dispatch::gFloat3;
|
||||
class Point3F
|
||||
{
|
||||
//-------------------------------------- Public data
|
||||
|
|
@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
|
|||
inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
|
||||
{
|
||||
AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
|
||||
m_point3F_interpolate( _from, _to, _factor, *this);
|
||||
|
||||
gFloat3.lerp(_from, _to, _factor, *this);
|
||||
}
|
||||
|
||||
inline void Point3F::zero()
|
||||
|
|
@ -599,17 +606,17 @@ inline void Point3F::convolveInverse(const Point3F& c)
|
|||
|
||||
inline F32 Point3F::lenSquared() const
|
||||
{
|
||||
return (x * x) + (y * y) + (z * z);
|
||||
return gFloat3.lengthSquared(*this);
|
||||
}
|
||||
|
||||
inline F32 Point3F::len() const
|
||||
{
|
||||
return mSqrt(x*x + y*y + z*z);
|
||||
return gFloat3.length(*this);
|
||||
}
|
||||
|
||||
inline void Point3F::normalize()
|
||||
{
|
||||
m_point3F_normalize(*this);
|
||||
gFloat3.normalize(*this);
|
||||
}
|
||||
|
||||
inline F32 Point3F::magnitudeSafe() const
|
||||
|
|
@ -626,18 +633,13 @@ inline F32 Point3F::magnitudeSafe() const
|
|||
|
||||
inline void Point3F::normalizeSafe()
|
||||
{
|
||||
F32 vmag = magnitudeSafe();
|
||||
|
||||
if( vmag > POINT_EPSILON )
|
||||
{
|
||||
*this *= F32(1.0 / vmag);
|
||||
}
|
||||
gFloat3.normalize(*this);
|
||||
}
|
||||
|
||||
|
||||
inline void Point3F::normalize(F32 val)
|
||||
{
|
||||
m_point3F_normalize_f(*this, val);
|
||||
gFloat3.normalize_mag(*this, val);
|
||||
}
|
||||
|
||||
inline bool Point3F::operator==(const Point3F& _test) const
|
||||
|
|
@ -652,52 +654,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const
|
|||
|
||||
inline Point3F Point3F::operator+(const Point3F& _add) const
|
||||
{
|
||||
return Point3F(x + _add.x, y + _add.y, z + _add.z);
|
||||
Point3F temp;
|
||||
gFloat3.add(*this, _add, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator-(const Point3F& _rSub) const
|
||||
{
|
||||
return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
|
||||
Point3F temp;
|
||||
gFloat3.sub(*this, _rSub, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator+=(const Point3F& _add)
|
||||
{
|
||||
x += _add.x;
|
||||
y += _add.y;
|
||||
z += _add.z;
|
||||
|
||||
gFloat3.add(*this, _add, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator-=(const Point3F& _rSub)
|
||||
{
|
||||
x -= _rSub.x;
|
||||
y -= _rSub.y;
|
||||
z -= _rSub.z;
|
||||
|
||||
gFloat3.sub(*this, _rSub, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator*(F32 _mul) const
|
||||
{
|
||||
return Point3F(x * _mul, y * _mul, z * _mul);
|
||||
Point3F temp;
|
||||
gFloat3.mul_scalar(*this, _mul, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator/(F32 _div) const
|
||||
{
|
||||
AssertFatal(_div != 0.0f, "Error, div by zero attempted");
|
||||
|
||||
F32 inv = 1.0f / _div;
|
||||
|
||||
return Point3F(x * inv, y * inv, z * inv);
|
||||
Point3F temp;
|
||||
gFloat3.div_scalar(*this, _div, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator*=(F32 _mul)
|
||||
{
|
||||
x *= _mul;
|
||||
y *= _mul;
|
||||
z *= _mul;
|
||||
|
||||
gFloat3.mul_scalar(*this, _mul, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
@ -705,39 +704,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
|
|||
{
|
||||
AssertFatal(_div != 0.0f, "Error, div by zero attempted");
|
||||
|
||||
F32 inv = 1.0f / _div;
|
||||
x *= inv;
|
||||
y *= inv;
|
||||
z *= inv;
|
||||
|
||||
gFloat3.div_scalar(*this, _div, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator*(const Point3F &_vec) const
|
||||
{
|
||||
return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
|
||||
Point3F temp;
|
||||
gFloat3.mul(*this, _vec, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator*=(const Point3F &_vec)
|
||||
{
|
||||
x *= _vec.x;
|
||||
y *= _vec.y;
|
||||
z *= _vec.z;
|
||||
gFloat3.mul(*this, _vec, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator/(const Point3F &_vec) const
|
||||
{
|
||||
AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
|
||||
return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
|
||||
Point3F temp;
|
||||
gFloat3.div(*this, _vec, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator/=(const Point3F &_vec)
|
||||
{
|
||||
AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
|
||||
x /= _vec.x;
|
||||
y /= _vec.y;
|
||||
z /= _vec.z;
|
||||
gFloat3.div(*this, _vec, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
@ -989,7 +984,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)
|
|||
|
||||
inline Point3F operator*(F32 mul, const Point3F& multiplicand)
|
||||
{
|
||||
return multiplicand * mul;
|
||||
Point3F temp;
|
||||
gFloat3.mul_scalar(multiplicand, mul, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3D operator*(F64 mul, const Point3D& multiplicand)
|
||||
|
|
@ -999,7 +996,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
|
|||
|
||||
inline F32 mDot(const Point3F &p1, const Point3F &p2)
|
||||
{
|
||||
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
|
||||
return gFloat3.dot(p1, p2);
|
||||
}
|
||||
|
||||
inline F64 mDot(const Point3D &p1, const Point3D &p2)
|
||||
|
|
|
|||
|
|
@ -26,10 +26,12 @@
|
|||
#ifndef _MMATHFN_H_
|
||||
#include "math/mMathFn.h"
|
||||
#endif
|
||||
|
||||
#ifndef _MPOINT3_H_
|
||||
#include "math/mPoint3.h"
|
||||
#endif
|
||||
#ifndef _MATH_BACKEND_H_
|
||||
#include "math/public/math_backend.h"
|
||||
#endif
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
@ -61,6 +63,8 @@ class Point4I
|
|||
/// Uses F32 internally.
|
||||
///
|
||||
/// Useful for representing quaternions and other 4d beasties.
|
||||
using math_backend::float4::dispatch::gFloat4;
|
||||
|
||||
class Point4F
|
||||
{
|
||||
//-------------------------------------- Public data
|
||||
|
|
@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)
|
|||
|
||||
inline F32 Point4F::len() const
|
||||
{
|
||||
return mSqrt(x*x + y*y + z*z + w*w);
|
||||
return gFloat4.length(*this);
|
||||
}
|
||||
|
||||
inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
|
||||
{
|
||||
x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
|
||||
y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
|
||||
z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
|
||||
w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
|
||||
gFloat4.lerp(_from, _to, _factor, *this);
|
||||
}
|
||||
|
||||
inline void Point4F::zero()
|
||||
|
|
@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
|
|||
if (mIsZero(scalar))
|
||||
return *this;
|
||||
|
||||
F32 denom = 1 / scalar;
|
||||
|
||||
x *= denom;
|
||||
y *= denom;
|
||||
z *= denom;
|
||||
w *= denom;
|
||||
gFloat4.div_scalar(*this, scalar, *this);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator+(const Point4F& _add) const
|
||||
{
|
||||
return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
|
||||
Point4F res;
|
||||
gFloat4.add(*this, _add, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Point4F& Point4F::operator+=(const Point4F& _add)
|
||||
{
|
||||
x += _add.x;
|
||||
y += _add.y;
|
||||
z += _add.z;
|
||||
w += _add.w;
|
||||
|
||||
gFloat4.add(*this, _add, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator-(const Point4F& _rSub) const
|
||||
{
|
||||
return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
|
||||
Point4F res;
|
||||
gFloat4.sub(*this, _rSub, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator*(const Point4F &_vec) const
|
||||
{
|
||||
return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
|
||||
Point4F res;
|
||||
gFloat4.mul(*this, _vec, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator*(F32 _mul) const
|
||||
{
|
||||
return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
|
||||
Point4F res;
|
||||
gFloat4.mul_scalar(*this, _mul, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator /(F32 t) const
|
||||
{
|
||||
F32 f = 1.0f / t;
|
||||
return Point4F( x * f, y * f, z * f, w * f );
|
||||
Point4F res;
|
||||
gFloat4.div_scalar(*this, t, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline F32 mDot(const Point4F &p1, const Point4F &p2)
|
||||
{
|
||||
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
|
||||
return gFloat4.dot(p1, p2);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
|
|||
38
Engine/source/math/public/float3_dispatch.h
Normal file
38
Engine/source/math/public/float3_dispatch.h
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
#pragma once
|
||||
#ifndef _FLOAT3_DISPATCH_H_
|
||||
#define _FLOAT3_DISPATCH_H_
|
||||
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
struct Float3Funcs
|
||||
{
|
||||
void (*add)(const float*, const float*, float*) = nullptr;
|
||||
void (*sub)(const float*, const float*, float*) = nullptr;
|
||||
void (*mul)(const float*, const float*, float*) = nullptr;
|
||||
void (*mul_scalar)(const float*, float, float*) = nullptr;
|
||||
void (*div)(const float*, const float*, float*) = nullptr;
|
||||
void (*div_scalar)(const float*, float, float*) = nullptr;
|
||||
float (*dot)(const float*, const float*) = nullptr;
|
||||
float (*length)(const float*) = nullptr;
|
||||
float (*lengthSquared)(const float*) = nullptr;
|
||||
void (*normalize)(float*) = nullptr;
|
||||
void (*normalize_mag)(float*, float) = nullptr;
|
||||
void (*lerp)(const float*, const float*, float, float*) = nullptr;
|
||||
};
|
||||
|
||||
// Global dispatch table
|
||||
extern Float3Funcs gFloat3;
|
||||
|
||||
// Backend installers (defined in ISA libraries)
|
||||
void install_scalar();
|
||||
void install_sse2();
|
||||
void install_sse41();
|
||||
void install_avx();
|
||||
void install_avx2();
|
||||
void install_neon();
|
||||
}
|
||||
|
||||
#endif // !_FLOAT4_DISPATCH_H_
|
||||
|
|
@ -1,7 +1,14 @@
|
|||
#include "math/public/float4_dispatch.h"
|
||||
#include "math/public/float3_dispatch.h"
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
// Single definition of the global dispatch table
|
||||
Float4Funcs gFloat4{};
|
||||
}
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
// Single definition of the global dispatch table
|
||||
Float3Funcs gFloat3{};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ namespace math_backend::float4::dispatch
|
|||
float (*length)(const float*) = nullptr;
|
||||
float (*lengthSquared)(const float*) = nullptr;
|
||||
void (*normalize)(float*) = nullptr;
|
||||
void (*normalize_mag)(float*, float) = nullptr;
|
||||
void (*lerp)(const float*, const float*, float, float*) = nullptr;
|
||||
};
|
||||
|
||||
|
|
@ -32,9 +33,6 @@ namespace math_backend::float4::dispatch
|
|||
void install_avx();
|
||||
void install_avx2();
|
||||
void install_neon();
|
||||
|
||||
// Centralized installer (engine calls this once)
|
||||
void install_preferred();
|
||||
}
|
||||
|
||||
#endif // !_FLOAT4_DISPATCH_H_
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
|
|||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
|
||||
case backend::avx2:
|
||||
float4::dispatch::install_avx2();
|
||||
float3::dispatch::install_avx2();
|
||||
break;
|
||||
|
||||
case backend::avx:
|
||||
|
|
|
|||
|
|
@ -1,4 +1,7 @@
|
|||
#pragma once
|
||||
#ifndef _MATH_BACKEND_H_
|
||||
#define _MATH_BACKEND_H_
|
||||
|
||||
#ifndef _MCONSTANTS_H_
|
||||
#include "math/mConstants.h"
|
||||
#endif
|
||||
|
|
@ -8,6 +11,9 @@
|
|||
#ifndef _FLOAT4_DISPATCH_H_
|
||||
#include "math/public/float4_dispatch.h"
|
||||
#endif
|
||||
#ifndef _FLOAT3_DISPATCH_H_
|
||||
#include "math/public/float3_dispatch.h"
|
||||
#endif
|
||||
|
||||
namespace math_backend
|
||||
{
|
||||
|
|
@ -25,3 +31,5 @@ namespace math_backend
|
|||
backend choose_backend(U32 cpu_flags);
|
||||
void install_from_cpu_flags(uint32_t cpu_flags);
|
||||
}
|
||||
|
||||
#endif // !_MATH_BACKEND_H_
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue