Torque3D/Engine/source/math/mMath_CPU.cpp
marauder2k7 8908bbe462 fix linux compile
added flags for different simd availability
linux complained (as usual)
2026-02-25 18:05:39 +00:00

239 lines
5.7 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "platform/platform.h"
#include "math/mMath.h"
#include "math/util/frustum.h"
#include <math.h> // Caution!!! Possible platform specific include
#include "math/mMathFn.h"
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
#define TORQUE_MATH_x64 1
#include <immintrin.h>
#elif defined(__aarch64__) || defined(_M_ARM64)
#define TORQUE_MATH_ARM 1
#include <arm_neon.h>
#else
#define TORQUE_MATH_C 1
#endif
#include <algorithm> // for std::min
//------------------------------------------------------------------------------
// Global ISA variable (x86)
ISA gISA = ISA::NONE;
#ifdef TORQUE_MATH_x64
void mInstallLibrary_CPU()
{
gISA = ISA::SSE2; // Bare minimum supported.
U32 properties = Platform::SystemInfo.processor.properties;
if (properties & CPU_PROP_AVX2) gISA = ISA::AVX2; return;
if (properties & CPU_PROP_AVX) gISA = ISA::AVX; return;
if (properties & CPU_PROP_SSE4_1) gISA = ISA::SSE41; return;
if (properties & CPU_PROP_SSE2) gISA = ISA::SSE2; return;
}
#else
void mInstallLibrary_CPU() { gISA = ISA::NONE; }
#endif
//------------------------------------------------------------------------------
// SINGLE POINT OPERATIONS
//------------------------------------------------------------------------------
void math_backend::float4::add(const float* a, const float* b, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
_mm_storeu_ps(r, _mm_add_ps(va, vb));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
vst1q_f32(r, vaddq_f32(vld1q_f32(a), vld1q_f32(b)));
#endif
r[0] = a[0] + b[0];
r[1] = a[1] + b[1];
r[2] = a[2] + b[2];
r[3] = a[3] + b[3];
}
void math_backend::float4::sub(const float* a, const float* b, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
_mm_storeu_ps(r, _mm_sub_ps(va, vb));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
vst1q_f32(r, vsubq_f32(vld1q_f32(a), vld1q_f32(b)));
#endif
r[0] = a[0] - b[0];
r[1] = a[1] - b[1];
r[2] = a[2] - b[2];
r[3] = a[3] - b[3];
}
void math_backend::float4::mul(const float* a, const float* b, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
_mm_storeu_ps(r, _mm_mul_ps(va, vb));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
vst1q_f32(r, vmulq_f32(vld1q_f32(a), vld1q_f32(b)));
#endif
r[0] = a[0] * b[0];
r[1] = a[1] * b[1];
r[2] = a[2] * b[2];
r[3] = a[3] * b[3];
}
void math_backend::float4::mul_scalar(const float* a, float s, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vs = _mm_set1_ps(s); // broadcast scalar -> [s s s s]
_mm_storeu_ps(r, _mm_mul_ps(va, vs));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
float32x4_t va = vld1q_f32(a);
float32x4_t vs = vdupq_n_f32(s);
vst1q_f32(r, vmulq_f32(va, vs));
return;
#endif
// torque_c fallback
r[0] = a[0] * s;
r[1] = a[1] * s;
r[2] = a[2] * s;
r[3] = a[3] * s;
}
void math_backend::float4::div(const float* a, const float* b, float* r)
{
}
void math_backend::float4::div_scalar(const float* a, float s, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
// compute reciprocal once
__m128 vs = _mm_set1_ps(s);
__m128 recip = _mm_rcp_ps(vs);
// NewtonRaphson refinement (important for accuracy)
recip = _mm_mul_ps(recip, _mm_sub_ps(_mm_set1_ps(2.0f),_mm_mul_ps(vs, recip)));
_mm_storeu_ps(r, _mm_mul_ps(va, recip));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
float32x4_t va = vld1q_f32(a);
float32x4_t vs = vdupq_n_f32(s);
float32x4_t recip = vrecpeq_f32(vs);
recip = vmulq_f32(vrecpsq_f32(vs, recip), recip);
vst1q_f32(r, vmulq_f32(va, recip));
return;
#endif
// torque_c fallback
float inv = 1.0f / s;
r[0] = a[0] * inv;
r[1] = a[1] * inv;
r[2] = a[2] * inv;
r[3] = a[3] * inv;
}
float math_backend::float4::dot(const float* a, const float* b)
{
#if TORQUE_MATH_x64
switch (gISA) {
// ---- SSE4.1 : dedicated dot instruction ----
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
#ifdef TORQUE_HAVE_SSE4_1 // Linux macro required in case sse4 does not exist.
{
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
return _mm_cvtss_f32(_mm_dp_ps(va, vb, 0xFF));
}
#endif
// ---- SSE2 fallback (no horizontal ops available) ----
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
__m128 mul = _mm_mul_ps(va, vb);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
}
default: break;
}
#elif TORQUE_MATH_ARM
// ---- NEON ----
float32x4_t va = vld1q_f32(a);
float32x4_t vb = vld1q_f32(b);
float32x4_t vmul = vmulq_f32(va, vb);
float32x2_t sum2 = vpadd_f32(vget_low_f32(vmul), vget_high_f32(vmul));
float32x2_t total = vpadd_f32(sum2, sum2);
return vget_lane_f32(total, 0);
#endif
return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
}