Torque3D/Engine/source/math/mMath_CPU.cpp
marauder2k7 fb986e375d backup
alot more changes here added some matrix functions
most of the point3 and point4 is now converted over to simd functions
matrix now has a lot done by simd but transpose/normal/affine inverse/ inverse still to do
2026-02-25 20:36:53 +00:00

429 lines
11 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "platform/platform.h"
#include "math/mMath.h"
#include "math/util/frustum.h"
#include <math.h> // Caution!!! Possible platform specific include
#include "math/mMathFn.h"
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
#define TORQUE_MATH_x64 1
#include <immintrin.h>
#elif defined(__aarch64__) || defined(_M_ARM64)
#define TORQUE_MATH_ARM 1
#include <arm_neon.h>
#else
#define TORQUE_MATH_C 1
#endif
#include <algorithm> // for std::min
//------------------------------------------------------------------------------
// Global ISA variable (x86)
ISA gISA = ISA::NONE;
#ifdef TORQUE_MATH_x64
void mInstallLibrary_CPU()
{
gISA = ISA::SSE2; // Bare minimum supported.
U32 properties = Platform::SystemInfo.processor.properties;
if (properties & CPU_PROP_AVX2) gISA = ISA::AVX2; return;
if (properties & CPU_PROP_AVX) gISA = ISA::AVX; return;
if (properties & CPU_PROP_SSE4_1) gISA = ISA::SSE41; return;
if (properties & CPU_PROP_SSE2) gISA = ISA::SSE2; return;
}
#else
void mInstallLibrary_CPU() { gISA = ISA::NONE; }
#endif
//------------------------------------------------------------------------------
// SINGLE POINT OPERATIONS
//------------------------------------------------------------------------------
void math_backend::float4::add(const float* a, const float* b, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
_mm_storeu_ps(r, _mm_add_ps(va, vb));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
vst1q_f32(r, vaddq_f32(vld1q_f32(a), vld1q_f32(b)));
#endif
r[0] = a[0] + b[0];
r[1] = a[1] + b[1];
r[2] = a[2] + b[2];
r[3] = a[3] + b[3];
}
void math_backend::float4::sub(const float* a, const float* b, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
_mm_storeu_ps(r, _mm_sub_ps(va, vb));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
vst1q_f32(r, vsubq_f32(vld1q_f32(a), vld1q_f32(b)));
#endif
r[0] = a[0] - b[0];
r[1] = a[1] - b[1];
r[2] = a[2] - b[2];
r[3] = a[3] - b[3];
}
void math_backend::float4::mul(const float* a, const float* b, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
_mm_storeu_ps(r, _mm_mul_ps(va, vb));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
vst1q_f32(r, vmulq_f32(vld1q_f32(a), vld1q_f32(b)));
#endif
r[0] = a[0] * b[0];
r[1] = a[1] * b[1];
r[2] = a[2] * b[2];
r[3] = a[3] * b[3];
}
void math_backend::float4::mul_scalar(const float* a, float s, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vs = _mm_set1_ps(s); // broadcast scalar -> [s s s s]
_mm_storeu_ps(r, _mm_mul_ps(va, vs));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
float32x4_t va = vld1q_f32(a);
float32x4_t vs = vdupq_n_f32(s);
vst1q_f32(r, vmulq_f32(va, vs));
return;
#endif
// torque_c fallback
r[0] = a[0] * s;
r[1] = a[1] * s;
r[2] = a[2] * s;
r[3] = a[3] * s;
}
void math_backend::float4::div(const float* a, const float* b, float* r)
{
}
void math_backend::float4::div_scalar(const float* a, float s, float* r)
{
#if TORQUE_MATH_x64
switch (gISA) {
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
// compute reciprocal once
__m128 vs = _mm_set1_ps(s);
__m128 recip = _mm_rcp_ps(vs);
// NewtonRaphson refinement (important for accuracy)
recip = _mm_mul_ps(recip, _mm_sub_ps(_mm_set1_ps(2.0f),_mm_mul_ps(vs, recip)));
_mm_storeu_ps(r, _mm_mul_ps(va, recip));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
float32x4_t va = vld1q_f32(a);
float32x4_t vs = vdupq_n_f32(s);
float32x4_t recip = vrecpeq_f32(vs);
recip = vmulq_f32(vrecpsq_f32(vs, recip), recip);
vst1q_f32(r, vmulq_f32(va, recip));
return;
#endif
// torque_c fallback
float inv = 1.0f / s;
r[0] = a[0] * inv;
r[1] = a[1] * inv;
r[2] = a[2] * inv;
r[3] = a[3] * inv;
}
float math_backend::float4::dot(const float* a, const float* b)
{
#if TORQUE_MATH_x64
switch (gISA) {
// ---- SSE4.1 : dedicated dot instruction ----
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
#ifdef TORQUE_HAVE_SSE4_1 // Linux macro required in case sse4 does not exist.
{
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
return _mm_cvtss_f32(_mm_dp_ps(va, vb, 0xFF));
}
#endif
// ---- SSE2 fallback (no horizontal ops available) ----
case ISA::SSE2: {
__m128 va = _mm_loadu_ps(a);
__m128 vb = _mm_loadu_ps(b);
__m128 mul = _mm_mul_ps(va, vb);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
}
default: break;
}
#elif TORQUE_MATH_ARM
// ---- NEON ----
float32x4_t va = vld1q_f32(a);
float32x4_t vb = vld1q_f32(b);
float32x4_t vmul = vmulq_f32(va, vb);
float32x2_t sum2 = vpadd_f32(vget_low_f32(vmul), vget_high_f32(vmul));
float32x2_t total = vpadd_f32(sum2, sum2);
return vget_lane_f32(total, 0);
#endif
return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
}
void math_backend::float3::add(const float* a, const float* b, float* r)
{
// Pad 3 floats into 4 for SIMD
float va[4] = { a[0], a[1], a[2], 0.0f };
float vb[4] = { b[0], b[1], b[2], 0.0f };
float vr[4];
float4::add(va, vb, vr);
r[0] = vr[0];
r[1] = vr[1];
r[2] = vr[2];
}
void math_backend::float3::sub(const float* a, const float* b, float* r)
{
// Pad 3 floats into 4 for SIMD
float va[4] = { a[0], a[1], a[2], 0.0f };
float vb[4] = { b[0], b[1], b[2], 0.0f };
float vr[4];
float4::sub(va, vb, vr);
r[0] = vr[0];
r[1] = vr[1];
r[2] = vr[2];
}
void math_backend::float3::mul(const float* a, const float* b, float* r)
{
// Pad 3 floats into 4 for SIMD
float va[4] = { a[0], a[1], a[2], 0.0f };
float vb[4] = { b[0], b[1], b[2], 0.0f };
float vr[4];
float4::mul(va, vb, vr);
r[0] = vr[0];
r[1] = vr[1];
r[2] = vr[2];
}
void math_backend::float3::mul_scalar(const float* a, float s, float* r)
{
float va[4] = { a[0], a[1], a[2], 0.0f };
float vr[4];
float4::mul_scalar(va, s, vr);
r[0] = vr[0];
r[1] = vr[1];
r[2] = vr[2];
}
float math_backend::float3::dot(const float* a, const float* b)
{
float va[4] = { a[0], a[1], a[2], 0.0f };
float vb[4] = { b[0], b[1], b[2], 0.0f };
return float4::dot(va, vb);
}
void math_backend::float_mat4x4::multiply_mat4(const float* A, const float* B, float* R)
{
__m128 b0 = _mm_loadu_ps(B + 0);
__m128 b1 = _mm_loadu_ps(B + 4);
__m128 b2 = _mm_loadu_ps(B + 8);
__m128 b3 = _mm_loadu_ps(B + 12);
for (int i = 0; i < 4; i++)
{
__m128 a = _mm_loadu_ps(A + i * 4);
__m128 r =
_mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b0);
r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), b1));
r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), b2));
r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), b3));
_mm_storeu_ps(R + i * 4, r);
}
}
void math_backend::float_mat4x4::multiply_float4(const float* M, const float* P, float* R)
{
#if TORQUE_MATH_x64
switch (gISA)
{
case ISA::AVX2:
case ISA::AVX:
case ISA::SSE41:
#ifdef TORQUE_HAVE_SSE4_1
{
__m128 p = _mm_loadu_ps(P);
__m128 r0 = _mm_dp_ps(_mm_loadu_ps(M + 0), p, 0xF1);
__m128 r1 = _mm_dp_ps(_mm_loadu_ps(M + 4), p, 0xF2);
__m128 r2 = _mm_dp_ps(_mm_loadu_ps(M + 8), p, 0xF4);
__m128 r3 = _mm_dp_ps(_mm_loadu_ps(M + 12), p, 0xF8);
__m128 r = _mm_or_ps(_mm_or_ps(r0, r1), _mm_or_ps(r2, r3));
_mm_storeu_ps(R, r);
return;
}
#endif
case ISA::SSE2:
{
__m128 p = _mm_loadu_ps(P);
__m128 xxxx = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0));
__m128 yyyy = _mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1));
__m128 zzzz = _mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2));
__m128 wwww = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3));
__m128 r =
_mm_mul_ps(_mm_loadu_ps(M + 0), xxxx);
r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 4), yyyy));
r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 8), zzzz));
r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 12), wwww));
// transpose from column accum to row result
__m128 t0 = _mm_unpacklo_ps(r, r);
__m128 t1 = _mm_unpackhi_ps(r, r);
__m128 t2 = _mm_movelh_ps(t0, t1);
__m128 t3 = _mm_movehl_ps(t1, t0);
_mm_storeu_ps(R, _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2, 0, 2, 0)));
return;
}
default: break;
}
#elif TORQUE_MATH_ARM
float32x4_t p = vld1q_f32(P);
float32x4_t r =
vmulq_n_f32(vld1q_f32(M + 0), vgetq_lane_f32(p, 0));
r = vmlaq_n_f32(r, vld1q_f32(M + 4), vgetq_lane_f32(p, 1));
r = vmlaq_n_f32(r, vld1q_f32(M + 8), vgetq_lane_f32(p, 2));
r = vmlaq_n_f32(r, vld1q_f32(M + 12), vgetq_lane_f32(p, 3));
// horizontal add rows
float32x2_t s0 = vadd_f32(vget_low_f32(r), vget_high_f32(r));
float32x2_t s1 = vpadd_f32(s0, s0);
vst1q_lane_f32(R, s1, 0);
return;
#endif
R[0] = M[0] * P[0] + M[1] * P[1] + M[2] * P[2] + M[3] * P[3];
R[1] = M[4] * P[0] + M[5] * P[1] + M[6] * P[2] + M[7] * P[3];
R[2] = M[8] * P[0] + M[9] * P[1] + M[10] * P[2] + M[11] * P[3];
R[3] = M[12] * P[0] + M[13] * P[1] + M[14] * P[2] + M[15] * P[3];
}
void math_backend::float_mat4x4::scale(float* M, const float* S)
{
float scalef[4] = { S[0], S[1], S[2], 1.0f };
#if TORQUE_MATH_x64
__m128 scale = _mm_loadu_ps(scalef); // S = {x, y, z, 1}
__m128 row0 = _mm_loadu_ps(M + 0);
__m128 row1 = _mm_loadu_ps(M + 4);
__m128 row2 = _mm_loadu_ps(M + 8);
__m128 row3 = _mm_loadu_ps(M + 12);
row0 = _mm_mul_ps(row0, scale);
row1 = _mm_mul_ps(row1, scale);
row2 = _mm_mul_ps(row2, scale);
row3 = _mm_mul_ps(row3, scale);
_mm_storeu_ps(M + 0, row0);
_mm_storeu_ps(M + 4, row1);
_mm_storeu_ps(M + 8, row2);
_mm_storeu_ps(M + 12, row3);
return;
#elif TORQUE_MATH_ARM
float32x4_t scale = vld1q_f32(scalef);
vst1q_f32(M + 0, vmulq_f32(vld1q_f32(M + 0), scale));
vst1q_f32(M + 4, vmulq_f32(vld1q_f32(M + 4), scale));
vst1q_f32(M + 8, vmulq_f32(vld1q_f32(M + 8), scale));
vst1q_f32(M + 12, vmulq_f32(vld1q_f32(M + 12), scale));
return;
#endif
M[0] *= S[0]; M[1] *= S[1]; M[2] *= S[2];
M[4] *= S[0]; M[5] *= S[1]; M[6] *= S[2];
M[8] *= S[0]; M[9] *= S[1]; M[10] *= S[2];
M[12] *= S[0]; M[13] *= S[1]; M[14] *= S[2];
}