#include "platform/platform.h" #include "math/mMath.h" #include "math/util/frustum.h" #include // Caution!!! Possible platform specific include #include "math/mMathFn.h" #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) #define TORQUE_MATH_x64 1 #include // this header will sort out which intrinsics are supported. #elif defined(__aarch64__) || defined(_M_ARM64) #define TORQUE_MATH_ARM 1 #include #else #define TORQUE_MATH_C 1 #endif #include // for std::min //------------------------------------------------------------------------------ // Global ISA variable (x86) ISA gISA = ISA::NONE; #ifdef TORQUE_MATH_x64 void mInstallLibrary_CPU() { gISA = ISA::SSE2; // Bare minimum supported. U32 properties = Platform::SystemInfo.processor.properties; if (properties & CPU_PROP_AVX2) gISA = ISA::AVX2; return; if (properties & CPU_PROP_AVX) gISA = ISA::AVX; return; if (properties & CPU_PROP_SSE4_1) gISA = ISA::SSE41; return; if (properties & CPU_PROP_SSE2) gISA = ISA::SSE2; return; } #else void mInstallLibrary_CPU() { gISA = ISA::NONE; } #endif //------------------------------------------------------------------------------ // SINGLE POINT OPERATIONS //------------------------------------------------------------------------------ void math_backend::float4::add(const float* a, const float* b, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); _mm_storeu_ps(r, _mm_add_ps(va, vb)); return; } default: break; } #elif TORQUE_MATH_ARM vst1q_f32(r, vaddq_f32(vld1q_f32(a), vld1q_f32(b))); #endif r[0] = a[0] + b[0]; r[1] = a[1] + b[1]; r[2] = a[2] + b[2]; r[3] = a[3] + b[3]; } void math_backend::float4::sub(const float* a, const float* b, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); _mm_storeu_ps(r, _mm_sub_ps(va, vb)); return; } default: break; } #elif TORQUE_MATH_ARM vst1q_f32(r, vsubq_f32(vld1q_f32(a), vld1q_f32(b))); #endif r[0] = a[0] - b[0]; r[1] = a[1] - b[1]; r[2] = a[2] - b[2]; r[3] = a[3] - b[3]; } void math_backend::float4::mul(const float* a, const float* b, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); _mm_storeu_ps(r, _mm_mul_ps(va, vb)); return; } default: break; } #elif TORQUE_MATH_ARM vst1q_f32(r, vmulq_f32(vld1q_f32(a), vld1q_f32(b))); #endif r[0] = a[0] * b[0]; r[1] = a[1] * b[1]; r[2] = a[2] * b[2]; r[3] = a[3] * b[3]; } void math_backend::float4::mul_scalar(const float* a, float s, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vs = _mm_set1_ps(s); // broadcast scalar -> [s s s s] _mm_storeu_ps(r, _mm_mul_ps(va, vs)); return; } default: break; } #elif TORQUE_MATH_ARM float32x4_t va = vld1q_f32(a); float32x4_t vs = vdupq_n_f32(s); vst1q_f32(r, vmulq_f32(va, vs)); return; #endif // torque_c fallback r[0] = a[0] * s; r[1] = a[1] * s; r[2] = a[2] * s; r[3] = a[3] * s; } void math_backend::float4::div(const float* a, const float* b, float* r) { } void math_backend::float4::div_scalar(const float* a, float s, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); // compute reciprocal once __m128 vs = _mm_set1_ps(s); __m128 recip = _mm_rcp_ps(vs); // Newton–Raphson refinement (important for accuracy) recip = _mm_mul_ps(recip, _mm_sub_ps(_mm_set1_ps(2.0f),_mm_mul_ps(vs, recip))); _mm_storeu_ps(r, _mm_mul_ps(va, recip)); return; } default: break; } #elif TORQUE_MATH_ARM float32x4_t va = vld1q_f32(a); float32x4_t vs = vdupq_n_f32(s); float32x4_t recip = vrecpeq_f32(vs); recip = vmulq_f32(vrecpsq_f32(vs, recip), recip); vst1q_f32(r, vmulq_f32(va, recip)); return; #endif // torque_c fallback float inv = 1.0f / s; r[0] = a[0] * inv; r[1] = a[1] * inv; r[2] = a[2] * inv; r[3] = a[3] * inv; } float math_backend::float4::dot(const float* a, const float* b) { #if TORQUE_MATH_x64 switch (gISA) { // ---- SSE4.1 : dedicated dot instruction ---- case ISA::AVX2: case ISA::AVX: case ISA::SSE41: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); return _mm_cvtss_f32(_mm_dp_ps(va, vb, 0xFF)); } // ---- SSE2 fallback (no horizontal ops available) ---- case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); __m128 mul = _mm_mul_ps(va, vb); __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1)); __m128 sums = _mm_add_ps(mul, shuf); shuf = _mm_movehl_ps(shuf, sums); sums = _mm_add_ss(sums, shuf); return _mm_cvtss_f32(sums); } default: break; } #elif TORQUE_MATH_ARM // ---- NEON ---- float32x4_t va = vld1q_f32(a); float32x4_t vb = vld1q_f32(b); float32x4_t vmul = vmulq_f32(va, vb); float32x2_t sum2 = vpadd_f32(vget_low_f32(vmul), vget_high_f32(vmul)); float32x2_t total = vpadd_f32(sum2, sum2); return vget_lane_f32(total, 0); #endif return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; }