#include "platform/platform.h" #include "math/mMath.h" #include "math/util/frustum.h" #include // Caution!!! Possible platform specific include #include "math/mMathFn.h" #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) #define TORQUE_MATH_x64 1 #include #elif defined(__aarch64__) || defined(_M_ARM64) #define TORQUE_MATH_ARM 1 #include #else #define TORQUE_MATH_C 1 #endif #include // for std::min //------------------------------------------------------------------------------ // Global ISA variable (x86) ISA gISA = ISA::NONE; #ifdef TORQUE_MATH_x64 void mInstallLibrary_CPU() { gISA = ISA::SSE2; // Bare minimum supported. U32 properties = Platform::SystemInfo.processor.properties; if (properties & CPU_PROP_AVX2) gISA = ISA::AVX2; return; if (properties & CPU_PROP_AVX) gISA = ISA::AVX; return; if (properties & CPU_PROP_SSE4_1) gISA = ISA::SSE41; return; if (properties & CPU_PROP_SSE2) gISA = ISA::SSE2; return; } #else void mInstallLibrary_CPU() { gISA = ISA::NONE; } #endif //------------------------------------------------------------------------------ // SINGLE POINT OPERATIONS //------------------------------------------------------------------------------ void math_backend::float4::add(const float* a, const float* b, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); _mm_storeu_ps(r, _mm_add_ps(va, vb)); return; } default: break; } #elif TORQUE_MATH_ARM vst1q_f32(r, vaddq_f32(vld1q_f32(a), vld1q_f32(b))); #endif r[0] = a[0] + b[0]; r[1] = a[1] + b[1]; r[2] = a[2] + b[2]; r[3] = a[3] + b[3]; } void math_backend::float4::sub(const float* a, const float* b, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); _mm_storeu_ps(r, _mm_sub_ps(va, vb)); return; } default: break; } #elif TORQUE_MATH_ARM vst1q_f32(r, vsubq_f32(vld1q_f32(a), vld1q_f32(b))); #endif r[0] = a[0] - b[0]; r[1] = a[1] - b[1]; r[2] = a[2] - b[2]; r[3] = a[3] - b[3]; } void math_backend::float4::mul(const float* a, const float* b, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); _mm_storeu_ps(r, _mm_mul_ps(va, vb)); return; } default: break; } #elif TORQUE_MATH_ARM vst1q_f32(r, vmulq_f32(vld1q_f32(a), vld1q_f32(b))); #endif r[0] = a[0] * b[0]; r[1] = a[1] * b[1]; r[2] = a[2] * b[2]; r[3] = a[3] * b[3]; } void math_backend::float4::mul_scalar(const float* a, float s, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vs = _mm_set1_ps(s); // broadcast scalar -> [s s s s] _mm_storeu_ps(r, _mm_mul_ps(va, vs)); return; } default: break; } #elif TORQUE_MATH_ARM float32x4_t va = vld1q_f32(a); float32x4_t vs = vdupq_n_f32(s); vst1q_f32(r, vmulq_f32(va, vs)); return; #endif // torque_c fallback r[0] = a[0] * s; r[1] = a[1] * s; r[2] = a[2] * s; r[3] = a[3] * s; } void math_backend::float4::div(const float* a, const float* b, float* r) { } void math_backend::float4::div_scalar(const float* a, float s, float* r) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); // compute reciprocal once __m128 vs = _mm_set1_ps(s); __m128 recip = _mm_rcp_ps(vs); // Newton–Raphson refinement (important for accuracy) recip = _mm_mul_ps(recip, _mm_sub_ps(_mm_set1_ps(2.0f),_mm_mul_ps(vs, recip))); _mm_storeu_ps(r, _mm_mul_ps(va, recip)); return; } default: break; } #elif TORQUE_MATH_ARM float32x4_t va = vld1q_f32(a); float32x4_t vs = vdupq_n_f32(s); float32x4_t recip = vrecpeq_f32(vs); recip = vmulq_f32(vrecpsq_f32(vs, recip), recip); vst1q_f32(r, vmulq_f32(va, recip)); return; #endif // torque_c fallback float inv = 1.0f / s; r[0] = a[0] * inv; r[1] = a[1] * inv; r[2] = a[2] * inv; r[3] = a[3] * inv; } float math_backend::float4::dot(const float* a, const float* b) { #if TORQUE_MATH_x64 switch (gISA) { // ---- SSE4.1 : dedicated dot instruction ---- case ISA::AVX2: case ISA::AVX: case ISA::SSE41: #ifdef TORQUE_HAVE_SSE4_1 // Linux macro required in case sse4 does not exist. { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); return _mm_cvtss_f32(_mm_dp_ps(va, vb, 0xFF)); } #endif // ---- SSE2 fallback (no horizontal ops available) ---- case ISA::SSE2: { __m128 va = _mm_loadu_ps(a); __m128 vb = _mm_loadu_ps(b); __m128 mul = _mm_mul_ps(va, vb); __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1)); __m128 sums = _mm_add_ps(mul, shuf); shuf = _mm_movehl_ps(shuf, sums); sums = _mm_add_ss(sums, shuf); return _mm_cvtss_f32(sums); } default: break; } #elif TORQUE_MATH_ARM // ---- NEON ---- float32x4_t va = vld1q_f32(a); float32x4_t vb = vld1q_f32(b); float32x4_t vmul = vmulq_f32(va, vb); float32x2_t sum2 = vpadd_f32(vget_low_f32(vmul), vget_high_f32(vmul)); float32x2_t total = vpadd_f32(sum2, sum2); return vget_lane_f32(total, 0); #endif return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; } void math_backend::float3::add(const float* a, const float* b, float* r) { // Pad 3 floats into 4 for SIMD float va[4] = { a[0], a[1], a[2], 0.0f }; float vb[4] = { b[0], b[1], b[2], 0.0f }; float vr[4]; float4::add(va, vb, vr); r[0] = vr[0]; r[1] = vr[1]; r[2] = vr[2]; } void math_backend::float3::sub(const float* a, const float* b, float* r) { // Pad 3 floats into 4 for SIMD float va[4] = { a[0], a[1], a[2], 0.0f }; float vb[4] = { b[0], b[1], b[2], 0.0f }; float vr[4]; float4::sub(va, vb, vr); r[0] = vr[0]; r[1] = vr[1]; r[2] = vr[2]; } void math_backend::float3::mul(const float* a, const float* b, float* r) { // Pad 3 floats into 4 for SIMD float va[4] = { a[0], a[1], a[2], 0.0f }; float vb[4] = { b[0], b[1], b[2], 0.0f }; float vr[4]; float4::mul(va, vb, vr); r[0] = vr[0]; r[1] = vr[1]; r[2] = vr[2]; } void math_backend::float3::mul_scalar(const float* a, float s, float* r) { float va[4] = { a[0], a[1], a[2], 0.0f }; float vr[4]; float4::mul_scalar(va, s, vr); r[0] = vr[0]; r[1] = vr[1]; r[2] = vr[2]; } float math_backend::float3::dot(const float* a, const float* b) { float va[4] = { a[0], a[1], a[2], 0.0f }; float vb[4] = { b[0], b[1], b[2], 0.0f }; return float4::dot(va, vb); } void math_backend::float_mat4x4::multiply_mat4(const float* A, const float* B, float* R) { __m128 b0 = _mm_loadu_ps(B + 0); __m128 b1 = _mm_loadu_ps(B + 4); __m128 b2 = _mm_loadu_ps(B + 8); __m128 b3 = _mm_loadu_ps(B + 12); for (int i = 0; i < 4; i++) { __m128 a = _mm_loadu_ps(A + i * 4); __m128 r = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b0); r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), b1)); r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), b2)); r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), b3)); _mm_storeu_ps(R + i * 4, r); } } void math_backend::float_mat4x4::multiply_float4(const float* M, const float* P, float* R) { #if TORQUE_MATH_x64 switch (gISA) { case ISA::AVX2: case ISA::AVX: case ISA::SSE41: #ifdef TORQUE_HAVE_SSE4_1 { __m128 p = _mm_loadu_ps(P); __m128 r0 = _mm_dp_ps(_mm_loadu_ps(M + 0), p, 0xF1); __m128 r1 = _mm_dp_ps(_mm_loadu_ps(M + 4), p, 0xF2); __m128 r2 = _mm_dp_ps(_mm_loadu_ps(M + 8), p, 0xF4); __m128 r3 = _mm_dp_ps(_mm_loadu_ps(M + 12), p, 0xF8); __m128 r = _mm_or_ps(_mm_or_ps(r0, r1), _mm_or_ps(r2, r3)); _mm_storeu_ps(R, r); return; } #endif case ISA::SSE2: { __m128 p = _mm_loadu_ps(P); __m128 xxxx = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)); __m128 yyyy = _mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)); __m128 zzzz = _mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)); __m128 wwww = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); __m128 r = _mm_mul_ps(_mm_loadu_ps(M + 0), xxxx); r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 4), yyyy)); r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 8), zzzz)); r = _mm_add_ps(r, _mm_mul_ps(_mm_loadu_ps(M + 12), wwww)); // transpose from column accum to row result __m128 t0 = _mm_unpacklo_ps(r, r); __m128 t1 = _mm_unpackhi_ps(r, r); __m128 t2 = _mm_movelh_ps(t0, t1); __m128 t3 = _mm_movehl_ps(t1, t0); _mm_storeu_ps(R, _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2, 0, 2, 0))); return; } default: break; } #elif TORQUE_MATH_ARM float32x4_t p = vld1q_f32(P); float32x4_t r = vmulq_n_f32(vld1q_f32(M + 0), vgetq_lane_f32(p, 0)); r = vmlaq_n_f32(r, vld1q_f32(M + 4), vgetq_lane_f32(p, 1)); r = vmlaq_n_f32(r, vld1q_f32(M + 8), vgetq_lane_f32(p, 2)); r = vmlaq_n_f32(r, vld1q_f32(M + 12), vgetq_lane_f32(p, 3)); // horizontal add rows float32x2_t s0 = vadd_f32(vget_low_f32(r), vget_high_f32(r)); float32x2_t s1 = vpadd_f32(s0, s0); vst1q_lane_f32(R, s1, 0); return; #endif R[0] = M[0] * P[0] + M[1] * P[1] + M[2] * P[2] + M[3] * P[3]; R[1] = M[4] * P[0] + M[5] * P[1] + M[6] * P[2] + M[7] * P[3]; R[2] = M[8] * P[0] + M[9] * P[1] + M[10] * P[2] + M[11] * P[3]; R[3] = M[12] * P[0] + M[13] * P[1] + M[14] * P[2] + M[15] * P[3]; } void math_backend::float_mat4x4::scale(float* M, const float* S) { float scalef[4] = { S[0], S[1], S[2], 1.0f }; #if TORQUE_MATH_x64 __m128 scale = _mm_loadu_ps(scalef); // S = {x, y, z, 1} __m128 row0 = _mm_loadu_ps(M + 0); __m128 row1 = _mm_loadu_ps(M + 4); __m128 row2 = _mm_loadu_ps(M + 8); __m128 row3 = _mm_loadu_ps(M + 12); row0 = _mm_mul_ps(row0, scale); row1 = _mm_mul_ps(row1, scale); row2 = _mm_mul_ps(row2, scale); row3 = _mm_mul_ps(row3, scale); _mm_storeu_ps(M + 0, row0); _mm_storeu_ps(M + 4, row1); _mm_storeu_ps(M + 8, row2); _mm_storeu_ps(M + 12, row3); return; #elif TORQUE_MATH_ARM float32x4_t scale = vld1q_f32(scalef); vst1q_f32(M + 0, vmulq_f32(vld1q_f32(M + 0), scale)); vst1q_f32(M + 4, vmulq_f32(vld1q_f32(M + 4), scale)); vst1q_f32(M + 8, vmulq_f32(vld1q_f32(M + 8), scale)); vst1q_f32(M + 12, vmulq_f32(vld1q_f32(M + 12), scale)); return; #endif M[0] *= S[0]; M[1] *= S[1]; M[2] *= S[2]; M[4] *= S[0]; M[5] *= S[1]; M[6] *= S[2]; M[8] *= S[0]; M[9] *= S[1]; M[10] *= S[2]; M[12] *= S[0]; M[13] *= S[1]; M[14] *= S[2]; }