mirror of
https://github.com/TorqueGameEngines/Torque3D.git
synced 2026-03-20 12:50:57 +00:00
beginning the implementation of overriding the math functions with sse2 sse41 and avx2 functions
142 lines
4.1 KiB
C++
142 lines
4.1 KiB
C++
#include "platform/platform.h"
|
|
#include "math/mMath.h"
|
|
#include "math/util/frustum.h"
|
|
#include <math.h> // Caution!!! Possible platform specific include
|
|
#include "math/mMathFn.h"
|
|
|
|
//################################################################
|
|
// SSE2 Functions - minimum baseline
|
|
//################################################################
|
|
#include <emmintrin.h>
|
|
|
|
static void m_point3F_normalize_sse2(float* p)
|
|
{
|
|
const float val = 1.0f;
|
|
|
|
// Load vector x, y, z into SSE register (w lane unused)
|
|
__m128 vec = _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
|
|
|
// Compute sum of squares: x*x + y*y + z*z
|
|
__m128 sq = _mm_mul_ps(vec, vec);
|
|
__m128 sum = _mm_add_ps(sq, _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(2, 1, 0, 3)));
|
|
sum = _mm_add_ss(sum, _mm_movehl_ps(sum, sum));
|
|
|
|
// Extract scalar squared length
|
|
float squared;
|
|
_mm_store_ss(&squared, sum);
|
|
|
|
if (squared != 0.0f)
|
|
{
|
|
// Exact normalization: 1/sqrt(squared)
|
|
float factor = 1.0f / std::sqrt(squared);
|
|
__m128 factorVec = _mm_set1_ps(factor);
|
|
|
|
vec = _mm_mul_ps(vec, factorVec);
|
|
}
|
|
else
|
|
{
|
|
// Zero-length fallback
|
|
vec = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
|
|
}
|
|
|
|
// Store result back
|
|
p[0] = _mm_cvtss_f32(vec);
|
|
p[1] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1)));
|
|
p[2] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2)));
|
|
}
|
|
|
|
static void m_point3F_normalize_f_sse2(float* p, float val)
|
|
{
|
|
__m128 vec = _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
|
|
|
__m128 sq = _mm_mul_ps(vec, vec);
|
|
__m128 sum = _mm_add_ps(sq, _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(2, 1, 0, 3)));
|
|
sum = _mm_add_ss(sum, _mm_movehl_ps(sum, sum));
|
|
|
|
float squared;
|
|
_mm_store_ss(&squared, sum);
|
|
|
|
if (squared != 0.0f)
|
|
{
|
|
float factor = val / std::sqrt(squared); // exact
|
|
__m128 factorVec = _mm_set1_ps(factor);
|
|
vec = _mm_mul_ps(vec, factorVec);
|
|
}
|
|
else
|
|
{
|
|
// Zero-length fallback: use unit vector along z
|
|
vec = _mm_set_ps(0.0f, val, 0.0f, 0.0f);
|
|
}
|
|
|
|
p[0] = _mm_cvtss_f32(vec);
|
|
p[1] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1)));
|
|
p[2] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2)));
|
|
}
|
|
|
|
static void matF_x_point4F_sse2(const float* m, const float* p, float* out)
|
|
{
|
|
__m128 point = _mm_loadu_ps(p);
|
|
|
|
__m128 r0 = _mm_loadu_ps(m + 0);
|
|
__m128 r1 = _mm_loadu_ps(m + 4);
|
|
__m128 r2 = _mm_loadu_ps(m + 8);
|
|
__m128 r3 = _mm_loadu_ps(m + 12);
|
|
|
|
// Multiply rows by vector
|
|
__m128 m0 = _mm_mul_ps(r0, point);
|
|
__m128 m1 = _mm_mul_ps(r1, point);
|
|
__m128 m2 = _mm_mul_ps(r2, point);
|
|
__m128 m3 = _mm_mul_ps(r3, point);
|
|
|
|
// Horizontal add
|
|
auto dot4 = [](__m128 v) -> float
|
|
{
|
|
__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
|
|
__m128 sums = _mm_add_ps(v, shuf);
|
|
shuf = _mm_movehl_ps(shuf, sums);
|
|
sums = _mm_add_ss(sums, shuf);
|
|
return _mm_cvtss_f32(sums);
|
|
};
|
|
|
|
out[0] = dot4(m0);
|
|
out[1] = dot4(m1);
|
|
out[2] = dot4(m2);
|
|
out[3] = dot4(m3);
|
|
}
|
|
|
|
static void m_matF_x_matF_sse2(const float* A, const float* B, float* R)
|
|
{
|
|
__m128 b0 = _mm_loadu_ps(B + 0);
|
|
__m128 b1 = _mm_loadu_ps(B + 4);
|
|
__m128 b2 = _mm_loadu_ps(B + 8);
|
|
__m128 b3 = _mm_loadu_ps(B + 12);
|
|
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
__m128 a = _mm_loadu_ps(A + i * 4);
|
|
|
|
__m128 xxxx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m128 yyyy = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
|
|
__m128 zzzz = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
|
|
__m128 wwww = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
__m128 row =
|
|
_mm_add_ps(
|
|
_mm_add_ps(_mm_mul_ps(xxxx, b0),
|
|
_mm_mul_ps(yyyy, b1)),
|
|
_mm_add_ps(_mm_mul_ps(zzzz, b2),
|
|
_mm_mul_ps(wwww, b3))
|
|
);
|
|
|
|
_mm_storeu_ps(R + i * 4, row);
|
|
}
|
|
}
|
|
|
|
void mInstallLibrary_SSE2()
|
|
{
|
|
m_point3F_normalize = m_point3F_normalize_sse2;
|
|
m_point3F_normalize_f = m_point3F_normalize_f_sse2;
|
|
m_matF_x_point4F = matF_x_point4F_sse2;
|
|
m_matF_x_matF = m_matF_x_matF_sse2;
|
|
m_matF_x_matF_aligned = m_matF_x_matF_sse2;
|
|
}
|