mirror of
https://github.com/TorqueGameEngines/Torque3D.git
synced 2026-03-20 12:50:57 +00:00
108 lines
3 KiB
C++
108 lines
3 KiB
C++
|
|
#include "platform/platform.h"
|
||
|
|
#include "math/mMath.h"
|
||
|
|
#include "math/util/frustum.h"
|
||
|
|
#include <math.h> // Caution!!! Possible platform specific include
|
||
|
|
#include "math/mMathFn.h"
|
||
|
|
|
||
|
|
|
||
|
|
//################################################################
|
||
|
|
// SSE4.1 Functions
|
||
|
|
//################################################################
|
||
|
|
#include <smmintrin.h> // SSE4.1
|
||
|
|
|
||
|
|
static void m_point3F_normalize_sse41(float* p)
|
||
|
|
{
|
||
|
|
// [x y z 0]
|
||
|
|
__m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||
|
|
|
||
|
|
// dot = x*x + y*y + z*z
|
||
|
|
__m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x
|
||
|
|
|
||
|
|
float lenSq = _mm_cvtss_f32(dot);
|
||
|
|
|
||
|
|
if (lenSq != 0.0f)
|
||
|
|
{
|
||
|
|
float invLen = 1.0f / sqrtf(lenSq);
|
||
|
|
__m128 scale = _mm_set1_ps(invLen);
|
||
|
|
v = _mm_mul_ps(v, scale);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// fallback [0,0,1]
|
||
|
|
v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
|
||
|
|
}
|
||
|
|
|
||
|
|
p[0] = _mm_cvtss_f32(v);
|
||
|
|
p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
|
||
|
|
p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
|
||
|
|
}
|
||
|
|
|
||
|
|
static void m_point3F_normalize_f_sse41(float* p, float val)
|
||
|
|
{
|
||
|
|
// [x y z 0]
|
||
|
|
__m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||
|
|
|
||
|
|
// dot = x*x + y*y + z*z
|
||
|
|
__m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x
|
||
|
|
|
||
|
|
float lenSq = _mm_cvtss_f32(dot);
|
||
|
|
|
||
|
|
if (lenSq != 0.0f)
|
||
|
|
{
|
||
|
|
float invLen = val / sqrtf(lenSq);
|
||
|
|
__m128 scale = _mm_set1_ps(invLen);
|
||
|
|
v = _mm_mul_ps(v, scale);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// fallback [0,0,1]
|
||
|
|
v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
|
||
|
|
}
|
||
|
|
|
||
|
|
p[0] = _mm_cvtss_f32(v);
|
||
|
|
p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
|
||
|
|
p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
|
||
|
|
}
|
||
|
|
|
||
|
|
void matF_x_point4F_sse41(const float* m, const float* p, float* out)
|
||
|
|
{
|
||
|
|
__m128 point = _mm_loadu_ps(p);
|
||
|
|
|
||
|
|
__m128 r0 = _mm_loadu_ps(m + 0);
|
||
|
|
__m128 r1 = _mm_loadu_ps(m + 4);
|
||
|
|
__m128 r2 = _mm_loadu_ps(m + 8);
|
||
|
|
__m128 r3 = _mm_loadu_ps(m + 12);
|
||
|
|
|
||
|
|
out[0] = _mm_cvtss_f32(_mm_dp_ps(r0, point, 0xF1));
|
||
|
|
out[1] = _mm_cvtss_f32(_mm_dp_ps(r1, point, 0xF2));
|
||
|
|
out[2] = _mm_cvtss_f32(_mm_dp_ps(r2, point, 0xF4));
|
||
|
|
out[3] = _mm_cvtss_f32(_mm_dp_ps(r3, point, 0xF8));
|
||
|
|
}
|
||
|
|
|
||
|
|
static void m_matF_x_matF_sse41(const float* A, const float* B, float* R)
|
||
|
|
{
|
||
|
|
__m128 col0 = _mm_set_ps(B[12], B[8], B[4], B[0]);
|
||
|
|
__m128 col1 = _mm_set_ps(B[13], B[9], B[5], B[1]);
|
||
|
|
__m128 col2 = _mm_set_ps(B[14], B[10], B[6], B[2]);
|
||
|
|
__m128 col3 = _mm_set_ps(B[15], B[11], B[7], B[3]);
|
||
|
|
|
||
|
|
for (int i = 0; i < 4; i++)
|
||
|
|
{
|
||
|
|
__m128 row = _mm_loadu_ps(A + i * 4);
|
||
|
|
|
||
|
|
R[i * 4 + 0] = _mm_cvtss_f32(_mm_dp_ps(row, col0, 0xF1));
|
||
|
|
R[i * 4 + 1] = _mm_cvtss_f32(_mm_dp_ps(row, col1, 0xF1));
|
||
|
|
R[i * 4 + 2] = _mm_cvtss_f32(_mm_dp_ps(row, col2, 0xF1));
|
||
|
|
R[i * 4 + 3] = _mm_cvtss_f32(_mm_dp_ps(row, col3, 0xF1));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
void mInstallLibrary_SSE41()
|
||
|
|
{
|
||
|
|
m_point3F_normalize = m_point3F_normalize_sse41;
|
||
|
|
m_point3F_normalize_f = m_point3F_normalize_f_sse41;
|
||
|
|
m_matF_x_point4F = matF_x_point4F_sse41;
|
||
|
|
m_matF_x_matF = m_matF_x_matF_sse41;
|
||
|
|
m_matF_x_matF_aligned = m_matF_x_matF_sse41;
|
||
|
|
}
|