Torque3D/Engine/source/math/simd/mMath_SSE41.cpp

108 lines
3 KiB
C++
Raw Normal View History

#include "platform/platform.h"
#include "math/mMath.h"
#include "math/util/frustum.h"
#include <math.h> // Caution!!! Possible platform specific include
#include "math/mMathFn.h"
//################################################################
// SSE4.1 Functions
//################################################################
#include <smmintrin.h> // SSE4.1
static void m_point3F_normalize_sse41(float* p)
{
// [x y z 0]
__m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]);
// dot = x*x + y*y + z*z
__m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x
float lenSq = _mm_cvtss_f32(dot);
if (lenSq != 0.0f)
{
float invLen = 1.0f / sqrtf(lenSq);
__m128 scale = _mm_set1_ps(invLen);
v = _mm_mul_ps(v, scale);
}
else
{
// fallback [0,0,1]
v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
}
p[0] = _mm_cvtss_f32(v);
p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
}
static void m_point3F_normalize_f_sse41(float* p, float val)
{
// [x y z 0]
__m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]);
// dot = x*x + y*y + z*z
__m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x
float lenSq = _mm_cvtss_f32(dot);
if (lenSq != 0.0f)
{
float invLen = val / sqrtf(lenSq);
__m128 scale = _mm_set1_ps(invLen);
v = _mm_mul_ps(v, scale);
}
else
{
// fallback [0,0,1]
v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
}
p[0] = _mm_cvtss_f32(v);
p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
}
void matF_x_point4F_sse41(const float* m, const float* p, float* out)
{
__m128 point = _mm_loadu_ps(p);
__m128 r0 = _mm_loadu_ps(m + 0);
__m128 r1 = _mm_loadu_ps(m + 4);
__m128 r2 = _mm_loadu_ps(m + 8);
__m128 r3 = _mm_loadu_ps(m + 12);
out[0] = _mm_cvtss_f32(_mm_dp_ps(r0, point, 0xF1));
out[1] = _mm_cvtss_f32(_mm_dp_ps(r1, point, 0xF2));
out[2] = _mm_cvtss_f32(_mm_dp_ps(r2, point, 0xF4));
out[3] = _mm_cvtss_f32(_mm_dp_ps(r3, point, 0xF8));
}
static void m_matF_x_matF_sse41(const float* A, const float* B, float* R)
{
__m128 col0 = _mm_set_ps(B[12], B[8], B[4], B[0]);
__m128 col1 = _mm_set_ps(B[13], B[9], B[5], B[1]);
__m128 col2 = _mm_set_ps(B[14], B[10], B[6], B[2]);
__m128 col3 = _mm_set_ps(B[15], B[11], B[7], B[3]);
for (int i = 0; i < 4; i++)
{
__m128 row = _mm_loadu_ps(A + i * 4);
R[i * 4 + 0] = _mm_cvtss_f32(_mm_dp_ps(row, col0, 0xF1));
R[i * 4 + 1] = _mm_cvtss_f32(_mm_dp_ps(row, col1, 0xF1));
R[i * 4 + 2] = _mm_cvtss_f32(_mm_dp_ps(row, col2, 0xF1));
R[i * 4 + 3] = _mm_cvtss_f32(_mm_dp_ps(row, col3, 0xF1));
}
}
void mInstallLibrary_SSE41()
{
m_point3F_normalize = m_point3F_normalize_sse41;
m_point3F_normalize_f = m_point3F_normalize_f_sse41;
m_matF_x_point4F = matF_x_point4F_sse41;
m_matF_x_matF = m_matF_x_matF_sse41;
m_matF_x_matF_aligned = m_matF_x_matF_sse41;
}