mirror of
https://github.com/TorqueGameEngines/Torque3D.git
synced 2026-03-21 05:10:53 +00:00
basic simd math function overrides
beginning the implementation of overriding the math functions with sse2 sse41 and avx2 functions
This commit is contained in:
parent
2b375bfea4
commit
a7d92c344d
5 changed files with 375 additions and 14 deletions
107
Engine/source/math/simd/mMath_SSE41.cpp
Normal file
107
Engine/source/math/simd/mMath_SSE41.cpp
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
#include "platform/platform.h"
|
||||
#include "math/mMath.h"
|
||||
#include "math/util/frustum.h"
|
||||
#include <math.h> // Caution!!! Possible platform specific include
|
||||
#include "math/mMathFn.h"
|
||||
|
||||
|
||||
//################################################################
|
||||
// SSE4.1 Functions
|
||||
//################################################################
|
||||
#include <smmintrin.h> // SSE4.1
|
||||
|
||||
static void m_point3F_normalize_sse41(float* p)
|
||||
{
|
||||
// [x y z 0]
|
||||
__m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||||
|
||||
// dot = x*x + y*y + z*z
|
||||
__m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x
|
||||
|
||||
float lenSq = _mm_cvtss_f32(dot);
|
||||
|
||||
if (lenSq != 0.0f)
|
||||
{
|
||||
float invLen = 1.0f / sqrtf(lenSq);
|
||||
__m128 scale = _mm_set1_ps(invLen);
|
||||
v = _mm_mul_ps(v, scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
// fallback [0,0,1]
|
||||
v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
p[0] = _mm_cvtss_f32(v);
|
||||
p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
}
|
||||
|
||||
static void m_point3F_normalize_f_sse41(float* p, float val)
|
||||
{
|
||||
// [x y z 0]
|
||||
__m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||||
|
||||
// dot = x*x + y*y + z*z
|
||||
__m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x
|
||||
|
||||
float lenSq = _mm_cvtss_f32(dot);
|
||||
|
||||
if (lenSq != 0.0f)
|
||||
{
|
||||
float invLen = val / sqrtf(lenSq);
|
||||
__m128 scale = _mm_set1_ps(invLen);
|
||||
v = _mm_mul_ps(v, scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
// fallback [0,0,1]
|
||||
v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
p[0] = _mm_cvtss_f32(v);
|
||||
p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
}
|
||||
|
||||
void matF_x_point4F_sse41(const float* m, const float* p, float* out)
|
||||
{
|
||||
__m128 point = _mm_loadu_ps(p);
|
||||
|
||||
__m128 r0 = _mm_loadu_ps(m + 0);
|
||||
__m128 r1 = _mm_loadu_ps(m + 4);
|
||||
__m128 r2 = _mm_loadu_ps(m + 8);
|
||||
__m128 r3 = _mm_loadu_ps(m + 12);
|
||||
|
||||
out[0] = _mm_cvtss_f32(_mm_dp_ps(r0, point, 0xF1));
|
||||
out[1] = _mm_cvtss_f32(_mm_dp_ps(r1, point, 0xF2));
|
||||
out[2] = _mm_cvtss_f32(_mm_dp_ps(r2, point, 0xF4));
|
||||
out[3] = _mm_cvtss_f32(_mm_dp_ps(r3, point, 0xF8));
|
||||
}
|
||||
|
||||
static void m_matF_x_matF_sse41(const float* A, const float* B, float* R)
|
||||
{
|
||||
__m128 col0 = _mm_set_ps(B[12], B[8], B[4], B[0]);
|
||||
__m128 col1 = _mm_set_ps(B[13], B[9], B[5], B[1]);
|
||||
__m128 col2 = _mm_set_ps(B[14], B[10], B[6], B[2]);
|
||||
__m128 col3 = _mm_set_ps(B[15], B[11], B[7], B[3]);
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
__m128 row = _mm_loadu_ps(A + i * 4);
|
||||
|
||||
R[i * 4 + 0] = _mm_cvtss_f32(_mm_dp_ps(row, col0, 0xF1));
|
||||
R[i * 4 + 1] = _mm_cvtss_f32(_mm_dp_ps(row, col1, 0xF1));
|
||||
R[i * 4 + 2] = _mm_cvtss_f32(_mm_dp_ps(row, col2, 0xF1));
|
||||
R[i * 4 + 3] = _mm_cvtss_f32(_mm_dp_ps(row, col3, 0xF1));
|
||||
}
|
||||
}
|
||||
|
||||
void mInstallLibrary_SSE41()
|
||||
{
|
||||
m_point3F_normalize = m_point3F_normalize_sse41;
|
||||
m_point3F_normalize_f = m_point3F_normalize_f_sse41;
|
||||
m_matF_x_point4F = matF_x_point4F_sse41;
|
||||
m_matF_x_matF = m_matF_x_matF_sse41;
|
||||
m_matF_x_matF_aligned = m_matF_x_matF_sse41;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue