#include "platform/platform.h" #include "math/mMath.h" #include "math/util/frustum.h" #include // Caution!!! Possible platform specific include #include "math/mMathFn.h" //################################################################ // SSE4.1 Functions //################################################################ #include // SSE4.1 static void m_point3F_normalize_sse41(float* p) { // [x y z 0] __m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]); // dot = x*x + y*y + z*z __m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x float lenSq = _mm_cvtss_f32(dot); if (lenSq != 0.0f) { float invLen = 1.0f / sqrtf(lenSq); __m128 scale = _mm_set1_ps(invLen); v = _mm_mul_ps(v, scale); } else { // fallback [0,0,1] v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); } p[0] = _mm_cvtss_f32(v); p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))); p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))); } static void m_point3F_normalize_f_sse41(float* p, float val) { // [x y z 0] __m128 v = _mm_set_ps(0.0f, p[2], p[1], p[0]); // dot = x*x + y*y + z*z __m128 dot = _mm_dp_ps(v, v, 0x71); // xyz, result in x float lenSq = _mm_cvtss_f32(dot); if (lenSq != 0.0f) { float invLen = val / sqrtf(lenSq); __m128 scale = _mm_set1_ps(invLen); v = _mm_mul_ps(v, scale); } else { // fallback [0,0,1] v = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); } p[0] = _mm_cvtss_f32(v); p[1] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))); p[2] = _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))); } void matF_x_point4F_sse41(const float* m, const float* p, float* out) { __m128 point = _mm_loadu_ps(p); __m128 r0 = _mm_loadu_ps(m + 0); __m128 r1 = _mm_loadu_ps(m + 4); __m128 r2 = _mm_loadu_ps(m + 8); __m128 r3 = _mm_loadu_ps(m + 12); out[0] = _mm_cvtss_f32(_mm_dp_ps(r0, point, 0xF1)); out[1] = _mm_cvtss_f32(_mm_dp_ps(r1, point, 0xF2)); out[2] = _mm_cvtss_f32(_mm_dp_ps(r2, point, 0xF4)); out[3] = _mm_cvtss_f32(_mm_dp_ps(r3, point, 0xF8)); } static void m_matF_x_matF_sse41(const float* A, const float* B, float* R) { __m128 col0 = _mm_set_ps(B[12], B[8], B[4], B[0]); __m128 col1 = _mm_set_ps(B[13], B[9], B[5], B[1]); __m128 col2 = _mm_set_ps(B[14], B[10], B[6], B[2]); __m128 col3 = _mm_set_ps(B[15], B[11], B[7], B[3]); for (int i = 0; i < 4; i++) { __m128 row = _mm_loadu_ps(A + i * 4); R[i * 4 + 0] = _mm_cvtss_f32(_mm_dp_ps(row, col0, 0xF1)); R[i * 4 + 1] = _mm_cvtss_f32(_mm_dp_ps(row, col1, 0xF1)); R[i * 4 + 2] = _mm_cvtss_f32(_mm_dp_ps(row, col2, 0xF1)); R[i * 4 + 3] = _mm_cvtss_f32(_mm_dp_ps(row, col3, 0xF1)); } } void mInstallLibrary_SSE41() { m_point3F_normalize = m_point3F_normalize_sse41; m_point3F_normalize_f = m_point3F_normalize_f_sse41; m_matF_x_point4F = matF_x_point4F_sse41; m_matF_x_matF = m_matF_x_matF_sse41; m_matF_x_matF_aligned = m_matF_x_matF_sse41; }