#include "platform/platform.h" #include "math/mMath.h" #include "math/util/frustum.h" #include // Caution!!! Possible platform specific include #include "math/mMathFn.h" //################################################################ // SSE2 Functions - minimum baseline //################################################################ #include static void m_point3F_normalize_sse2(float* p) { const float val = 1.0f; // Load vector x, y, z into SSE register (w lane unused) __m128 vec = _mm_set_ps(0.0f, p[2], p[1], p[0]); // Compute sum of squares: x*x + y*y + z*z __m128 sq = _mm_mul_ps(vec, vec); __m128 sum = _mm_add_ps(sq, _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(2, 1, 0, 3))); sum = _mm_add_ss(sum, _mm_movehl_ps(sum, sum)); // Extract scalar squared length float squared; _mm_store_ss(&squared, sum); if (squared != 0.0f) { // Exact normalization: 1/sqrt(squared) float factor = 1.0f / std::sqrt(squared); __m128 factorVec = _mm_set1_ps(factor); vec = _mm_mul_ps(vec, factorVec); } else { // Zero-length fallback vec = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); } // Store result back p[0] = _mm_cvtss_f32(vec); p[1] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))); p[2] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2))); } static void m_point3F_normalize_f_sse2(float* p, float val) { __m128 vec = _mm_set_ps(0.0f, p[2], p[1], p[0]); __m128 sq = _mm_mul_ps(vec, vec); __m128 sum = _mm_add_ps(sq, _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(2, 1, 0, 3))); sum = _mm_add_ss(sum, _mm_movehl_ps(sum, sum)); float squared; _mm_store_ss(&squared, sum); if (squared != 0.0f) { float factor = val / std::sqrt(squared); // exact __m128 factorVec = _mm_set1_ps(factor); vec = _mm_mul_ps(vec, factorVec); } else { // Zero-length fallback: use unit vector along z vec = _mm_set_ps(0.0f, val, 0.0f, 0.0f); } p[0] = _mm_cvtss_f32(vec); p[1] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))); p[2] = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2))); } static void matF_x_point4F_sse2(const float* m, const float* p, float* out) { __m128 point = _mm_loadu_ps(p); __m128 r0 = _mm_loadu_ps(m + 0); __m128 r1 = _mm_loadu_ps(m + 4); __m128 r2 = _mm_loadu_ps(m + 8); __m128 r3 = _mm_loadu_ps(m + 12); // Multiply rows by vector __m128 m0 = _mm_mul_ps(r0, point); __m128 m1 = _mm_mul_ps(r1, point); __m128 m2 = _mm_mul_ps(r2, point); __m128 m3 = _mm_mul_ps(r3, point); // Horizontal add auto dot4 = [](__m128 v) -> float { __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); __m128 sums = _mm_add_ps(v, shuf); shuf = _mm_movehl_ps(shuf, sums); sums = _mm_add_ss(sums, shuf); return _mm_cvtss_f32(sums); }; out[0] = dot4(m0); out[1] = dot4(m1); out[2] = dot4(m2); out[3] = dot4(m3); } static void m_matF_x_matF_sse2(const float* A, const float* B, float* R) { __m128 b0 = _mm_loadu_ps(B + 0); __m128 b1 = _mm_loadu_ps(B + 4); __m128 b2 = _mm_loadu_ps(B + 8); __m128 b3 = _mm_loadu_ps(B + 12); for (int i = 0; i < 4; i++) { __m128 a = _mm_loadu_ps(A + i * 4); __m128 xxxx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); __m128 yyyy = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); __m128 zzzz = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); __m128 wwww = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)); __m128 row = _mm_add_ps( _mm_add_ps(_mm_mul_ps(xxxx, b0), _mm_mul_ps(yyyy, b1)), _mm_add_ps(_mm_mul_ps(zzzz, b2), _mm_mul_ps(wwww, b3)) ); _mm_storeu_ps(R + i * 4, row); } } void mInstallLibrary_SSE2() { m_point3F_normalize = m_point3F_normalize_sse2; m_point3F_normalize_f = m_point3F_normalize_f_sse2; m_matF_x_point4F = matF_x_point4F_sse2; m_matF_x_matF = m_matF_x_matF_sse2; m_matF_x_matF_aligned = m_matF_x_matF_sse2; }