#include "platform/platform.h" #include "math/mMath.h" #include "math/util/frustum.h" #include // Caution!!! Possible platform specific include #include "math/mMathFn.h" //################################################################ // AVX 2 Functions //################################################################ #include void m_point3F_bulk_dot_avx2( const F32* refVector, const F32* dotPoints, const U32 numPoints, const U32 pointStride, F32* output) { __m256 refX = _mm256_set1_ps(refVector[0]); __m256 refY = _mm256_set1_ps(refVector[1]); __m256 refZ = _mm256_set1_ps(refVector[2]); U32 i = 0; // Process 8 points at a time (AVX2 = 8 floats per 256-bit register) for (; i + 7 < numPoints; i += 8) { // Load x, y, z components with stride __m256 x = _mm256_set_ps( dotPoints[(i + 7) * pointStride + 0], dotPoints[(i + 6) * pointStride + 0], dotPoints[(i + 5) * pointStride + 0], dotPoints[(i + 4) * pointStride + 0], dotPoints[(i + 3) * pointStride + 0], dotPoints[(i + 2) * pointStride + 0], dotPoints[(i + 1) * pointStride + 0], dotPoints[(i + 0) * pointStride + 0] ); __m256 y = _mm256_set_ps( dotPoints[(i + 7) * pointStride + 1], dotPoints[(i + 6) * pointStride + 1], dotPoints[(i + 5) * pointStride + 1], dotPoints[(i + 4) * pointStride + 1], dotPoints[(i + 3) * pointStride + 1], dotPoints[(i + 2) * pointStride + 1], dotPoints[(i + 1) * pointStride + 1], dotPoints[(i + 0) * pointStride + 1] ); __m256 z = _mm256_set_ps( dotPoints[(i + 7) * pointStride + 2], dotPoints[(i + 6) * pointStride + 2], dotPoints[(i + 5) * pointStride + 2], dotPoints[(i + 4) * pointStride + 2], dotPoints[(i + 3) * pointStride + 2], dotPoints[(i + 2) * pointStride + 2], dotPoints[(i + 1) * pointStride + 2], dotPoints[(i + 0) * pointStride + 2] ); // Multiply and accumulate: x*rx + y*ry + z*rz __m256 dot = _mm256_mul_ps(x, refX); dot = _mm256_fmadd_ps(y, refY, dot); // dot += y*refY dot = _mm256_fmadd_ps(z, refZ, dot); // dot += z*refZ // Store the results _mm256_storeu_ps(&output[i], dot); } // Handle remaining points for (; i < numPoints; i++) { const F32* pPoint = &dotPoints[i * pointStride]; output[i] = refVector[0] * pPoint[0] + refVector[1] * pPoint[1] + refVector[2] * pPoint[2]; } } void default_matF_x_matF_AVX2(const F32* A, const F32* B, F32* C) { for (int i = 0; i < 4; i++) { // Broadcast elements of A row __m128 a0 = _mm_set1_ps(A[i * 4 + 0]); __m128 a1 = _mm_set1_ps(A[i * 4 + 1]); __m128 a2 = _mm_set1_ps(A[i * 4 + 2]); __m128 a3 = _mm_set1_ps(A[i * 4 + 3]); // Load columns of B (rows in memory since row-major) __m128 b0 = _mm_loadu_ps(&B[0 * 4]); // B row 0 __m128 b1 = _mm_loadu_ps(&B[1 * 4]); // B row 1 __m128 b2 = _mm_loadu_ps(&B[2 * 4]); // B row 2 __m128 b3 = _mm_loadu_ps(&B[3 * 4]); // B row 3 // Multiply and sum __m128 res = _mm_mul_ps(a0, b0); res = _mm_add_ps(res, _mm_mul_ps(a1, b1)); res = _mm_add_ps(res, _mm_mul_ps(a2, b2)); res = _mm_add_ps(res, _mm_mul_ps(a3, b3)); // Store result row _mm_storeu_ps(&C[i * 4], res); } } void mInstallLibrary_AVX2() { m_point3F_bulk_dot = m_point3F_bulk_dot_avx2; m_matF_x_matF = default_matF_x_matF_AVX2; m_matF_x_matF_aligned = default_matF_x_matF_AVX2; }