From ac6ec05690e52c85c9858a134d02050b8054559b Mon Sep 17 00:00:00 2001 From: marauder2k7 Date: Thu, 5 Mar 2026 14:16:11 +0000 Subject: [PATCH] fix drift turns out we require the dot product to be in all rows --- Engine/source/math/isa/avx/avx_intrinsics.h | 157 +++++++++++++++++- Engine/source/math/isa/avx2/avx2_intrinsics.h | 157 +++++++++++++++++- Engine/source/math/isa/sse2/sse2_intrinsics.h | 96 +++++++++-- .../source/math/isa/sse41/sse41_intrinsics.h | 81 ++++++++- 4 files changed, 456 insertions(+), 35 deletions(-) diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h index 2dcdf513b..28a5fa4fa 100644 --- a/Engine/source/math/isa/avx/avx_intrinsics.h +++ b/Engine/source/math/isa/avx/avx_intrinsics.h @@ -136,7 +136,8 @@ namespace { f32x4 r = _mm_rcp_ps(b); f32x4 two = _mm_set1_ps(2.0f); - return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r))); + r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); } // Divide fast ( b = recip eg 1/b) @@ -144,14 +145,12 @@ namespace inline f32x4 v_rsqrt_nr(f32x4 x) { - f32x4 r = _mm_rsqrt_ps(x); - f32x4 half = _mm_set1_ps(0.5f); f32x4 three = _mm_set1_ps(3.0f); - r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r))); - - return _mm_mul_ps(r, half); + f32x4 r = _mm_rsqrt_ps(x); + f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r); + return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr)); } //------------------------------------------------------ @@ -161,13 +160,13 @@ namespace // full dot4 inline f32x4 v_dot4(f32x4 a, f32x4 b) { - return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1 + return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1 } // dot3 (ignores w) inline f32x4 v_dot3(f32x4 a, f32x4 b) { - return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1 + return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1 } // cross product xyz only. @@ -392,4 +391,146 @@ namespace f32x4 c0 = v_cross(r1, r2); return v_dot3(r0, c0); // splatted determinant } + + //--------------------------------------------------------- + // BATCH INTRINSICS + //--------------------------------------------------------- + typedef __m256 f32x8; + + struct vec4_batch8 + { + f32x8 x; + f32x8 y; + f32x8 z; + f32x8 w; + }; + + struct vec3_batch8 + { + f32x8 x; + f32x8 y; + f32x8 z; + }; + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + struct vec3_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + }; + + inline vec3_batch8 load_vec3_batch8(const float* ptr) + { + // ptr points to x0 + // layout: x y z x y z ... + + vec3_batch8 r; + + r.x = _mm256_set_ps( + ptr[21], ptr[18], ptr[15], ptr[12], + ptr[9], ptr[6], ptr[3], ptr[0]); + + r.y = _mm256_set_ps( + ptr[22], ptr[19], ptr[16], ptr[13], + ptr[10], ptr[7], ptr[4], ptr[1]); + + r.z = _mm256_set_ps( + ptr[23], ptr[20], ptr[17], ptr[14], + ptr[11], ptr[8], ptr[5], ptr[2]); + + return r; + } + + inline vec3_batch4 load_vec3_batch4(const float* ptr) + { + vec3_batch4 r; + + r.x = _mm_set_ps( + ptr[9], ptr[6], ptr[3], ptr[0]); + + r.y = _mm_set_ps( + ptr[10], ptr[7], ptr[4], ptr[1]); + + r.z = _mm_set_ps( + ptr[11], ptr[8], ptr[5], ptr[2]); + + return r; + } + + // Store the result back out to a float array 8 at a time + inline void store_f32x8(float* out, f32x8 v) + { + _mm256_storeu_ps(out, v); + } + + // Store the result back out to a float array 4 at a time + inline void store_f32x4(float* out, f32x4 v) + { + _mm_storeu_ps(out, v); + } + + // Store the result back to a float3 array with size of 8 + inline void store_vec3_batch8(float* out, const vec3_batch8& v) + { + alignas(32) float xs[8]; + alignas(32) float ys[8]; + alignas(32) float zs[8]; + + _mm256_store_ps(xs, v.x); + _mm256_store_ps(ys, v.y); + _mm256_store_ps(zs, v.z); + + for (int i = 0; i < 8; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + // Store the result back to a float3 array with size of 4 + inline void store_vec3_batch4(float* out, const vec3_batch4& v) + { + alignas(16) float xs[8]; + alignas(16) float ys[8]; + alignas(16) float zs[8]; + + _mm_store_ps(xs, v.x); + _mm_store_ps(ys, v.y); + _mm_store_ps(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b) + { + f32x4 mulx = _mm_mul_ps(a.x, b.x); + f32x4 muly = _mm_mul_ps(a.y, b.y); + f32x4 mulz = _mm_mul_ps(a.z, b.z); + + return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); + } + + inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b) + { + f32x8 mulx = _mm256_mul_ps(a.x, b.x); + f32x8 muly = _mm256_mul_ps(a.y, b.y); + f32x8 mulz = _mm256_mul_ps(a.z, b.z); + + return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz); + } + } diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h index e276e4c6e..bb367f198 100644 --- a/Engine/source/math/isa/avx2/avx2_intrinsics.h +++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h @@ -136,7 +136,8 @@ namespace { f32x4 r = _mm_rcp_ps(b); f32x4 two = _mm_set1_ps(2.0f); - return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r))); + r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); } // Divide fast ( b = recip eg 1/b) @@ -144,14 +145,12 @@ namespace inline f32x4 v_rsqrt_nr(f32x4 x) { - f32x4 r = _mm_rsqrt_ps(x); - f32x4 half = _mm_set1_ps(0.5f); f32x4 three = _mm_set1_ps(3.0f); - r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r))); - - return _mm_mul_ps(r, half); + f32x4 r = _mm_rsqrt_ps(x); + f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r); + return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr)); } //------------------------------------------------------ @@ -161,13 +160,13 @@ namespace // full dot4 inline f32x4 v_dot4(f32x4 a, f32x4 b) { - return _mm_dp_ps(a, b, 0xF1); // f32x4, 4 lanes into lane 1 + return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes } // dot3 (ignores w) inline f32x4 v_dot3(f32x4 a, f32x4 b) { - return _mm_dp_ps(a, b, 0x71); // f32x4, 3 last lanes into lane 1 + return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes } // cross product xyz only. @@ -392,4 +391,146 @@ namespace f32x4 c0 = v_cross(r1, r2); return v_dot3(r0, c0); // splatted determinant } + + //--------------------------------------------------------- + // BATCH INTRINSICS + //--------------------------------------------------------- + typedef __m256 f32x8; + + struct vec4_batch8 + { + f32x8 x; + f32x8 y; + f32x8 z; + f32x8 w; + }; + + struct vec3_batch8 + { + f32x8 x; + f32x8 y; + f32x8 z; + }; + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + struct vec3_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + }; + + inline vec3_batch8 load_vec3_batch8(const float* ptr) + { + // ptr points to x0 + // layout: x y z x y z ... + + vec3_batch8 r; + + r.x = _mm256_set_ps( + ptr[21], ptr[18], ptr[15], ptr[12], + ptr[9], ptr[6], ptr[3], ptr[0]); + + r.y = _mm256_set_ps( + ptr[22], ptr[19], ptr[16], ptr[13], + ptr[10], ptr[7], ptr[4], ptr[1]); + + r.z = _mm256_set_ps( + ptr[23], ptr[20], ptr[17], ptr[14], + ptr[11], ptr[8], ptr[5], ptr[2]); + + return r; + } + + inline vec3_batch4 load_vec3_batch4(const float* ptr) + { + vec3_batch4 r; + + r.x = _mm_set_ps( + ptr[9], ptr[6], ptr[3], ptr[0]); + + r.y = _mm_set_ps( + ptr[10], ptr[7], ptr[4], ptr[1]); + + r.z = _mm_set_ps( + ptr[11], ptr[8], ptr[5], ptr[2]); + + return r; + } + + // Store the result back out to a float array 8 at a time + inline void store_f32x8(float* out, f32x8 v) + { + _mm256_storeu_ps(out, v); + } + + // Store the result back out to a float array 4 at a time + inline void store_f32x4(float* out, f32x4 v) + { + _mm_storeu_ps(out, v); + } + + // Store the result back to a float3 array with size of 8 + inline void store_vec3_batch8(float* out, const vec3_batch8& v) + { + alignas(32) float xs[8]; + alignas(32) float ys[8]; + alignas(32) float zs[8]; + + _mm256_store_ps(xs, v.x); + _mm256_store_ps(ys, v.y); + _mm256_store_ps(zs, v.z); + + for (int i = 0; i < 8; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + // Store the result back to a float3 array with size of 4 + inline void store_vec3_batch4(float* out, const vec3_batch4& v) + { + alignas(16) float xs[8]; + alignas(16) float ys[8]; + alignas(16) float zs[8]; + + _mm_store_ps(xs, v.x); + _mm_store_ps(ys, v.y); + _mm_store_ps(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b) + { + f32x4 mulx = _mm_mul_ps(a.x, b.x); + f32x4 muly = _mm_mul_ps(a.y, b.y); + f32x4 mulz = _mm_mul_ps(a.z, b.z); + + return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); + } + + inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b) + { + f32x8 mulx = _mm256_mul_ps(a.x, b.x); + f32x8 muly = _mm256_mul_ps(a.y, b.y); + f32x8 mulz = _mm256_mul_ps(a.z, b.z); + + return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz); + } + } diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h index 63243d2d3..0150a78a5 100644 --- a/Engine/source/math/isa/sse2/sse2_intrinsics.h +++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h @@ -149,7 +149,7 @@ namespace return _mm_and_ps(v, mask); } - //------------------------------------------------------ + //------------------------------------------------------ // Fast recip //------------------------------------------------------ @@ -158,7 +158,8 @@ namespace { f32x4 r = _mm_rcp_ps(b); f32x4 two = _mm_set1_ps(2.0f); - return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r))); + r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); } // Divide fast ( b = recip eg 1/b) @@ -166,14 +167,12 @@ namespace inline f32x4 v_rsqrt_nr(f32x4 x) { - f32x4 r = _mm_rsqrt_ps(x); - f32x4 half = _mm_set1_ps(0.5f); f32x4 three = _mm_set1_ps(3.0f); - r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r))); - - return _mm_mul_ps(r, half); + f32x4 r = _mm_rsqrt_ps(x); + f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r); + return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr)); } //------------------------------------------------------ @@ -184,9 +183,9 @@ namespace inline f32x4 v_dot4(f32x4 a, f32x4 b) { f32x4 prod = _mm_mul_ps(a, b); // multiply element-wise - f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1)); + f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2)); prod = _mm_add_ps(prod, shuf); - shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2)); + shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1)); prod = _mm_add_ps(prod, shuf); return prod; // f32x4, all lanes = dot(a,b) } @@ -305,10 +304,11 @@ namespace f32x4 z = v_dot4(m.r2, v); f32x4 w = v_dot4(m.r3, v); - // Pack lowest lane of each into a vector - f32x4 xy = _mm_unpacklo_ps(x, y); - f32x4 zw = _mm_unpacklo_ps(z, w); - return _mm_movelh_ps(xy, zw); + // combine to a vector + return _mm_set_ps(_mm_cvtss_f32(w), + _mm_cvtss_f32(z), + _mm_cvtss_f32(y), + _mm_cvtss_f32(x)); } inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v) @@ -531,4 +531,74 @@ namespace f32x4 c0 = v_cross(r1, r2); return v_dot3(r0, c0); // splatted determinant } + + //--------------------------------------------------------- + // BATCH INTRINSICS + //--------------------------------------------------------- + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + struct vec3_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + }; + + + inline vec3_batch4 load_vec3_batch4(const float* ptr) + { + vec3_batch4 r; + + r.x = _mm_set_ps( + ptr[9], ptr[6], ptr[3], ptr[0]); + + r.y = _mm_set_ps( + ptr[10], ptr[7], ptr[4], ptr[1]); + + r.z = _mm_set_ps( + ptr[11], ptr[8], ptr[5], ptr[2]); + + return r; + } + + // Store the result back out to a float array 4 at a time + inline void store_f32x4(float* out, f32x4 v) + { + _mm_storeu_ps(out, v); + } + + // Store the result back to a float3 array with size of 4 + inline void store_vec3_batch4(float* out, const vec3_batch4& v) + { + alignas(16) float xs[8]; + alignas(16) float ys[8]; + alignas(16) float zs[8]; + + _mm_store_ps(xs, v.x); + _mm_store_ps(ys, v.y); + _mm_store_ps(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b) + { + f32x4 mulx = _mm_mul_ps(a.x, b.x); + f32x4 muly = _mm_mul_ps(a.y, b.y); + f32x4 mulz = _mm_mul_ps(a.z, b.z); + + return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); + } } diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h index 58d66ca6c..4134480aa 100644 --- a/Engine/source/math/isa/sse41/sse41_intrinsics.h +++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h @@ -148,7 +148,8 @@ namespace { f32x4 r = _mm_rcp_ps(b); f32x4 two = _mm_set1_ps(2.0f); - return _mm_mul_ps(r, _mm_sub_ps(two, _mm_mul_ps(b, r))); + r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r))); } // Divide fast ( b = recip eg 1/b) @@ -156,14 +157,12 @@ namespace inline f32x4 v_rsqrt_nr(f32x4 x) { - f32x4 r = _mm_rsqrt_ps(x); - f32x4 half = _mm_set1_ps(0.5f); f32x4 three = _mm_set1_ps(3.0f); - r = _mm_mul_ps(r, _mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(x, r), r))); - - return _mm_mul_ps(r, half); + f32x4 r = _mm_rsqrt_ps(x); + f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r); + return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr)); } //------------------------------------------------------ @@ -502,4 +501,74 @@ namespace f32x4 c0 = v_cross(r1, r2); return v_dot3(r0, c0); // splatted determinant } + + //--------------------------------------------------------- + // BATCH INTRINSICS + //--------------------------------------------------------- + + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + struct vec3_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + }; + + + inline vec3_batch4 load_vec3_batch4(const float* ptr) + { + vec3_batch4 r; + + r.x = _mm_set_ps( + ptr[9], ptr[6], ptr[3], ptr[0]); + + r.y = _mm_set_ps( + ptr[10], ptr[7], ptr[4], ptr[1]); + + r.z = _mm_set_ps( + ptr[11], ptr[8], ptr[5], ptr[2]); + + return r; + } + + // Store the result back out to a float array 4 at a time + inline void store_f32x4(float* out, f32x4 v) + { + _mm_storeu_ps(out, v); + } + + // Store the result back to a float3 array with size of 4 + inline void store_vec3_batch4(float* out, const vec3_batch4& v) + { + alignas(16) float xs[8]; + alignas(16) float ys[8]; + alignas(16) float zs[8]; + + _mm_store_ps(xs, v.x); + _mm_store_ps(ys, v.y); + _mm_store_ps(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b) + { + f32x4 mulx = _mm_mul_ps(a.x, b.x); + f32x4 muly = _mm_mul_ps(a.y, b.y); + f32x4 mulz = _mm_mul_ps(a.z, b.z); + + return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); + } }