Torque3D/Engine/source/math/isa/sse2/sse2_intrinsics.h
2026-03-06 09:41:35 +00:00

655 lines
17 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <emmintrin.h> // SSE2
#include <xmmintrin.h> // SSE
namespace
{
typedef __m128 f32x4;
//------------------------------------------------------
// Vector/Point Load / Store
//------------------------------------------------------
// Load 4 floats from memory into a SIMD register
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
inline f32x4 v_zero() { return _mm_setzero_ps(); }
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
//------------------------------------------------------
// Float3 helpers (safe loading into 4 lanes)
//------------------------------------------------------
inline f32x4 v_load3_vec(const float* p) // w = 0
{
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
}
inline f32x4 v_load3_pos(const float* p) // w = 1
{
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
}
inline f32x4 v_set(float x, float y, float z, float w)
{
return _mm_set_ps(w, z, y, x);
}
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
{
f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v));
}
inline void v_store3(float* dst, f32x4 v)
{
alignas(16) float tmp[4]; // temp storage
_mm_store_ps(tmp, v); // store all 4 lanes
dst[0] = tmp[0];
dst[1] = tmp[1];
dst[2] = tmp[2];
}
//------------------------------------------------------
// Mask helpers
//------------------------------------------------------
inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
{
alignas(16) float tmp[4];
_mm_store_ps(tmp, v);
return _mm_set_ps(
tmp[w],
tmp[z],
tmp[y],
tmp[x]
);
}
inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
{
alignas(16) float tmp[4];
_mm_store_ps(tmp, v);
return _mm_set1_ps(tmp[x]);
}
inline f32x4 v_swizzle_lo(f32x4 v1)
{
return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0));
}
inline f32x4 v_swizzle_hi(f32x4 v1)
{
return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1));
}
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
{
f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1
return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
}
//------------------------------------------------------
// Shuffle helpers
//------------------------------------------------------
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
{
alignas(16) float a[4], b[4];
_mm_store_ps(a, v1);
_mm_store_ps(b, v2);
return _mm_set_ps(
b[w],
b[z],
a[y],
a[x]
);
}
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
{
return _mm_movelh_ps(v1, v2);
}
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
{
return _mm_movehl_ps(v2, v1);
}
//------------------------------------------------------
// Simple Arithmatic
//------------------------------------------------------
// Element-wise multiply
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
// Element-wise divide
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
// Element-wise add
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
// Element-wise subtract
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
// Absolute value.
inline f32x4 v_abs(f32x4 v)
{
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return _mm_and_ps(v, mask);
}
//------------------------------------------------------
// Fast recip
//------------------------------------------------------
// Fast recip 1/b
inline f32x4 v_rcp_nr(f32x4 b)
{
f32x4 r = _mm_rcp_ps(b);
f32x4 two = _mm_set1_ps(2.0f);
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
}
// Divide fast ( b = recip eg 1/b)
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
inline f32x4 v_rsqrt_nr(f32x4 x)
{
f32x4 half = _mm_set1_ps(0.5f);
f32x4 three = _mm_set1_ps(3.0f);
f32x4 r = _mm_rsqrt_ps(x);
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
}
//------------------------------------------------------
// Vector intrinsic functions
//------------------------------------------------------
// full dot4
inline f32x4 v_dot4(f32x4 a, f32x4 b)
{
f32x4 prod = _mm_mul_ps(a, b); // multiply element-wise
f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2));
prod = _mm_add_ps(prod, shuf);
shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1));
prod = _mm_add_ps(prod, shuf);
return prod; // f32x4, all lanes = dot(a,b)
}
// dot3 (ignores w)
inline f32x4 v_dot3(f32x4 a, f32x4 b)
{
f32x4 prod = _mm_mul_ps(a, b);
prod = _mm_and_ps(prod, v_mask_xyz()); // zero w
f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
prod = _mm_add_ps(prod, shuf);
shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
prod = _mm_add_ps(prod, shuf);
return prod; // f32x4, all lanes = dot(a,b)
}
// cross product xyz only.
inline f32x4 v_cross(f32x4 a, f32x4 b)
{
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
}
inline f32x4 v_normalize3(f32x4 v)
{
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
// vbsl equivalent
f32x4 result = _mm_or_ps(
_mm_and_ps(isZero, fallback),
_mm_andnot_ps(isZero, norm)
);
return result;
}
// adds all 4 lanes together.
inline f32x4 v_hadd4(f32x4 a)
{
// sum all 4 lanes in SSE2
__m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs
__m128 sums = _mm_add_ps(a, shuf);
shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
return _mm_add_ps(sums, shuf);
}
//------------------------------------------------------
// Matrix type (row-major 4x4)
//------------------------------------------------------
struct f32x4x4
{
f32x4 r0;
f32x4 r1;
f32x4 r2;
f32x4 r3;
};
//------------------------------------------------------
// Matrix Load / Store
//------------------------------------------------------
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
{
f32x4x4 out;
out.r0 = v_load(m + 0);
out.r1 = v_load(m + 4);
out.r2 = v_load(m + 8);
out.r3 = v_load(m + 12);
return out;
}
inline void m_store(float* dst, const f32x4x4& m)
{
v_store(dst + 0, m.r0);
v_store(dst + 4, m.r1);
v_store(dst + 8, m.r2);
v_store(dst + 12, m.r3);
}
inline f32x4x4 m_identity()
{
f32x4x4 m;
m.r0 = _mm_set_ps(0, 0, 0, 1);
m.r1 = _mm_set_ps(0, 0, 1, 0);
m.r2 = _mm_set_ps(0, 1, 0, 0);
m.r3 = _mm_set_ps(1, 0, 0, 0);
return m;
}
inline f32x4x4 m_zero()
{
f32x4 z = v_zero();
return { z, z, z, z };
}
//------------------------------------------------------
// Matrix × Vector
//------------------------------------------------------
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot4(m.r0, v);
f32x4 y = v_dot4(m.r1, v);
f32x4 z = v_dot4(m.r2, v);
f32x4 w = v_dot4(m.r3, v);
// combine to a vector
return _mm_set_ps(_mm_cvtss_f32(w),
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot3(m.r0, v);
f32x4 y = v_dot3(m.r1, v);
f32x4 z = v_dot3(m.r2, v);
// combine to a vector
return _mm_set_ps(0.0f,
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
//------------------------------------------------------
// Matrix × Matrix
//------------------------------------------------------
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
{
// Transpose B once for dot products
f32x4 b0 = b.r0;
f32x4 b1 = b.r1;
f32x4 b2 = b.r2;
f32x4 b3 = b.r3;
// Transpose (SSE2)
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
f32x4x4 C;
auto mul_row = [&](f32x4 arow)
{
f32x4 x = v_dot4(arow, b0);
f32x4 y = v_dot4(arow, b1);
f32x4 z = v_dot4(arow, b2);
f32x4 w = v_dot4(arow, b3);
f32x4 xy = _mm_unpacklo_ps(x, y);
f32x4 zw = _mm_unpacklo_ps(z, w);
return _mm_movelh_ps(xy, zw);
};
C.r0 = mul_row(a.r0);
C.r1 = mul_row(a.r1);
C.r2 = mul_row(a.r2);
C.r3 = mul_row(a.r3);
return C;
}
//------------------------------------------------------
// Transpose
//------------------------------------------------------
inline f32x4x4 m_transpose(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
f32x4 r3 = m.r3;
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
return { r0, r1, r2, r3 };
}
inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
{
dst[0] = v_extract0(c0); // c0.x
dst[1] = v_extract0(c1); // c1.x
dst[2] = v_extract0(c2); // c2.x
dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
}
inline f32x4x4 m_inverse(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(b, c);
f32x4 c1 = v_cross(c, d);
f32x4 c2 = v_cross(d, a);
f32x4 c3 = v_cross(a, b);
f32x4 det = v_dot4(a, c1);
f32x4 invDet = v_rcp_nr(det);
f32x4x4 adj;
adj.r0 = _mm_mul_ps(c1, invDet);
adj.r1 = _mm_mul_ps(c2, invDet);
adj.r2 = _mm_mul_ps(c3, invDet);
adj.r3 = _mm_mul_ps(c0, invDet);
return m_transpose(adj);
}
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
{
f32x4x4 t = m_transpose(m);
// Extract translation
f32x4 T = v_set(
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
0.0f
);
f32x4 newT = m_mul_vec4(t, T);
newT = _mm_sub_ps(v_zero(), newT);
t.r0 = v_preserve_w(t.r0, newT);
t.r1 = v_preserve_w(t.r1, newT);
t.r2 = v_preserve_w(t.r2, newT);
t.r3 = v_set(0, 0, 0, 1);
return t;
}
inline f32x4x4 m_inverse_affine(const f32x4x4& m)
{
// Extract upper 3x3 rows
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
// Zero translation
r0 = _mm_and_ps(r0, v_mask_xyz());
r1 = _mm_and_ps(r1, v_mask_xyz());
r2 = _mm_and_ps(r2, v_mask_xyz());
// Compute cofactors via cross products
f32x4 c0 = v_cross(r1, r2);
f32x4 c1 = v_cross(r2, r0);
f32x4 c2 = v_cross(r0, r1);
// Determinant
f32x4 det = v_dot3(r0, c0);
// Reciprocal determinant
f32x4 invDet = v_rcp_nr(det);
// Inverse 3x3 (transpose of cofactor matrix)
f32x4 i0 = _mm_mul_ps(c0, invDet);
f32x4 i1 = _mm_mul_ps(c1, invDet);
f32x4 i2 = _mm_mul_ps(c2, invDet);
// Transpose 3x3
f32x4 t0 = i0;
f32x4 t1 = i1;
f32x4 t2 = i2;
f32x4 t3 = _mm_setzero_ps();
_MM_TRANSPOSE4_PS(t0, t1, t2, t3);
// Extract translation
alignas(16) float tmp[4];
tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[3] = 0.0f;
f32x4 T = v_load(tmp);
// New translation = -R' * T
f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
newT = _mm_sub_ps(v_zero(), newT);
// Assemble final matrix
f32x4x4 out;
out.r0 = v_preserve_w(t0, newT);
out.r1 = v_preserve_w(t1, newT);
out.r2 = v_preserve_w(t2, newT);
out.r3 = _mm_set_ps(1, 0, 0, 0);
return out;
}
inline f32x4 m_determinant(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(c, d);
f32x4 c1 = v_cross(d, b);
f32x4 c2 = v_cross(b, c);
f32x4 term0 = _mm_mul_ps(a, c0);
f32x4 term1 = _mm_mul_ps(a, c1);
f32x4 term2 = _mm_mul_ps(a, c2);
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
return v_hadd4(det);
}
inline f32x4 m_determinant_affine(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
r0 = _mm_and_ps(r0, v_mask_xyz());
r1 = _mm_and_ps(r1, v_mask_xyz());
r2 = _mm_and_ps(r2, v_mask_xyz());
f32x4 c0 = v_cross(r1, r2);
return v_dot3(r0, c0); // splatted determinant
}
//---------------------------------------------------------
// BATCH INTRINSICS
//---------------------------------------------------------
struct vec4_batch4
{
f32x4 x;
f32x4 y;
f32x4 z;
f32x4 w;
};
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
{
vec4_batch4 r;
r.x = _mm_set_ps(
ptr[9], ptr[6], ptr[3], ptr[0]);
r.y = _mm_set_ps(
ptr[10], ptr[7], ptr[4], ptr[1]);
r.z = _mm_set_ps(
ptr[11], ptr[8], ptr[5], ptr[2]);
if (fillW)
{
r.w = _mm_set1_ps(w);
}
return r;
}
// Store the result back out to a float array 4 at a time
inline void store_f32x4(float* out, f32x4 v)
{
_mm_storeu_ps(out, v);
}
// Store the result back to a float3 array with size of 4
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
{
alignas(16) float xs[8];
alignas(16) float ys[8];
alignas(16) float zs[8];
_mm_store_ps(xs, v.x);
_mm_store_ps(ys, v.y);
_mm_store_ps(zs, v.z);
for (int i = 0; i < 4; ++i)
{
out[i * 3 + 0] = xs[i];
out[i * 3 + 1] = ys[i];
out[i * 3 + 2] = zs[i];
}
}
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
{
f32x4 mulx = _mm_mul_ps(a.x, b.x);
f32x4 muly = _mm_mul_ps(a.y, b.y);
f32x4 mulz = _mm_mul_ps(a.z, b.z);
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot
//
r.x = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m00),
_mm_mul_ps(v.y, m01)),
_mm_add_ps(
_mm_mul_ps(v.z, m02), m03));
//
// row1 dot
//
r.y = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m10),
_mm_mul_ps(v.y, m11)),
_mm_add_ps(
_mm_mul_ps(v.z, m12), m13));
//
// row2 dot
//
r.z = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m20),
_mm_mul_ps(v.y, m21)),
_mm_add_ps(
_mm_mul_ps(v.z, m22), m23));
return r;
}
}