mirror of
https://github.com/TorqueGameEngines/Torque3D.git
synced 2026-03-24 06:39:26 +00:00
655 lines
17 KiB
C
655 lines
17 KiB
C
#pragma once
|
||
#include <emmintrin.h> // SSE2
|
||
#include <xmmintrin.h> // SSE
|
||
|
||
namespace
|
||
{
|
||
typedef __m128 f32x4;
|
||
|
||
//------------------------------------------------------
|
||
// Vector/Point Load / Store
|
||
//------------------------------------------------------
|
||
|
||
// Load 4 floats from memory into a SIMD register
|
||
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
|
||
|
||
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
|
||
|
||
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
|
||
|
||
inline f32x4 v_zero() { return _mm_setzero_ps(); }
|
||
|
||
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
|
||
|
||
//------------------------------------------------------
|
||
// Float3 helpers (safe loading into 4 lanes)
|
||
//------------------------------------------------------
|
||
|
||
inline f32x4 v_load3_vec(const float* p) // w = 0
|
||
{
|
||
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||
}
|
||
|
||
inline f32x4 v_load3_pos(const float* p) // w = 1
|
||
{
|
||
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
|
||
}
|
||
|
||
inline f32x4 v_set(float x, float y, float z, float w)
|
||
{
|
||
return _mm_set_ps(w, z, y, x);
|
||
}
|
||
|
||
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
|
||
{
|
||
f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
|
||
return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v));
|
||
}
|
||
|
||
inline void v_store3(float* dst, f32x4 v)
|
||
{
|
||
alignas(16) float tmp[4]; // temp storage
|
||
_mm_store_ps(tmp, v); // store all 4 lanes
|
||
dst[0] = tmp[0];
|
||
dst[1] = tmp[1];
|
||
dst[2] = tmp[2];
|
||
}
|
||
|
||
//------------------------------------------------------
|
||
// Mask helpers
|
||
//------------------------------------------------------
|
||
|
||
inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
|
||
|
||
inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
|
||
{
|
||
alignas(16) float tmp[4];
|
||
_mm_store_ps(tmp, v);
|
||
|
||
return _mm_set_ps(
|
||
tmp[w],
|
||
tmp[z],
|
||
tmp[y],
|
||
tmp[x]
|
||
);
|
||
}
|
||
|
||
inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
|
||
{
|
||
alignas(16) float tmp[4];
|
||
_mm_store_ps(tmp, v);
|
||
return _mm_set1_ps(tmp[x]);
|
||
}
|
||
|
||
inline f32x4 v_swizzle_lo(f32x4 v1)
|
||
{
|
||
return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0));
|
||
}
|
||
|
||
inline f32x4 v_swizzle_hi(f32x4 v1)
|
||
{
|
||
return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1));
|
||
}
|
||
|
||
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
|
||
{
|
||
f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1
|
||
return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
|
||
}
|
||
|
||
//------------------------------------------------------
|
||
// Shuffle helpers
|
||
//------------------------------------------------------
|
||
|
||
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
|
||
{
|
||
alignas(16) float a[4], b[4];
|
||
_mm_store_ps(a, v1);
|
||
_mm_store_ps(b, v2);
|
||
|
||
return _mm_set_ps(
|
||
b[w],
|
||
b[z],
|
||
a[y],
|
||
a[x]
|
||
);
|
||
}
|
||
|
||
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
|
||
{
|
||
return _mm_movelh_ps(v1, v2);
|
||
}
|
||
|
||
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
|
||
{
|
||
return _mm_movehl_ps(v2, v1);
|
||
}
|
||
|
||
|
||
//------------------------------------------------------
|
||
// Simple Arithmatic
|
||
//------------------------------------------------------
|
||
|
||
// Element-wise multiply
|
||
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
|
||
|
||
// Element-wise divide
|
||
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
|
||
|
||
// Element-wise add
|
||
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
|
||
|
||
// Element-wise subtract
|
||
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
|
||
|
||
// Absolute value.
|
||
inline f32x4 v_abs(f32x4 v)
|
||
{
|
||
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
|
||
return _mm_and_ps(v, mask);
|
||
}
|
||
|
||
//------------------------------------------------------
|
||
// Fast recip
|
||
//------------------------------------------------------
|
||
|
||
// Fast recip 1/b
|
||
inline f32x4 v_rcp_nr(f32x4 b)
|
||
{
|
||
f32x4 r = _mm_rcp_ps(b);
|
||
f32x4 two = _mm_set1_ps(2.0f);
|
||
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||
}
|
||
|
||
// Divide fast ( b = recip eg 1/b)
|
||
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
|
||
|
||
inline f32x4 v_rsqrt_nr(f32x4 x)
|
||
{
|
||
f32x4 half = _mm_set1_ps(0.5f);
|
||
f32x4 three = _mm_set1_ps(3.0f);
|
||
|
||
f32x4 r = _mm_rsqrt_ps(x);
|
||
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
|
||
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
|
||
}
|
||
|
||
//------------------------------------------------------
|
||
// Vector intrinsic functions
|
||
//------------------------------------------------------
|
||
|
||
// full dot4
|
||
inline f32x4 v_dot4(f32x4 a, f32x4 b)
|
||
{
|
||
f32x4 prod = _mm_mul_ps(a, b); // multiply element-wise
|
||
f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2));
|
||
prod = _mm_add_ps(prod, shuf);
|
||
shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1));
|
||
prod = _mm_add_ps(prod, shuf);
|
||
return prod; // f32x4, all lanes = dot(a,b)
|
||
}
|
||
|
||
// dot3 (ignores w)
|
||
inline f32x4 v_dot3(f32x4 a, f32x4 b)
|
||
{
|
||
f32x4 prod = _mm_mul_ps(a, b);
|
||
prod = _mm_and_ps(prod, v_mask_xyz()); // zero w
|
||
f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
|
||
prod = _mm_add_ps(prod, shuf);
|
||
shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
|
||
prod = _mm_add_ps(prod, shuf);
|
||
return prod; // f32x4, all lanes = dot(a,b)
|
||
}
|
||
|
||
// cross product xyz only.
|
||
inline f32x4 v_cross(f32x4 a, f32x4 b)
|
||
{
|
||
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
|
||
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
|
||
|
||
f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
|
||
|
||
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
|
||
}
|
||
|
||
inline f32x4 v_normalize3(f32x4 v)
|
||
{
|
||
const f32x4 zero = _mm_setzero_ps();
|
||
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
|
||
f32x4 dot = v_dot3(v, v);
|
||
|
||
f32x4 inv = v_rsqrt_nr(dot);
|
||
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
|
||
|
||
f32x4 norm = _mm_mul_ps(v, inv);
|
||
|
||
// vbsl equivalent
|
||
f32x4 result = _mm_or_ps(
|
||
_mm_and_ps(isZero, fallback),
|
||
_mm_andnot_ps(isZero, norm)
|
||
);
|
||
|
||
return result;
|
||
}
|
||
|
||
// adds all 4 lanes together.
|
||
inline f32x4 v_hadd4(f32x4 a)
|
||
{
|
||
// sum all 4 lanes in SSE2
|
||
__m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs
|
||
__m128 sums = _mm_add_ps(a, shuf);
|
||
shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
|
||
return _mm_add_ps(sums, shuf);
|
||
}
|
||
|
||
//------------------------------------------------------
|
||
// Matrix type (row-major 4x4)
|
||
//------------------------------------------------------
|
||
|
||
struct f32x4x4
|
||
{
|
||
f32x4 r0;
|
||
f32x4 r1;
|
||
f32x4 r2;
|
||
f32x4 r3;
|
||
};
|
||
|
||
//------------------------------------------------------
|
||
// Matrix Load / Store
|
||
//------------------------------------------------------
|
||
|
||
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
|
||
{
|
||
f32x4x4 out;
|
||
out.r0 = v_load(m + 0);
|
||
out.r1 = v_load(m + 4);
|
||
out.r2 = v_load(m + 8);
|
||
out.r3 = v_load(m + 12);
|
||
return out;
|
||
}
|
||
|
||
inline void m_store(float* dst, const f32x4x4& m)
|
||
{
|
||
v_store(dst + 0, m.r0);
|
||
v_store(dst + 4, m.r1);
|
||
v_store(dst + 8, m.r2);
|
||
v_store(dst + 12, m.r3);
|
||
}
|
||
|
||
inline f32x4x4 m_identity()
|
||
{
|
||
f32x4x4 m;
|
||
m.r0 = _mm_set_ps(0, 0, 0, 1);
|
||
m.r1 = _mm_set_ps(0, 0, 1, 0);
|
||
m.r2 = _mm_set_ps(0, 1, 0, 0);
|
||
m.r3 = _mm_set_ps(1, 0, 0, 0);
|
||
return m;
|
||
}
|
||
|
||
inline f32x4x4 m_zero()
|
||
{
|
||
f32x4 z = v_zero();
|
||
return { z, z, z, z };
|
||
}
|
||
|
||
//------------------------------------------------------
|
||
// Matrix × Vector
|
||
//------------------------------------------------------
|
||
|
||
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
|
||
{
|
||
f32x4 x = v_dot4(m.r0, v);
|
||
f32x4 y = v_dot4(m.r1, v);
|
||
f32x4 z = v_dot4(m.r2, v);
|
||
f32x4 w = v_dot4(m.r3, v);
|
||
|
||
// combine to a vector
|
||
return _mm_set_ps(_mm_cvtss_f32(w),
|
||
_mm_cvtss_f32(z),
|
||
_mm_cvtss_f32(y),
|
||
_mm_cvtss_f32(x));
|
||
}
|
||
|
||
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
|
||
{
|
||
f32x4 x = v_dot3(m.r0, v);
|
||
f32x4 y = v_dot3(m.r1, v);
|
||
f32x4 z = v_dot3(m.r2, v);
|
||
|
||
// combine to a vector
|
||
return _mm_set_ps(0.0f,
|
||
_mm_cvtss_f32(z),
|
||
_mm_cvtss_f32(y),
|
||
_mm_cvtss_f32(x));
|
||
}
|
||
|
||
//------------------------------------------------------
|
||
// Matrix × Matrix
|
||
//------------------------------------------------------
|
||
|
||
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
|
||
{
|
||
// Transpose B once for dot products
|
||
f32x4 b0 = b.r0;
|
||
f32x4 b1 = b.r1;
|
||
f32x4 b2 = b.r2;
|
||
f32x4 b3 = b.r3;
|
||
|
||
// Transpose (SSE2)
|
||
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
|
||
|
||
f32x4x4 C;
|
||
|
||
auto mul_row = [&](f32x4 arow)
|
||
{
|
||
f32x4 x = v_dot4(arow, b0);
|
||
f32x4 y = v_dot4(arow, b1);
|
||
f32x4 z = v_dot4(arow, b2);
|
||
f32x4 w = v_dot4(arow, b3);
|
||
|
||
f32x4 xy = _mm_unpacklo_ps(x, y);
|
||
f32x4 zw = _mm_unpacklo_ps(z, w);
|
||
return _mm_movelh_ps(xy, zw);
|
||
};
|
||
|
||
C.r0 = mul_row(a.r0);
|
||
C.r1 = mul_row(a.r1);
|
||
C.r2 = mul_row(a.r2);
|
||
C.r3 = mul_row(a.r3);
|
||
|
||
return C;
|
||
}
|
||
|
||
//------------------------------------------------------
|
||
// Transpose
|
||
//------------------------------------------------------
|
||
|
||
inline f32x4x4 m_transpose(const f32x4x4& m)
|
||
{
|
||
f32x4 r0 = m.r0;
|
||
f32x4 r1 = m.r1;
|
||
f32x4 r2 = m.r2;
|
||
f32x4 r3 = m.r3;
|
||
|
||
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
|
||
|
||
return { r0, r1, r2, r3 };
|
||
}
|
||
|
||
inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
|
||
{
|
||
dst[0] = v_extract0(c0); // c0.x
|
||
dst[1] = v_extract0(c1); // c1.x
|
||
dst[2] = v_extract0(c2); // c2.x
|
||
|
||
dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
|
||
dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
|
||
dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
|
||
|
||
dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
|
||
dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
|
||
dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
|
||
}
|
||
|
||
inline f32x4x4 m_inverse(const f32x4x4& m)
|
||
{
|
||
f32x4 a = m.r0;
|
||
f32x4 b = m.r1;
|
||
f32x4 c = m.r2;
|
||
f32x4 d = m.r3;
|
||
|
||
f32x4 c0 = v_cross(b, c);
|
||
f32x4 c1 = v_cross(c, d);
|
||
f32x4 c2 = v_cross(d, a);
|
||
f32x4 c3 = v_cross(a, b);
|
||
|
||
f32x4 det = v_dot4(a, c1);
|
||
f32x4 invDet = v_rcp_nr(det);
|
||
|
||
f32x4x4 adj;
|
||
adj.r0 = _mm_mul_ps(c1, invDet);
|
||
adj.r1 = _mm_mul_ps(c2, invDet);
|
||
adj.r2 = _mm_mul_ps(c3, invDet);
|
||
adj.r3 = _mm_mul_ps(c0, invDet);
|
||
|
||
return m_transpose(adj);
|
||
}
|
||
|
||
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
|
||
{
|
||
f32x4x4 t = m_transpose(m);
|
||
|
||
// Extract translation
|
||
f32x4 T = v_set(
|
||
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
|
||
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
|
||
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
|
||
0.0f
|
||
);
|
||
|
||
f32x4 newT = m_mul_vec4(t, T);
|
||
newT = _mm_sub_ps(v_zero(), newT);
|
||
|
||
t.r0 = v_preserve_w(t.r0, newT);
|
||
t.r1 = v_preserve_w(t.r1, newT);
|
||
t.r2 = v_preserve_w(t.r2, newT);
|
||
t.r3 = v_set(0, 0, 0, 1);
|
||
|
||
return t;
|
||
}
|
||
|
||
inline f32x4x4 m_inverse_affine(const f32x4x4& m)
|
||
{
|
||
// Extract upper 3x3 rows
|
||
f32x4 r0 = m.r0;
|
||
f32x4 r1 = m.r1;
|
||
f32x4 r2 = m.r2;
|
||
|
||
// Zero translation
|
||
r0 = _mm_and_ps(r0, v_mask_xyz());
|
||
r1 = _mm_and_ps(r1, v_mask_xyz());
|
||
r2 = _mm_and_ps(r2, v_mask_xyz());
|
||
|
||
// Compute cofactors via cross products
|
||
f32x4 c0 = v_cross(r1, r2);
|
||
f32x4 c1 = v_cross(r2, r0);
|
||
f32x4 c2 = v_cross(r0, r1);
|
||
|
||
// Determinant
|
||
f32x4 det = v_dot3(r0, c0);
|
||
|
||
// Reciprocal determinant
|
||
f32x4 invDet = v_rcp_nr(det);
|
||
|
||
// Inverse 3x3 (transpose of cofactor matrix)
|
||
f32x4 i0 = _mm_mul_ps(c0, invDet);
|
||
f32x4 i1 = _mm_mul_ps(c1, invDet);
|
||
f32x4 i2 = _mm_mul_ps(c2, invDet);
|
||
|
||
// Transpose 3x3
|
||
f32x4 t0 = i0;
|
||
f32x4 t1 = i1;
|
||
f32x4 t2 = i2;
|
||
f32x4 t3 = _mm_setzero_ps();
|
||
|
||
_MM_TRANSPOSE4_PS(t0, t1, t2, t3);
|
||
|
||
// Extract translation
|
||
alignas(16) float tmp[4];
|
||
tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
|
||
tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
|
||
tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
|
||
tmp[3] = 0.0f;
|
||
|
||
f32x4 T = v_load(tmp);
|
||
|
||
// New translation = -R' * T
|
||
f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
|
||
newT = _mm_sub_ps(v_zero(), newT);
|
||
|
||
// Assemble final matrix
|
||
f32x4x4 out;
|
||
out.r0 = v_preserve_w(t0, newT);
|
||
out.r1 = v_preserve_w(t1, newT);
|
||
out.r2 = v_preserve_w(t2, newT);
|
||
out.r3 = _mm_set_ps(1, 0, 0, 0);
|
||
|
||
return out;
|
||
}
|
||
|
||
inline f32x4 m_determinant(const f32x4x4& m)
|
||
{
|
||
f32x4 a = m.r0;
|
||
f32x4 b = m.r1;
|
||
f32x4 c = m.r2;
|
||
f32x4 d = m.r3;
|
||
|
||
f32x4 c0 = v_cross(c, d);
|
||
f32x4 c1 = v_cross(d, b);
|
||
f32x4 c2 = v_cross(b, c);
|
||
|
||
f32x4 term0 = _mm_mul_ps(a, c0);
|
||
f32x4 term1 = _mm_mul_ps(a, c1);
|
||
f32x4 term2 = _mm_mul_ps(a, c2);
|
||
|
||
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
|
||
|
||
return v_hadd4(det);
|
||
}
|
||
|
||
inline f32x4 m_determinant_affine(const f32x4x4& m)
|
||
{
|
||
f32x4 r0 = m.r0;
|
||
f32x4 r1 = m.r1;
|
||
f32x4 r2 = m.r2;
|
||
|
||
r0 = _mm_and_ps(r0, v_mask_xyz());
|
||
r1 = _mm_and_ps(r1, v_mask_xyz());
|
||
r2 = _mm_and_ps(r2, v_mask_xyz());
|
||
|
||
f32x4 c0 = v_cross(r1, r2);
|
||
return v_dot3(r0, c0); // splatted determinant
|
||
}
|
||
|
||
//---------------------------------------------------------
|
||
// BATCH INTRINSICS
|
||
//---------------------------------------------------------
|
||
|
||
struct vec4_batch4
|
||
{
|
||
f32x4 x;
|
||
f32x4 y;
|
||
f32x4 z;
|
||
f32x4 w;
|
||
};
|
||
|
||
|
||
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
|
||
{
|
||
vec4_batch4 r;
|
||
|
||
r.x = _mm_set_ps(
|
||
ptr[9], ptr[6], ptr[3], ptr[0]);
|
||
|
||
r.y = _mm_set_ps(
|
||
ptr[10], ptr[7], ptr[4], ptr[1]);
|
||
|
||
r.z = _mm_set_ps(
|
||
ptr[11], ptr[8], ptr[5], ptr[2]);
|
||
|
||
if (fillW)
|
||
{
|
||
r.w = _mm_set1_ps(w);
|
||
}
|
||
|
||
return r;
|
||
}
|
||
|
||
// Store the result back out to a float array 4 at a time
|
||
inline void store_f32x4(float* out, f32x4 v)
|
||
{
|
||
_mm_storeu_ps(out, v);
|
||
}
|
||
|
||
// Store the result back to a float3 array with size of 4
|
||
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
|
||
{
|
||
alignas(16) float xs[8];
|
||
alignas(16) float ys[8];
|
||
alignas(16) float zs[8];
|
||
|
||
_mm_store_ps(xs, v.x);
|
||
_mm_store_ps(ys, v.y);
|
||
_mm_store_ps(zs, v.z);
|
||
|
||
for (int i = 0; i < 4; ++i)
|
||
{
|
||
out[i * 3 + 0] = xs[i];
|
||
out[i * 3 + 1] = ys[i];
|
||
out[i * 3 + 2] = zs[i];
|
||
}
|
||
}
|
||
|
||
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
|
||
{
|
||
f32x4 mulx = _mm_mul_ps(a.x, b.x);
|
||
f32x4 muly = _mm_mul_ps(a.y, b.y);
|
||
f32x4 mulz = _mm_mul_ps(a.z, b.z);
|
||
|
||
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
|
||
}
|
||
|
||
// Batch 4 mul_Vec4.
|
||
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
|
||
{
|
||
vec4_batch4 r;
|
||
|
||
f32x4 m00 = _mm_set1_ps(m[0]);
|
||
f32x4 m01 = _mm_set1_ps(m[1]);
|
||
f32x4 m02 = _mm_set1_ps(m[2]);
|
||
f32x4 m03 = _mm_set1_ps(m[3]);
|
||
|
||
f32x4 m10 = _mm_set1_ps(m[4]);
|
||
f32x4 m11 = _mm_set1_ps(m[5]);
|
||
f32x4 m12 = _mm_set1_ps(m[6]);
|
||
f32x4 m13 = _mm_set1_ps(m[7]);
|
||
|
||
f32x4 m20 = _mm_set1_ps(m[8]);
|
||
f32x4 m21 = _mm_set1_ps(m[9]);
|
||
f32x4 m22 = _mm_set1_ps(m[10]);
|
||
f32x4 m23 = _mm_set1_ps(m[11]);
|
||
|
||
//
|
||
// row0 dot
|
||
//
|
||
r.x = _mm_add_ps(
|
||
_mm_add_ps(
|
||
_mm_mul_ps(v.x, m00),
|
||
_mm_mul_ps(v.y, m01)),
|
||
_mm_add_ps(
|
||
_mm_mul_ps(v.z, m02), m03));
|
||
|
||
//
|
||
// row1 dot
|
||
//
|
||
r.y = _mm_add_ps(
|
||
_mm_add_ps(
|
||
_mm_mul_ps(v.x, m10),
|
||
_mm_mul_ps(v.y, m11)),
|
||
_mm_add_ps(
|
||
_mm_mul_ps(v.z, m12), m13));
|
||
|
||
//
|
||
// row2 dot
|
||
//
|
||
r.z = _mm_add_ps(
|
||
_mm_add_ps(
|
||
_mm_mul_ps(v.x, m20),
|
||
_mm_mul_ps(v.y, m21)),
|
||
_mm_add_ps(
|
||
_mm_mul_ps(v.z, m22), m23));
|
||
|
||
return r;
|
||
}
|
||
}
|