2026-03-03 19:09:00 +00:00
|
|
|
#pragma once
|
|
|
|
|
#include <cmath> // for sqrtf, etc.
|
|
|
|
|
#include "../mConstants.h"
|
|
|
|
|
|
|
|
|
|
namespace math_backend::mat44
|
|
|
|
|
{
|
|
|
|
|
//------------------------------------------------------------------
|
|
|
|
|
// Matrix Transpose
|
|
|
|
|
inline void mat44_transpose_impl(float* m)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
f32x4x4 mr = m_transpose(ma);
|
|
|
|
|
m_store(m, mr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline float mat44_get_determinant(const float* m)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
return v_extract0(m_determinant_affine(ma));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Matrix Scale: Float3 (assume w = 1.0f)
|
|
|
|
|
inline void mat44_scale_impl(float* m, const float* s)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
f32x4 va = v_load3_pos(s);
|
|
|
|
|
|
|
|
|
|
ma.r0 = v_mul(ma.r0, va);
|
|
|
|
|
ma.r1 = v_mul(ma.r1, va);
|
|
|
|
|
ma.r2 = v_mul(ma.r2, va);
|
|
|
|
|
ma.r3 = v_mul(ma.r3, va);
|
|
|
|
|
m_store(m, ma);
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 18:54:28 +00:00
|
|
|
inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 M = m_load(m);
|
|
|
|
|
|
|
|
|
|
f32x4 plane_v = v_load(plane);
|
|
|
|
|
f32x4 scale_v = v_load3_vec(scale);
|
|
|
|
|
f32x4 invScale = v_rcp_nr(scale_v);
|
|
|
|
|
|
|
|
|
|
// normal = plane.xyz
|
|
|
|
|
f32x4 normal = plane_v;
|
|
|
|
|
|
|
|
|
|
// apply Inv(s)
|
|
|
|
|
normal = v_mul(normal, invScale);
|
|
|
|
|
|
|
|
|
|
// multiply by Inv(Tr(m)) (only the rotation part matters)
|
|
|
|
|
f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0);
|
|
|
|
|
f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1);
|
|
|
|
|
f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2);
|
|
|
|
|
|
|
|
|
|
normal = v_add(v_add(nx, ny), nz);
|
|
|
|
|
|
|
|
|
|
normal = v_normalize3(normal);
|
|
|
|
|
|
|
|
|
|
// compute point on plane
|
|
|
|
|
float d = v_extract0(v_swizzle_singular_mask(plane_v, 3));
|
|
|
|
|
|
|
|
|
|
f32x4 point = v_mul(plane_v, v_set1(-d));
|
|
|
|
|
point = v_preserve_w(point, v_set1(1.0f));
|
|
|
|
|
|
|
|
|
|
// apply scale
|
|
|
|
|
point = v_mul(point, scale_v);
|
|
|
|
|
|
|
|
|
|
// transform point by matrix
|
|
|
|
|
point = m_mul_vec4(M, point);
|
|
|
|
|
|
|
|
|
|
// compute new plane distance
|
|
|
|
|
float newD = -v_extract0(v_dot3(point, normal));
|
|
|
|
|
|
|
|
|
|
alignas(16) float n[4];
|
|
|
|
|
v_store(n, normal);
|
|
|
|
|
|
|
|
|
|
plane_result[0] = n[0];
|
|
|
|
|
plane_result[1] = n[1];
|
|
|
|
|
plane_result[2] = n[2];
|
|
|
|
|
plane_result[3] = newD;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-03 19:09:00 +00:00
|
|
|
inline void mat44_get_scale_impl(const float* m, float* s)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
|
|
|
|
|
// squared lengths
|
|
|
|
|
f32x4 len2_x = v_dot3(ma.r0, ma.r0);
|
|
|
|
|
f32x4 len2_y = v_dot3(ma.r1, ma.r1);
|
|
|
|
|
f32x4 len2_z = v_dot3(ma.r2, ma.r2);
|
|
|
|
|
|
|
|
|
|
// extract and sqrt
|
|
|
|
|
s[0] = 1.0f / v_extract0(v_rsqrt_nr(len2_x));
|
|
|
|
|
s[1] = 1.0f / v_extract0(v_rsqrt_nr(len2_y));
|
|
|
|
|
s[2] = 1.0f / v_extract0(v_rsqrt_nr(len2_z));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Matrix Scale Uniform: Float value (assume w = 1.0f)
|
|
|
|
|
inline void mat44_scale_uniform(float* m, float s)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
|
|
|
|
|
// (s, s, s, 1)
|
|
|
|
|
f32x4 scale = v_set(s, s, s, 1.0f);
|
|
|
|
|
|
|
|
|
|
// Scale only rotation rows (xyz part)
|
|
|
|
|
ma.r0 = v_mul(ma.r0, scale);
|
|
|
|
|
ma.r1 = v_mul(ma.r1, scale);
|
|
|
|
|
ma.r2 = v_mul(ma.r2, scale);
|
|
|
|
|
m_store(m, ma);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Matrix Inverse
|
|
|
|
|
inline void mat44_inverse_impl(float* m)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
|
|
|
|
|
// Compute cofactors using cross products
|
|
|
|
|
f32x4x4 mTemp;
|
|
|
|
|
mTemp.r0 = v_cross(ma.r1, ma.r2);
|
|
|
|
|
mTemp.r1 = v_cross(ma.r2, ma.r0);
|
|
|
|
|
mTemp.r2 = v_cross(ma.r0, ma.r1);
|
|
|
|
|
|
|
|
|
|
// Determinant = dot(ma.r0, c0)
|
|
|
|
|
f32x4 det = v_dot3(ma.r0, mTemp.r0);
|
|
|
|
|
f32x4 invDet = v_rcp_nr(det);
|
|
|
|
|
|
|
|
|
|
// Scale cofactors
|
|
|
|
|
mTemp.r0 = v_mul(mTemp.r0, invDet);
|
|
|
|
|
mTemp.r1 = v_mul(mTemp.r1, invDet);
|
|
|
|
|
mTemp.r2 = v_mul(mTemp.r2, invDet);
|
|
|
|
|
|
|
|
|
|
// Store inverse 3x3 (transpose of cofactor matrix)
|
|
|
|
|
|
|
|
|
|
mTemp = m_transpose(mTemp);
|
2026-03-04 09:56:29 +00:00
|
|
|
mTemp.r3 = ma.r3;
|
2026-03-03 19:09:00 +00:00
|
|
|
|
|
|
|
|
// ---- Translation ----
|
|
|
|
|
|
|
|
|
|
// Load original translation
|
|
|
|
|
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
|
|
|
|
|
|
|
|
|
|
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
|
|
|
|
|
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
|
|
|
|
|
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
|
|
|
|
|
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
|
|
|
|
|
result = v_mul(result, v_set1(-1.0f));
|
|
|
|
|
|
|
|
|
|
m_store(m, mTemp);
|
|
|
|
|
|
|
|
|
|
// Store translation
|
|
|
|
|
m[3] = v_extract0(result);
|
|
|
|
|
m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
|
|
|
|
|
m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-04 09:56:29 +00:00
|
|
|
// Matrix Inverse
|
|
|
|
|
inline void mat44_inverse_to_impl(const float* m, float* dst)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
|
|
|
|
|
// Compute cofactors using cross products
|
|
|
|
|
f32x4x4 mTemp;
|
|
|
|
|
mTemp.r0 = v_cross(ma.r1, ma.r2);
|
|
|
|
|
mTemp.r1 = v_cross(ma.r2, ma.r0);
|
|
|
|
|
mTemp.r2 = v_cross(ma.r0, ma.r1);
|
|
|
|
|
|
|
|
|
|
// Determinant = dot(ma.r0, c0)
|
|
|
|
|
f32x4 det = v_dot3(ma.r0, mTemp.r0);
|
|
|
|
|
f32x4 invDet = v_rcp_nr(det);
|
|
|
|
|
|
|
|
|
|
// Scale cofactors
|
|
|
|
|
mTemp.r0 = v_mul(mTemp.r0, invDet);
|
|
|
|
|
mTemp.r1 = v_mul(mTemp.r1, invDet);
|
|
|
|
|
mTemp.r2 = v_mul(mTemp.r2, invDet);
|
|
|
|
|
|
|
|
|
|
// Store inverse 3x3 (transpose of cofactor matrix)
|
|
|
|
|
|
|
|
|
|
mTemp = m_transpose(mTemp);
|
|
|
|
|
mTemp.r3 = ma.r3;
|
|
|
|
|
|
|
|
|
|
// ---- Translation ----
|
|
|
|
|
|
|
|
|
|
// Load original translation
|
2026-03-04 10:25:47 +00:00
|
|
|
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
|
2026-03-04 09:56:29 +00:00
|
|
|
|
|
|
|
|
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
|
|
|
|
|
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
|
|
|
|
|
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
|
|
|
|
|
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
|
|
|
|
|
result = v_mul(result, v_set1(-1.0f));
|
|
|
|
|
|
|
|
|
|
m_store(dst, mTemp);
|
|
|
|
|
|
|
|
|
|
// Store translation
|
|
|
|
|
dst[3] = v_extract0(result);
|
|
|
|
|
dst[7] = v_extract0(v_swizzle_singular_mask(result, 1));
|
|
|
|
|
dst[11] = v_extract0(v_swizzle_singular_mask(result, 2));
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-03 19:09:00 +00:00
|
|
|
// Matrix Affine Inverse
|
|
|
|
|
inline void mat44_affine_inverse_impl(float* m)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
|
|
|
|
|
f32x4x4 mTemp = m_transpose(ma);
|
|
|
|
|
mTemp.r3 = v_set(0, 0, 0, 1);
|
|
|
|
|
|
|
|
|
|
// ---- Translation ----
|
|
|
|
|
// Load original translation
|
|
|
|
|
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
|
|
|
|
|
|
|
|
|
|
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
|
|
|
|
|
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
|
|
|
|
|
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
|
|
|
|
|
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
|
|
|
|
|
result = v_mul(result, v_set1(-1.0f));
|
|
|
|
|
|
|
|
|
|
m_store(m, mTemp);
|
|
|
|
|
|
|
|
|
|
// Store translation
|
|
|
|
|
m[3] = v_extract0(result);
|
|
|
|
|
m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
|
|
|
|
|
m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-04 21:22:22 +00:00
|
|
|
inline void mat44_normalize_impl(float* m)
|
2026-03-03 19:09:00 +00:00
|
|
|
{
|
2026-03-04 21:22:22 +00:00
|
|
|
// Load the matrix into SIMD registers
|
|
|
|
|
f32x4x4 mat = m_load(m);
|
2026-03-03 19:09:00 +00:00
|
|
|
|
2026-03-04 21:22:22 +00:00
|
|
|
// Transpose: now rows are columns
|
|
|
|
|
mat = m_transpose(mat);
|
2026-03-03 19:09:00 +00:00
|
|
|
|
2026-03-04 21:22:22 +00:00
|
|
|
// Extract columns (which are now rows)
|
|
|
|
|
f32x4 col0 = mat.r0;
|
|
|
|
|
f32x4 col1 = mat.r1;
|
2026-03-03 19:09:00 +00:00
|
|
|
|
2026-03-04 21:22:22 +00:00
|
|
|
// Rebuild orthonormal basis
|
|
|
|
|
f32x4 col2 = v_cross(col0, col1);
|
|
|
|
|
col1 = v_cross(col2, col0);
|
2026-03-03 19:09:00 +00:00
|
|
|
|
2026-03-04 21:22:22 +00:00
|
|
|
// Normalize columns
|
|
|
|
|
col0 = v_normalize3(col0);
|
|
|
|
|
col1 = v_normalize3(col1);
|
|
|
|
|
col2 = v_normalize3(col2);
|
2026-03-03 19:09:00 +00:00
|
|
|
|
2026-03-04 21:22:22 +00:00
|
|
|
// Write back directly into transposed matrix
|
|
|
|
|
mat.r0 = col0;
|
|
|
|
|
mat.r1 = col1;
|
|
|
|
|
mat.r2 = col2;
|
2026-03-03 19:09:00 +00:00
|
|
|
|
2026-03-04 21:22:22 +00:00
|
|
|
// Transpose back to row-major
|
|
|
|
|
mat = m_transpose(mat);
|
|
|
|
|
|
|
|
|
|
// Store back
|
|
|
|
|
m_store(m, mat);
|
2026-03-03 19:09:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Matrix Multiply: a * b
|
|
|
|
|
inline void mat44_mul_mat44_impl(const float* a, const float* b, float* r)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(a);
|
|
|
|
|
f32x4x4 mb = m_load(b);
|
|
|
|
|
|
|
|
|
|
f32x4x4 mr = m_mul(ma, mb);
|
|
|
|
|
m_store(r, mr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Vector Multiply: m * p (assume w = 1.0f)
|
|
|
|
|
inline void mat44_mul_pos3_impl(const float *m, const float *p, float* r)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
f32x4 va = v_load3_pos(p);
|
|
|
|
|
f32x4 vr = m_mul_vec4(ma, va);
|
|
|
|
|
v_store3(r, vr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Vector Multiply: m * v (assume w = 0.0f)
|
|
|
|
|
inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
f32x4 va = v_load3_vec(v);
|
2026-03-04 21:22:22 +00:00
|
|
|
f32x4 vr = m_mul_vec3(ma, va);
|
2026-03-03 19:09:00 +00:00
|
|
|
v_store3(r, vr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Vector Multiply: m * p (full [4x4] * [1x4])
|
|
|
|
|
inline void mat44_mul_float4_impl(const float* m, const float* p, float* r)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 ma = m_load(m);
|
|
|
|
|
f32x4 va = v_load(p);
|
|
|
|
|
f32x4 vr = m_mul_vec4(ma, va);
|
|
|
|
|
v_store(r, vr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//--------------------------------------------------
|
|
|
|
|
// MATRIX ROTATION FUNCTIONS
|
|
|
|
|
//--------------------------------------------------
|
|
|
|
|
|
|
|
|
|
inline void mat44_rotation_x_impl(float* m, float angle)
|
|
|
|
|
{
|
|
|
|
|
float c = cosf(angle), s = sinf(angle);
|
|
|
|
|
f32x4x4 mr = m_identity();
|
|
|
|
|
mr.r1 = v_set(0, c, s, 0);
|
|
|
|
|
mr.r2 = v_set(0, -s, c, 0);
|
|
|
|
|
m_store(m, mr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline void mat44_rotation_y_impl(float* m, float angle)
|
|
|
|
|
{
|
|
|
|
|
float c = cosf(angle), s = sinf(angle);
|
|
|
|
|
f32x4x4 mr = m_identity();
|
|
|
|
|
mr.r0 = v_set(c, 0, -s, 0);
|
|
|
|
|
mr.r2 = v_set(s, 0, c, 0);
|
|
|
|
|
m_store(m, mr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline void mat44_rotation_z_impl(float* m, float angle)
|
|
|
|
|
{
|
|
|
|
|
float c = cosf(angle), s = sinf(angle);
|
|
|
|
|
f32x4x4 mr = m_identity();
|
|
|
|
|
mr.r0 = v_set(c, s, 0, 0);
|
|
|
|
|
mr.r1 = v_set(-s, c, 0, 0);
|
|
|
|
|
m_store(m, mr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Compose rotation from Euler angles (pitch=X, yaw=Y, roll=Z)
|
|
|
|
|
inline void mat44_rotation_euler_impl(float* m, float pitch, float yaw, float roll)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 rx, ry, rz;
|
|
|
|
|
mat44_rotation_x_impl((float*)&rx, pitch);
|
|
|
|
|
mat44_rotation_y_impl((float*)&ry, yaw);
|
|
|
|
|
mat44_rotation_z_impl((float*)&rz, roll);
|
|
|
|
|
|
|
|
|
|
f32x4x4 r = m_mul(rz, m_mul(ry, rx));
|
|
|
|
|
m_store(m, r);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up)
|
|
|
|
|
{
|
|
|
|
|
f32x4 vEye = v_load3_pos(eye);
|
|
|
|
|
f32x4 vTarget = v_load3_pos(target);
|
|
|
|
|
f32x4 vUp = v_load3_vec(up);
|
|
|
|
|
|
|
|
|
|
// Forward (z+)
|
|
|
|
|
f32x4 zaxis = v_normalize3(v_sub(vTarget, vEye));
|
|
|
|
|
|
|
|
|
|
// Right (x+)
|
|
|
|
|
f32x4 xaxis = v_normalize3(v_cross(vUp, zaxis));
|
|
|
|
|
|
|
|
|
|
// Up (y+)
|
|
|
|
|
f32x4 yaxis = v_cross(zaxis, xaxis);
|
|
|
|
|
|
|
|
|
|
// Compute translation components: -dot(axis, eye)
|
|
|
|
|
f32x4 t_x = v_mul(v_dot3(xaxis, vEye), v_set1(-1.0f));
|
|
|
|
|
f32x4 t_y = v_mul(v_dot3(yaxis, vEye), v_set1(-1.0f));
|
|
|
|
|
f32x4 t_z = v_mul(v_dot3(zaxis, vEye), v_set1(-1.0f));
|
|
|
|
|
|
|
|
|
|
f32x4x4 view;
|
|
|
|
|
view.r0 = v_insert_w(xaxis, t_x);
|
|
|
|
|
view.r1 = v_insert_w(yaxis, t_y);
|
|
|
|
|
view.r2 = v_insert_w(zaxis, t_z);
|
|
|
|
|
view.r3 = v_set(0, 0, 0, 1.0f);
|
|
|
|
|
|
|
|
|
|
m_store(m, view);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline void mat44_perspective_impl(float* m, float fovY, float aspect, float znear, float zfar)
|
|
|
|
|
{
|
|
|
|
|
float f = 1.0f / tanf(fovY * 0.5f);
|
|
|
|
|
float nf = 1.0f / (znear - zfar);
|
|
|
|
|
|
|
|
|
|
f32x4x4 mp = m_zero();
|
|
|
|
|
mp.r0 = v_set(f / aspect, 0, 0, 0);
|
|
|
|
|
mp.r1 = v_set(0, f, 0, 0);
|
|
|
|
|
mp.r2 = v_set(0, 0, (zfar + znear) * nf, 2 * zfar * znear * nf);
|
|
|
|
|
mp.r3 = v_set(0, 0, -1, 0); // row-major projection
|
|
|
|
|
m_store(m, mp);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline void mat44_orthographic_impl(float* m, float left, float right, float bottom, float top, float znear, float zfar)
|
|
|
|
|
{
|
|
|
|
|
f32x4x4 mo = m_zero();
|
|
|
|
|
mo.r0 = v_set(2.0f / (right - left), 0, 0, -(right + left) / (right - left));
|
|
|
|
|
mo.r1 = v_set(0, 2.0f / (top - bottom), 0, -(top + bottom) / (top - bottom));
|
|
|
|
|
mo.r2 = v_set(0, 0, -2.0f / (zfar - znear), -(zfar + znear) / (zfar - znear));
|
|
|
|
|
mo.r3 = v_set(0, 0, 0, 1.0f);
|
|
|
|
|
m_store(m, mo);
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 18:54:28 +00:00
|
|
|
//--------------------------------------------------
|
|
|
|
|
// MATRIX BATCH FUNCTIONS
|
|
|
|
|
//--------------------------------------------------
|
|
|
|
|
|
2026-03-05 20:04:33 +00:00
|
|
|
inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result)
|
2026-03-05 18:54:28 +00:00
|
|
|
{
|
2026-03-05 20:04:33 +00:00
|
|
|
int i = 0;
|
2026-03-05 18:54:28 +00:00
|
|
|
// AVX has 8 lanes to play with
|
|
|
|
|
#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX)
|
|
|
|
|
// 8-wide AVX only
|
|
|
|
|
for (; i + 8 <= count; i += 8)
|
|
|
|
|
{
|
|
|
|
|
vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false);
|
2026-03-06 09:41:35 +00:00
|
|
|
vec4_batch8 vr = m_mul_pos3_batch8(m, va);
|
2026-03-05 18:54:28 +00:00
|
|
|
store_vec3_batch8(&result[i*3], vr);
|
|
|
|
|
}
|
|
|
|
|
#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX
|
|
|
|
|
|
|
|
|
|
// 4-wide
|
|
|
|
|
for (; i + 4 <= count; i += 4)
|
|
|
|
|
{
|
|
|
|
|
vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false);
|
2026-03-06 09:41:35 +00:00
|
|
|
vec4_batch4 vr = m_mul_pos3_batch4(m, va);
|
2026-03-05 18:54:28 +00:00
|
|
|
store_vec3_batch4(&result[i * 3], vr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (; i < count; ++i)
|
|
|
|
|
{
|
|
|
|
|
size_t idx = i * 3;
|
|
|
|
|
mat44_mul_pos3_impl(m, &points[idx], &result[idx]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-03 19:09:00 +00:00
|
|
|
} // namespace math_backend::mat44
|