Merge pull request #1680 from marauder2k9-torque/SIMD-Math-and-ISA-integration
Some checks failed
Linux Build / Ubuntu Latest GCC (push) Has been cancelled
MacOSX Build / MacOSX Latest Clang (push) Has been cancelled
Windows Build / Windows Latest MSVC (push) Has been cancelled

Simd math and isa integration
This commit is contained in:
Brian Roberts 2026-03-13 12:50:17 -05:00 committed by GitHub
commit 6700cfd993
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
49 changed files with 5232 additions and 212 deletions

View file

@ -130,7 +130,7 @@ torqueAddSourceDirectories("windowManager" "windowManager/torque" "windowManager
torqueAddSourceDirectories("scene" "scene/culling" "scene/zones" "scene/mixin")
# Handle math
torqueAddSourceDirectories("math" "math/util")
torqueAddSourceDirectories("math" "math/util" "math/public" "math/impl") # note impl must skip the .inl files, never use them in engine code.
# Handle persistence
set(TORQUE_INCLUDE_DIRECTORIES ${TORQUE_INCLUDE_DIRECTORIES} "persistence/rapidjson")
@ -496,6 +496,37 @@ else()
set_target_properties(${TORQUE_APP_NAME} PROPERTIES LINK_FLAGS "-Wl,-rpath,./")
endif()
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH)
set(IS_X86 FALSE)
set(IS_ARM FALSE)
if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
set(IS_X86 TRUE)
endif()
if(ARCH MATCHES "arm64|aarch64")
set(IS_ARM TRUE)
endif()
# always available
add_math_backend(scalar MATH_SIMD_SCALAR)
message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}")
message(STATUS "IS_X86=${IS_X86}")
message(STATUS "IS_ARM=${IS_ARM}")
# x86 family
if(IS_X86)
add_math_backend(sse2 MATH_SIMD_SSE2)
add_math_backend(sse41 MATH_SIMD_SSE41)
add_math_backend(avx MATH_SIMD_AVX)
add_math_backend(avx2 MATH_SIMD_AVX2)
endif()
# ARM family
if(IS_ARM)
add_math_backend(neon MATH_SIMD_NEON)
endif()
if(MSVC)
# Match projectGenerator naming for executables

View file

@ -3406,7 +3406,7 @@ MatrixF MeshRoad::getNodeTransform( U32 idx )
mat.setColumn( 2, node.normal );
mat.setColumn( 3, node.point );
AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
AssertFatal(mat.determinant() != 0.0f, "no inverse!");
return mat;
}
@ -3456,7 +3456,7 @@ void MeshRoad::calcSliceTransform( U32 idx, MatrixF &mat )
mat.setColumn( 2, slice.normal );
mat.setColumn( 3, slice.p1 );
AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
AssertFatal(mat.determinant() != 0.0f, "no inverse!");
}
F32 MeshRoad::getRoadLength() const

View file

@ -2139,7 +2139,7 @@ MatrixF River::getNodeTransform( U32 idx ) const
mat.setColumn( 2, node.normal );
mat.setColumn( 3, node.point );
AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
AssertFatal( mat.determinant() != 0.0f, "no inverse!");
return mat;
}

View file

@ -0,0 +1,122 @@
#pragma once
#include <cmath> // for sqrtf, etc.
#include "../mConstants.h"
// Safely loads a float3 -> simd 4 lane backend
namespace math_backend::float3
{
//----------------------------------------------------------
// Add two float4 vectors: r = a + b
inline void float3_add_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3_vec(a);
f32x4 vb = v_load3_vec(b);
f32x4 vr = v_add(va, vb);
v_store3(r, vr);
}
// Subtract: r = a - b
inline void float3_sub_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3_vec(a);
f32x4 vb = v_load3_vec(b);
f32x4 vr = v_sub(va, vb);
v_store3(r, vr);
}
// Multiply element-wise: r = a * b
inline void float3_mul_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3_vec(a);
f32x4 vb = v_load3_vec(b);
f32x4 vr = v_mul(va, vb);
v_store3(r, vr);
}
// Multiply by scalar: r = a * s
inline void float3_mul_scalar_impl(const float* a, float s, float* r)
{
f32x4 va = v_load3_vec(a);
f32x4 vs = v_set1(s);
f32x4 vr = v_mul(va, vs);
v_store3(r, vr);
}
// Divide element-wise: r = a / b
inline void float3_div_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3_vec(a);
f32x4 vb = v_load3_vec(b);
f32x4 vr = v_div(va, vb);
v_store3(r, vr);
}
// Divide by scalar: r = a / s
inline void float3_div_scalar_impl(const float* a, float s, float* r)
{
f32x4 va = v_load3_vec(a);
f32x4 vs = v_set1(s);
f32x4 vr = v_div(va, vs);
v_store3(r, vr);
}
// Dot product: returns scalar
inline float float3_dot_impl(const float* a, const float* b)
{
f32x4 va = v_load3_vec(a);
f32x4 vb = v_load3_vec(b);
f32x4 vdot = v_dot3(va, vb);
return v_extract0(vdot); // first lane is the sum of 3 elements
}
// Length squared
inline float float3_length_squared_impl(const float* a)
{
return float3_dot_impl(a, a);
}
// Length
inline float float3_length_impl(const float* a)
{
return std::sqrt(float3_length_squared_impl(a));
}
// Normalize in-place
inline void float3_normalize_impl(float* a)
{
f32x4 va = v_load3_vec(a);
f32x4 vr = v_normalize3(va);
v_store3(a, vr);
}
// Normalize with magnitude: r = normalize(a) * r
inline void float3_normalize_mag_impl(float* a, float r)
{
f32x4 va = v_load3_vec(a);
// invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
f32x4 vnorm = v_mul(va, invLen);
v_store3(a, vnorm);
}
// Linear interpolation: r = from + (to - from) * f
inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
{
f32x4 vfrom = v_load3_vec(from);
f32x4 vto = v_load3_vec(to);
f32x4 vf = v_set1(f);
f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
v_store3(r, vr);
}
inline void float3_cross_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load3_vec(a);
f32x4 vb = v_load3_vec(b);
f32x4 vcross = v_cross(va, vb);
v_store3(r, vcross);
}
}

View file

@ -0,0 +1,123 @@
#pragma once
#include <cmath> // for sqrtf, etc.
#include "../mConstants.h"
namespace math_backend::float4
{
//----------------------------------------------------------
// Add two float4 vectors: r = a + b
inline void float4_add_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load(a);
f32x4 vb = v_load(b);
f32x4 vr = v_add(va, vb);
v_store(r, vr);
}
// Subtract: r = a - b
inline void float4_sub_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load(a);
f32x4 vb = v_load(b);
f32x4 vr = v_sub(va, vb);
v_store(r, vr);
}
// Multiply element-wise: r = a * b
inline void float4_mul_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load(a);
f32x4 vb = v_load(b);
f32x4 vr = v_mul(va, vb);
v_store(r, vr);
}
// Multiply by scalar: r = a * s
inline void float4_mul_scalar_impl(const float* a, float s, float* r)
{
f32x4 va = v_load(a);
f32x4 vs = v_set1(s);
f32x4 vr = v_mul(va, vs);
v_store(r, vr);
}
// Divide element-wise: r = a / b
inline void float4_div_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load(a);
f32x4 vb = v_load(b);
f32x4 vr = v_div(va, vb);
v_store(r, vr);
}
// Divide by scalar: r = a / s
inline void float4_div_scalar_impl(const float* a, float s, float* r)
{
f32x4 va = v_load(a);
f32x4 vs = v_set1(s);
f32x4 vr = v_div(va, vs);
v_store(r, vr);
}
// Dot product: returns scalar
inline float float4_dot_impl(const float* a, const float* b)
{
f32x4 va = v_load(a);
f32x4 vb = v_load(b);
f32x4 vdot = v_dot4(va, vb); // calls ISA-specific implementation
return v_extract0(vdot);
}
// Length squared
inline float float4_length_squared_impl(const float* a)
{
return float4_dot_impl(a, a);
}
// Length
inline float float4_length_impl(const float* a)
{
return std::sqrt(float4_length_squared_impl(a));
}
// Normalize in-place
inline void float4_normalize_impl(float* a)
{
f32x4 va = v_load(a);
f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted
f32x4 vnorm = v_mul(va, invLen);
v_store(a, vnorm);
}
// Normalize with magnitude: r = normalize(a) * r
inline void float4_normalize_mag_impl(float* a, float r)
{
f32x4 va = v_load(a);
// invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va)));
f32x4 vnorm = v_mul(va, invLen);
v_store(a, vnorm);
}
// Linear interpolation: r = from + (to - from) * f
inline void float4_lerp_impl(const float* from, const float* to, float f, float* r)
{
f32x4 vfrom = v_load(from);
f32x4 vto = v_load(to);
f32x4 vf = v_set1(f);
f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
v_store(r, vr);
}
inline void float4_cross_impl(const float* a, const float* b, float* r)
{
f32x4 va = v_load(a);
f32x4 vb = v_load(b);
f32x4 vcross = v_cross(va, vb);
v_store(r, vcross);
}
} // namespace math_backend::float4

View file

@ -0,0 +1,424 @@
#pragma once
#include <cmath> // for sqrtf, etc.
#include "../mConstants.h"
namespace math_backend::mat44
{
//------------------------------------------------------------------
// Matrix Transpose
inline void mat44_transpose_impl(float* m)
{
f32x4x4 ma = m_load(m);
f32x4x4 mr = m_transpose(ma);
m_store(m, mr);
}
inline float mat44_get_determinant(const float* m)
{
f32x4x4 ma = m_load(m);
return v_extract0(m_determinant_affine(ma));
}
// Matrix Scale: Float3 (assume w = 1.0f)
inline void mat44_scale_impl(float* m, const float* s)
{
f32x4x4 ma = m_load(m);
f32x4 va = v_load3_pos(s);
ma.r0 = v_mul(ma.r0, va);
ma.r1 = v_mul(ma.r1, va);
ma.r2 = v_mul(ma.r2, va);
ma.r3 = v_mul(ma.r3, va);
m_store(m, ma);
}
inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result)
{
f32x4x4 M = m_load(m);
f32x4 plane_v = v_load(plane);
f32x4 scale_v = v_load3_vec(scale);
f32x4 invScale = v_rcp_nr(scale_v);
// normal = plane.xyz
f32x4 normal = plane_v;
// apply Inv(s)
normal = v_mul(normal, invScale);
// multiply by Inv(Tr(m)) (only the rotation part matters)
f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0);
f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1);
f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2);
normal = v_add(v_add(nx, ny), nz);
normal = v_normalize3(normal);
// compute point on plane
float d = v_extract0(v_swizzle_singular_mask(plane_v, 3));
f32x4 point = v_mul(plane_v, v_set1(-d));
point = v_preserve_w(point, v_set1(1.0f));
// apply scale
point = v_mul(point, scale_v);
// transform point by matrix
point = m_mul_vec4(M, point);
// compute new plane distance
float newD = -v_extract0(v_dot3(point, normal));
alignas(16) float n[4];
v_store(n, normal);
plane_result[0] = n[0];
plane_result[1] = n[1];
plane_result[2] = n[2];
plane_result[3] = newD;
}
inline void mat44_get_scale_impl(const float* m, float* s)
{
f32x4x4 ma = m_load(m);
// squared lengths
f32x4 len2_x = v_dot3(ma.r0, ma.r0);
f32x4 len2_y = v_dot3(ma.r1, ma.r1);
f32x4 len2_z = v_dot3(ma.r2, ma.r2);
// extract and sqrt
s[0] = 1.0f / v_extract0(v_rsqrt_nr(len2_x));
s[1] = 1.0f / v_extract0(v_rsqrt_nr(len2_y));
s[2] = 1.0f / v_extract0(v_rsqrt_nr(len2_z));
}
// Matrix Scale Uniform: Float value (assume w = 1.0f)
inline void mat44_scale_uniform(float* m, float s)
{
f32x4x4 ma = m_load(m);
// (s, s, s, 1)
f32x4 scale = v_set(s, s, s, 1.0f);
// Scale only rotation rows (xyz part)
ma.r0 = v_mul(ma.r0, scale);
ma.r1 = v_mul(ma.r1, scale);
ma.r2 = v_mul(ma.r2, scale);
m_store(m, ma);
}
// Matrix Inverse
inline void mat44_inverse_impl(float* m)
{
f32x4x4 ma = m_load(m);
// Compute cofactors using cross products
f32x4x4 mTemp;
mTemp.r0 = v_cross(ma.r1, ma.r2);
mTemp.r1 = v_cross(ma.r2, ma.r0);
mTemp.r2 = v_cross(ma.r0, ma.r1);
// Determinant = dot(ma.r0, c0)
f32x4 det = v_dot3(ma.r0, mTemp.r0);
f32x4 invDet = v_rcp_nr(det);
// Scale cofactors
mTemp.r0 = v_mul(mTemp.r0, invDet);
mTemp.r1 = v_mul(mTemp.r1, invDet);
mTemp.r2 = v_mul(mTemp.r2, invDet);
// Store inverse 3x3 (transpose of cofactor matrix)
mTemp = m_transpose(mTemp);
mTemp.r3 = ma.r3;
// ---- Translation ----
// Load original translation
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
result = v_mul(result, v_set1(-1.0f));
m_store(m, mTemp);
// Store translation
m[3] = v_extract0(result);
m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
}
// Matrix Inverse
inline void mat44_inverse_to_impl(const float* m, float* dst)
{
f32x4x4 ma = m_load(m);
// Compute cofactors using cross products
f32x4x4 mTemp;
mTemp.r0 = v_cross(ma.r1, ma.r2);
mTemp.r1 = v_cross(ma.r2, ma.r0);
mTemp.r2 = v_cross(ma.r0, ma.r1);
// Determinant = dot(ma.r0, c0)
f32x4 det = v_dot3(ma.r0, mTemp.r0);
f32x4 invDet = v_rcp_nr(det);
// Scale cofactors
mTemp.r0 = v_mul(mTemp.r0, invDet);
mTemp.r1 = v_mul(mTemp.r1, invDet);
mTemp.r2 = v_mul(mTemp.r2, invDet);
// Store inverse 3x3 (transpose of cofactor matrix)
mTemp = m_transpose(mTemp);
mTemp.r3 = ma.r3;
// ---- Translation ----
// Load original translation
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
result = v_mul(result, v_set1(-1.0f));
m_store(dst, mTemp);
// Store translation
dst[3] = v_extract0(result);
dst[7] = v_extract0(v_swizzle_singular_mask(result, 1));
dst[11] = v_extract0(v_swizzle_singular_mask(result, 2));
}
// Matrix Affine Inverse
inline void mat44_affine_inverse_impl(float* m)
{
f32x4x4 ma = m_load(m);
f32x4x4 mTemp = m_transpose(ma);
mTemp.r3 = v_set(0, 0, 0, 1);
// ---- Translation ----
// Load original translation
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
result = v_mul(result, v_set1(-1.0f));
m_store(m, mTemp);
// Store translation
m[3] = v_extract0(result);
m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
}
inline void mat44_normalize_impl(float* m)
{
// Load the matrix into SIMD registers
f32x4x4 mat = m_load(m);
// Transpose: now rows are columns
mat = m_transpose(mat);
// Extract columns (which are now rows)
f32x4 col0 = mat.r0;
f32x4 col1 = mat.r1;
// Rebuild orthonormal basis
f32x4 col2 = v_cross(col0, col1);
col1 = v_cross(col2, col0);
// Normalize columns
col0 = v_normalize3(col0);
col1 = v_normalize3(col1);
col2 = v_normalize3(col2);
// Write back directly into transposed matrix
mat.r0 = col0;
mat.r1 = col1;
mat.r2 = col2;
// Transpose back to row-major
mat = m_transpose(mat);
// Store back
m_store(m, mat);
}
// Matrix Multiply: a * b
inline void mat44_mul_mat44_impl(const float* a, const float* b, float* r)
{
f32x4x4 ma = m_load(a);
f32x4x4 mb = m_load(b);
f32x4x4 mr = m_mul(ma, mb);
m_store(r, mr);
}
// Vector Multiply: m * p (assume w = 1.0f)
inline void mat44_mul_pos3_impl(const float *m, const float *p, float* r)
{
f32x4x4 ma = m_load(m);
f32x4 va = v_load3_pos(p);
f32x4 vr = m_mul_vec4(ma, va);
v_store3(r, vr);
}
// Vector Multiply: m * v (assume w = 0.0f)
inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r)
{
f32x4x4 ma = m_load(m);
f32x4 va = v_load3_vec(v);
f32x4 vr = m_mul_vec3(ma, va);
v_store3(r, vr);
}
// Vector Multiply: m * p (full [4x4] * [1x4])
inline void mat44_mul_float4_impl(const float* m, const float* p, float* r)
{
f32x4x4 ma = m_load(m);
f32x4 va = v_load(p);
f32x4 vr = m_mul_vec4(ma, va);
v_store(r, vr);
}
//--------------------------------------------------
// MATRIX ROTATION FUNCTIONS
//--------------------------------------------------
inline void mat44_rotation_x_impl(float* m, float angle)
{
float c = cosf(angle), s = sinf(angle);
f32x4x4 mr = m_identity();
mr.r1 = v_set(0, c, s, 0);
mr.r2 = v_set(0, -s, c, 0);
m_store(m, mr);
}
inline void mat44_rotation_y_impl(float* m, float angle)
{
float c = cosf(angle), s = sinf(angle);
f32x4x4 mr = m_identity();
mr.r0 = v_set(c, 0, -s, 0);
mr.r2 = v_set(s, 0, c, 0);
m_store(m, mr);
}
inline void mat44_rotation_z_impl(float* m, float angle)
{
float c = cosf(angle), s = sinf(angle);
f32x4x4 mr = m_identity();
mr.r0 = v_set(c, s, 0, 0);
mr.r1 = v_set(-s, c, 0, 0);
m_store(m, mr);
}
// Compose rotation from Euler angles (pitch=X, yaw=Y, roll=Z)
inline void mat44_rotation_euler_impl(float* m, float pitch, float yaw, float roll)
{
f32x4x4 rx, ry, rz;
mat44_rotation_x_impl((float*)&rx, pitch);
mat44_rotation_y_impl((float*)&ry, yaw);
mat44_rotation_z_impl((float*)&rz, roll);
f32x4x4 r = m_mul(rz, m_mul(ry, rx));
m_store(m, r);
}
inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up)
{
f32x4 vEye = v_load3_pos(eye);
f32x4 vTarget = v_load3_pos(target);
f32x4 vUp = v_load3_vec(up);
// Forward (z+)
f32x4 zaxis = v_normalize3(v_sub(vTarget, vEye));
// Right (x+)
f32x4 xaxis = v_normalize3(v_cross(vUp, zaxis));
// Up (y+)
f32x4 yaxis = v_cross(zaxis, xaxis);
// Compute translation components: -dot(axis, eye)
f32x4 t_x = v_mul(v_dot3(xaxis, vEye), v_set1(-1.0f));
f32x4 t_y = v_mul(v_dot3(yaxis, vEye), v_set1(-1.0f));
f32x4 t_z = v_mul(v_dot3(zaxis, vEye), v_set1(-1.0f));
f32x4x4 view;
view.r0 = v_insert_w(xaxis, t_x);
view.r1 = v_insert_w(yaxis, t_y);
view.r2 = v_insert_w(zaxis, t_z);
view.r3 = v_set(0, 0, 0, 1.0f);
m_store(m, view);
}
inline void mat44_perspective_impl(float* m, float fovY, float aspect, float znear, float zfar)
{
float f = 1.0f / tanf(fovY * 0.5f);
float nf = 1.0f / (znear - zfar);
f32x4x4 mp = m_zero();
mp.r0 = v_set(f / aspect, 0, 0, 0);
mp.r1 = v_set(0, f, 0, 0);
mp.r2 = v_set(0, 0, (zfar + znear) * nf, 2 * zfar * znear * nf);
mp.r3 = v_set(0, 0, -1, 0); // row-major projection
m_store(m, mp);
}
inline void mat44_orthographic_impl(float* m, float left, float right, float bottom, float top, float znear, float zfar)
{
f32x4x4 mo = m_zero();
mo.r0 = v_set(2.0f / (right - left), 0, 0, -(right + left) / (right - left));
mo.r1 = v_set(0, 2.0f / (top - bottom), 0, -(top + bottom) / (top - bottom));
mo.r2 = v_set(0, 0, -2.0f / (zfar - znear), -(zfar + znear) / (zfar - znear));
mo.r3 = v_set(0, 0, 0, 1.0f);
m_store(m, mo);
}
//--------------------------------------------------
// MATRIX BATCH FUNCTIONS
//--------------------------------------------------
inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result)
{
int i = 0;
// AVX has 8 lanes to play with
#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX)
// 8-wide AVX only
for (; i + 8 <= count; i += 8)
{
vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false);
vec4_batch8 vr = m_mul_pos3_batch8(m, va);
store_vec3_batch8(&result[i*3], vr);
}
#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX
// 4-wide
for (; i + 4 <= count; i += 4)
{
vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false);
vec4_batch4 vr = m_mul_pos3_batch4(m, va);
store_vec3_batch4(&result[i * 3], vr);
}
for (; i < count; ++i)
{
size_t idx = i * 3;
mat44_mul_pos3_impl(m, &points[idx], &result[idx]);
}
}
} // namespace math_backend::mat44

View file

@ -0,0 +1,500 @@
#include "platform/platform.h"
#include "math/public/float4_dispatch.h"
#include "math/public/float3_dispatch.h"
#include "math/public/mat44_dispatch.h"
#include "math/mConstants.h"
#include "math/mMatrix.h"
#include <cmath> // for sqrtf, etc.
namespace math_backend::float4::dispatch
{
void install_scalar()
{
gFloat4.add = [](const float* a, const float* b, float* r) {
for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
};
gFloat4.sub = [](const float* a, const float* b, float* r) {
for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
};
gFloat4.mul = [](const float* a, const float* b, float* r) {
for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
};
gFloat4.mul_scalar = [](const float* a, float s, float* r) {
for (int i = 0; i < 4; i++) r[i] = a[i] * s;
};
gFloat4.div = [](const float* a, const float* b, float* r) {
for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
};
gFloat4.div_scalar = [](const float* a, float s, float* r) {
float denom = 1.0f / s;
for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
};
gFloat4.dot = [](const float* a, const float* b) {
float sum = 0.f;
for (int i = 0; i < 4; i++) sum += a[i] * b[i];
return sum;
};
gFloat4.length = [](const float* a) {
float sum = 0.f;
for (int i = 0; i < 4; i++) sum += a[i] * a[i];
return std::sqrt(sum);
};
gFloat4.lengthSquared = [](const float* a) {
float sum = 0.f;
for (int i = 0; i < 4; i++) sum += a[i] * a[i];
return (sum);
};
gFloat4.normalize = [](float* a) {
float len = gFloat4.length(a);
if (len > POINT_EPSILON)
{
float denom = 1.0f / len;
for (int i = 0; i < 4; i++)
a[i] *= denom;
}
};
gFloat4.normalize_mag = [](float* a, float f) {
float len = gFloat4.length(a);
if (len > POINT_EPSILON)
{
float denom = f / len;
for (int i = 0; i < 4; i++) a[i] *= denom;
}
};
gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
};
gFloat4.cross = [](const float* a, const float* b, float* r) {
const float ax = a[0];
const float ay = a[1];
const float az = a[2];
const float bx = b[0];
const float by = b[1];
const float bz = b[2];
r[0] = ay * bz - az * by;
r[1] = az * bx - ax * bz;
r[2] = ax * by - ay * bx;
};
}
}
namespace math_backend::float3::dispatch
{
void install_scalar()
{
gFloat3.add = [](const float* a, const float* b, float* r) {
for (int i = 0; i < 3; i++) r[i] = a[i] + b[i];
};
gFloat3.sub = [](const float* a, const float* b, float* r) {
for (int i = 0; i < 3; i++) r[i] = a[i] - b[i];
};
gFloat3.mul = [](const float* a, const float* b, float* r) {
for (int i = 0; i < 3; i++) r[i] = a[i] * b[i];
};
gFloat3.mul_scalar = [](const float* a, float s, float* r) {
for (int i = 0; i < 3; i++) r[i] = a[i] * s;
};
gFloat3.div = [](const float* a, const float* b, float* r) {
for (int i = 0; i < 3; i++) r[i] = a[i] / b[i];
};
gFloat3.div_scalar = [](const float* a, float s, float* r) {
float denom = 1.0f / s;
for (int i = 0; i < 3; i++) r[i] = a[i] * denom;
};
gFloat3.dot = [](const float* a, const float* b) {
float sum = 0.f;
for (int i = 0; i < 3; i++) sum += a[i] * b[i];
return sum;
};
gFloat3.length = [](const float* a) {
float sum = 0.f;
for (int i = 0; i < 3; i++) sum += a[i] * a[i];
return std::sqrt(sum);
};
gFloat3.lengthSquared = [](const float* a) {
float sum = 0.f;
for (int i = 0; i < 3; i++) sum += a[i] * a[i];
return (sum);
};
gFloat3.normalize = [](float* a) {
float len = gFloat3.length(a);
if (len > POINT_EPSILON)
{
float denom = 1.0 / len;
for (int i = 0; i < 3; i++) a[i] *= denom;
}
};
gFloat3.normalize_mag = [](float* a, float f) {
float len = gFloat3.length(a);
if (len > POINT_EPSILON)
{
float denom = f / len;
for (int i = 0; i < 3; i++) a[i] *= denom;
}
};
gFloat3.lerp = [](const float* from, const float* to, float f, float* r) {
for (int i = 0; i < 3; i++) r[i] = from[i] + (to[i] - from[i]) * f;
};
gFloat3.cross = [](const float* a, const float* b, float* r) {
const float ax = a[0];
const float ay = a[1];
const float az = a[2];
const float bx = b[0];
const float by = b[1];
const float bz = b[2];
r[0] = ay * bz - az * by;
r[1] = az * bx - ax * bz;
r[2] = ax * by - ay * bx;
};
}
}
inline void swap(float& a, float& b)
{
float temp = a;
a = b;
b = temp;
}
namespace math_backend::mat44::dispatch
{
void install_scalar()
{
gMat44.transpose = [](float* a) {
swap(a[1], a[4]);
swap(a[2], a[8]);
swap(a[3], a[12]);
swap(a[6], a[9]);
swap(a[7], a[13]);
swap(a[11], a[14]);
};
gMat44.determinant = [](const float* m) {
return m[0] * (m[5] * m[10] - m[6] * m[9]) +
m[4] * (m[2] * m[9] - m[1] * m[10]) +
m[8] * (m[1] * m[6] - m[2] * m[5]);
};
gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
#ifdef TORQUE_COMPILER_GCC
const F32 v0 = b[0], v1 = b[1], v2 = b[2];
const F32 m0 = a[0], m1 = a[1], m2 = a[2];
const F32 m4 = a[4], m5 = a[5], m6 = a[6];
const F32 m8 = a[8], m9 = a[9], m10 = a[10];
r[0] = m0 * v0 + m1 * v1 + m2 * v2;
r[1] = m4 * v0 + m5 * v1 + m6 * v2;
r[2] = m8 * v0 + m9 * v1 + m10 * v2;
#else
r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
#endif
};
gMat44.inverse = [](float* m) {
// using Cramers Rule find the Inverse
// Minv = (1/det(M)) * adjoint(M)
float det = gMat44.determinant(m);
AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
float invDet = 1.0f / det;
float temp[16];
temp[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
temp[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
temp[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
temp[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
temp[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
m[0] = temp[0];
m[1] = temp[1];
m[2] = temp[2];
m[4] = temp[4];
m[5] = temp[5];
m[6] = temp[6];
m[8] = temp[8];
m[9] = temp[9];
m[10] = temp[10];
// invert the translation
temp[0] = -m[3];
temp[1] = -m[7];
temp[2] = -m[11];
gMat44.mul_vec3(m, temp, &temp[4]);
m[3] = temp[4];
m[7] = temp[5];
m[11] = temp[6];
};
gMat44.inverse_to = [](const float* m, float* d) {
// using Cramers Rule find the Inverse
// Minv = (1/det(M)) * adjoint(M)
F32 det = gMat44.determinant(m);
AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
F32 invDet = 1.0f / det;
d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
// invert the translation
F32 temp[6];
temp[0] = -m[3];
temp[1] = -m[7];
temp[2] = -m[11];
gMat44.mul_vec3(d, temp, &temp[3]);
d[3] = temp[3];
d[7] = temp[4];
d[11] = temp[5];
d[12] = m[12];
d[13] = m[13];
d[14] = m[14];
d[15] = m[15];
};
gMat44.affine_inverse = [](float* a) {
F32 temp[16];
dMemcpy(temp, a, 16 * sizeof(F32));
// Transpose rotation
a[1] = temp[4];
a[4] = temp[1];
a[2] = temp[8];
a[8] = temp[2];
a[6] = temp[9];
a[9] = temp[6];
a[3] = -(temp[0] * temp[3] + temp[4] * temp[7] + temp[8] * temp[11]);
a[7] = -(temp[1] * temp[3] + temp[5] * temp[7] + temp[9] * temp[11]);
a[11] = -(temp[2] * temp[3] + temp[6] * temp[7] + temp[10] * temp[11]);
};
gMat44.scale = [](float* a, const float* s) {
// Note, doesn't allow scaling w...
a[0] *= s[0]; a[1] *= s[1]; a[2] *= s[2];
a[4] *= s[0]; a[5] *= s[1]; a[6] *= s[2];
a[8] *= s[0]; a[9] *= s[1]; a[10] *= s[2];
a[12] *= s[0]; a[13] *= s[1]; a[14] *= s[2];
};
gMat44.get_scale = [](const float* a, float* s) {
// Note, doesn't allow scaling w...
s[0] = sqrt(a[0] * a[0] + a[4] * a[4] + a[8] * a[8]);
s[1] = sqrt(a[1] * a[1] + a[5] * a[5] + a[9] * a[9]);
s[2] = sqrt(a[2] * a[2] + a[6] * a[6] + a[10] * a[10]);
};
gMat44.mul_float4 = [](const float* a, const float* b, float* r) {
AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2] + a[7] * b[3];
r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2] + a[11] * b[3];
r[2] = a[12] * b[0] + a[13] * b[1] + a[14] * b[2] + a[15] * b[3];
};
gMat44.mul_pos3 = [](const float* a, const float* b, float* r) {
AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
r[0] = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3];
r[1] = a[4]*b[0] + a[5]*b[1] + a[6]*b[2] + a[7];
r[2] = a[8]*b[0] + a[9]*b[1] + a[10]*b[2] + a[11];
};
gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
};
gMat44.mul_mat44 = [](const float* a, const float* b, float* mresult) {
mresult[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12];
mresult[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13];
mresult[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14];
mresult[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15];
mresult[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12];
mresult[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13];
mresult[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14];
mresult[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15];
mresult[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12];
mresult[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13];
mresult[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14];
mresult[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15];
mresult[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12];
mresult[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13];
mresult[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14];
mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
};
gMat44.transform_plane = [](const F32* m, const F32* s, const F32* p, F32* presult) {
// We take in a matrix, a scale factor, and a plane equation. We want to output
// the resultant normal
// We have T = m*s
// To multiply the normal, we want Inv(Tr(m*s))
// Inv(Tr(ms)) = Inv(Tr(s) * Tr(m))
// = Inv(Tr(m)) * Inv(Tr(s))
//
// Inv(Tr(s)) = Inv(s) = [ 1/x 0 0 0]
// [ 0 1/y 0 0]
// [ 0 0 1/z 0]
// [ 0 0 0 1]
//
// Since m is an affine matrix,
// Tr(m) = [ [ ] 0 ]
// [ [ R ] 0 ]
// [ [ ] 0 ]
// [ [ x y z ] 1 ]
//
// Inv(Tr(m)) = [ [ -1 ] 0 ]
// [ [ R ] 0 ]
// [ [ ] 0 ]
// [ [ A B C ] 1 ]
// Where:
//
// P = (x, y, z)
// A = -(Row(0, r) * P);
// B = -(Row(1, r) * P);
// C = -(Row(2, r) * P);
MatrixF invScale(true);
F32* pScaleElems = invScale;
pScaleElems[MatrixF::idx(0, 0)] = 1.0f / s[0];
pScaleElems[MatrixF::idx(1, 1)] = 1.0f / s[1];
pScaleElems[MatrixF::idx(2, 2)] = 1.0f / s[2];
const Point3F shear(m[MatrixF::idx(3, 0)], m[MatrixF::idx(3, 1)], m[MatrixF::idx(3, 2)]);
const Point3F row0(m[MatrixF::idx(0, 0)], m[MatrixF::idx(0, 1)], m[MatrixF::idx(0, 2)]);
const Point3F row1(m[MatrixF::idx(1, 0)], m[MatrixF::idx(1, 1)], m[MatrixF::idx(1, 2)]);
const Point3F row2(m[MatrixF::idx(2, 0)], m[MatrixF::idx(2, 1)], m[MatrixF::idx(2, 2)]);
const F32 A = -mDot(row0, shear);
const F32 B = -mDot(row1, shear);
const F32 C = -mDot(row2, shear);
MatrixF invTrMatrix(true);
F32* destMat = invTrMatrix;
destMat[MatrixF::idx(0, 0)] = m[MatrixF::idx(0, 0)];
destMat[MatrixF::idx(1, 0)] = m[MatrixF::idx(1, 0)];
destMat[MatrixF::idx(2, 0)] = m[MatrixF::idx(2, 0)];
destMat[MatrixF::idx(0, 1)] = m[MatrixF::idx(0, 1)];
destMat[MatrixF::idx(1, 1)] = m[MatrixF::idx(1, 1)];
destMat[MatrixF::idx(2, 1)] = m[MatrixF::idx(2, 1)];
destMat[MatrixF::idx(0, 2)] = m[MatrixF::idx(0, 2)];
destMat[MatrixF::idx(1, 2)] = m[MatrixF::idx(1, 2)];
destMat[MatrixF::idx(2, 2)] = m[MatrixF::idx(2, 2)];
destMat[MatrixF::idx(0, 3)] = A;
destMat[MatrixF::idx(1, 3)] = B;
destMat[MatrixF::idx(2, 3)] = C;
invTrMatrix.mul(invScale);
Point3F norm(p[0], p[1], p[2]);
Point3F point = norm * -p[3];
invTrMatrix.mulP(norm);
norm.normalize();
MatrixF temp;
dMemcpy(temp, m, sizeof(F32) * 16);
point.x *= s[0];
point.y *= s[1];
point.z *= s[2];
temp.mulP(point);
PlaneF resultPlane(point, norm);
presult[0] = resultPlane.x;
presult[1] = resultPlane.y;
presult[2] = resultPlane.z;
presult[3] = resultPlane.d;
};
gMat44.normalize = [](float* a) {
F32 col0[3], col1[3], col2[3];
// extract columns 0 and 1
col0[0] = a[0];
col0[1] = a[4];
col0[2] = a[8];
col1[0] = a[1];
col1[1] = a[5];
col1[2] = a[9];
math_backend::float3::dispatch::gFloat3.normalize(col0);
math_backend::float3::dispatch::gFloat3.normalize(col1);
math_backend::float3::dispatch::gFloat3.normalize(col2);
// store the normalized columns
a[0] = col0[0];
a[4] = col0[1];
a[8] = col0[2];
a[1] = col1[0];
a[5] = col1[1];
a[9] = col1[2];
a[2] = col2[0];
a[6] = col2[1];
a[10] = col2[2];
};
gMat44.batch_mul_pos3 = [](const float* m, const float* pts, int count, float* result_ptrs) {
int i = 0;
for (; i < count; i++)
{
int idx = i * 3;
gMat44.mul_pos3(m, &pts[idx], &result_ptrs[idx]);
}
};
}
}

View file

@ -0,0 +1,625 @@
#pragma once
#include <immintrin.h> // AVX
namespace
{
typedef __m128 f32x4;
//------------------------------------------------------
// Load / Store
//------------------------------------------------------
// Load 4 floats from memory into a SIMD register
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
inline f32x4 v_zero() { return _mm_setzero_ps(); }
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
//------------------------------------------------------
// Mask helpers
//------------------------------------------------------
inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
inline f32x4 v_swizzle_mask(f32x4 v1, const int x, const int y, const int z, const int w)
{
__m128i idx = _mm_set_epi32(w, z, y, x);
return _mm_permutevar_ps(v1, idx);
}
inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
{
__m128i idx = _mm_set1_epi32(x);
return _mm_permutevar_ps(v1, idx);
}
inline f32x4 v_swizzle_lo(f32x4 v1)
{
return _mm_moveldup_ps(v1);
}
inline f32x4 v_swizzle_hi(f32x4 v1)
{
return _mm_movehdup_ps(v1);
}
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
{
return _mm_blend_ps(newv, original, 0b1000);
}
//------------------------------------------------------
// Shuffle helpers
//------------------------------------------------------
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
{
f32x4 lo = v1;
f32x4 hi = v2;
f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
return _mm_movelh_ps(combined_lo, combined_hi);
}
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
{
return _mm_movelh_ps(v1, v2);
}
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
{
return _mm_movehl_ps(v2, v1);
}
//------------------------------------------------------
// Float3 helpers (safe loading into 4 lanes)
//------------------------------------------------------
inline f32x4 v_load3_vec(const float* p) // w = 0
{
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
}
inline f32x4 v_load3_pos(const float* p) // w = 1
{
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
}
inline void v_store3(float* dst, f32x4 v)
{
alignas(16) float tmp[4]; // temp storage
_mm_store_ps(tmp, v); // store all 4 lanes
dst[0] = tmp[0];
dst[1] = tmp[1];
dst[2] = tmp[2];
}
inline f32x4 v_set(float x, float y, float z, float w)
{
return _mm_set_ps(w, z, y, x);
}
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
{
return _mm_insert_ps(v, w, 0x30);
}
//------------------------------------------------------
// Simple Arithmatic
//------------------------------------------------------
// Element-wise multiply
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
// Element-wise divide
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
// Element-wise add
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
// Element-wise subtract
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
//------------------------------------------------------
// Fast recip
//------------------------------------------------------
// Fast recip 1/b
inline f32x4 v_rcp_nr(f32x4 b)
{
f32x4 r = _mm_rcp_ps(b);
f32x4 two = _mm_set1_ps(2.0f);
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
}
// Divide fast ( b = recip eg 1/b)
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
inline f32x4 v_rsqrt_nr(f32x4 x)
{
f32x4 half = _mm_set1_ps(0.5f);
f32x4 three = _mm_set1_ps(3.0f);
f32x4 r = _mm_rsqrt_ps(x);
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
}
//------------------------------------------------------
// Vector intrinsic functions
//------------------------------------------------------
// full dot4
inline f32x4 v_dot4(f32x4 a, f32x4 b)
{
return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
}
// dot3 (ignores w)
inline f32x4 v_dot3(f32x4 a, f32x4 b)
{
return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
}
// cross product xyz only.
inline f32x4 v_cross(f32x4 a, f32x4 b)
{
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
}
inline f32x4 v_normalize3(f32x4 v)
{
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
return _mm_blendv_ps(norm, fallback, isZero);
}
// adds all 4 lanes together.
inline f32x4 v_hadd4(f32x4 a)
{
// sum all 4 lanes in SSE41
__m128 sum = _mm_hadd_ps(a, a);
return _mm_hadd_ps(sum, sum);
}
//------------------------------------------------------
// Matrix type (row-major 4x4)
//------------------------------------------------------
struct f32x4x4
{
f32x4 r0;
f32x4 r1;
f32x4 r2;
f32x4 r3;
};
//------------------------------------------------------
// Matrix Load / Store
//------------------------------------------------------
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
{
f32x4x4 out;
out.r0 = v_load(m + 0);
out.r1 = v_load(m + 4);
out.r2 = v_load(m + 8);
out.r3 = v_load(m + 12);
return out;
}
inline void m_store(float* dst, const f32x4x4& m)
{
v_store(dst + 0, m.r0);
v_store(dst + 4, m.r1);
v_store(dst + 8, m.r2);
v_store(dst + 12, m.r3);
}
inline f32x4x4 m_identity()
{
f32x4x4 m;
m.r0 = _mm_set_ps(0, 0, 0, 1);
m.r1 = _mm_set_ps(0, 0, 1, 0);
m.r2 = _mm_set_ps(0, 1, 0, 0);
m.r3 = _mm_set_ps(1, 0, 0, 0);
return m;
}
inline f32x4x4 m_zero()
{
f32x4 z = v_zero();
return { z, z, z, z };
}
//------------------------------------------------------
// Matrix × Vector
//------------------------------------------------------
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot4(m.r0, v);
f32x4 y = v_dot4(m.r1, v);
f32x4 z = v_dot4(m.r2, v);
f32x4 w = v_dot4(m.r3, v);
// combine to a vector
return _mm_set_ps(_mm_cvtss_f32(w),
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot3(m.r0, v);
f32x4 y = v_dot3(m.r1, v);
f32x4 z = v_dot3(m.r2, v);
// combine to a vector
return _mm_set_ps(0.0f,
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
//------------------------------------------------------
// Matrix × Matrix
//------------------------------------------------------
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
{
// Transpose B once for dot products
f32x4 b0 = b.r0;
f32x4 b1 = b.r1;
f32x4 b2 = b.r2;
f32x4 b3 = b.r3;
// Transpose (SSE2)
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
f32x4x4 C;
auto mul_row = [&](f32x4 arow)
{
f32x4 x = v_dot4(arow, b0);
f32x4 y = v_dot4(arow, b1);
f32x4 z = v_dot4(arow, b2);
f32x4 w = v_dot4(arow, b3);
f32x4 xy = _mm_unpacklo_ps(x, y);
f32x4 zw = _mm_unpacklo_ps(z, w);
return _mm_movelh_ps(xy, zw);
};
C.r0 = mul_row(a.r0);
C.r1 = mul_row(a.r1);
C.r2 = mul_row(a.r2);
C.r3 = mul_row(a.r3);
return C;
}
//------------------------------------------------------
// Transpose
//------------------------------------------------------
inline f32x4x4 m_transpose(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
f32x4 r3 = m.r3;
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
return { r0, r1, r2, r3 };
}
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
{
f32x4x4 t = m_transpose(m);
// Extract translation
f32x4 T = v_set(
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
0.0f
);
f32x4 newT = m_mul_vec4(t, T);
newT = _mm_sub_ps(v_zero(), newT);
t.r0 = v_preserve_w(t.r0, newT);
t.r1 = v_preserve_w(t.r1, newT);
t.r2 = v_preserve_w(t.r2, newT);
t.r3 = v_set(0, 0, 0, 1);
return t;
}
inline f32x4 m_determinant(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(c, d);
f32x4 c1 = v_cross(d, b);
f32x4 c2 = v_cross(b, c);
f32x4 term0 = _mm_mul_ps(a, c0);
f32x4 term1 = _mm_mul_ps(a, c1);
f32x4 term2 = _mm_mul_ps(a, c2);
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
return v_hadd4(det);
}
inline f32x4 m_determinant_affine(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
r0 = _mm_and_ps(r0, v_mask_xyz());
r1 = _mm_and_ps(r1, v_mask_xyz());
r2 = _mm_and_ps(r2, v_mask_xyz());
f32x4 c0 = v_cross(r1, r2);
return v_dot3(r0, c0); // splatted determinant
}
//---------------------------------------------------------
// BATCH INTRINSICS
//---------------------------------------------------------
typedef __m256 f32x8;
struct vec4_batch8
{
f32x8 x;
f32x8 y;
f32x8 z;
f32x8 w;
};
struct vec4_batch4
{
f32x4 x;
f32x4 y;
f32x4 z;
f32x4 w;
};
inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
{
vec4_batch8 r;
r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
ptr[12], ptr[15], ptr[18], ptr[21]);
r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
ptr[13], ptr[16], ptr[19], ptr[22]);
r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
ptr[14], ptr[17], ptr[20], ptr[23]);
if (fillW)
{
r.w = _mm256_set1_ps(w);
}
return r;
}
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
{
vec4_batch4 r;
r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);
if (fillW)
{
r.w = _mm_set1_ps(w);
}
return r;
}
// Store the result back out to a float array 8 at a time
inline void store_f32x8(float* out, f32x8 v)
{
_mm256_storeu_ps(out, v);
}
// Store the result back out to a float array 4 at a time
inline void store_f32x4(float* out, f32x4 v)
{
_mm_storeu_ps(out, v);
}
// Store the result back to a float3 array with size of 8
inline void store_vec3_batch8(float* out, const vec4_batch8& v)
{
alignas(32) float xs[8];
alignas(32) float ys[8];
alignas(32) float zs[8];
_mm256_store_ps(xs, v.x);
_mm256_store_ps(ys, v.y);
_mm256_store_ps(zs, v.z);
for (int i = 0; i < 8; ++i)
{
out[i * 3 + 0] = xs[i];
out[i * 3 + 1] = ys[i];
out[i * 3 + 2] = zs[i];
}
}
// Store the result back to a float3 array with size of 4
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
{
alignas(16) float xs[8];
alignas(16) float ys[8];
alignas(16) float zs[8];
_mm_store_ps(xs, v.x);
_mm_store_ps(ys, v.y);
_mm_store_ps(zs, v.z);
for (int i = 0; i < 4; ++i)
{
out[i * 3 + 0] = xs[i];
out[i * 3 + 1] = ys[i];
out[i * 3 + 2] = zs[i];
}
}
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
{
f32x4 mulx = _mm_mul_ps(a.x, b.x);
f32x4 muly = _mm_mul_ps(a.y, b.y);
f32x4 mulz = _mm_mul_ps(a.z, b.z);
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
}
inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
{
f32x8 mulx = _mm256_mul_ps(a.x, b.x);
f32x8 muly = _mm256_mul_ps(a.y, b.y);
f32x8 mulz = _mm256_mul_ps(a.z, b.z);
return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
}
// Batch 8 mul_Vec4.
inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
{
vec4_batch8 r;
__m256 m00 = _mm256_set1_ps(m[0]);
__m256 m01 = _mm256_set1_ps(m[1]);
__m256 m02 = _mm256_set1_ps(m[2]);
__m256 m03 = _mm256_set1_ps(m[3]);
__m256 m10 = _mm256_set1_ps(m[4]);
__m256 m11 = _mm256_set1_ps(m[5]);
__m256 m12 = _mm256_set1_ps(m[6]);
__m256 m13 = _mm256_set1_ps(m[7]);
__m256 m20 = _mm256_set1_ps(m[8]);
__m256 m21 = _mm256_set1_ps(m[9]);
__m256 m22 = _mm256_set1_ps(m[10]);
__m256 m23 = _mm256_set1_ps(m[11]);
//
// row0 dot
//
r.x = _mm256_add_ps(
_mm256_add_ps(
_mm256_mul_ps(v.x, m00),
_mm256_mul_ps(v.y, m01)),
_mm256_add_ps(
_mm256_mul_ps(v.z, m02), m03));
//
// row1 dot
//
r.y = _mm256_add_ps(
_mm256_add_ps(
_mm256_mul_ps(v.x, m10),
_mm256_mul_ps(v.y, m11)),
_mm256_add_ps(
_mm256_mul_ps(v.z, m12), m13));
//
// row2 dot
//
r.z = _mm256_add_ps(
_mm256_add_ps(
_mm256_mul_ps(v.x, m20),
_mm256_mul_ps(v.y, m21)),
_mm256_add_ps(
_mm256_mul_ps(v.z, m22), m23));
return r;
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot
//
r.x = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m00),
_mm_mul_ps(v.y, m01)),
_mm_add_ps(
_mm_mul_ps(v.z, m02), m03));
//
// row1 dot
//
r.y = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m10),
_mm_mul_ps(v.y, m11)),
_mm_add_ps(
_mm_mul_ps(v.z, m12), m13));
//
// row2 dot
//
r.z = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m20),
_mm_mul_ps(v.y, m21)),
_mm_add_ps(
_mm_mul_ps(v.z, m22), m23));
return r;
}
}

View file

@ -0,0 +1,26 @@
#include "avx_intrinsics.h"
#include "float3_dispatch.h"
#include <immintrin.h> // AVX/AVX2 intrinsics
#include "float3_impl.inl"
namespace math_backend::float3::dispatch
{
// Install AVX backend
void install_avx()
{
gFloat3.add = float3_add_impl;
gFloat3.sub = float3_sub_impl;
gFloat3.mul = float3_mul_impl;
gFloat3.mul_scalar = float3_mul_scalar_impl;
gFloat3.div = float3_div_impl;
gFloat3.div_scalar = float3_div_scalar_impl;
gFloat3.dot = float3_dot_impl;
gFloat3.length = float3_length_impl;
gFloat3.lengthSquared = float3_length_squared_impl;
gFloat3.normalize = float3_normalize_impl;
gFloat3.normalize_mag = float3_normalize_mag_impl;
gFloat3.lerp = float3_lerp_impl;
gFloat3.cross = float3_cross_impl;
}
}

View file

@ -0,0 +1,25 @@
#include "avx_intrinsics.h"
#include "float4_dispatch.h"
#include "float4_impl.inl"
namespace math_backend::float4::dispatch
{
// Install AVX backend
void install_avx()
{
gFloat4.add = float4_add_impl;
gFloat4.sub = float4_sub_impl;
gFloat4.mul = float4_mul_impl;
gFloat4.mul_scalar = float4_mul_scalar_impl;
gFloat4.div = float4_div_impl;
gFloat4.div_scalar = float4_div_scalar_impl;
gFloat4.dot = float4_dot_impl;
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
gFloat4.cross = float4_cross_impl;
}
}

View file

@ -0,0 +1,26 @@
#include "avx_intrinsics.h"
#include "mat44_dispatch.h"
#include "mat44_impl.inl"
namespace math_backend::mat44::dispatch
{
void install_avx()
{
gMat44.transpose = mat44_transpose_impl;
gMat44.inverse = mat44_inverse_impl;
gMat44.inverse_to = mat44_inverse_to_impl;
gMat44.affine_inverse = mat44_affine_inverse_impl;
gMat44.mul_mat44 = mat44_mul_mat44_impl;
gMat44.mul_pos3 = mat44_mul_pos3_impl;
gMat44.mul_vec3 = mat44_mul_vec3_impl;
gMat44.mul_float4 = mat44_mul_float4_impl;
gMat44.transform_plane = mat44_transform_plane_impl;
gMat44.scale = mat44_scale_impl;
gMat44.get_scale = mat44_get_scale_impl;
gMat44.normalize = mat44_normalize_impl;
gMat44.determinant = mat44_get_determinant;
gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
}
}

View file

@ -0,0 +1,625 @@
#pragma once
#include <immintrin.h> // AVX2
namespace
{
typedef __m128 f32x4;
//------------------------------------------------------
// Load / Store
//------------------------------------------------------
// Load 4 floats from memory into a SIMD register
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
inline f32x4 v_zero() { return _mm_setzero_ps(); }
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
//------------------------------------------------------
// Mask helpers
//------------------------------------------------------
inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
inline f32x4 v_swizzle_mask(f32x4 v1,const int x, const int y, const int z, const int w)
{
__m128i idx = _mm_set_epi32(w, z, y, x);
return _mm_permutevar_ps(v1, idx);
}
inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
{
__m128i idx = _mm_set1_epi32(x);
return _mm_permutevar_ps(v1, idx);
}
inline f32x4 v_swizzle_lo(f32x4 v1)
{
return _mm_moveldup_ps(v1);
}
inline f32x4 v_swizzle_hi(f32x4 v1)
{
return _mm_movehdup_ps(v1);
}
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
{
return _mm_blend_ps(newv, original, 0b1000);
}
//------------------------------------------------------
// Shuffle helpers
//------------------------------------------------------
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
{
f32x4 lo = v1;
f32x4 hi = v2;
f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
return _mm_movelh_ps(combined_lo, combined_hi);
}
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
{
return _mm_movelh_ps(v1, v2);
}
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
{
return _mm_movehl_ps(v2, v1);
}
//------------------------------------------------------
// Float3 helpers (safe loading into 4 lanes)
//------------------------------------------------------
inline f32x4 v_load3_vec(const float* p) // w = 0
{
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
}
inline f32x4 v_load3_pos(const float* p) // w = 1
{
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
}
inline void v_store3(float* dst, f32x4 v)
{
alignas(16) float tmp[4]; // temp storage
_mm_store_ps(tmp, v); // store all 4 lanes
dst[0] = tmp[0];
dst[1] = tmp[1];
dst[2] = tmp[2];
}
inline f32x4 v_set(float x, float y, float z, float w)
{
return _mm_set_ps(w, z, y, x);
}
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
{
return _mm_insert_ps(v, w, 0x30);
}
//------------------------------------------------------
// Simple Arithmatic
//------------------------------------------------------
// Element-wise multiply
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
// Element-wise divide
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
// Element-wise add
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
// Element-wise subtract
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
//------------------------------------------------------
// Fast recip
//------------------------------------------------------
// Fast recip 1/b
inline f32x4 v_rcp_nr(f32x4 b)
{
f32x4 r = _mm_rcp_ps(b);
f32x4 two = _mm_set1_ps(2.0f);
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
}
// Divide fast ( b = recip eg 1/b)
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
inline f32x4 v_rsqrt_nr(f32x4 x)
{
f32x4 half = _mm_set1_ps(0.5f);
f32x4 three = _mm_set1_ps(3.0f);
f32x4 r = _mm_rsqrt_ps(x);
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
}
//------------------------------------------------------
// Vector intrinsic functions
//------------------------------------------------------
// full dot4
inline f32x4 v_dot4(f32x4 a, f32x4 b)
{
return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
}
// dot3 (ignores w)
inline f32x4 v_dot3(f32x4 a, f32x4 b)
{
return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
}
// cross product xyz only.
inline f32x4 v_cross(f32x4 a, f32x4 b)
{
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
}
inline f32x4 v_normalize3(f32x4 v)
{
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
return _mm_blendv_ps(norm, fallback, isZero);
}
// adds all 4 lanes together.
inline f32x4 v_hadd4(f32x4 a)
{
// sum all 4 lanes in SSE41
__m128 sum = _mm_hadd_ps(a, a);
return _mm_hadd_ps(sum, sum);
}
//------------------------------------------------------
// Matrix type (row-major 4x4)
//------------------------------------------------------
struct f32x4x4
{
f32x4 r0;
f32x4 r1;
f32x4 r2;
f32x4 r3;
};
//------------------------------------------------------
// Matrix Load / Store
//------------------------------------------------------
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
{
f32x4x4 out;
out.r0 = v_load(m + 0);
out.r1 = v_load(m + 4);
out.r2 = v_load(m + 8);
out.r3 = v_load(m + 12);
return out;
}
inline void m_store(float* dst, const f32x4x4& m)
{
v_store(dst + 0, m.r0);
v_store(dst + 4, m.r1);
v_store(dst + 8, m.r2);
v_store(dst + 12, m.r3);
}
inline f32x4x4 m_identity()
{
f32x4x4 m;
m.r0 = _mm_set_ps(0, 0, 0, 1);
m.r1 = _mm_set_ps(0, 0, 1, 0);
m.r2 = _mm_set_ps(0, 1, 0, 0);
m.r3 = _mm_set_ps(1, 0, 0, 0);
return m;
}
inline f32x4x4 m_zero()
{
f32x4 z = v_zero();
return { z, z, z, z };
}
//------------------------------------------------------
// Matrix × Vector
//------------------------------------------------------
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot4(m.r0, v);
f32x4 y = v_dot4(m.r1, v);
f32x4 z = v_dot4(m.r2, v);
f32x4 w = v_dot4(m.r3, v);
// combine to a vector
return _mm_set_ps(_mm_cvtss_f32(w),
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot3(m.r0, v);
f32x4 y = v_dot3(m.r1, v);
f32x4 z = v_dot3(m.r2, v);
// combine to a vector
return _mm_set_ps(0.0f,
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
//------------------------------------------------------
// Matrix × Matrix
//------------------------------------------------------
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
{
// Transpose B once for dot products
f32x4 b0 = b.r0;
f32x4 b1 = b.r1;
f32x4 b2 = b.r2;
f32x4 b3 = b.r3;
// Transpose (SSE2)
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
f32x4x4 C;
auto mul_row = [&](f32x4 arow)
{
f32x4 x = v_dot4(arow, b0);
f32x4 y = v_dot4(arow, b1);
f32x4 z = v_dot4(arow, b2);
f32x4 w = v_dot4(arow, b3);
f32x4 xy = _mm_unpacklo_ps(x, y);
f32x4 zw = _mm_unpacklo_ps(z, w);
return _mm_movelh_ps(xy, zw);
};
C.r0 = mul_row(a.r0);
C.r1 = mul_row(a.r1);
C.r2 = mul_row(a.r2);
C.r3 = mul_row(a.r3);
return C;
}
//------------------------------------------------------
// Transpose
//------------------------------------------------------
inline f32x4x4 m_transpose(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
f32x4 r3 = m.r3;
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
return { r0, r1, r2, r3 };
}
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
{
f32x4x4 t = m_transpose(m);
// Extract translation
f32x4 T = v_set(
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
0.0f
);
f32x4 newT = m_mul_vec4(t, T);
newT = _mm_sub_ps(v_zero(), newT);
t.r0 = v_preserve_w(t.r0, newT);
t.r1 = v_preserve_w(t.r1, newT);
t.r2 = v_preserve_w(t.r2, newT);
t.r3 = v_set(0, 0, 0, 1);
return t;
}
inline f32x4 m_determinant(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(c, d);
f32x4 c1 = v_cross(d, b);
f32x4 c2 = v_cross(b, c);
f32x4 term0 = _mm_mul_ps(a, c0);
f32x4 term1 = _mm_mul_ps(a, c1);
f32x4 term2 = _mm_mul_ps(a, c2);
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
return v_hadd4(det);
}
inline f32x4 m_determinant_affine(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
r0 = _mm_and_ps(r0, v_mask_xyz());
r1 = _mm_and_ps(r1, v_mask_xyz());
r2 = _mm_and_ps(r2, v_mask_xyz());
f32x4 c0 = v_cross(r1, r2);
return v_dot3(r0, c0); // splatted determinant
}
//---------------------------------------------------------
// BATCH INTRINSICS
//---------------------------------------------------------
typedef __m256 f32x8;
struct vec4_batch8
{
f32x8 x;
f32x8 y;
f32x8 z;
f32x8 w;
};
struct vec4_batch4
{
f32x4 x;
f32x4 y;
f32x4 z;
f32x4 w;
};
inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
{
vec4_batch8 r;
r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
ptr[12], ptr[15], ptr[18], ptr[21]);
r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
ptr[13], ptr[16], ptr[19], ptr[22]);
r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
ptr[14], ptr[17], ptr[20], ptr[23]);
if (fillW)
{
r.w = _mm256_set1_ps(w);
}
return r;
}
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
{
vec4_batch4 r;
r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);
if (fillW)
{
r.w = _mm_set1_ps(w);
}
return r;
}
// Store the result back out to a float array 8 at a time
inline void store_f32x8(float* out, f32x8 v)
{
_mm256_storeu_ps(out, v);
}
// Store the result back out to a float array 4 at a time
inline void store_f32x4(float* out, f32x4 v)
{
_mm_storeu_ps(out, v);
}
// Store the result back to a float3 array with size of 8
inline void store_vec3_batch8(float* out, const vec4_batch8& v)
{
alignas(32) float xs[8];
alignas(32) float ys[8];
alignas(32) float zs[8];
_mm256_store_ps(xs, v.x);
_mm256_store_ps(ys, v.y);
_mm256_store_ps(zs, v.z);
for (int i = 0; i < 8; ++i)
{
out[i * 3 + 0] = xs[i];
out[i * 3 + 1] = ys[i];
out[i * 3 + 2] = zs[i];
}
}
// Store the result back to a float3 array with size of 4
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
{
alignas(16) float xs[8];
alignas(16) float ys[8];
alignas(16) float zs[8];
_mm_store_ps(xs, v.x);
_mm_store_ps(ys, v.y);
_mm_store_ps(zs, v.z);
for (int i = 0; i < 4; ++i)
{
out[i * 3 + 0] = xs[i];
out[i * 3 + 1] = ys[i];
out[i * 3 + 2] = zs[i];
}
}
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
{
f32x4 mulx = _mm_mul_ps(a.x, b.x);
f32x4 muly = _mm_mul_ps(a.y, b.y);
f32x4 mulz = _mm_mul_ps(a.z, b.z);
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
}
inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
{
f32x8 mulx = _mm256_mul_ps(a.x, b.x);
f32x8 muly = _mm256_mul_ps(a.y, b.y);
f32x8 mulz = _mm256_mul_ps(a.z, b.z);
return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
}
// Batch 8 mul_Vec4.
inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
{
vec4_batch8 r;
__m256 m00 = _mm256_set1_ps(m[0]);
__m256 m01 = _mm256_set1_ps(m[1]);
__m256 m02 = _mm256_set1_ps(m[2]);
__m256 m03 = _mm256_set1_ps(m[3]);
__m256 m10 = _mm256_set1_ps(m[4]);
__m256 m11 = _mm256_set1_ps(m[5]);
__m256 m12 = _mm256_set1_ps(m[6]);
__m256 m13 = _mm256_set1_ps(m[7]);
__m256 m20 = _mm256_set1_ps(m[8]);
__m256 m21 = _mm256_set1_ps(m[9]);
__m256 m22 = _mm256_set1_ps(m[10]);
__m256 m23 = _mm256_set1_ps(m[11]);
//
// row0 dot
//
r.x = _mm256_add_ps(
_mm256_add_ps(
_mm256_mul_ps(v.x, m00),
_mm256_mul_ps(v.y, m01)),
_mm256_add_ps(
_mm256_mul_ps(v.z, m02), m03));
//
// row1 dot
//
r.y = _mm256_add_ps(
_mm256_add_ps(
_mm256_mul_ps(v.x, m10),
_mm256_mul_ps(v.y, m11)),
_mm256_add_ps(
_mm256_mul_ps(v.z, m12), m13));
//
// row2 dot
//
r.z = _mm256_add_ps(
_mm256_add_ps(
_mm256_mul_ps(v.x, m20),
_mm256_mul_ps(v.y, m21)),
_mm256_add_ps(
_mm256_mul_ps(v.z, m22), m23));
return r;
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot
//
r.x = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m00),
_mm_mul_ps(v.y, m01)),
_mm_add_ps(
_mm_mul_ps(v.z, m02), m03));
//
// row1 dot
//
r.y = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m10),
_mm_mul_ps(v.y, m11)),
_mm_add_ps(
_mm_mul_ps(v.z, m12), m13));
//
// row2 dot
//
r.z = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m20),
_mm_mul_ps(v.y, m21)),
_mm_add_ps(
_mm_mul_ps(v.z, m22), m23));
return r;
}
}

View file

@ -0,0 +1,26 @@
#include "avx2_intrinsics.h"
#include "float3_dispatch.h"
#include <immintrin.h> // AVX/AVX2 intrinsics
#include "float3_impl.inl"
namespace math_backend::float3::dispatch
{
// Install AVX2 backend
void install_avx2()
{
gFloat3.add = float3_add_impl;
gFloat3.sub = float3_sub_impl;
gFloat3.mul = float3_mul_impl;
gFloat3.mul_scalar = float3_mul_scalar_impl;
gFloat3.div = float3_div_impl;
gFloat3.div_scalar = float3_div_scalar_impl;
gFloat3.dot = float3_dot_impl;
gFloat3.length = float3_length_impl;
gFloat3.lengthSquared = float3_length_squared_impl;
gFloat3.normalize = float3_normalize_impl;
gFloat3.normalize_mag = float3_normalize_mag_impl;
gFloat3.lerp = float3_lerp_impl;
gFloat3.cross = float3_cross_impl;
}
}

View file

@ -0,0 +1,25 @@
#include "avx2_intrinsics.h"
#include "float4_dispatch.h"
#include "float4_impl.inl"
namespace math_backend::float4::dispatch
{
// Install AVX2 backend
void install_avx2()
{
gFloat4.add = float4_add_impl;
gFloat4.sub = float4_sub_impl;
gFloat4.mul = float4_mul_impl;
gFloat4.mul_scalar = float4_mul_scalar_impl;
gFloat4.div = float4_div_impl;
gFloat4.div_scalar = float4_div_scalar_impl;
gFloat4.dot = float4_dot_impl;
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
gFloat4.cross = float4_cross_impl;
}
}

View file

@ -0,0 +1,25 @@
#include "avx2_intrinsics.h"
#include "mat44_dispatch.h"
#include "mat44_impl.inl"
namespace math_backend::mat44::dispatch
{
void install_avx2()
{
gMat44.transpose = mat44_transpose_impl;
gMat44.inverse = mat44_inverse_impl;
gMat44.inverse_to = mat44_inverse_to_impl;
gMat44.affine_inverse = mat44_affine_inverse_impl;
gMat44.mul_mat44 = mat44_mul_mat44_impl;
gMat44.mul_pos3 = mat44_mul_pos3_impl;
gMat44.mul_vec3 = mat44_mul_vec3_impl;
gMat44.mul_float4 = mat44_mul_float4_impl;
gMat44.transform_plane = mat44_transform_plane_impl;
gMat44.scale = mat44_scale_impl;
gMat44.get_scale = mat44_get_scale_impl;
gMat44.normalize = mat44_normalize_impl;
gMat44.determinant = mat44_get_determinant;
gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
}
}

View file

@ -0,0 +1,25 @@
#include "neon_intrinsics.h"
#include "float3_dispatch.h"
#include "float3_impl.inl"
namespace math_backend::float3::dispatch
{
// Install NEON backend
void install_neon()
{
gFloat3.add = float3_add_impl;
gFloat3.sub = float3_sub_impl;
gFloat3.mul = float3_mul_impl;
gFloat3.mul_scalar = float3_mul_scalar_impl;
gFloat3.div = float3_div_impl;
gFloat3.div_scalar = float3_div_scalar_impl;
gFloat3.dot = float3_dot_impl;
gFloat3.length = float3_length_impl;
gFloat3.lengthSquared = float3_length_squared_impl;
gFloat3.normalize = float3_normalize_impl;
gFloat3.normalize_mag = float3_normalize_mag_impl;
gFloat3.lerp = float3_lerp_impl;
gFloat3.cross = float3_cross_impl;
}
}

View file

@ -0,0 +1,25 @@
#include "neon_intrinsics.h"
#include "float4_dispatch.h"
#include "float4_impl.inl"
namespace math_backend::float4::dispatch
{
// Install NEON64 backend
void install_neon()
{
gFloat4.add = float4_add_impl;
gFloat4.sub = float4_sub_impl;
gFloat4.mul = float4_mul_impl;
gFloat4.mul_scalar = float4_mul_scalar_impl;
gFloat4.div = float4_div_impl;
gFloat4.div_scalar = float4_div_scalar_impl;
gFloat4.dot = float4_dot_impl;
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
gFloat4.cross = float4_cross_impl;
}
}

View file

@ -0,0 +1,23 @@
#include "neon_intrinsics.h"
#include "mat44_dispatch.h"
#include "mat44_impl.inl"
namespace math_backend::mat44::dispatch
{
void install_neon()
{
gMat44.transpose = mat44_transpose_impl;
gMat44.inverse = mat44_inverse_impl;
gMat44.inverse_to = mat44_inverse_to_impl;
gMat44.affine_inverse = mat44_affine_inverse_impl;
gMat44.mul_mat44 = mat44_mul_mat44_impl;
gMat44.mul_pos3 = mat44_mul_pos3_impl;
gMat44.mul_vec3 = mat44_mul_vec3_impl;
gMat44.mul_float4 = mat44_mul_float4_impl;
gMat44.scale = mat44_scale_impl;
gMat44.get_scale = mat44_get_scale_impl;
gMat44.normalize = mat44_normalize_impl;
gMat44.determinant = mat44_get_determinant;
}
}

View file

@ -0,0 +1,504 @@
#pragma once
#include <arm_neon.h>
namespace
{
typedef float32x4_t f32x4;
//------------------------------------------------------
// Load / Store
//------------------------------------------------------
inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
inline f32x4 v_set(float x, float y, float z, float w)
{
return { x, y, z, w }; // lane0=x, lane1=y, lane2=z, lane3=w
}
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
{
// extract scalar value from w lane0
float w_val = vgetq_lane_f32(w, 0);
// broadcast w_val to all lanes (we only need lane3 later)
f32x4 w_broadcast = vdupq_n_f32(w_val);
// mask to select only lane3
uint32x4_t mask = {0, 0, 0, 0xFFFFFFFF};
// vbslq: if mask bit=1, take from first argument, else take from second
return vbslq_f32(mask, w_broadcast, v);
}
//------------------------------------------------------
// Mask helpers
//------------------------------------------------------
inline f32x4 v_mask_xyz()
{
uint32x4_t mask = {
0xFFFFFFFF,
0xFFFFFFFF,
0xFFFFFFFF,
0x00000000
};
return vreinterpretq_f32_u32(mask);
}
inline f32x4 v_and(f32x4 a, f32x4 b)
{
return vreinterpretq_f32_u32(
vandq_u32(
vreinterpretq_u32_f32(a),
vreinterpretq_u32_f32(b)
)
);
}
inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
{
// base byte index of the float lane
uint8x16_t base = vdupq_n_u8((uint8_t)(x * 4));
// byte offsets inside a float (0,1,2,3 repeated 4 times)
const uint8x16_t offsets = {
0,1,2,3, 0,1,2,3,
0,1,2,3, 0,1,2,3
};
uint8x16_t idx = vaddq_u8(base, offsets);
return vreinterpretq_f32_u8(
vqtbl1q_u8(
vreinterpretq_u8_f32(v),
idx
)
);
}
inline f32x4 v_swizzle_lo(f32x4 v)
{
float32x4x2_t t = vzipq_f32(v, v);
return t.val[0]; // [x, x, z, z]
}
inline f32x4 v_swizzle_hi(f32x4 v)
{
float32x4x2_t t = vzipq_f32(v, v);
return t.val[1]; // [y, y, w, w]
}
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
{
float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f};
return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv);
}
//------------------------------------------------------
// Float3 helpers
//------------------------------------------------------
inline f32x4 v_load3_vec(const float* p) // w = 0
{
float tmp[4] = { p[0], p[1], p[2], 0.0f };
return vld1q_f32(tmp);
}
inline f32x4 v_load3_pos(const float* p) // w = 1
{
float tmp[4] = { p[0], p[1], p[2], 1.0f };
return vld1q_f32(tmp);
}
inline void v_store3(float* dst, f32x4 v)
{
float tmp[4];
vst1q_f32(tmp, v);
dst[0] = tmp[0];
dst[1] = tmp[1];
dst[2] = tmp[2];
}
//------------------------------------------------------
// Simple Arithmetic
//------------------------------------------------------
inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64
inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
//------------------------------------------------------
// Fast recip
//------------------------------------------------------
inline f32x4 v_rcp_nr(f32x4 b)
{
f32x4 r = vrecpeq_f32(b);
r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson
r = vmulq_f32(r, vrecpsq_f32(b, r));
return r;
}
inline f32x4 v_div(f32x4 a, f32x4 b)
{
return vmulq_f32(a, v_rcp_nr(b));
}
inline f32x4 v_rsqrt_nr(f32x4 x)
{
f32x4 r = vrsqrteq_f32(x);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x));
return r;
}
//------------------------------------------------------
// Vector intrinsic functions
//------------------------------------------------------
inline f32x4 v_dot4(f32x4 a, f32x4 b)
{
f32x4 mul = vmulq_f32(a, b);
float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
return vdupq_n_f32(sum);
}
inline f32x4 v_dot3(f32x4 a, f32x4 b)
{
float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
f32x4 mul = vmulq_f32(a, b);
mul = vmulq_f32(mul, mask);
float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
return vdupq_n_f32(sum);
}
inline f32x4 v_cross(f32x4 a, f32x4 b)
{
f32x4 a_yzx = { vgetq_lane_f32(a,1),
vgetq_lane_f32(a,2),
vgetq_lane_f32(a,0),
0.0f };
f32x4 b_zxy = { vgetq_lane_f32(b,2),
vgetq_lane_f32(b,0),
vgetq_lane_f32(b,1),
0.0f };
f32x4 a_zxy = { vgetq_lane_f32(a,2),
vgetq_lane_f32(a,0),
vgetq_lane_f32(a,1),
0.0f };
f32x4 b_yzx = { vgetq_lane_f32(b,1),
vgetq_lane_f32(b,2),
vgetq_lane_f32(b,0),
0.0f };
return vsubq_f32(
vmulq_f32(a_yzx, b_zxy),
vmulq_f32(a_zxy, b_yzx)
);
}
inline f32x4 v_normalize3(f32x4 v)
{
const float32x4_t zero = vdupq_n_f32(0.0f);
const float32x4_t fallback = {0.0f, 0.0f, 1.0f, 0.0f};
f32x4 dot = v_dot3(v, v);
// dot == 0?
uint32x4_t isZero = vceqq_f32(dot, zero);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 norm = vmulq_f32(v, inv);
// Select fallback when zero
return vbslq_f32(isZero, fallback, norm);
}
inline f32x4 v_hadd4(f32x4 a)
{
float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a));
float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
return vdupq_n_f32(sum);
}
//------------------------------------------------------
// Matrix type (row-major 4x4)
//------------------------------------------------------
struct f32x4x4
{
f32x4 r0;
f32x4 r1;
f32x4 r2;
f32x4 r3;
};
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
{
f32x4x4 out;
out.r0 = v_load(m + 0);
out.r1 = v_load(m + 4);
out.r2 = v_load(m + 8);
out.r3 = v_load(m + 12);
return out;
}
inline void m_store(float* dst, const f32x4x4& m)
{
v_store(dst + 0, m.r0);
v_store(dst + 4, m.r1);
v_store(dst + 8, m.r2);
v_store(dst + 12, m.r3);
}
inline f32x4x4 m_identity()
{
f32x4x4 m;
m.r0 = {1,0,0,0};
m.r1 = {0,1,0,0};
m.r2 = {0,0,1,0};
m.r3 = {0,0,0,1};
return m;
}
inline f32x4x4 m_zero()
{
f32x4 z = v_zero();
return { z, z, z, z };
}
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot4(m.r0, v);
f32x4 y = v_dot4(m.r1, v);
f32x4 z = v_dot4(m.r2, v);
f32x4 w = v_dot4(m.r3, v);
return {
v_extract0(x),
v_extract0(y),
v_extract0(z),
v_extract0(w)
};
}
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot3(m.r0, v);
f32x4 y = v_dot3(m.r1, v);
f32x4 z = v_dot3(m.r2, v);
return {
v_extract0(x),
v_extract0(y),
v_extract0(z),
0.0f
};
}
inline f32x4x4 m_transpose(const f32x4x4& m)
{
float32x4x2_t t0 = vtrnq_f32(m.r0, m.r1);
float32x4x2_t t1 = vtrnq_f32(m.r2, m.r3);
float32x2_t a0 = vget_low_f32(t0.val[0]);
float32x2_t a1 = vget_high_f32(t0.val[0]);
float32x2_t a2 = vget_low_f32(t1.val[0]);
float32x2_t a3 = vget_high_f32(t1.val[0]);
float32x2_t b0 = vget_low_f32(t0.val[1]);
float32x2_t b1 = vget_high_f32(t0.val[1]);
float32x2_t b2 = vget_low_f32(t1.val[1]);
float32x2_t b3 = vget_high_f32(t1.val[1]);
f32x4x4 out;
out.r0 = vcombine_f32(a0, a2);
out.r1 = vcombine_f32(b0, b2);
out.r2 = vcombine_f32(a1, a3);
out.r3 = vcombine_f32(b1, b3);
return out;
}
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
{
f32x4x4 bt = m_transpose(b);
auto mul_row = [&](f32x4 row)
{
f32x4 x = v_dot4(row, bt.r0);
f32x4 y = v_dot4(row, bt.r1);
f32x4 z = v_dot4(row, bt.r2);
f32x4 w = v_dot4(row, bt.r3);
return f32x4{
v_extract0(x),
v_extract0(y),
v_extract0(z),
v_extract0(w)
};
};
f32x4x4 C;
C.r0 = mul_row(a.r0);
C.r1 = mul_row(a.r1);
C.r2 = mul_row(a.r2);
C.r3 = mul_row(a.r3);
return C;
}
inline f32x4 m_determinant(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(c, d);
f32x4 c1 = v_cross(d, b);
f32x4 c2 = v_cross(b, c);
f32x4 term0 = vmulq_f32(a, c0);
f32x4 term1 = vmulq_f32(a, c1);
f32x4 term2 = vmulq_f32(a, c2);
f32x4 det = vaddq_f32(term0, vaddq_f32(term1, term2));
return v_hadd4(det);
}
inline f32x4 m_determinant_affine(const f32x4x4& m)
{
f32x4 r0 = v_and(m.r0, v_mask_xyz());
f32x4 r1 = v_and(m.r1, v_mask_xyz());
f32x4 r2 = v_and(m.r2, v_mask_xyz());
f32x4 c0 = v_cross(r1, r2);
return v_dot3(r0, c0);
}
inline f32x4x4 m_inverse(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(b, c);
f32x4 c1 = v_cross(c, d);
f32x4 c2 = v_cross(d, a);
f32x4 c3 = v_cross(a, b);
f32x4 det = v_dot4(a, c1);
f32x4 invDet = v_rcp_nr(det);
f32x4x4 adj;
adj.r0 = vmulq_f32(c1, invDet);
adj.r1 = vmulq_f32(c2, invDet);
adj.r2 = vmulq_f32(c3, invDet);
adj.r3 = vmulq_f32(c0, invDet);
return m_transpose(adj);
}
struct vec4_batch4
{
f32x4 x;
f32x4 y;
f32x4 z;
f32x4 w;
};
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
{
vec4_batch4 r;
r.x = (f32x4){ ptr[0], ptr[3], ptr[6], ptr[9] };
r.y = (f32x4){ ptr[1], ptr[4], ptr[7], ptr[10] };
r.z = (f32x4){ ptr[2], ptr[5], ptr[8], ptr[11] };
if (fillW)
{
r.w = vdupq_n_f32(w);
}
return r;
}
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
{
alignas(16) float xs[4];
alignas(16) float ys[4];
alignas(16) float zs[4];
vst1q_f32(xs, v.x);
vst1q_f32(ys, v.y);
vst1q_f32(zs, v.z);
for (int i = 0; i < 4; ++i)
{
out[i * 3 + 0] = xs[i];
out[i * 3 + 1] = ys[i];
out[i * 3 + 2] = zs[i];
}
}
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
float32x4_t m00 = vdupq_n_f32(m[0]);
float32x4_t m01 = vdupq_n_f32(m[1]);
float32x4_t m02 = vdupq_n_f32(m[2]);
float32x4_t m03 = vdupq_n_f32(m[3]);
float32x4_t m10 = vdupq_n_f32(m[4]);
float32x4_t m11 = vdupq_n_f32(m[5]);
float32x4_t m12 = vdupq_n_f32(m[6]);
float32x4_t m13 = vdupq_n_f32(m[7]);
float32x4_t m20 = vdupq_n_f32(m[8]);
float32x4_t m21 = vdupq_n_f32(m[9]);
float32x4_t m22 = vdupq_n_f32(m[10]);
float32x4_t m23 = vdupq_n_f32(m[11]);
// row0 dot
r.x = vaddq_f32(
vaddq_f32(
vmulq_f32(v.x, m00),
vmulq_f32(v.y, m01)),
vaddq_f32(
vmulq_f32(v.z, m02),
m03));
// row1 dot
r.y = vaddq_f32(
vaddq_f32(
vmulq_f32(v.x, m10),
vmulq_f32(v.y, m11)),
vaddq_f32(
vmulq_f32(v.z, m12),
m13));
// row2 dot
r.z = vaddq_f32(
vaddq_f32(
vmulq_f32(v.x, m20),
vmulq_f32(v.y, m21)),
vaddq_f32(
vmulq_f32(v.z, m22),
m23));
return r;
}
}

View file

@ -0,0 +1,26 @@
#include "sse2_intrinsics.h"
#include "float3_dispatch.h"
#include <emmintrin.h> // SSE2 intrinsics
#include "float3_impl.inl"
namespace math_backend::float3::dispatch
{
// Install SSE2 backend
void install_sse2()
{
gFloat3.add = float3_add_impl;
gFloat3.sub = float3_sub_impl;
gFloat3.mul = float3_mul_impl;
gFloat3.mul_scalar = float3_mul_scalar_impl;
gFloat3.div = float3_div_impl;
gFloat3.div_scalar = float3_div_scalar_impl;
gFloat3.dot = float3_dot_impl;
gFloat3.length = float3_length_impl;
gFloat3.lengthSquared = float3_length_squared_impl;
gFloat3.normalize = float3_normalize_impl;
gFloat3.normalize_mag = float3_normalize_mag_impl;
gFloat3.lerp = float3_lerp_impl;
gFloat3.cross = float3_cross_impl;
}
}

View file

@ -0,0 +1,26 @@
#include "sse2_intrinsics.h"
#include "float4_dispatch.h"
#include "float4_impl.inl"
namespace math_backend::float4::dispatch
{
// Install SSE2 backend
void install_sse2()
{
gFloat4.add = float4_add_impl;
gFloat4.sub = float4_sub_impl;
gFloat4.mul = float4_mul_impl;
gFloat4.mul_scalar = float4_mul_scalar_impl;
gFloat4.div = float4_div_impl;
gFloat4.div_scalar = float4_div_scalar_impl;
gFloat4.dot = float4_dot_impl;
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
gFloat4.cross = float4_cross_impl;
}
}

View file

@ -0,0 +1,25 @@
#include "sse2_intrinsics.h"
#include "mat44_dispatch.h"
#include "mat44_impl.inl"
namespace math_backend::mat44::dispatch
{
void install_sse2()
{
gMat44.transpose = mat44_transpose_impl;
gMat44.inverse = mat44_inverse_impl;
gMat44.inverse_to = mat44_inverse_to_impl;
gMat44.affine_inverse = mat44_affine_inverse_impl;
gMat44.mul_mat44 = mat44_mul_mat44_impl;
gMat44.mul_pos3 = mat44_mul_pos3_impl;
gMat44.mul_vec3 = mat44_mul_vec3_impl;
gMat44.mul_float4 = mat44_mul_float4_impl;
gMat44.transform_plane = mat44_transform_plane_impl;
gMat44.scale = mat44_scale_impl;
gMat44.get_scale = mat44_get_scale_impl;
gMat44.normalize = mat44_normalize_impl;
gMat44.determinant = mat44_get_determinant;
gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
}
}

View file

@ -0,0 +1,655 @@
#pragma once
#include <emmintrin.h> // SSE2
#include <xmmintrin.h> // SSE
namespace
{
typedef __m128 f32x4;
//------------------------------------------------------
// Vector/Point Load / Store
//------------------------------------------------------
// Load 4 floats from memory into a SIMD register
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
inline f32x4 v_zero() { return _mm_setzero_ps(); }
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
//------------------------------------------------------
// Float3 helpers (safe loading into 4 lanes)
//------------------------------------------------------
inline f32x4 v_load3_vec(const float* p) // w = 0
{
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
}
inline f32x4 v_load3_pos(const float* p) // w = 1
{
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
}
inline f32x4 v_set(float x, float y, float z, float w)
{
return _mm_set_ps(w, z, y, x);
}
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
{
f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v));
}
inline void v_store3(float* dst, f32x4 v)
{
alignas(16) float tmp[4]; // temp storage
_mm_store_ps(tmp, v); // store all 4 lanes
dst[0] = tmp[0];
dst[1] = tmp[1];
dst[2] = tmp[2];
}
//------------------------------------------------------
// Mask helpers
//------------------------------------------------------
inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
{
alignas(16) float tmp[4];
_mm_store_ps(tmp, v);
return _mm_set_ps(
tmp[w],
tmp[z],
tmp[y],
tmp[x]
);
}
inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
{
alignas(16) float tmp[4];
_mm_store_ps(tmp, v);
return _mm_set1_ps(tmp[x]);
}
inline f32x4 v_swizzle_lo(f32x4 v1)
{
return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0));
}
inline f32x4 v_swizzle_hi(f32x4 v1)
{
return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1));
}
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
{
f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1
return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
}
//------------------------------------------------------
// Shuffle helpers
//------------------------------------------------------
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
{
alignas(16) float a[4], b[4];
_mm_store_ps(a, v1);
_mm_store_ps(b, v2);
return _mm_set_ps(
b[w],
b[z],
a[y],
a[x]
);
}
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
{
return _mm_movelh_ps(v1, v2);
}
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
{
return _mm_movehl_ps(v2, v1);
}
//------------------------------------------------------
// Simple Arithmatic
//------------------------------------------------------
// Element-wise multiply
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
// Element-wise divide
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
// Element-wise add
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
// Element-wise subtract
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
// Absolute value.
inline f32x4 v_abs(f32x4 v)
{
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return _mm_and_ps(v, mask);
}
//------------------------------------------------------
// Fast recip
//------------------------------------------------------
// Fast recip 1/b
inline f32x4 v_rcp_nr(f32x4 b)
{
f32x4 r = _mm_rcp_ps(b);
f32x4 two = _mm_set1_ps(2.0f);
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
}
// Divide fast ( b = recip eg 1/b)
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
inline f32x4 v_rsqrt_nr(f32x4 x)
{
f32x4 half = _mm_set1_ps(0.5f);
f32x4 three = _mm_set1_ps(3.0f);
f32x4 r = _mm_rsqrt_ps(x);
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
}
//------------------------------------------------------
// Vector intrinsic functions
//------------------------------------------------------
// full dot4
inline f32x4 v_dot4(f32x4 a, f32x4 b)
{
f32x4 prod = _mm_mul_ps(a, b); // multiply element-wise
f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2));
prod = _mm_add_ps(prod, shuf);
shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1));
prod = _mm_add_ps(prod, shuf);
return prod; // f32x4, all lanes = dot(a,b)
}
// dot3 (ignores w)
inline f32x4 v_dot3(f32x4 a, f32x4 b)
{
f32x4 prod = _mm_mul_ps(a, b);
prod = _mm_and_ps(prod, v_mask_xyz()); // zero w
f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
prod = _mm_add_ps(prod, shuf);
shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
prod = _mm_add_ps(prod, shuf);
return prod; // f32x4, all lanes = dot(a,b)
}
// cross product xyz only.
inline f32x4 v_cross(f32x4 a, f32x4 b)
{
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
}
inline f32x4 v_normalize3(f32x4 v)
{
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
// vbsl equivalent
f32x4 result = _mm_or_ps(
_mm_and_ps(isZero, fallback),
_mm_andnot_ps(isZero, norm)
);
return result;
}
// adds all 4 lanes together.
inline f32x4 v_hadd4(f32x4 a)
{
// sum all 4 lanes in SSE2
__m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs
__m128 sums = _mm_add_ps(a, shuf);
shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
return _mm_add_ps(sums, shuf);
}
//------------------------------------------------------
// Matrix type (row-major 4x4)
//------------------------------------------------------
struct f32x4x4
{
f32x4 r0;
f32x4 r1;
f32x4 r2;
f32x4 r3;
};
//------------------------------------------------------
// Matrix Load / Store
//------------------------------------------------------
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
{
f32x4x4 out;
out.r0 = v_load(m + 0);
out.r1 = v_load(m + 4);
out.r2 = v_load(m + 8);
out.r3 = v_load(m + 12);
return out;
}
inline void m_store(float* dst, const f32x4x4& m)
{
v_store(dst + 0, m.r0);
v_store(dst + 4, m.r1);
v_store(dst + 8, m.r2);
v_store(dst + 12, m.r3);
}
inline f32x4x4 m_identity()
{
f32x4x4 m;
m.r0 = _mm_set_ps(0, 0, 0, 1);
m.r1 = _mm_set_ps(0, 0, 1, 0);
m.r2 = _mm_set_ps(0, 1, 0, 0);
m.r3 = _mm_set_ps(1, 0, 0, 0);
return m;
}
inline f32x4x4 m_zero()
{
f32x4 z = v_zero();
return { z, z, z, z };
}
//------------------------------------------------------
// Matrix × Vector
//------------------------------------------------------
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot4(m.r0, v);
f32x4 y = v_dot4(m.r1, v);
f32x4 z = v_dot4(m.r2, v);
f32x4 w = v_dot4(m.r3, v);
// combine to a vector
return _mm_set_ps(_mm_cvtss_f32(w),
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot3(m.r0, v);
f32x4 y = v_dot3(m.r1, v);
f32x4 z = v_dot3(m.r2, v);
// combine to a vector
return _mm_set_ps(0.0f,
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
//------------------------------------------------------
// Matrix × Matrix
//------------------------------------------------------
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
{
// Transpose B once for dot products
f32x4 b0 = b.r0;
f32x4 b1 = b.r1;
f32x4 b2 = b.r2;
f32x4 b3 = b.r3;
// Transpose (SSE2)
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
f32x4x4 C;
auto mul_row = [&](f32x4 arow)
{
f32x4 x = v_dot4(arow, b0);
f32x4 y = v_dot4(arow, b1);
f32x4 z = v_dot4(arow, b2);
f32x4 w = v_dot4(arow, b3);
f32x4 xy = _mm_unpacklo_ps(x, y);
f32x4 zw = _mm_unpacklo_ps(z, w);
return _mm_movelh_ps(xy, zw);
};
C.r0 = mul_row(a.r0);
C.r1 = mul_row(a.r1);
C.r2 = mul_row(a.r2);
C.r3 = mul_row(a.r3);
return C;
}
//------------------------------------------------------
// Transpose
//------------------------------------------------------
inline f32x4x4 m_transpose(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
f32x4 r3 = m.r3;
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
return { r0, r1, r2, r3 };
}
inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
{
dst[0] = v_extract0(c0); // c0.x
dst[1] = v_extract0(c1); // c1.x
dst[2] = v_extract0(c2); // c2.x
dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
}
inline f32x4x4 m_inverse(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(b, c);
f32x4 c1 = v_cross(c, d);
f32x4 c2 = v_cross(d, a);
f32x4 c3 = v_cross(a, b);
f32x4 det = v_dot4(a, c1);
f32x4 invDet = v_rcp_nr(det);
f32x4x4 adj;
adj.r0 = _mm_mul_ps(c1, invDet);
adj.r1 = _mm_mul_ps(c2, invDet);
adj.r2 = _mm_mul_ps(c3, invDet);
adj.r3 = _mm_mul_ps(c0, invDet);
return m_transpose(adj);
}
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
{
f32x4x4 t = m_transpose(m);
// Extract translation
f32x4 T = v_set(
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
0.0f
);
f32x4 newT = m_mul_vec4(t, T);
newT = _mm_sub_ps(v_zero(), newT);
t.r0 = v_preserve_w(t.r0, newT);
t.r1 = v_preserve_w(t.r1, newT);
t.r2 = v_preserve_w(t.r2, newT);
t.r3 = v_set(0, 0, 0, 1);
return t;
}
inline f32x4x4 m_inverse_affine(const f32x4x4& m)
{
// Extract upper 3x3 rows
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
// Zero translation
r0 = _mm_and_ps(r0, v_mask_xyz());
r1 = _mm_and_ps(r1, v_mask_xyz());
r2 = _mm_and_ps(r2, v_mask_xyz());
// Compute cofactors via cross products
f32x4 c0 = v_cross(r1, r2);
f32x4 c1 = v_cross(r2, r0);
f32x4 c2 = v_cross(r0, r1);
// Determinant
f32x4 det = v_dot3(r0, c0);
// Reciprocal determinant
f32x4 invDet = v_rcp_nr(det);
// Inverse 3x3 (transpose of cofactor matrix)
f32x4 i0 = _mm_mul_ps(c0, invDet);
f32x4 i1 = _mm_mul_ps(c1, invDet);
f32x4 i2 = _mm_mul_ps(c2, invDet);
// Transpose 3x3
f32x4 t0 = i0;
f32x4 t1 = i1;
f32x4 t2 = i2;
f32x4 t3 = _mm_setzero_ps();
_MM_TRANSPOSE4_PS(t0, t1, t2, t3);
// Extract translation
alignas(16) float tmp[4];
tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[3] = 0.0f;
f32x4 T = v_load(tmp);
// New translation = -R' * T
f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
newT = _mm_sub_ps(v_zero(), newT);
// Assemble final matrix
f32x4x4 out;
out.r0 = v_preserve_w(t0, newT);
out.r1 = v_preserve_w(t1, newT);
out.r2 = v_preserve_w(t2, newT);
out.r3 = _mm_set_ps(1, 0, 0, 0);
return out;
}
inline f32x4 m_determinant(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(c, d);
f32x4 c1 = v_cross(d, b);
f32x4 c2 = v_cross(b, c);
f32x4 term0 = _mm_mul_ps(a, c0);
f32x4 term1 = _mm_mul_ps(a, c1);
f32x4 term2 = _mm_mul_ps(a, c2);
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
return v_hadd4(det);
}
inline f32x4 m_determinant_affine(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
r0 = _mm_and_ps(r0, v_mask_xyz());
r1 = _mm_and_ps(r1, v_mask_xyz());
r2 = _mm_and_ps(r2, v_mask_xyz());
f32x4 c0 = v_cross(r1, r2);
return v_dot3(r0, c0); // splatted determinant
}
//---------------------------------------------------------
// BATCH INTRINSICS
//---------------------------------------------------------
struct vec4_batch4
{
f32x4 x;
f32x4 y;
f32x4 z;
f32x4 w;
};
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
{
vec4_batch4 r;
r.x = _mm_set_ps(
ptr[9], ptr[6], ptr[3], ptr[0]);
r.y = _mm_set_ps(
ptr[10], ptr[7], ptr[4], ptr[1]);
r.z = _mm_set_ps(
ptr[11], ptr[8], ptr[5], ptr[2]);
if (fillW)
{
r.w = _mm_set1_ps(w);
}
return r;
}
// Store the result back out to a float array 4 at a time
inline void store_f32x4(float* out, f32x4 v)
{
_mm_storeu_ps(out, v);
}
// Store the result back to a float3 array with size of 4
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
{
alignas(16) float xs[8];
alignas(16) float ys[8];
alignas(16) float zs[8];
_mm_store_ps(xs, v.x);
_mm_store_ps(ys, v.y);
_mm_store_ps(zs, v.z);
for (int i = 0; i < 4; ++i)
{
out[i * 3 + 0] = xs[i];
out[i * 3 + 1] = ys[i];
out[i * 3 + 2] = zs[i];
}
}
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
{
f32x4 mulx = _mm_mul_ps(a.x, b.x);
f32x4 muly = _mm_mul_ps(a.y, b.y);
f32x4 mulz = _mm_mul_ps(a.z, b.z);
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot
//
r.x = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m00),
_mm_mul_ps(v.y, m01)),
_mm_add_ps(
_mm_mul_ps(v.z, m02), m03));
//
// row1 dot
//
r.y = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m10),
_mm_mul_ps(v.y, m11)),
_mm_add_ps(
_mm_mul_ps(v.z, m12), m13));
//
// row2 dot
//
r.z = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m20),
_mm_mul_ps(v.y, m21)),
_mm_add_ps(
_mm_mul_ps(v.z, m22), m23));
return r;
}
}

View file

@ -0,0 +1,26 @@
#include "sse41_intrinsics.h"
#include "float3_dispatch.h"
#include <smmintrin.h> // SSE41 intrinsics
#include "float3_impl.inl"
namespace math_backend::float3::dispatch
{
// Install SSE41 backend
void install_sse41()
{
gFloat3.add = float3_add_impl;
gFloat3.sub = float3_sub_impl;
gFloat3.mul = float3_mul_impl;
gFloat3.mul_scalar = float3_mul_scalar_impl;
gFloat3.div = float3_div_impl;
gFloat3.div_scalar = float3_div_scalar_impl;
gFloat3.dot = float3_dot_impl;
gFloat3.length = float3_length_impl;
gFloat3.lengthSquared = float3_length_squared_impl;
gFloat3.normalize = float3_normalize_impl;
gFloat3.normalize_mag = float3_normalize_mag_impl;
gFloat3.lerp = float3_lerp_impl;
gFloat3.cross = float3_cross_impl;
}
}

View file

@ -0,0 +1,25 @@
#include "sse41_intrinsics.h"
#include "float4_dispatch.h"
#include "float4_impl.inl"
namespace math_backend::float4::dispatch
{
// Install SSE41 backend
void install_sse41()
{
gFloat4.add = float4_add_impl;
gFloat4.sub = float4_sub_impl;
gFloat4.mul = float4_mul_impl;
gFloat4.mul_scalar = float4_mul_scalar_impl;
gFloat4.div = float4_div_impl;
gFloat4.div_scalar = float4_div_scalar_impl;
gFloat4.dot = float4_dot_impl;
gFloat4.length = float4_length_impl;
gFloat4.lengthSquared = float4_length_squared_impl;
gFloat4.normalize = float4_normalize_impl;
gFloat4.normalize_mag = float4_normalize_mag_impl;
gFloat4.lerp = float4_lerp_impl;
gFloat4.cross = float4_cross_impl;
}
}

View file

@ -0,0 +1,25 @@
#include "sse41_intrinsics.h"
#include "mat44_dispatch.h"
#include "mat44_impl.inl"
namespace math_backend::mat44::dispatch
{
void install_sse41()
{
gMat44.transpose = mat44_transpose_impl;
gMat44.inverse = mat44_inverse_impl;
gMat44.inverse_to = mat44_inverse_to_impl;
gMat44.affine_inverse = mat44_affine_inverse_impl;
gMat44.mul_mat44 = mat44_mul_mat44_impl;
gMat44.mul_pos3 = mat44_mul_pos3_impl;
gMat44.mul_vec3 = mat44_mul_vec3_impl;
gMat44.mul_float4 = mat44_mul_float4_impl;
gMat44.transform_plane = mat44_transform_plane_impl;
gMat44.scale = mat44_scale_impl;
gMat44.get_scale = mat44_get_scale_impl;
gMat44.normalize = mat44_normalize_impl;
gMat44.determinant = mat44_get_determinant;
gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
}
}

View file

@ -0,0 +1,625 @@
#pragma once
#include <smmintrin.h> // SSE4.1
namespace
{
typedef __m128 f32x4;
//------------------------------------------------------
// Load / Store
//------------------------------------------------------
// Load 4 floats from memory into a SIMD register
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
inline f32x4 v_zero() { return _mm_setzero_ps(); }
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
//------------------------------------------------------
// Mask helpers
//------------------------------------------------------
inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
{
alignas(16) float tmp[4];
_mm_store_ps(tmp, v);
return _mm_set_ps(
tmp[w],
tmp[z],
tmp[y],
tmp[x]
);
}
inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
{
alignas(16) float tmp[4];
_mm_store_ps(tmp, v);
return _mm_set1_ps(tmp[x]);
}
inline f32x4 v_swizzle_lo(f32x4 v1)
{
return _mm_moveldup_ps(v1);
}
inline f32x4 v_swizzle_hi(f32x4 v1)
{
return _mm_movehdup_ps(v1);
}
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
{
return _mm_blend_ps(newv, original, 0b1000);
}
//------------------------------------------------------
// Shuffle helpers
//------------------------------------------------------
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
{
alignas(16) float a[4], b[4];
_mm_store_ps(a, v1);
_mm_store_ps(b, v2);
return _mm_set_ps(
b[w],
b[z],
a[y],
a[x]
);
}
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
{
return _mm_movelh_ps(v1, v2);
}
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
{
return _mm_movehl_ps(v2, v1);
}
//------------------------------------------------------
// Float3 helpers (safe loading into 4 lanes)
//------------------------------------------------------
inline f32x4 v_load3_vec(const float* p) // w = 0
{
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
}
inline f32x4 v_load3_pos(const float* p) // w = 1
{
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
}
inline void v_store3(float* dst, f32x4 v)
{
alignas(16) float tmp[4]; // temp storage
_mm_store_ps(tmp, v); // store all 4 lanes
dst[0] = tmp[0];
dst[1] = tmp[1];
dst[2] = tmp[2];
}
inline f32x4 v_set(float x, float y, float z, float w)
{
return _mm_set_ps(w, z, y, x);
}
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
{
return _mm_insert_ps(v, w, 0x30);
}
//------------------------------------------------------
// Simple Arithmatic
//------------------------------------------------------
// Element-wise multiply
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
// Element-wise divide
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
// Element-wise add
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
// Element-wise subtract
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
//------------------------------------------------------
// Fast recip
//------------------------------------------------------
// Fast recip 1/b
inline f32x4 v_rcp_nr(f32x4 b)
{
f32x4 r = _mm_rcp_ps(b);
f32x4 two = _mm_set1_ps(2.0f);
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
}
// Divide fast ( b = recip eg 1/b)
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
inline f32x4 v_rsqrt_nr(f32x4 x)
{
f32x4 half = _mm_set1_ps(0.5f);
f32x4 three = _mm_set1_ps(3.0f);
f32x4 r = _mm_rsqrt_ps(x);
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
}
//------------------------------------------------------
// Vector intrinsic functions
//------------------------------------------------------
// full dot4
inline f32x4 v_dot4(f32x4 a, f32x4 b)
{
return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
}
// dot3 (ignores w)
inline f32x4 v_dot3(f32x4 a, f32x4 b)
{
return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
}
// cross product xyz only.
inline f32x4 v_cross(f32x4 a, f32x4 b)
{
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
}
inline f32x4 v_normalize3(f32x4 v)
{
const f32x4 zero = _mm_setzero_ps();
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
f32x4 dot = v_dot3(v, v);
f32x4 inv = v_rsqrt_nr(dot);
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
f32x4 norm = _mm_mul_ps(v, inv);
return _mm_blendv_ps(norm, fallback, isZero);
}
// adds all 4 lanes together.
inline f32x4 v_hadd4(f32x4 a)
{
// sum all 4 lanes in SSE41
__m128 sum = _mm_hadd_ps(a, a);
return _mm_hadd_ps(sum, sum);
}
//------------------------------------------------------
// Matrix type (row-major 4x4)
//------------------------------------------------------
struct f32x4x4
{
f32x4 r0;
f32x4 r1;
f32x4 r2;
f32x4 r3;
};
//------------------------------------------------------
// Matrix Load / Store
//------------------------------------------------------
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
{
f32x4x4 out;
out.r0 = v_load(m + 0);
out.r1 = v_load(m + 4);
out.r2 = v_load(m + 8);
out.r3 = v_load(m + 12);
return out;
}
inline void m_store(float* dst, const f32x4x4& m)
{
_mm_storeu_ps(dst + 0, m.r0);
_mm_storeu_ps(dst + 4, m.r1);
_mm_storeu_ps(dst + 8, m.r2);
_mm_storeu_ps(dst + 12, m.r3);
}
inline f32x4x4 m_identity()
{
f32x4x4 m;
m.r0 = _mm_set_ps(0, 0, 0, 1);
m.r1 = _mm_set_ps(0, 0, 1, 0);
m.r2 = _mm_set_ps(0, 1, 0, 0);
m.r3 = _mm_set_ps(1, 0, 0, 0);
return m;
}
inline f32x4x4 m_zero()
{
f32x4 z = v_zero();
return { z, z, z, z };
}
//------------------------------------------------------
// Matrix × Vector
//------------------------------------------------------
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot4(m.r0, v);
f32x4 y = v_dot4(m.r1, v);
f32x4 z = v_dot4(m.r2, v);
f32x4 w = v_dot4(m.r3, v);
// combine to a vector
return _mm_set_ps(_mm_cvtss_f32(w),
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
{
f32x4 x = v_dot3(m.r0, v);
f32x4 y = v_dot3(m.r1, v);
f32x4 z = v_dot3(m.r2, v);
// combine to a vector
return _mm_set_ps(0.0f,
_mm_cvtss_f32(z),
_mm_cvtss_f32(y),
_mm_cvtss_f32(x));
}
//------------------------------------------------------
// Matrix × Matrix
//------------------------------------------------------
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
{
// Transpose B once for dot products
f32x4 b0 = b.r0;
f32x4 b1 = b.r1;
f32x4 b2 = b.r2;
f32x4 b3 = b.r3;
// Transpose (SSE2)
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
f32x4x4 C;
auto mul_row = [&](f32x4 arow)
{
f32x4 x = v_dot4(arow, b0);
f32x4 y = v_dot4(arow, b1);
f32x4 z = v_dot4(arow, b2);
f32x4 w = v_dot4(arow, b3);
f32x4 xy = _mm_unpacklo_ps(x, y);
f32x4 zw = _mm_unpacklo_ps(z, w);
return _mm_movelh_ps(xy, zw);
};
C.r0 = mul_row(a.r0);
C.r1 = mul_row(a.r1);
C.r2 = mul_row(a.r2);
C.r3 = mul_row(a.r3);
return C;
}
//------------------------------------------------------
// Transpose
//------------------------------------------------------
inline f32x4x4 m_transpose(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
f32x4 r3 = m.r3;
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
return { r0, r1, r2, r3 };
}
inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
{
dst[0] = v_extract0(c0); // c0.x
dst[1] = v_extract0(c1); // c1.x
dst[2] = v_extract0(c2); // c2.x
dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
}
inline f32x4x4 m_inverse(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(b, c);
f32x4 c1 = v_cross(c, d);
f32x4 c2 = v_cross(d, a);
f32x4 c3 = v_cross(a, b);
f32x4 det = v_dot4(a, c1);
f32x4 invDet = v_rcp_nr(det);
f32x4x4 adj;
adj.r0 = _mm_mul_ps(c1, invDet);
adj.r1 = _mm_mul_ps(c2, invDet);
adj.r2 = _mm_mul_ps(c3, invDet);
adj.r3 = _mm_mul_ps(c0, invDet);
return m_transpose(adj);
}
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
{
f32x4x4 t = m_transpose(m);
// Extract translation
f32x4 T = v_set(
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
0.0f
);
f32x4 newT = m_mul_vec4(t, T);
newT = _mm_sub_ps(v_zero(), newT);
t.r0 = v_preserve_w(t.r0, newT);
t.r1 = v_preserve_w(t.r1, newT);
t.r2 = v_preserve_w(t.r2, newT);
t.r3 = v_set(0, 0, 0, 1);
return t;
}
inline f32x4x4 m_inverse_affine(const f32x4x4& m)
{
// Extract upper 3x3 rows
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
// Zero translation
r0 = _mm_and_ps(r0, v_mask_xyz());
r1 = _mm_and_ps(r1, v_mask_xyz());
r2 = _mm_and_ps(r2, v_mask_xyz());
// Compute cofactors via cross products
f32x4 c0 = v_cross(r1, r2);
f32x4 c1 = v_cross(r2, r0);
f32x4 c2 = v_cross(r0, r1);
// Determinant
f32x4 det = v_dot3(r0, c0);
// Reciprocal determinant
f32x4 invDet = v_rcp_nr(det);
// Inverse 3x3 (transpose of cofactor matrix)
f32x4 i0 = _mm_mul_ps(c0, invDet);
f32x4 i1 = _mm_mul_ps(c1, invDet);
f32x4 i2 = _mm_mul_ps(c2, invDet);
// Transpose 3x3
f32x4 t0 = i0;
f32x4 t1 = i1;
f32x4 t2 = i2;
f32x4 t3 = _mm_setzero_ps();
_MM_TRANSPOSE4_PS(t0, t1, t2, t3);
// Extract translation
alignas(16) float tmp[4];
tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
tmp[3] = 0.0f;
f32x4 T = v_load(tmp);
// New translation = -R' * T
f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
newT = _mm_sub_ps(v_zero(), newT);
// Assemble final matrix
f32x4x4 out;
out.r0 = v_preserve_w(t0, newT);
out.r1 = v_preserve_w(t1, newT);
out.r2 = v_preserve_w(t2, newT);
out.r3 = _mm_set_ps(1, 0, 0, 0);
return out;
}
inline f32x4 m_determinant(const f32x4x4& m)
{
f32x4 a = m.r0;
f32x4 b = m.r1;
f32x4 c = m.r2;
f32x4 d = m.r3;
f32x4 c0 = v_cross(c, d);
f32x4 c1 = v_cross(d, b);
f32x4 c2 = v_cross(b, c);
f32x4 term0 = _mm_mul_ps(a, c0);
f32x4 term1 = _mm_mul_ps(a, c1);
f32x4 term2 = _mm_mul_ps(a, c2);
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
return v_hadd4(det);
}
inline f32x4 m_determinant_affine(const f32x4x4& m)
{
f32x4 r0 = m.r0;
f32x4 r1 = m.r1;
f32x4 r2 = m.r2;
r0 = _mm_and_ps(r0, v_mask_xyz());
r1 = _mm_and_ps(r1, v_mask_xyz());
r2 = _mm_and_ps(r2, v_mask_xyz());
f32x4 c0 = v_cross(r1, r2);
return v_dot3(r0, c0); // splatted determinant
}
//---------------------------------------------------------
// BATCH INTRINSICS
//---------------------------------------------------------
struct vec4_batch4
{
f32x4 x;
f32x4 y;
f32x4 z;
f32x4 w;
};
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
{
vec4_batch4 r;
r.x = _mm_set_ps(
ptr[9], ptr[6], ptr[3], ptr[0]);
r.y = _mm_set_ps(
ptr[10], ptr[7], ptr[4], ptr[1]);
r.z = _mm_set_ps(
ptr[11], ptr[8], ptr[5], ptr[2]);
if (fillW)
{
r.w = _mm_set1_ps(w);
}
return r;
}
// Store the result back out to a float array 4 at a time
inline void store_f32x4(float* out, f32x4 v)
{
_mm_storeu_ps(out, v);
}
// Store the result back to a float3 array with size of 4
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
{
alignas(16) float xs[8];
alignas(16) float ys[8];
alignas(16) float zs[8];
_mm_store_ps(xs, v.x);
_mm_store_ps(ys, v.y);
_mm_store_ps(zs, v.z);
for (int i = 0; i < 4; ++i)
{
out[i * 3 + 0] = xs[i];
out[i * 3 + 1] = ys[i];
out[i * 3 + 2] = zs[i];
}
}
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
{
f32x4 mulx = _mm_mul_ps(a.x, b.x);
f32x4 muly = _mm_mul_ps(a.y, b.y);
f32x4 mulz = _mm_mul_ps(a.z, b.z);
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot
//
r.x = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m00),
_mm_mul_ps(v.y, m01)),
_mm_add_ps(
_mm_mul_ps(v.z, m02), m03));
//
// row1 dot
//
r.y = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m10),
_mm_mul_ps(v.y, m11)),
_mm_add_ps(
_mm_mul_ps(v.z, m12), m13));
//
// row2 dot
//
r.z = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(v.x, m20),
_mm_mul_ps(v.y, m21)),
_mm_add_ps(
_mm_mul_ps(v.z, m22), m23));
return r;
}
}

View file

@ -151,7 +151,7 @@ inline void m_matF_x_vectorF(const F32 *m, const F32 *v, F32 *vresult)
inline bool mIsEqual( F32 a, F32 b, const F32 epsilon = __EQUAL_CONST_F )
{
F32 diff = a - b;
return diff > -epsilon && diff < epsilon;
return diff > -epsilon && diff < epsilon;
}
inline bool mIsZero(const F32 val, const F32 epsilon = __EQUAL_CONST_F )
@ -207,16 +207,16 @@ inline F32 mFmod(const F32 val, const F32 mod)
return fmod(val, mod);
}
inline S32 mRound(const F32 val)
{
return (S32)floor(val + 0.5f);
}
inline S32 mRound(const F32 val)
{
return (S32)floor(val + 0.5f);
}
inline F32 mRound(const F32 val, const S32 n)
{
S32 place = (S32) pow(10.0f, n);
inline F32 mRound(const F32 val, const S32 n)
{
S32 place = (S32) pow(10.0f, n);
return mFloor((val*place)+0.5)/place;
return mFloor((val*place)+0.5)/place;
}
inline F32 mRoundF(const F32 val, const F32 step)
@ -261,15 +261,15 @@ inline F32 mClampF(F32 val, F32 low, F32 high)
inline S32 mWrap(S32 val, S32 low, S32 high)
{
int len = high - low;
return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0);
int len = high - low;
return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0);
}
inline F32 mWrapF(F32 val, F32 low, F32 high)
{
F32 t = fmod(val - low, high - low);
return t < 0 ? t + high : t + low;
F32 t = fmod(val - low, high - low);
return t < 0 ? t + high : t + low;
}
/// Template function for doing a linear interpolation between any two
@ -497,7 +497,7 @@ inline F64 mRadToDeg(F64 r)
inline bool mIsNaN_F( const F32 x )
{
// If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN.
// If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN.
// This is only going to work if the compiler is IEEE 748 compliant.
//
// Tested and working on VC2k5

View file

@ -152,8 +152,8 @@ static void m_point3F_interpolate_C(const F32 *from, const F32 *to, F32 factor,
#ifdef TORQUE_COMPILER_GCC
// remove possibility of aliases
const F32 inverse = 1.0f - factor;
const F32 from0 = from[0], from1 = from[1], from2 = from[2];
const F32 to0 = to[0], to1 = to[1], to2 = to[2];
const F32 from0 = from[0], from1 = from[1], from2 = from[2];
const F32 to0 = to[0], to1 = to[1], to2 = to[2];
result[0] = from0 * inverse + to0 * factor;
result[1] = from1 * inverse + to1 * factor;
@ -182,8 +182,8 @@ static void m_point3D_interpolate_C(const F64 *from, const F64 *to, F64 factor,
#ifdef TORQUE_COMPILER_GCC
// remove possibility of aliases
const F64 inverse = 1.0f - factor;
const F64 from0 = from[0], from1 = from[1], from2 = from[2];
const F64 to0 = to[0], to1 = to[1], to2 = to[2];
const F64 from0 = from[0], from1 = from[1], from2 = from[2];
const F64 to0 = to[0], to1 = to[1], to2 = to[2];
result[0] = from0 * inverse + to0 * factor;
result[1] = from1 * inverse + to1 * factor;
@ -436,8 +436,8 @@ static void affineInvertTo(const F32 * m, F32 * out)
F32 invDet = 1.0f / (m[idx(0,0)] * d1 + m[idx(0,1)] * d2 + m[idx(0,2)] * d3);
F32 m00 = m[idx(0,0)] * invDet;
F32 m01 = m[idx(0,1)] * invDet;
F32 m00 = m[idx(0,0)] * invDet;
F32 m01 = m[idx(0,1)] * invDet;
F32 m02 = m[idx(0,2)] * invDet;
F32 * result = out;
@ -950,4 +950,3 @@ void mInstallLibrary_C()
m_matF_x_scale_x_planeF = m_matF_x_scale_x_planeF_C;
m_matF_x_box3F = m_matF_x_box3F_C;
}

View file

@ -50,13 +50,16 @@
#ifndef _CONSOLE_H_
#include "console/console.h"
#endif
#ifndef _MATH_BACKEND_H_
#include "math/public/math_backend.h"
#endif
#ifndef USE_TEMPLATE_MATRIX
/// 4x4 Matrix Class
///
/// This runs at F32 precision.
using namespace math_backend::mat44::dispatch;
class MatrixF
{
friend class MatrixFEngineExport;
@ -128,7 +131,7 @@ public:
EulerF toEuler() const;
F32 determinant() const {
return m_matF_determinant(*this);
return GetMat44().determinant(*this);
}
/// Compute the inverse of the matrix.
@ -228,6 +231,8 @@ public:
void mul( Point4F& p ) const; ///< M * p -> p (full [4x4] * [1x4])
void mulP( Point3F& p ) const; ///< M * p -> p (assume w = 1.0f)
void mulP( const Point3F &p, Point3F *d) const; ///< M * p -> d (assume w = 1.0f)
void batch_mulP(Point3F* points, U32 count) const;
void batch_mulP(const Point3F* points, U32 count, Point3F* out) const;
void mulV( VectorF& p ) const; ///< M * v -> v (assume w = 0.0f)
void mulV( const VectorF &p, Point3F *d) const; ///< M * v -> d (assume w = 0.0f)
@ -372,71 +377,66 @@ inline MatrixF& MatrixF::identity()
inline MatrixF& MatrixF::inverse()
{
m_matF_inverse(m);
GetMat44().inverse(m);
return (*this);
}
inline void MatrixF::invertTo( MatrixF *out )
{
m_matF_invert_to(m,*out);
GetMat44().inverse_to(*this, *out);
}
inline MatrixF& MatrixF::affineInverse()
{
// AssertFatal(isAffine() == true, "Error, this matrix is not an affine transform");
m_matF_affineInverse(m);
GetMat44().affine_inverse(m);
return (*this);
}
inline MatrixF& MatrixF::transpose()
{
m_matF_transpose(m);
GetMat44().transpose(m);
return (*this);
}
inline MatrixF& MatrixF::scale(const Point3F& p)
{
m_matF_scale(m,p);
GetMat44().scale(m, p);
return *this;
}
inline Point3F MatrixF::getScale() const
{
Point3F scale;
scale.x = mSqrt(m[0]*m[0] + m[4] * m[4] + m[8] * m[8]);
scale.y = mSqrt(m[1]*m[1] + m[5] * m[5] + m[9] * m[9]);
scale.z = mSqrt(m[2]*m[2] + m[6] * m[6] + m[10] * m[10]);
GetMat44().get_scale(m, scale);
return scale;
}
inline void MatrixF::normalize()
{
m_matF_normalize(m);
GetMat44().normalize(m);
}
inline MatrixF& MatrixF::mul( const MatrixF &a )
{ // M * a -> M
AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!");
MatrixF tempThis(*this);
m_matF_x_matF(tempThis, a, *this);
GetMat44().mul_mat44(tempThis, a, *this);
return (*this);
}
inline MatrixF& MatrixF::mulL( const MatrixF &a )
{ // a * M -> M
AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!");
MatrixF tempThis(*this);
m_matF_x_matF(a, tempThis, *this);
GetMat44().mul_mat44(a, tempThis, *this);
return (*this);
}
inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b )
{ // a * b -> M
AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!");
m_matF_x_matF(a, b, *this);
GetMat44().mul_mat44(a, b, *this);
return (*this);
}
@ -461,7 +461,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b)
inline void MatrixF::mul( Point4F& p ) const
{
Point4F temp;
m_matF_x_point4F(*this, &p.x, &temp.x);
GetMat44().mul_float4(*this, p, temp);
p = temp;
}
@ -469,28 +469,62 @@ inline void MatrixF::mulP( Point3F& p) const
{
// M * p -> d
Point3F d;
m_matF_x_point3F(*this, &p.x, &d.x);
GetMat44().mul_pos3(*this, p, d);
p = d;
}
inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
{
// M * p -> d
m_matF_x_point3F(*this, &p.x, &d->x);
GetMat44().mul_pos3(*this, p, *d);
}
inline void MatrixF::batch_mulP(Point3F* points, U32 count) const
{
// Allocate temporary buffer
Point3F* temp = new Point3F[count];
GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
// Copy the results back to out
for (U32 i = 0; i < count; ++i)
{
points[i] = temp[i];
}
// Free temporary buffer
delete[] temp;
}
inline void MatrixF::batch_mulP(const Point3F* points, U32 count, Point3F* out) const
{
// Allocate temporary buffer
Point3F* temp = new Point3F[count];
GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
// Copy the results back to out
for (U32 i = 0; i < count; ++i)
{
out[i] = temp[i];
}
// Free temporary buffer
delete[] temp;
}
inline void MatrixF::mulV( VectorF& v) const
{
// M * v -> v
VectorF temp;
m_matF_x_vectorF(*this, &v.x, &temp.x);
GetMat44().mul_vec3(*this, v, temp);
v = temp;
}
inline void MatrixF::mulV( const VectorF &v, Point3F *d) const
{
// M * v -> d
m_matF_x_vectorF(*this, &v.x, &d->x);
GetMat44().mul_vec3(*this, v, *d);
}
inline void MatrixF::mul(Box3F& b) const
@ -634,14 +668,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 )
{
// temp = m1 * m2
MatrixF temp;
m_matF_x_matF(m1, m2, temp);
GetMat44().mul_mat44(m1, m2, temp);
return temp;
}
inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 )
{
MatrixF tempThis(*this);
m_matF_x_matF(tempThis, m1, *this);
GetMat44().mul_mat44(tempThis, m1, *this);
return (*this);
}
@ -665,7 +699,9 @@ inline bool MatrixF::isNaN()
inline void mTransformPlane(const MatrixF& mat, const Point3F& scale, const PlaneF& plane, PlaneF * result)
{
m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
//m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
GetMat44().transform_plane(mat, scale, plane, *result);
}
#else // !USE_TEMPLATE_MATRIX

View file

@ -29,6 +29,11 @@
#ifndef _MPOINT2_H_
#include "math/mPoint2.h"
#endif
#ifndef _MATH_BACKEND_H_
#include "math/public/math_backend.h"
#endif
#include <array>
//------------------------------------------------------------------------------
/// 3D integer point
@ -97,6 +102,7 @@ public:
class Point3D;
//------------------------------------------------------------------------------
using namespace math_backend::float3::dispatch;
class Point3F
{
//-------------------------------------- Public data
@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
{
AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
m_point3F_interpolate( _from, _to, _factor, *this);
GetFloat3().lerp(_from, _to, _factor, *this);
}
inline void Point3F::zero()
@ -585,31 +592,27 @@ inline void Point3F::neg()
inline void Point3F::convolve(const Point3F& c)
{
x *= c.x;
y *= c.y;
z *= c.z;
GetFloat3().mul(*this, c, *this);
}
inline void Point3F::convolveInverse(const Point3F& c)
{
x /= c.x;
y /= c.y;
z /= c.z;
GetFloat3().div(*this, c, *this);
}
inline F32 Point3F::lenSquared() const
{
return (x * x) + (y * y) + (z * z);
return GetFloat3().lengthSquared(*this);
}
inline F32 Point3F::len() const
{
return mSqrt(x*x + y*y + z*z);
return GetFloat3().length(*this);
}
inline void Point3F::normalize()
{
m_point3F_normalize(*this);
GetFloat3().normalize(*this);
}
inline F32 Point3F::magnitudeSafe() const
@ -626,18 +629,13 @@ inline F32 Point3F::magnitudeSafe() const
inline void Point3F::normalizeSafe()
{
F32 vmag = magnitudeSafe();
if( vmag > POINT_EPSILON )
{
*this *= F32(1.0 / vmag);
}
GetFloat3().normalize(*this);
}
inline void Point3F::normalize(F32 val)
{
m_point3F_normalize_f(*this, val);
GetFloat3().normalize_mag(*this, val);
}
inline bool Point3F::operator==(const Point3F& _test) const
@ -652,52 +650,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const
inline Point3F Point3F::operator+(const Point3F& _add) const
{
return Point3F(x + _add.x, y + _add.y, z + _add.z);
Point3F temp;
GetFloat3().add(*this, _add, temp);
return temp;
}
inline Point3F Point3F::operator-(const Point3F& _rSub) const
{
return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
Point3F temp;
GetFloat3().sub(*this, _rSub, temp);
return temp;
}
inline Point3F& Point3F::operator+=(const Point3F& _add)
{
x += _add.x;
y += _add.y;
z += _add.z;
GetFloat3().add(*this, _add, *this);
return *this;
}
inline Point3F& Point3F::operator-=(const Point3F& _rSub)
{
x -= _rSub.x;
y -= _rSub.y;
z -= _rSub.z;
GetFloat3().sub(*this, _rSub, *this);
return *this;
}
inline Point3F Point3F::operator*(F32 _mul) const
{
return Point3F(x * _mul, y * _mul, z * _mul);
Point3F temp;
GetFloat3().mul_scalar(*this, _mul, temp);
return temp;
}
inline Point3F Point3F::operator/(F32 _div) const
{
AssertFatal(_div != 0.0f, "Error, div by zero attempted");
F32 inv = 1.0f / _div;
return Point3F(x * inv, y * inv, z * inv);
Point3F temp;
GetFloat3().div_scalar(*this, _div, temp);
return temp;
}
inline Point3F& Point3F::operator*=(F32 _mul)
{
x *= _mul;
y *= _mul;
z *= _mul;
GetFloat3().mul_scalar(*this, _mul, *this);
return *this;
}
@ -705,39 +700,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
{
AssertFatal(_div != 0.0f, "Error, div by zero attempted");
F32 inv = 1.0f / _div;
x *= inv;
y *= inv;
z *= inv;
GetFloat3().div_scalar(*this, _div, *this);
return *this;
}
inline Point3F Point3F::operator*(const Point3F &_vec) const
{
return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
Point3F temp;
GetFloat3().mul(*this, _vec, temp);
return temp;
}
inline Point3F& Point3F::operator*=(const Point3F &_vec)
{
x *= _vec.x;
y *= _vec.y;
z *= _vec.z;
GetFloat3().mul(*this, _vec, *this);
return *this;
}
inline Point3F Point3F::operator/(const Point3F &_vec) const
{
AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
Point3F temp;
GetFloat3().div(*this, _vec, temp);
return temp;
}
inline Point3F& Point3F::operator/=(const Point3F &_vec)
{
AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
x /= _vec.x;
y /= _vec.y;
z /= _vec.z;
GetFloat3().div(*this, _vec, *this);
return *this;
}
@ -989,7 +980,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)
inline Point3F operator*(F32 mul, const Point3F& multiplicand)
{
return multiplicand * mul;
Point3F temp;
GetFloat3().mul_scalar(multiplicand, mul, temp);
return temp;
}
inline Point3D operator*(F64 mul, const Point3D& multiplicand)
@ -999,7 +992,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
inline F32 mDot(const Point3F &p1, const Point3F &p2)
{
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
return GetFloat3().dot(p1, p2);
}
inline F64 mDot(const Point3D &p1, const Point3D &p2)
@ -1009,9 +1002,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2)
inline void mCross(const Point3F &a, const Point3F &b, Point3F *res)
{
res->x = (a.y * b.z) - (a.z * b.y);
res->y = (a.z * b.x) - (a.x * b.z);
res->z = (a.x * b.y) - (a.y * b.x);
GetFloat3().cross(a, b, *res);
}
inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
@ -1024,7 +1015,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
inline Point3F mCross(const Point3F &a, const Point3F &b)
{
Point3F r;
mCross( a, b, &r );
GetFloat3().cross(a, b, r);
return r;
}

View file

@ -26,10 +26,12 @@
#ifndef _MMATHFN_H_
#include "math/mMathFn.h"
#endif
#ifndef _MPOINT3_H_
#include "math/mPoint3.h"
#endif
#ifndef _MATH_BACKEND_H_
#include "math/public/math_backend.h"
#endif
//------------------------------------------------------------------------------
@ -61,6 +63,8 @@ class Point4I
/// Uses F32 internally.
///
/// Useful for representing quaternions and other 4d beasties.
using namespace math_backend::float4::dispatch;
class Point4F
{
//-------------------------------------- Public data
@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)
inline F32 Point4F::len() const
{
return mSqrt(x*x + y*y + z*z + w*w);
return GetFloat4().length(*this);
}
inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
{
x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
GetFloat4().lerp(_from, _to, _factor, *this);
}
inline void Point4F::zero()
@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
if (mIsZero(scalar))
return *this;
F32 denom = 1 / scalar;
x *= denom;
y *= denom;
z *= denom;
w *= denom;
GetFloat4().div_scalar(*this, scalar, *this);
return *this;
}
inline Point4F Point4F::operator+(const Point4F& _add) const
{
return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
Point4F res;
GetFloat4().add(*this, _add, res);
return res;
}
inline Point4F& Point4F::operator+=(const Point4F& _add)
{
x += _add.x;
y += _add.y;
z += _add.z;
w += _add.w;
GetFloat4().add(*this, _add, *this);
return *this;
}
inline Point4F Point4F::operator-(const Point4F& _rSub) const
{
return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
Point4F res;
GetFloat4().sub(*this, _rSub, res);
return res;
}
inline Point4F Point4F::operator*(const Point4F &_vec) const
{
return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
Point4F res;
GetFloat4().mul(*this, _vec, res);
return res;
}
inline Point4F Point4F::operator*(F32 _mul) const
{
return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
Point4F res;
GetFloat4().mul_scalar(*this, _mul, res);
return res;
}
inline Point4F Point4F::operator /(F32 t) const
{
F32 f = 1.0f / t;
return Point4F( x * f, y * f, z * f, w * f );
Point4F res;
GetFloat4().div_scalar(*this, t, res);
return res;
}
inline F32 mDot(const Point4F &p1, const Point4F &p2)
{
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
return GetFloat4().dot(p1, p2);
}
//------------------------------------------------------------------------------

View file

@ -0,0 +1,39 @@
#pragma once
#ifndef _FLOAT3_DISPATCH_H_
#define _FLOAT3_DISPATCH_H_
#include <cstdint>
namespace math_backend::float3::dispatch
{
struct Float3Funcs
{
void (*add)(const float*, const float*, float*) = nullptr;
void (*sub)(const float*, const float*, float*) = nullptr;
void (*mul)(const float*, const float*, float*) = nullptr;
void (*mul_scalar)(const float*, float, float*) = nullptr;
void (*div)(const float*, const float*, float*) = nullptr;
void (*div_scalar)(const float*, float, float*) = nullptr;
float (*dot)(const float*, const float*) = nullptr;
float (*length)(const float*) = nullptr;
float (*lengthSquared)(const float*) = nullptr;
void (*normalize)(float*) = nullptr;
void (*normalize_mag)(float*, float) = nullptr;
void (*lerp)(const float*, const float*, float, float*) = nullptr;
void (*cross)(const float*, const float*, float*) = nullptr;
};
// Global dispatch table
extern Float3Funcs gFloat3;
// Backend installers (defined in ISA libraries)
void install_scalar();
void install_sse2();
void install_sse41();
void install_avx();
void install_avx2();
void install_neon();
}
#endif // !_FLOAT4_DISPATCH_H_

View file

@ -0,0 +1,39 @@
#pragma once
#ifndef _FLOAT4_DISPATCH_H_
#define _FLOAT4_DISPATCH_H_
#include <cstdint>
namespace math_backend::float4::dispatch
{
struct Float4Funcs
{
void (*add)(const float*, const float*, float*) = nullptr;
void (*sub)(const float*, const float*, float*) = nullptr;
void (*mul)(const float*, const float*, float*) = nullptr;
void (*mul_scalar)(const float*, float, float*) = nullptr;
void (*div)(const float*, const float*, float*) = nullptr;
void (*div_scalar)(const float*, float, float*) = nullptr;
float (*dot)(const float*, const float*) = nullptr;
float (*length)(const float*) = nullptr;
float (*lengthSquared)(const float*) = nullptr;
void (*normalize)(float*) = nullptr;
void (*normalize_mag)(float*, float) = nullptr;
void (*lerp)(const float*, const float*, float, float*) = nullptr;
void (*cross)(const float*, const float*, float*) = nullptr;
};
// Global dispatch table
extern Float4Funcs gFloat4;
// Backend installers (defined in ISA libraries)
void install_scalar();
void install_sse2();
void install_sse41();
void install_avx();
void install_avx2();
void install_neon();
}
#endif // !_FLOAT4_DISPATCH_H_

View file

@ -0,0 +1,39 @@
#pragma once
#ifndef _MAT44_DISPATCH_H_
#define _MAT44_DISPATCH_H_
namespace math_backend::mat44::dispatch
{
struct Mat44Funcs
{
void (*transpose)(float*) = nullptr;
void (*inverse)(float*) = nullptr;
void (*inverse_to)(const float*, float*) = nullptr;
void (*affine_inverse)(float*) = nullptr;
void (*normalize)(float*) = nullptr;
void (*mul_mat44)(const float* a, const float* b, float* r) = nullptr;
void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr;
void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr;
void (*mul_float4)(const float* a, const float* b, float* r) = nullptr;
void (*transform_plane)(const float* m, const float* s, const float* p, float* r) = nullptr;
float (*determinant)(const float*) = nullptr;
void (*scale)(float*, const float*) = nullptr;
void (*get_scale)(const float*, float*) = nullptr;
void (*batch_mul_pos3)(const float* m, const float* pts, int count, float* result_ptrs) = nullptr;
};
// Global dispatch table
extern Mat44Funcs gMat44;
// Backend installers (defined in ISA libraries)
void install_scalar();
void install_sse2();
void install_sse41();
void install_avx();
void install_avx2();
void install_neon();
}
#endif // !_MAT44_DISPATCH_H_

View file

@ -0,0 +1,103 @@
#include "math/public/math_backend.h"
namespace math_backend::float4::dispatch
{
// Single definition of the global dispatch table
Float4Funcs gFloat4{};
}
namespace math_backend::float3::dispatch
{
// Single definition of the global dispatch table
Float3Funcs gFloat3{};
}
namespace math_backend::mat44::dispatch
{
Mat44Funcs gMat44{};
}
namespace math_backend
{
// Use an anonymous namespace for the static initializer
namespace {
struct ScalarInitializer
{
ScalarInitializer()
{
// Install scalar defaults immediately for all types
float4::dispatch::install_scalar();
float3::dispatch::install_scalar();
mat44::dispatch::install_scalar();
}
};
// Static instance ensures constructor runs before main()
ScalarInitializer g_scalarInitializer;
}
}
math_backend::backend math_backend::choose_backend(U32 cpu_flags)
{
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
if (cpu_flags & CPU_PROP_AVX2) return backend::avx2;
if (cpu_flags & CPU_PROP_AVX) return backend::avx;
if (cpu_flags & CPU_PROP_SSE4_1) return backend::sse41;
if (cpu_flags & CPU_PROP_SSE2) return backend::sse2;
#elif defined(__aarch64__) || defined(__ARM_NEON)
if (cpu_flags & CPU_PROP_NEON) return backend::neon;
#endif
return backend::scalar;
}
void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
{
{
g_backend = choose_backend(cpu_flags);
switch (g_backend)
{
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
case backend::avx2:
float4::dispatch::install_avx2();
float3::dispatch::install_avx2();
mat44::dispatch::install_avx2();
break;
case backend::avx:
float4::dispatch::install_avx();
float3::dispatch::install_avx();
mat44::dispatch::install_avx();
break;
case backend::sse41:
float4::dispatch::install_sse41();
float3::dispatch::install_sse41();
mat44::dispatch::install_sse41();
break;
case backend::sse2:
float4::dispatch::install_sse2();
float3::dispatch::install_sse2();
mat44::dispatch::install_sse2();
break;
#elif defined(__aarch64__) || defined(__ARM_NEON)
case backend::neon:
float4::dispatch::install_neon();
float3::dispatch::install_neon();
mat44::dispatch::install_neon();
break;
#endif
default:
float4::dispatch::install_scalar();
float3::dispatch::install_scalar();
mat44::dispatch::install_scalar();
break;
}
}
}

View file

@ -0,0 +1,95 @@
#pragma once
#ifndef _MATH_BACKEND_H_
#define _MATH_BACKEND_H_
#ifndef _MCONSTANTS_H_
#include "math/mConstants.h"
#endif
#ifndef _PLATFORMASSERT_H_
#include "platform/platformAssert.h"
#endif
#ifndef _FLOAT4_DISPATCH_H_
#include "math/public/float4_dispatch.h"
#endif
#ifndef _FLOAT3_DISPATCH_H_
#include "math/public/float3_dispatch.h"
#endif
#ifndef _MAT44_DISPATCH_H_
#include "math/public/mat44_dispatch.h"
#endif
namespace math_backend
{
enum class backend
{
scalar,
sse2,
sse41,
avx,
avx2,
neon
};
static backend g_backend = backend::scalar;
backend choose_backend(U32 cpu_flags);
void install_from_cpu_flags(uint32_t cpu_flags);
}
#include <mutex>
namespace math_backend::float4::dispatch
{
//--------------------------------------------------
// Thread-safe getter, ensures scalar installed if empty
//--------------------------------------------------
inline Float4Funcs& GetFloat4()
{
if (gFloat4.mul == nullptr)
{
static std::once_flag once;
std::call_once(once, []{
install_scalar();
});
}
return gFloat4;
}
}
namespace math_backend::float3::dispatch
{
//--------------------------------------------------
// Thread-safe getter, ensures scalar installed if empty
//--------------------------------------------------
inline Float3Funcs& GetFloat3()
{
if (gFloat3.mul == nullptr)
{
static std::once_flag once;
std::call_once(once, []{
install_scalar();
});
}
return gFloat3;
}
}
namespace math_backend::mat44::dispatch
{
//--------------------------------------------------
// Thread-safe getter, ensures scalar installed if empty
//--------------------------------------------------
inline Mat44Funcs& GetMat44()
{
if (gMat44.mul_mat44 == nullptr)
{
static std::once_flag once;
std::call_once(once, []{
install_scalar();
});
}
return gMat44;
}
}
#endif // !_MATH_BACKEND_H_

View file

@ -381,8 +381,10 @@ void FrustumData::_update() const
// Transform the points into the desired culling space.
for( U32 i = 0; i < mPoints.size(); ++ i )
mTransform.mulP( mPoints[ i ] );
/*for( U32 i = 0; i < mPoints.size(); ++ i )
mTransform.mulP( mPoints[ i ] );*/
mTransform.batch_mulP(mPoints.address(), mPoints.size());
// Update the axis aligned bounding box from
// the newly transformed points.

View file

@ -75,10 +75,12 @@ enum ProcessorProperties
CPU_PROP_SSE4_1 = (1<<7), ///< Supports SSE4_1 instruction set extension.
CPU_PROP_SSE4_2 = (1<<8), ///< Supports SSE4_2 instruction set extension.
CPU_PROP_AVX = (1<<9), ///< Supports AVX256 instruction set extension.
CPU_PROP_MP = (1<<10), ///< This is a multi-processor system.
CPU_PROP_LE = (1<<11), ///< This processor is LITTLE ENDIAN.
CPU_PROP_64bit = (1<<12), ///< This processor is 64-bit capable
CPU_PROP_NEON = (1<<13), ///< Supports the Arm Neon instruction set extension.
CPU_PROP_AVX2 = (1<<10), ///< Supports AVX256 instruction set extension.
CPU_PROP_AVX512 = (1<<11), ///< Supports AVX512 instruction set extension.
CPU_PROP_MP = (1<<12), ///< This is a multi-processor system.
CPU_PROP_LE = (1<<13), ///< This processor is LITTLE ENDIAN.
CPU_PROP_64bit = (1<<14), ///< This processor is 64-bit capable
CPU_PROP_NEON = (1<<15), ///< Supports the Arm Neon instruction set extension.
};
/// Processor info manager.

View file

@ -25,6 +25,7 @@
#import "math/mMath.h"
#import "core/strings/stringFunctions.h"
#include "console/engineAPI.h"
#include "math/public/math_backend.h"
extern void mInstallLibrary_C();
@ -107,14 +108,10 @@ void Math::init(U32 properties)
Con::printf("Math Init:");
Con::printf(" Installing Standard C extensions");
mInstallLibrary_C();
mInstallLibrary_C();
#ifdef TORQUE_CPU_X86
if( properties & CPU_PROP_SSE )
{
Con::printf( " Installing SSE extensions" );
}
#endif
Con::printf( " Installing ISA extensions" );
math_backend::install_from_cpu_flags(properties);
Con::printf(" ");
}

View file

@ -27,6 +27,7 @@
#include "math/mMath.h"
#include "core/strings/stringFunctions.h"
#include "console/engineAPI.h"
#include "math/public/math_backend.h"
extern void mInstallLibrary_C();
extern void mInstallLibrary_ASM();
@ -90,29 +91,14 @@ void Math::init(U32 properties)
Con::printf(" Installing Standard C extensions");
mInstallLibrary_C();
Con::printf(" Installing ISA extensions");
math_backend::install_from_cpu_flags(properties);
#if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64)
Con::printf(" Installing Assembly extensions");
mInstallLibrary_ASM();
#endif
if (properties & CPU_PROP_FPU)
{
Con::printf(" Installing FPU extensions");
}
if (properties & CPU_PROP_MMX)
{
Con::printf(" Installing MMX extensions");
}
#if !defined(__MWERKS__) || (__MWERKS__ >= 0x2400)
if (properties & CPU_PROP_SSE)
{
Con::printf(" Installing SSE extensions");
}
#endif //mwerks>2.4
Con::printf(" ");
}

View file

@ -72,13 +72,15 @@ enum CpuFlags
BIT_XSAVE_RESTORE = BIT(27),
BIT_AVX = BIT(28),
BIT_AVX2 = BIT(5),
BIT_AVX512F = BIT(16)
};
static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor)
{
S32 cpuInfo[4];
__cpuid(cpuInfo, 1);
//U32 eax = cpuInfo[0]; // eax
int nIds = cpuInfo[0];
U32 edx = cpuInfo[3]; // edx
U32 ecx = cpuInfo[2]; // ecx
@ -90,6 +92,18 @@ static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor)
processor.properties |= (ecx & BIT_SSE4_1) ? CPU_PROP_SSE4_1 : 0;
processor.properties |= (ecx & BIT_SSE4_2) ? CPU_PROP_SSE4_2 : 0;
if (nIds >= 7)
{
int ext[4];
__cpuidex(ext, 7, 0);
if (ext[1] & (BIT_AVX2)) // EBX bit 5
processor.properties |= CPU_PROP_AVX2;
if (ext[1] & (BIT_AVX512F)) // AVX-512 Foundation
processor.properties |= CPU_PROP_AVX512;
}
// AVX detection requires that xsaverestore is supported
if (ecx & BIT_XSAVE_RESTORE && ecx & BIT_AVX)
{
@ -176,6 +190,10 @@ void Processor::init()
Con::printf(" SSE4.2 detected" );
if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX)
Con::printf(" AVX detected");
if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX2)
Con::printf(" AVX2 detected");
if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX512)
Con::printf(" AVX512 detected");
if (Platform::SystemInfo.processor.properties & CPU_PROP_MP)
Con::printf(" MultiCore CPU detected [%i cores, %i logical]", Platform::SystemInfo.processor.numPhysicalProcessors, Platform::SystemInfo.processor.numLogicalProcessors);

View file

@ -25,7 +25,7 @@
#include "console/engineAPI.h"
#include "math/mMath.h"
#include "math/public/math_backend.h"
extern void mInstallLibrary_C();
extern void mInstallLibrary_ASM();
@ -98,24 +98,12 @@ void Math::init(U32 properties)
Con::printf(" Installing Standard C extensions");
mInstallLibrary_C();
Con::printf(" Installing ISA extensions");
math_backend::install_from_cpu_flags(properties);
Con::printf(" Installing Assembly extensions");
mInstallLibrary_ASM();
if (properties & CPU_PROP_FPU)
{
Con::printf(" Installing FPU extensions");
}
if (properties & CPU_PROP_MMX)
{
Con::printf(" Installing MMX extensions");
}
if (properties & CPU_PROP_SSE)
{
Con::printf(" Installing SSE extensions");
}
Con::printf(" ");
}

View file

@ -250,7 +250,7 @@ MatrixF AssimpAppNode::getNodeTransform(F32 time)
// Check for inverted node coordinate spaces => can happen when modelers
// use the 'mirror' tool in their 3d app. Shows up as negative <scale>
// transforms in the collada model.
if (m_matF_determinant(nodeTransform) < 0.0f)
if (nodeTransform.determinant() < 0.0f)
{
// Mark this node as inverted so we can mirror mesh geometry, then
// de-invert the transform matrix

View file

@ -1099,7 +1099,7 @@ void ColladaAppMesh::lookupSkinData()
// Inverted node coordinate spaces (negative scale factor) are corrected
// in ColladaAppNode::getNodeTransform, so need to apply the same operation
// here to match
if (m_matF_determinant(invBind) < 0.0f)
if (invBind.determinant() < 0.0f)
initialTransforms[iJoint].scale(Point3F(1, 1, -1));
initialTransforms[iJoint].mul(invBind);

View file

@ -192,7 +192,7 @@ MatrixF ColladaAppNode::getNodeTransform(F32 time)
// Check for inverted node coordinate spaces => can happen when modelers
// use the 'mirror' tool in their 3d app. Shows up as negative <scale>
// transforms in the collada model.
if (m_matF_determinant(nodeTransform) < 0.0f)
if (nodeTransform.determinant() < 0.0f)
{
// Mark this node as inverted so we can mirror mesh geometry, then
// de-invert the transform matrix

View file

@ -36,6 +36,8 @@ void FPSTracker::reset()
{
fpsNext = (F32)Platform::getRealMilliseconds()/1000.0f + mUpdateInterval;
fpsAccumTime = 0.0f;
fpsAccumVirtualTime = 0.0f;
fpsRealLast = 0.0f;
fpsReal = 0.0f;
fpsRealMin = 0.000001f; // Avoid division by zero.
@ -51,42 +53,60 @@ void FPSTracker::update()
F32 realSeconds = (F32)Platform::getRealMilliseconds()/1000.0f;
F32 virtualSeconds = (F32)Platform::getVirtualMilliseconds()/1000.0f;
fpsFrames++;
if (fpsFrames > 1)
if (fpsRealLast == 0.0f)
{
fpsReal = fpsReal*(1.0-alpha) + (realSeconds-fpsRealLast)*alpha;
fpsVirtual = fpsVirtual*(1.0-alpha) + (virtualSeconds-fpsVirtualLast)*alpha;
if( fpsFrames > 10 ) // Wait a few frames before updating these.
{
// Update min/max. This is a bit counter-intuitive, as the comparisons are
// inversed because these are all one-over-x values.
if( fpsReal > fpsRealMin )
fpsRealMin = fpsReal;
if( fpsReal < fpsRealMax )
fpsRealMax = fpsReal;
}
fpsRealLast = realSeconds;
fpsVirtualLast = virtualSeconds;
return;
}
fpsRealLast = realSeconds;
F32 frameTimeReal = realSeconds - fpsRealLast;
F32 frameTimeVirtual = virtualSeconds - fpsVirtualLast;
fpsRealLast = realSeconds;
fpsVirtualLast = virtualSeconds;
// update variables every few frames
F32 update = fpsRealLast - fpsNext;
if (update > 0.5f)
{
F32 delta = realSeconds - fpsNext;
Con::setVariable( "fps::frameDelta",avar("%g", delta));
Con::setVariable( "fps::real", avar( "%4.1f", 1.0f / fpsReal ) );
Con::setVariable( "fps::realMin", avar( "%4.1f", 1.0f / fpsRealMin ) );
Con::setVariable( "fps::realMax", avar( "%4.1f", 1.0f / fpsRealMax ) );
Con::setVariable( "fps::virtual", avar( "%4.1f", 1.0f / fpsVirtual ) );
// Accumulate for windowed FPS calculation
fpsFrames++;
fpsAccumTime += frameTimeReal;
fpsAccumVirtualTime += frameTimeVirtual;
if (update > mUpdateInterval)
fpsNext = fpsRealLast + mUpdateInterval;
else
fpsNext += mUpdateInterval;
// Only update console values at interval
if (realSeconds >= fpsNext)
{
fpsReal = 0.0f;
fpsVirtual = 0.0f;
if (fpsAccumTime > 0.0f)
fpsReal = fpsFrames / fpsAccumTime;
if (fpsAccumVirtualTime > 0.0f)
fpsVirtual = fpsFrames / fpsAccumVirtualTime;
// Update min/max correctly (these are FPS now)
if (fpsReal > 0.0f)
{
if (fpsReal < fpsRealMin)
fpsRealMin = fpsReal;
if (fpsReal > fpsRealMax)
fpsRealMax = fpsReal;
}
// frameDelta = actual accumulated real time over window
Con::setVariable("fps::frameTimeMs", avar("%4.3f", (fpsAccumTime / fpsFrames) * 1000.0f));
Con::setVariable("fps::frameDelta", avar("%g", fpsAccumTime));
Con::setVariable("fps::real", avar("%4.1f", fpsReal));
Con::setVariable("fps::realMin", avar("%4.1f", fpsRealMin));
Con::setVariable("fps::realMax", avar("%4.1f", fpsRealMax));
Con::setVariable("fps::virtual", avar("%4.1f", fpsVirtual));
// Reset window
fpsFrames = 0;
fpsAccumTime = 0.0f;
fpsAccumVirtualTime = 0.0f;
fpsNext = realSeconds + mUpdateInterval;
}
}

View file

@ -27,6 +27,8 @@
struct FPSTracker
{
F32 fpsAccumTime;
F32 fpsAccumVirtualTime;
F32 fpsRealLast;
F32 fpsReal;
F32 fpsRealMin;
@ -48,4 +50,4 @@ struct FPSTracker
extern FPSTracker gFPS;
#endif
#endif